From 1f1d7bb01a08813e93b22ded1059ae9af1449285 Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Sat, 22 Jan 2022 17:37:43 -0500
Subject: [PATCH 01/56] Modify dataset

---
 tuplex/python/tuplex/dataset.py | 82 +++++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)
diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py
index a2b8c0b33..aa5b1ca12 100644
--- a/tuplex/python/tuplex/dataset.py
+++ b/tuplex/python/tuplex/dataset.py
@@ -28,6 +28,88 @@ class DataSet:
     def __init__(self):
         self._dataSet = None
 
+    def getDataLen(self):
+        data = self.collect()
+        if len(data) == 0:
+            return 0, 0
+        else:
+            return len(data), len(data[0])
+
+    def revTake(self, nRows = 5):
+        return self.collect()[-nRows:]
+
+    def _repr_html_(self):
+        rows_list = self.take()
+        total_row_cnt, total_col_cnt = self.getDataLen()
+        print('rowlist')
+        print(rows_list)
+        if len(rows_list) == 0:
+            header = '<th></th>\n'
+            rows = '<tr></tr>\n'
+        else:
+            header = '<th></th>\n'
+
+            if self.columns != None:
+                for x in self.columns:
+                    header += f'      <th>{x}</th>\n'
+            else:
+                for i in range(len(rows_list[0])):
+                    header += f'      <th>column {i + 1}</th>\n'
+
+            rows = ''
+            for i, r in enumerate(rows_list):
+                rows += '    <tr>\n'
+                rows += f'      <th>{i}</th>\n'
+                for data in r:
+                    rows += f'      <td>{data}</td>\n'
+                rows += '    </tr>\n'
+
+            # add the ...
+            rows += '    <tr>\n'
+            rows += '      <th>...</th>\n'
+            for i in range(total_col_cnt):
+                rows += '      <td>...</td>\n'
+            rows += '    </tr>\n'
+
+            lastData = self.revTake()
+            for i, r in enumerate(lastData):
+                rows += '    <tr>\n'
+                rows += f'      <th>{total_row_cnt - len(lastData) + i}</th>\n'
+                for data in r:
+                    rows += f'      <td>{data}</td>\n'
+                rows += '    </tr>\n'
+
+        html_template = (
+            '<div>\n'
+            '<style scoped>\n'
+            '    .dataframe tbody tr th:only-of-type {\n'
+            '        vertical-align: middle;\n'
+            '    }\n'
+            '\n'
+            '    .dataframe tbody tr th {\n'
+            '        vertical-align: top;\n'
+            '    }\n'
+            '\n'
+            '    .dataframe thead th {\n'
+            '        text-align: right;\n'
+            '    }\n'
+            '</style>\n'
+            '<table border="1" class="dataframe">\n'
+            '  <thead>\n'
+            '    <tr style="text-align: right;">\n'
+            f'{header}'
+            '    </tr>\n'
+            '  </thead>\n'
+            '  <tbody>\n'
+            f'{rows}'
+            '  </tbody>\n'
+            '</table>\n'
+            f'<p>{total_row_cnt} rows × {total_col_cnt} columns</p>\n'
+            '</div>'
+        )
+
+        return html_template
+
     def unique(self):
         """ removes duplicates from Dataset (out-of-order). Equivalent to a DISTINCT clause in a SQL-statement.
         Returns:

From 0b1e7677a6e6a93f4de812da3eb37c091689f490 Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Thu, 27 Jan 2022 22:12:09 -0500
Subject: [PATCH 02/56] Add in takeLast operator

---
 tuplex/core/include/DataSet.h                 |  2 +
 .../include/logical/LogicalOperatorType.h     |  1 +
 .../core/include/logical/TakeLastOperator.h   | 51 ++++++++++++++
 tuplex/core/src/DataSet.cc                    | 24 +++++++
 tuplex/core/src/logical/TaskLastOperator.cc   | 43 ++++++++++++
 tuplex/core/src/physical/PhysicalPlan.cc      |  7 +-
 tuplex/python/include/PythonDataSet.h         |  1 +
 tuplex/python/src/PythonBindings.cc           |  1 +
 tuplex/python/src/PythonDataSet.cc            | 69 +++++++++++++++++++
 tuplex/python/tuplex/dataset.py               | 17 +++++
 10 files changed, 215 insertions(+), 1 deletion(-)
 create mode 100644 tuplex/core/include/logical/TakeLastOperator.h
 create mode 100644 tuplex/core/src/logical/TaskLastOperator.cc

diff --git a/tuplex/core/include/DataSet.h b/tuplex/core/include/DataSet.h
index 899032723..429d8c6a7 100644
--- a/tuplex/core/include/DataSet.h
+++ b/tuplex/core/include/DataSet.h
@@ -269,6 +269,8 @@ namespace tuplex {
 
         virtual std::vector<Row> takeAsVector(int64_t numElements, std::ostream &os = std::cout);
 
+        virtual std::shared_ptr<ResultSet> takeLast(int64_t numElements, std::ostream &os = std::cout);
+        
         /*!
          * saves dataset to file. There are multiple options to control the behavior
          * ==> 1.) files can be split across multiple ones. Specify number of files to split rows to
diff --git a/tuplex/core/include/logical/LogicalOperatorType.h b/tuplex/core/include/logical/LogicalOperatorType.h
index 594252820..b6a1c788b 100644
--- a/tuplex/core/include/logical/LogicalOperatorType.h
+++ b/tuplex/core/include/logical/LogicalOperatorType.h
@@ -17,6 +17,7 @@ namespace tuplex {
         MAP,
         FILTER,
         TAKE, // i.e. output to python / in memory
+        TAKELAST,
         PARALLELIZE, // i.e. input from python
         FILEINPUT,
         RESOLVE,
diff --git a/tuplex/core/include/logical/TakeLastOperator.h b/tuplex/core/include/logical/TakeLastOperator.h
new file mode 100644
index 000000000..28896e513
--- /dev/null
+++ b/tuplex/core/include/logical/TakeLastOperator.h
@@ -0,0 +1,51 @@
+//--------------------------------------------------------------------------------------------------------------------//
+//                                                                                                                    //
+//                                      Tuplex: Blazing Fast Python Data Science                                      //
+//                                                                                                                    //
+//                                                                                                                    //
+//  (c) 2017 - 2021, Tuplex team                                                                                      //
+//  Created by Leonhard Spiegelberg first on 1/1/2021                                                                 //
+//  License: Apache 2.0                                                                                               //
+//--------------------------------------------------------------------------------------------------------------------//
+
+#ifndef TUPLEX_TAKELASTOPERATOR_H
+#define TUPLEX_TAKELASTOPERATOR_H
+
+
+#include "LogicalOperator.h"
+
+namespace tuplex {
+    class TakeLastOperator : public LogicalOperator {
+    private:
+        int64_t _limit;
+    public:
+        LogicalOperator *clone() override;
+
+    public:
+        TakeLastOperator(LogicalOperator *parent, const int64_t numElements);
+
+        std::string name() override {
+            if(_limit < 0 || std::numeric_limits<int64_t>::max() == _limit)
+                return "collect";
+            return "take";
+        }
+        LogicalOperatorType type() const override { return LogicalOperatorType::TAKELAST; }
+
+        bool isActionable() override { return true; }
+
+        bool isDataSource() override { return false; }
+
+        bool good() const override;
+
+        int64_t limit() { return _limit; }
+
+
+        std::vector<Row> getSample(const size_t num) const override;
+
+        Schema getInputSchema() const override { return getOutputSchema(); }
+
+        std::vector<std::string> columns() const override;
+    };
+}
+
+#endif //TUPLEX_TAKELASTOPERATOR_H
\ No newline at end of file
diff --git a/tuplex/core/src/DataSet.cc b/tuplex/core/src/DataSet.cc
index a53a14094..66a6a548c 100644
--- a/tuplex/core/src/DataSet.cc
+++ b/tuplex/core/src/DataSet.cc
@@ -12,6 +12,7 @@
 #include <logical/MapOperator.h>
 #include <logical/FilterOperator.h>
 #include <logical/TakeOperator.h>
+#include <logical/TakeLastOperator.h>
 #include <logical/ResolveOperator.h>
 #include <logical/IgnoreOperator.h>
 #include <logical/MapColumnOperator.h>
@@ -102,6 +103,29 @@ namespace tuplex {
         return v;
     }
 
+    std::shared_ptr<ResultSet> DataSet::takeLast(int64_t numElements, std::ostream &os) {
+        // error dataset?
+        if (isError())
+            throw std::runtime_error("is error dataset!");
+
+        // negative numbers mean get all elements!
+        if (numElements < 0)
+            numElements = std::numeric_limits<int64_t>::max();
+
+        // create a take node
+        assert(_context);
+        LogicalOperator *op = _context->addOperator(new TakeLastOperator(this->_operator, numElements));
+        DataSet *dsptr = _context->createDataSet(op->getOutputSchema());
+        dsptr->_operator = op;
+        op->setDataSet(dsptr);
+
+        // perform action.
+        assert(this->_context);
+        auto rs = op->compute(*this->_context);
+
+        return rs;
+    }
+
     void DataSet::tofile(tuplex::FileFormat fmt, const tuplex::URI &uri, const tuplex::UDF &udf,
                          size_t fileCount, size_t shardSize,
                          const std::unordered_map<std::string, std::string> &outputOptions, size_t limit,
diff --git a/tuplex/core/src/logical/TaskLastOperator.cc b/tuplex/core/src/logical/TaskLastOperator.cc
new file mode 100644
index 000000000..92295efb3
--- /dev/null
+++ b/tuplex/core/src/logical/TaskLastOperator.cc
@@ -0,0 +1,43 @@
+//--------------------------------------------------------------------------------------------------------------------//
+//                                                                                                                    //
+//                                      Tuplex: Blazing Fast Python Data Science                                      //
+//                                                                                                                    //
+//                                                                                                                    //
+//  (c) 2017 - 2021, Tuplex team                                                                                      //
+//  Created by Leonhard Spiegelberg first on 1/1/2021                                                                 //
+//  License: Apache 2.0                                                                                               //
+//--------------------------------------------------------------------------------------------------------------------//
+
+#include <logical/TakeLastOperator.h>
+#include <cassert>
+
+namespace tuplex {
+    TakeLastOperator::TakeLastOperator(LogicalOperator *parent, const int64_t numElements) : LogicalOperator::LogicalOperator(parent), _limit(numElements) {
+        // take schema from parent node
+        setSchema(this->parent()->getOutputSchema());
+    }
+
+    bool TakeLastOperator::good() const {
+            return _limit >= -1;
+    }
+
+    std::vector<Row> TakeLastOperator::getSample(const size_t num) const {
+        // take sample from parent
+        return parent()->getSample(num);
+    }
+
+    std::vector<std::string> TakeLastOperator::columns() const {
+        assert(parent());
+        return parent()->columns();
+    }
+
+    LogicalOperator *TakeLastOperator::clone() {
+        // create clone of this operator
+        auto copy = new TakeLastOperator(parent()->clone(), _limit);
+
+        copy->setDataSet(getDataSet()); // weak ptr to old dataset...
+        copy->copyMembers(this);
+        assert(getID() == copy->getID());
+        return copy;
+    }
+}
\ No newline at end of file
diff --git a/tuplex/core/src/physical/PhysicalPlan.cc b/tuplex/core/src/physical/PhysicalPlan.cc
index 2399edf6f..87a73a712 100644
--- a/tuplex/core/src/physical/PhysicalPlan.cc
+++ b/tuplex/core/src/physical/PhysicalPlan.cc
@@ -208,7 +208,9 @@ namespace tuplex {
         if(ops.back()->isActionable()) {
             if(ops.back()->type() == LogicalOperatorType::FILEOUTPUT)
                 outputMode = EndPointMode::FILE;
-            else if(ops.back()->type() == LogicalOperatorType::TAKE || ops.back()->type() == LogicalOperatorType::CACHE) {
+            else if(ops.back()->type() == LogicalOperatorType::TAKE ||
+                    ops.back()->type() == LogicalOperatorType::TAKELAST || 
+                    ops.back()->type() == LogicalOperatorType::CACHE) {
                // memory?
                outputMode = EndPointMode::MEMORY;
             } else
@@ -382,6 +384,9 @@ namespace tuplex {
         if(outputNode->type() == LogicalOperatorType::TAKE) {
             auto top = static_cast<TakeOperator*>(outputNode);
             builder.setOutputLimit(top->limit());
+        } else if (outputNode->type() == LogicalOperatorType::TAKELAST) {
+            auto top = static_cast<TakeLastOperator*>(outputNode);
+            builder.setOutputLimit(top->limit());
         }
 
         // @TODO: add slowPip builder to this process...
diff --git a/tuplex/python/include/PythonDataSet.h b/tuplex/python/include/PythonDataSet.h
index 665d68856..58827ea33 100644
--- a/tuplex/python/include/PythonDataSet.h
+++ b/tuplex/python/include/PythonDataSet.h
@@ -78,6 +78,7 @@ namespace tuplex {
 
         py::object collect();
         py::object take(const int64_t numRows);
+        boost::python::object takeLast(const int64_t numRows);
         void show(const int64_t numRows=-1);
 
         // DataFrame like operations
diff --git a/tuplex/python/src/PythonBindings.cc b/tuplex/python/src/PythonBindings.cc
index 6b3683853..4d0b1f4e9 100644
--- a/tuplex/python/src/PythonBindings.cc
+++ b/tuplex/python/src/PythonBindings.cc
@@ -43,6 +43,7 @@ PYMODULE {
             .def("show", &tuplex::PythonDataSet::show)
             .def("collect", &tuplex::PythonDataSet::collect)
             .def("take", &tuplex::PythonDataSet::take)
+            .def("takeLast", &tuplex::PythonDataSet::takeLast)
             .def("map", &tuplex::PythonDataSet::map)
             .def("resolve", &tuplex::PythonDataSet::resolve)
             .def("ignore", &tuplex::PythonDataSet::ignore)
diff --git a/tuplex/python/src/PythonDataSet.cc b/tuplex/python/src/PythonDataSet.cc
index 36f9a392b..2e54deec5 100644
--- a/tuplex/python/src/PythonDataSet.cc
+++ b/tuplex/python/src/PythonDataSet.cc
@@ -176,6 +176,75 @@ namespace tuplex {
         }
     }
 
+    boost::python::object PythonDataSet::takeLast(const int64_t numRows) {
+        // make sure a dataset is wrapped
+        assert(this->_dataset);
+
+        // is callee error dataset? if so return list with error string
+        if (this->_dataset->isError()) {
+            ErrorDataSet *eds = static_cast<ErrorDataSet *>(this->_dataset);
+            boost::python::list L;
+            L.append(eds->getError());
+            // Logger::instance().flushAll();
+            Logger::instance().flushToPython();
+            return L;
+        } else {
+            std::stringstream ss;
+
+            // release GIL & hand over everything to Tuplex
+            assert(PyGILState_Check()); // make sure this thread holds the GIL!
+            python::unlockGIL();
+
+            std::shared_ptr<ResultSet> rs;
+            std::string err_message = "";
+            try {
+                rs = _dataset->takeLast(numRows, ss);
+                if(!rs)
+                    throw std::runtime_error("invalid result set");
+                // if there are more than 1 million (100k in debug mode) elements print message...
+                if (rs->rowCount() > LARGE_RESULT_SIZE)
+                    Logger::instance().logger("python").info("transferring "
+                                                             + std::to_string(rs->rowCount()) +
+                                                             " elements back to Python. This might take a while...");
+            } catch(const std::exception& e) {
+                err_message = e.what();
+                Logger::instance().defaultLogger().error(err_message);
+            } catch(...) {
+                err_message = "unknown C++ exception occurred, please change type.";
+                Logger::instance().defaultLogger().error(err_message);
+            }
+
+            // reqacquire GIL
+            python::lockGIL();
+
+            // error? then return list of error string
+            if(!rs || !err_message.empty()) {
+                // Logger::instance().flushAll();
+                Logger::instance().flushToPython();
+                auto listObj = PyList_New(1);
+                PyList_SetItem(listObj, 0, python::PyString_FromString(err_message.c_str()));
+                auto list = boost::python::object(boost::python::borrowed<>(listObj));
+                return list;
+            }
+
+            // collect results & transfer them back to python
+            // new version, directly interact with the interpreter
+            Timer timer;
+            // build python list object from resultset
+            auto listObj = resultSetToCPython(rs.get(), numRows);
+            Logger::instance().logger("python").info("Data transfer back to python took "
+                                                     + std::to_string(timer.time()) + " seconds");
+            // Logger::instance().flushAll();
+            Logger::instance().flushToPython();
+
+            // print errors
+            if (ss.str().length() > 0)
+                PySys_FormatStdout("%s", ss.str().c_str());
+
+            return boost::python::object(boost::python::handle<>(listObj));
+        }
+    }
+
     PythonDataSet PythonDataSet::map(const std::string &lambda_code, const std::string &pickled_code, const py::object& closure) {
 
         auto& logger = Logger::instance().logger("python");
diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py
index aa5b1ca12..a1d838526 100644
--- a/tuplex/python/tuplex/dataset.py
+++ b/tuplex/python/tuplex/dataset.py
@@ -208,6 +208,23 @@ def take(self, nrows=5):
 
         return self._dataSet.take(nrows)
 
+    def takeLast(self, nrows=5):
+        """ action that generates a physical plan, processes data and collects the last results then as list of tuples.
+
+        Args:
+            nrows (int): number of rows to collect. Per default ``5``.
+        Returns:
+            (list): A list of tuples
+
+        """
+
+        assert isinstance(nrows, int), 'num rows must be an integer'
+        assert nrows > 0, 'please specify a number greater than zero'
+
+        assert self._dataSet is not None, 'internal API error, datasets must be created via context objects'
+
+        return self._dataSet.takeLast(nrows)
+
     def show(self, nrows=None):
         """ action that generates a physical plan, processes data and prints results as nicely formatted
         ASCII table to stdout.

From eafb76d2a4e6461a3c2e035c8b0e5abed76d9a7e Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Thu, 27 Jan 2022 22:25:39 -0500
Subject: [PATCH 03/56] (wip) add reverse limit in partition

---
 tuplex/core/include/Partition.h            | 15 +++++++
 tuplex/core/src/physical/TransformStage.cc | 51 ++++++++++++++++------
 2 files changed, 53 insertions(+), 13 deletions(-)

diff --git a/tuplex/core/include/Partition.h b/tuplex/core/include/Partition.h
index 9bc7fc54c..5a66023fd 100644
--- a/tuplex/core/include/Partition.h
+++ b/tuplex/core/include/Partition.h
@@ -248,6 +248,21 @@ namespace tuplex {
             _mutex.unlock();
         }
 
+        void setNumLastRows(const size_t numRows) {
+            // TODO: set another value instead
+            _mutex.lock();
+
+            _numRows = numRows;
+
+            // save to memptr
+            if(_arena) {
+                *((int64_t*)_arena) = numRows;
+            }
+
+            _mutex.unlock();
+        }
+
+
 
         int64_t getDataSetID() const { return _dataSetID; }
 
diff --git a/tuplex/core/src/physical/TransformStage.cc b/tuplex/core/src/physical/TransformStage.cc
index b61f9cbe2..9cd15694a 100644
--- a/tuplex/core/src/physical/TransformStage.cc
+++ b/tuplex/core/src/physical/TransformStage.cc
@@ -139,21 +139,46 @@ namespace tuplex {
                 }
 
                 // check output limit, adjust partitions if necessary
-                size_t numOutputRows = 0;
-                for (auto partition : partitions) {
-                    numOutputRows += partition->getNumRows();
-                    if (numOutputRows >= outputLimit()) {
-                        // clip last partition & leave loop
-                        auto clipped = outputLimit() - (numOutputRows - partition->getNumRows());
-                        assert(clipped <= partition->getNumRows());
-                        partition->setNumRows(clipped);
-                        if (clipped > 0)
+                // TODO: add reverse outputLimit condition here
+                if (true) {
+                    size_t numOutputRows = 0;
+                    for (auto partition : partitions) {
+                        numOutputRows += partition->getNumRows();
+                        if (numOutputRows >= outputLimit()) {
+                            // clip last partition & leave loop
+                            auto clipped = outputLimit() - (numOutputRows - partition->getNumRows());
+                            assert(clipped <= partition->getNumRows());
+                            partition->setNumRows(clipped);
+                            if (clipped > 0)
+                                limitedPartitions.push_back(partition);
+                            break;
+                        } else {
+                            // put full partition to output set
                             limitedPartitions.push_back(partition);
-                        break;
-                    } else {
-                        // put full partition to output set
-                        limitedPartitions.push_back(partition);
+                        }
+                    }   
+                } else {
+                    size_t numOutputRows = 0;
+                    for (auto partitionIt = partitions.rbeing();
+                            partitionIt != partitions.rend(); partitionIt++) {
+                        numOutputRows += partition->getNumRows();
+                        if (numOutputRows >= outputLimit()) {
+                            // clip last partition & leave loop
+                            auto clipped = outputLimit() - (numOutputRows - partition->getNumRows());
+                            assert(clipped <= partition->getNumRows());
+
+                            // TODO: do backward clip here instead
+                            partition->setNumRows(clipped);
+                            if (clipped > 0)
+                                limitedPartitions.push_back(partition);
+                            break;
+                        } else {
+                            // put full partition to output set
+                            limitedPartitions.push_back(partition);
+                        }    
                     }
+
+                    std::reverse(limitedPartitions.begin(), limitedPartitions.end());
                 }
             }
 

From cb47a4da16279cbee77a76664d4702f5e38f7b5d Mon Sep 17 00:00:00 2001
From: korlamarch <khemarat_boonyapaluk@brown.edu>
Date: Fri, 11 Feb 2022 09:20:06 -0500
Subject: [PATCH 04/56] Remove row count

---
 tuplex/python/tuplex/dataset.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py
index a1d838526..976a751f4 100644
--- a/tuplex/python/tuplex/dataset.py
+++ b/tuplex/python/tuplex/dataset.py
@@ -28,19 +28,19 @@ class DataSet:
     def __init__(self):
         self._dataSet = None
 
-    def getDataLen(self):
+    def getColumnSize(self):
         data = self.collect()
         if len(data) == 0:
             return 0, 0
         else:
-            return len(data), len(data[0])
+            return len(data[0])
 
     def revTake(self, nRows = 5):
         return self.collect()[-nRows:]
 
     def _repr_html_(self):
         rows_list = self.take()
-        total_row_cnt, total_col_cnt = self.getDataLen()
+        total_col_cnt = self.getColumnSize()
         print('rowlist')
         print(rows_list)
         if len(rows_list) == 0:
@@ -74,7 +74,7 @@ def _repr_html_(self):
             lastData = self.revTake()
             for i, r in enumerate(lastData):
                 rows += '    <tr>\n'
-                rows += f'      <th>{total_row_cnt - len(lastData) + i}</th>\n'
+                rows += f'      <th>{0 - len(lastData) + i}</th>\n'
                 for data in r:
                     rows += f'      <td>{data}</td>\n'
                 rows += '    </tr>\n'
@@ -104,7 +104,7 @@ def _repr_html_(self):
             f'{rows}'
             '  </tbody>\n'
             '</table>\n'
-            f'<p>{total_row_cnt} rows × {total_col_cnt} columns</p>\n'
+            f'<p>{total_col_cnt} columns</p>\n'
             '</div>'
         )
 

From d879bcd0a9d95e4f3e9812635c44769dfcc8e74e Mon Sep 17 00:00:00 2001
From: korlamarch <khemarat_boonyapaluk@brown.edu>
Date: Tue, 15 Feb 2022 23:30:47 -0500
Subject: [PATCH 05/56] refactor TakeOperator

---
 tuplex/core/include/DataSet.h                 |  3 +-
 tuplex/core/include/EmptyDataset.h            |  2 +-
 tuplex/core/include/ErrorDataSet.h            |  2 +-
 .../include/logical/LogicalOperatorType.h     |  1 -
 .../core/include/logical/TakeLastOperator.h   | 51 -------------
 tuplex/core/include/logical/TakeOperator.h    | 10 ++-
 tuplex/core/src/DataSet.cc                    | 36 ++-------
 tuplex/core/src/EmptyDataset.cc               |  4 +-
 tuplex/core/src/ErrorDataSet.cc               |  4 +-
 tuplex/core/src/logical/TakeOperator.cc       |  6 +-
 tuplex/core/src/logical/TaskLastOperator.cc   | 43 -----------
 tuplex/core/src/physical/PhysicalPlan.cc      |  4 -
 tuplex/core/src/physical/StageBuilder.cc      |  3 +-
 tuplex/core/src/physical/TransformStage.cc    | 51 ++++---------
 tuplex/python/include/PythonDataSet.h         |  3 +-
 tuplex/python/src/PythonBindings.cc           |  1 -
 tuplex/python/src/PythonDataSet.cc            | 73 +------------------
 tuplex/python/tuplex/dataset.py               | 23 +-----
 18 files changed, 44 insertions(+), 276 deletions(-)
 delete mode 100644 tuplex/core/include/logical/TakeLastOperator.h
 delete mode 100644 tuplex/core/src/logical/TaskLastOperator.cc

diff --git a/tuplex/core/include/DataSet.h b/tuplex/core/include/DataSet.h
index 429d8c6a7..65a766a87 100644
--- a/tuplex/core/include/DataSet.h
+++ b/tuplex/core/include/DataSet.h
@@ -263,13 +263,12 @@ namespace tuplex {
         // these are actions that cause execution
         virtual std::shared_ptr<ResultSet> collect(std::ostream &os = std::cout);
 
-        virtual std::shared_ptr<ResultSet> take(int64_t numElements, std::ostream &os = std::cout);
+        virtual std::shared_ptr<ResultSet> take(int64_t numTop, int64_t numBottom, std::ostream &os = std::cout);
 
         virtual std::vector<Row> collectAsVector(std::ostream &os = std::cout);
 
         virtual std::vector<Row> takeAsVector(int64_t numElements, std::ostream &os = std::cout);
 
-        virtual std::shared_ptr<ResultSet> takeLast(int64_t numElements, std::ostream &os = std::cout);
         
         /*!
          * saves dataset to file. There are multiple options to control the behavior
diff --git a/tuplex/core/include/EmptyDataset.h b/tuplex/core/include/EmptyDataset.h
index b3c1ed7af..0f8a1f52c 100644
--- a/tuplex/core/include/EmptyDataset.h
+++ b/tuplex/core/include/EmptyDataset.h
@@ -70,7 +70,7 @@ namespace tuplex {
         virtual std::shared_ptr<ResultSet> collect(std::ostream& os) override;
 
         // take / collect will print out the error only
-        virtual std::shared_ptr<ResultSet> take(int64_t numElements, std::ostream& os) override;
+        virtual std::shared_ptr<ResultSet> take(int64_t numTop, int64_t numBottom, std::ostream& os) override;
 
         //virtual void show(const int64_t numRows=-1, std::ostream& os=std::cout) override;
         virtual std::vector<Row> collectAsVector(std::ostream& os) override;
diff --git a/tuplex/core/include/ErrorDataSet.h b/tuplex/core/include/ErrorDataSet.h
index 2f46d8638..34fc60685 100644
--- a/tuplex/core/include/ErrorDataSet.h
+++ b/tuplex/core/include/ErrorDataSet.h
@@ -90,7 +90,7 @@ namespace tuplex {
         std::shared_ptr<ResultSet> collect(std::ostream& os) override;
 
         // take / collect will print out the error only
-        std::shared_ptr<ResultSet> take(int64_t numElements, std::ostream& os) override;
+        std::shared_ptr<ResultSet> take(int64_t numTop, int64_t numBottom, std::ostream& os) override;
 
         //virtual void show(const int64_t numRows=-1, std::ostream& os=std::cout) override;
         std::vector<Row> collectAsVector(std::ostream& os) override;
diff --git a/tuplex/core/include/logical/LogicalOperatorType.h b/tuplex/core/include/logical/LogicalOperatorType.h
index b6a1c788b..594252820 100644
--- a/tuplex/core/include/logical/LogicalOperatorType.h
+++ b/tuplex/core/include/logical/LogicalOperatorType.h
@@ -17,7 +17,6 @@ namespace tuplex {
         MAP,
         FILTER,
         TAKE, // i.e. output to python / in memory
-        TAKELAST,
         PARALLELIZE, // i.e. input from python
         FILEINPUT,
         RESOLVE,
diff --git a/tuplex/core/include/logical/TakeLastOperator.h b/tuplex/core/include/logical/TakeLastOperator.h
deleted file mode 100644
index 28896e513..000000000
--- a/tuplex/core/include/logical/TakeLastOperator.h
+++ /dev/null
@@ -1,51 +0,0 @@
-//--------------------------------------------------------------------------------------------------------------------//
-//                                                                                                                    //
-//                                      Tuplex: Blazing Fast Python Data Science                                      //
-//                                                                                                                    //
-//                                                                                                                    //
-//  (c) 2017 - 2021, Tuplex team                                                                                      //
-//  Created by Leonhard Spiegelberg first on 1/1/2021                                                                 //
-//  License: Apache 2.0                                                                                               //
-//--------------------------------------------------------------------------------------------------------------------//
-
-#ifndef TUPLEX_TAKELASTOPERATOR_H
-#define TUPLEX_TAKELASTOPERATOR_H
-
-
-#include "LogicalOperator.h"
-
-namespace tuplex {
-    class TakeLastOperator : public LogicalOperator {
-    private:
-        int64_t _limit;
-    public:
-        LogicalOperator *clone() override;
-
-    public:
-        TakeLastOperator(LogicalOperator *parent, const int64_t numElements);
-
-        std::string name() override {
-            if(_limit < 0 || std::numeric_limits<int64_t>::max() == _limit)
-                return "collect";
-            return "take";
-        }
-        LogicalOperatorType type() const override { return LogicalOperatorType::TAKELAST; }
-
-        bool isActionable() override { return true; }
-
-        bool isDataSource() override { return false; }
-
-        bool good() const override;
-
-        int64_t limit() { return _limit; }
-
-
-        std::vector<Row> getSample(const size_t num) const override;
-
-        Schema getInputSchema() const override { return getOutputSchema(); }
-
-        std::vector<std::string> columns() const override;
-    };
-}
-
-#endif //TUPLEX_TAKELASTOPERATOR_H
\ No newline at end of file
diff --git a/tuplex/core/include/logical/TakeOperator.h b/tuplex/core/include/logical/TakeOperator.h
index 8d0d6dcab..20c035a74 100644
--- a/tuplex/core/include/logical/TakeOperator.h
+++ b/tuplex/core/include/logical/TakeOperator.h
@@ -17,15 +17,16 @@
 namespace tuplex {
     class TakeOperator : public LogicalOperator {
     private:
-        int64_t _limit;
+        int64_t _limitTop;
+        int64_t _limitBottom;
     public:
         LogicalOperator *clone() override;
 
     public:
-        TakeOperator(LogicalOperator *parent, const int64_t numElements);
+        TakeOperator(LogicalOperator *parent, const int64_t numTop, const int64_t numBottom);
 
         std::string name() override {
-            if(_limit < 0 || std::numeric_limits<int64_t>::max() == _limit)
+            if(_limitTop < 0 || std::numeric_limits<int64_t>::max() == _limitTop)
                 return "collect";
             return "take";
         }
@@ -37,8 +38,9 @@ namespace tuplex {
 
         bool good() const override;
 
-        int64_t limit() { return _limit; }
+        int64_t limit() { return _limitTop; }
 
+        bool limitBottom() { return _limitBottom; }
 
         std::vector<Row> getSample(const size_t num) const override;
 
diff --git a/tuplex/core/src/DataSet.cc b/tuplex/core/src/DataSet.cc
index 66a6a548c..3de903d1c 100644
--- a/tuplex/core/src/DataSet.cc
+++ b/tuplex/core/src/DataSet.cc
@@ -12,7 +12,6 @@
 #include <logical/MapOperator.h>
 #include <logical/FilterOperator.h>
 #include <logical/TakeOperator.h>
-#include <logical/TakeLastOperator.h>
 #include <logical/ResolveOperator.h>
 #include <logical/IgnoreOperator.h>
 #include <logical/MapColumnOperator.h>
@@ -39,21 +38,21 @@ namespace tuplex {
     }
 
     std::shared_ptr<ResultSet> DataSet::collect(std::ostream &os) {
-        return take(-1, os);
+        return take(-1, false, os);
     }
 
-    std::shared_ptr<ResultSet> DataSet::take(int64_t numElements, std::ostream &os) {
+    std::shared_ptr<ResultSet> DataSet::take(int64_t numTop, int64_t numBottom, std::ostream &os) {
         // error dataset?
         if (isError())
             throw std::runtime_error("is error dataset!");
 
         // negative numbers mean get all elements!
-        if (numElements < 0)
-            numElements = std::numeric_limits<int64_t>::max();
+        if (numTop < 0)
+            numTop = std::numeric_limits<int64_t>::max();
 
         // create a take node
         assert(_context);
-        LogicalOperator *op = _context->addOperator(new TakeOperator(this->_operator, numElements));
+        LogicalOperator *op = _context->addOperator(new TakeOperator(this->_operator, numTop, numBottom));
         DataSet *dsptr = _context->createDataSet(op->getOutputSchema());
         dsptr->_operator = op;
         op->setDataSet(dsptr);
@@ -72,7 +71,7 @@ namespace tuplex {
 
     // -1 means to retrieve all elements
     std::vector<Row> DataSet::takeAsVector(int64_t numElements, std::ostream &os) {
-        auto rs = take(numElements, os);
+        auto rs = take(numElements, false, os);
         Timer timer;
 
 #warning "limiting should make this hack irrelevant..."
@@ -103,29 +102,6 @@ namespace tuplex {
         return v;
     }
 
-    std::shared_ptr<ResultSet> DataSet::takeLast(int64_t numElements, std::ostream &os) {
-        // error dataset?
-        if (isError())
-            throw std::runtime_error("is error dataset!");
-
-        // negative numbers mean get all elements!
-        if (numElements < 0)
-            numElements = std::numeric_limits<int64_t>::max();
-
-        // create a take node
-        assert(_context);
-        LogicalOperator *op = _context->addOperator(new TakeLastOperator(this->_operator, numElements));
-        DataSet *dsptr = _context->createDataSet(op->getOutputSchema());
-        dsptr->_operator = op;
-        op->setDataSet(dsptr);
-
-        // perform action.
-        assert(this->_context);
-        auto rs = op->compute(*this->_context);
-
-        return rs;
-    }
-
     void DataSet::tofile(tuplex::FileFormat fmt, const tuplex::URI &uri, const tuplex::UDF &udf,
                          size_t fileCount, size_t shardSize,
                          const std::unordered_map<std::string, std::string> &outputOptions, size_t limit,
diff --git a/tuplex/core/src/EmptyDataset.cc b/tuplex/core/src/EmptyDataset.cc
index 984fa904f..7504e8499 100644
--- a/tuplex/core/src/EmptyDataset.cc
+++ b/tuplex/core/src/EmptyDataset.cc
@@ -11,7 +11,7 @@
 #include <EmptyDataset.h>
 
 namespace tuplex {
-    std::shared_ptr<ResultSet> EmptyDataset::take(int64_t numElements, std::ostream &os) {
+    std::shared_ptr<ResultSet> EmptyDataset::take(int64_t numTop, int64_t numBottom, std::ostream &os) {
         return std::make_shared<ResultSet>();
     }
 
@@ -20,7 +20,7 @@ namespace tuplex {
     }
 
     std::shared_ptr<ResultSet> EmptyDataset::collect(std::ostream &os) {
-        return take(0, os);
+        return take(0, false, os);
     }
 
     std::vector<Row> EmptyDataset::collectAsVector(std::ostream &os) {
diff --git a/tuplex/core/src/ErrorDataSet.cc b/tuplex/core/src/ErrorDataSet.cc
index 57c03ffba..9d19594f2 100644
--- a/tuplex/core/src/ErrorDataSet.cc
+++ b/tuplex/core/src/ErrorDataSet.cc
@@ -23,7 +23,7 @@ namespace tuplex {
         return takeAsVector(0, os);
     }
 
-    std::shared_ptr<ResultSet> ErrorDataSet::take(int64_t numElements, std::ostream &os) {
+    std::shared_ptr<ResultSet> ErrorDataSet::take(int64_t numTop, int64_t numBottom, std::ostream &os) {
         // return empty vector and print err message
         Logger::instance().logger("core").error(this->_error);
 
@@ -31,7 +31,7 @@ namespace tuplex {
     }
 
     std::shared_ptr<ResultSet> ErrorDataSet::collect(std::ostream &os) {
-        return take(0, os);
+        return take(0, false, os);
     }
 
     void
diff --git a/tuplex/core/src/logical/TakeOperator.cc b/tuplex/core/src/logical/TakeOperator.cc
index aa7c49668..e588b5e97 100644
--- a/tuplex/core/src/logical/TakeOperator.cc
+++ b/tuplex/core/src/logical/TakeOperator.cc
@@ -12,13 +12,13 @@
 #include <cassert>
 
 namespace tuplex {
-    TakeOperator::TakeOperator(LogicalOperator *parent, const int64_t numElements) : LogicalOperator::LogicalOperator(parent), _limit(numElements) {
+    TakeOperator::TakeOperator(LogicalOperator *parent, const int64_t numTop, const int64_t numBottom) : LogicalOperator::LogicalOperator(parent), _limitTop(numTop), _limitBottom(numBottom) {
         // take schema from parent node
         setSchema(this->parent()->getOutputSchema());
     }
 
     bool TakeOperator::good() const {
-            return _limit >= -1;
+            return _limitTop >= -1 && _limitBottom >= -1;
     }
 
     std::vector<Row> TakeOperator::getSample(const size_t num) const {
@@ -33,7 +33,7 @@ namespace tuplex {
 
     LogicalOperator *TakeOperator::clone() {
         // create clone of this operator
-        auto copy = new TakeOperator(parent()->clone(), _limit);
+        auto copy = new TakeOperator(parent()->clone(), _limitTop, _limitBottom);
 
         copy->setDataSet(getDataSet()); // weak ptr to old dataset...
         copy->copyMembers(this);
diff --git a/tuplex/core/src/logical/TaskLastOperator.cc b/tuplex/core/src/logical/TaskLastOperator.cc
deleted file mode 100644
index 92295efb3..000000000
--- a/tuplex/core/src/logical/TaskLastOperator.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-//--------------------------------------------------------------------------------------------------------------------//
-//                                                                                                                    //
-//                                      Tuplex: Blazing Fast Python Data Science                                      //
-//                                                                                                                    //
-//                                                                                                                    //
-//  (c) 2017 - 2021, Tuplex team                                                                                      //
-//  Created by Leonhard Spiegelberg first on 1/1/2021                                                                 //
-//  License: Apache 2.0                                                                                               //
-//--------------------------------------------------------------------------------------------------------------------//
-
-#include <logical/TakeLastOperator.h>
-#include <cassert>
-
-namespace tuplex {
-    TakeLastOperator::TakeLastOperator(LogicalOperator *parent, const int64_t numElements) : LogicalOperator::LogicalOperator(parent), _limit(numElements) {
-        // take schema from parent node
-        setSchema(this->parent()->getOutputSchema());
-    }
-
-    bool TakeLastOperator::good() const {
-            return _limit >= -1;
-    }
-
-    std::vector<Row> TakeLastOperator::getSample(const size_t num) const {
-        // take sample from parent
-        return parent()->getSample(num);
-    }
-
-    std::vector<std::string> TakeLastOperator::columns() const {
-        assert(parent());
-        return parent()->columns();
-    }
-
-    LogicalOperator *TakeLastOperator::clone() {
-        // create clone of this operator
-        auto copy = new TakeLastOperator(parent()->clone(), _limit);
-
-        copy->setDataSet(getDataSet()); // weak ptr to old dataset...
-        copy->copyMembers(this);
-        assert(getID() == copy->getID());
-        return copy;
-    }
-}
\ No newline at end of file
diff --git a/tuplex/core/src/physical/PhysicalPlan.cc b/tuplex/core/src/physical/PhysicalPlan.cc
index 87a73a712..17a4c7c0e 100644
--- a/tuplex/core/src/physical/PhysicalPlan.cc
+++ b/tuplex/core/src/physical/PhysicalPlan.cc
@@ -209,7 +209,6 @@ namespace tuplex {
             if(ops.back()->type() == LogicalOperatorType::FILEOUTPUT)
                 outputMode = EndPointMode::FILE;
             else if(ops.back()->type() == LogicalOperatorType::TAKE ||
-                    ops.back()->type() == LogicalOperatorType::TAKELAST || 
                     ops.back()->type() == LogicalOperatorType::CACHE) {
                // memory?
                outputMode = EndPointMode::MEMORY;
@@ -384,9 +383,6 @@ namespace tuplex {
         if(outputNode->type() == LogicalOperatorType::TAKE) {
             auto top = static_cast<TakeOperator*>(outputNode);
             builder.setOutputLimit(top->limit());
-        } else if (outputNode->type() == LogicalOperatorType::TAKELAST) {
-            auto top = static_cast<TakeLastOperator*>(outputNode);
-            builder.setOutputLimit(top->limit());
         }
 
         // @TODO: add slowPip builder to this process...
diff --git a/tuplex/core/src/physical/StageBuilder.cc b/tuplex/core/src/physical/StageBuilder.cc
index 72f01e2b8..0bf509ed1 100644
--- a/tuplex/core/src/physical/StageBuilder.cc
+++ b/tuplex/core/src/physical/StageBuilder.cc
@@ -457,7 +457,8 @@ namespace tuplex {
                         break;
                     }
                     case LogicalOperatorType::TAKE: {
-                        opt_ops.push_back(new TakeOperator(lastParent, dynamic_cast<TakeOperator*>(node)->limit()));
+                        auto takeOp = dynamic_cast<TakeOperator*>(node);
+                        opt_ops.push_back(new TakeOperator(lastParent, takeOp->limit(), takeOp->limitBottom()));
                         opt_ops.back()->setID(node->getID());
                         break;
                     }
diff --git a/tuplex/core/src/physical/TransformStage.cc b/tuplex/core/src/physical/TransformStage.cc
index 9cd15694a..b61f9cbe2 100644
--- a/tuplex/core/src/physical/TransformStage.cc
+++ b/tuplex/core/src/physical/TransformStage.cc
@@ -139,46 +139,21 @@ namespace tuplex {
                 }
 
                 // check output limit, adjust partitions if necessary
-                // TODO: add reverse outputLimit condition here
-                if (true) {
-                    size_t numOutputRows = 0;
-                    for (auto partition : partitions) {
-                        numOutputRows += partition->getNumRows();
-                        if (numOutputRows >= outputLimit()) {
-                            // clip last partition & leave loop
-                            auto clipped = outputLimit() - (numOutputRows - partition->getNumRows());
-                            assert(clipped <= partition->getNumRows());
-                            partition->setNumRows(clipped);
-                            if (clipped > 0)
-                                limitedPartitions.push_back(partition);
-                            break;
-                        } else {
-                            // put full partition to output set
-                            limitedPartitions.push_back(partition);
-                        }
-                    }   
-                } else {
-                    size_t numOutputRows = 0;
-                    for (auto partitionIt = partitions.rbeing();
-                            partitionIt != partitions.rend(); partitionIt++) {
-                        numOutputRows += partition->getNumRows();
-                        if (numOutputRows >= outputLimit()) {
-                            // clip last partition & leave loop
-                            auto clipped = outputLimit() - (numOutputRows - partition->getNumRows());
-                            assert(clipped <= partition->getNumRows());
-
-                            // TODO: do backward clip here instead
-                            partition->setNumRows(clipped);
-                            if (clipped > 0)
-                                limitedPartitions.push_back(partition);
-                            break;
-                        } else {
-                            // put full partition to output set
+                size_t numOutputRows = 0;
+                for (auto partition : partitions) {
+                    numOutputRows += partition->getNumRows();
+                    if (numOutputRows >= outputLimit()) {
+                        // clip last partition & leave loop
+                        auto clipped = outputLimit() - (numOutputRows - partition->getNumRows());
+                        assert(clipped <= partition->getNumRows());
+                        partition->setNumRows(clipped);
+                        if (clipped > 0)
                             limitedPartitions.push_back(partition);
-                        }    
+                        break;
+                    } else {
+                        // put full partition to output set
+                        limitedPartitions.push_back(partition);
                     }
-
-                    std::reverse(limitedPartitions.begin(), limitedPartitions.end());
                 }
             }
 
diff --git a/tuplex/python/include/PythonDataSet.h b/tuplex/python/include/PythonDataSet.h
index 58827ea33..23b09314d 100644
--- a/tuplex/python/include/PythonDataSet.h
+++ b/tuplex/python/include/PythonDataSet.h
@@ -77,8 +77,7 @@ namespace tuplex {
         PythonDataSet resolve(const int64_t exceptionCode, const std::string& lambda_code, const std::string& pickled_code, const py::object& closure=py::object());
 
         py::object collect();
-        py::object take(const int64_t numRows);
-        boost::python::object takeLast(const int64_t numRows);
+        py::object take(const int64_t numTop, const int64_t numBottom);
         void show(const int64_t numRows=-1);
 
         // DataFrame like operations
diff --git a/tuplex/python/src/PythonBindings.cc b/tuplex/python/src/PythonBindings.cc
index 4d0b1f4e9..6b3683853 100644
--- a/tuplex/python/src/PythonBindings.cc
+++ b/tuplex/python/src/PythonBindings.cc
@@ -43,7 +43,6 @@ PYMODULE {
             .def("show", &tuplex::PythonDataSet::show)
             .def("collect", &tuplex::PythonDataSet::collect)
             .def("take", &tuplex::PythonDataSet::take)
-            .def("takeLast", &tuplex::PythonDataSet::takeLast)
             .def("map", &tuplex::PythonDataSet::map)
             .def("resolve", &tuplex::PythonDataSet::resolve)
             .def("ignore", &tuplex::PythonDataSet::ignore)
diff --git a/tuplex/python/src/PythonDataSet.cc b/tuplex/python/src/PythonDataSet.cc
index 2e54deec5..853b910db 100644
--- a/tuplex/python/src/PythonDataSet.cc
+++ b/tuplex/python/src/PythonDataSet.cc
@@ -107,7 +107,7 @@ namespace tuplex {
         }
     }
 
-    py::object PythonDataSet::take(const int64_t numRows) {
+    py::object PythonDataSet::take(const int64_t numTop, const int64_t numBottom) {
         // make sure a dataset is wrapped
         assert(this->_dataset);
 
@@ -162,7 +162,7 @@ namespace tuplex {
             // new version, directly interact with the interpreter
             Timer timer;
             // build python list object from resultset
-            auto listObj = resultSetToCPython(rs.get(), numRows);
+            auto listObj = resultSetToCPython(rs.get(), numTop);
             Logger::instance().logger("python").info("Data transfer back to python took "
                                                      + std::to_string(timer.time()) + " seconds");
             // Logger::instance().flushAll();
@@ -176,75 +176,6 @@ namespace tuplex {
         }
     }
 
-    boost::python::object PythonDataSet::takeLast(const int64_t numRows) {
-        // make sure a dataset is wrapped
-        assert(this->_dataset);
-
-        // is callee error dataset? if so return list with error string
-        if (this->_dataset->isError()) {
-            ErrorDataSet *eds = static_cast<ErrorDataSet *>(this->_dataset);
-            boost::python::list L;
-            L.append(eds->getError());
-            // Logger::instance().flushAll();
-            Logger::instance().flushToPython();
-            return L;
-        } else {
-            std::stringstream ss;
-
-            // release GIL & hand over everything to Tuplex
-            assert(PyGILState_Check()); // make sure this thread holds the GIL!
-            python::unlockGIL();
-
-            std::shared_ptr<ResultSet> rs;
-            std::string err_message = "";
-            try {
-                rs = _dataset->takeLast(numRows, ss);
-                if(!rs)
-                    throw std::runtime_error("invalid result set");
-                // if there are more than 1 million (100k in debug mode) elements print message...
-                if (rs->rowCount() > LARGE_RESULT_SIZE)
-                    Logger::instance().logger("python").info("transferring "
-                                                             + std::to_string(rs->rowCount()) +
-                                                             " elements back to Python. This might take a while...");
-            } catch(const std::exception& e) {
-                err_message = e.what();
-                Logger::instance().defaultLogger().error(err_message);
-            } catch(...) {
-                err_message = "unknown C++ exception occurred, please change type.";
-                Logger::instance().defaultLogger().error(err_message);
-            }
-
-            // reqacquire GIL
-            python::lockGIL();
-
-            // error? then return list of error string
-            if(!rs || !err_message.empty()) {
-                // Logger::instance().flushAll();
-                Logger::instance().flushToPython();
-                auto listObj = PyList_New(1);
-                PyList_SetItem(listObj, 0, python::PyString_FromString(err_message.c_str()));
-                auto list = boost::python::object(boost::python::borrowed<>(listObj));
-                return list;
-            }
-
-            // collect results & transfer them back to python
-            // new version, directly interact with the interpreter
-            Timer timer;
-            // build python list object from resultset
-            auto listObj = resultSetToCPython(rs.get(), numRows);
-            Logger::instance().logger("python").info("Data transfer back to python took "
-                                                     + std::to_string(timer.time()) + " seconds");
-            // Logger::instance().flushAll();
-            Logger::instance().flushToPython();
-
-            // print errors
-            if (ss.str().length() > 0)
-                PySys_FormatStdout("%s", ss.str().c_str());
-
-            return boost::python::object(boost::python::handle<>(listObj));
-        }
-    }
-
     PythonDataSet PythonDataSet::map(const std::string &lambda_code, const std::string &pickled_code, const py::object& closure) {
 
         auto& logger = Logger::instance().logger("python");
diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py
index 976a751f4..1046505f2 100644
--- a/tuplex/python/tuplex/dataset.py
+++ b/tuplex/python/tuplex/dataset.py
@@ -191,7 +191,7 @@ def collect(self):
         assert self._dataSet is not None, 'internal API error, datasets must be created via context objects'
         return self._dataSet.collect()
 
-    def take(self, nrows=5):
+    def take(self, nrows=5, nbottom=0):
         """ action that generates a physical plan, processes data and collects the top results then as list of tuples.
 
         Args:
@@ -203,27 +203,12 @@ def take(self, nrows=5):
 
         assert isinstance(nrows, int), 'num rows must be an integer'
         assert nrows > 0, 'please specify a number greater than zero'
+        assert isinstance(nbottom, int), 'num bottom last must be an integer'
+        assert nbottom >= 0, 'please specify a number greater or equal to zero'
 
         assert self._dataSet is not None, 'internal API error, datasets must be created via context objects'
 
-        return self._dataSet.take(nrows)
-
-    def takeLast(self, nrows=5):
-        """ action that generates a physical plan, processes data and collects the last results then as list of tuples.
-
-        Args:
-            nrows (int): number of rows to collect. Per default ``5``.
-        Returns:
-            (list): A list of tuples
-
-        """
-
-        assert isinstance(nrows, int), 'num rows must be an integer'
-        assert nrows > 0, 'please specify a number greater than zero'
-
-        assert self._dataSet is not None, 'internal API error, datasets must be created via context objects'
-
-        return self._dataSet.takeLast(nrows)
+        return self._dataSet.take(nrows, nbottom)
 
     def show(self, nrows=None):
         """ action that generates a physical plan, processes data and prints results as nicely formatted

From a6f31ddb42e635df259de7c8f2057215118b6335 Mon Sep 17 00:00:00 2001
From: korlamarch <khemarat_boonyapaluk@brown.edu>
Date: Wed, 16 Feb 2022 12:17:36 -0500
Subject: [PATCH 06/56] Add unit tests

---
 tuplex/test/core/TakeTest.cc | 125 +++++++++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)
 create mode 100644 tuplex/test/core/TakeTest.cc

diff --git a/tuplex/test/core/TakeTest.cc b/tuplex/test/core/TakeTest.cc
new file mode 100644
index 000000000..08b648f34
--- /dev/null
+++ b/tuplex/test/core/TakeTest.cc
@@ -0,0 +1,125 @@
+//--------------------------------------------------------------------------------------------------------------------//
+//                                                                                                                    //
+//                                      Tuplex: Blazing Fast Python Data Science                                      //
+//                                                                                                                    //
+//                                                                                                                    //
+//  (c) 2017 - 2021, Tuplex team                                                                                      //
+//  Created by Leonhard Spiegelberg first on 1/1/2021                                                                 //
+//  License: Apache 2.0                                                                                               //
+//--------------------------------------------------------------------------------------------------------------------//
+
+#include <Context.h>
+#include "TestUtils.h"
+
+class TakeTest : public PyTest {};
+
+TEST_F(TakeTest, takeTopTest) {
+    using namespace tuplex;
+    auto opt = testOptions();
+    Context context(opt);
+
+    auto rs = context.parallelize(
+        {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(1, 0);
+
+    ASSERT_EQ(rs->rowCount(), 1);
+    auto v = rs->getRows(1);
+
+    EXPECT_EQ(v[0].getInt(0), 1);
+
+    auto rs2 = context.parallelize(
+        {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(3, 0);
+
+    ASSERT_EQ(rs2->rowCount(), 3);
+    auto v2 = rs2->getRows(3);
+
+    EXPECT_EQ(v2[0].getInt(0), 1);
+    EXPECT_EQ(v2[1].getInt(0), 2);
+    EXPECT_EQ(v2[2].getInt(0), 3);
+
+    auto rs3 = context.parallelize(
+        {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"), Row("!")}).take(5, 0);
+
+    ASSERT_EQ(rs3->rowCount(), 5);
+    auto v3 = rs3->getRows(5);
+
+    EXPECT_EQ(v3[0].getString(0), "hello");
+    EXPECT_EQ(v3[1].getString(0), "world");
+    EXPECT_EQ(v3[2].getString(0), "! :)");
+    EXPECT_EQ(v3[3].getString(0), "world");
+    EXPECT_EQ(v3[4].getString(0), "hello");
+
+}
+
+TEST_F(TakeTest, takeBottomTest) {
+    using namespace tuplex;
+    auto opt = testOptions();
+    Context context(opt);
+
+    auto rs = context.parallelize(
+        {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(0, 1);
+
+    ASSERT_EQ(rs->rowCount(), 1);
+    auto v = rs->getRows(1);
+
+    EXPECT_EQ(v[0].getInt(0), 6);
+
+    auto rs2 = context.parallelize(
+        {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(0, 3);
+
+    ASSERT_EQ(rs2->rowCount(), 3);
+    auto v2 = rs2->getRows(3);
+
+    EXPECT_EQ(v2[0].getInt(0), 4);
+    EXPECT_EQ(v2[1].getInt(0), 5);
+    EXPECT_EQ(v2[2].getInt(0), 6);
+
+    auto rs3 = context.parallelize(
+        {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"), Row("!")}).take(0, 5);
+
+    ASSERT_EQ(rs3->rowCount(), 5);
+    auto v3 = rs3->getRows(5);
+
+    EXPECT_EQ(v3[0].getString(0), "world");
+    EXPECT_EQ(v3[1].getString(0), "hello");
+    EXPECT_EQ(v3[2].getString(0), "!");
+    EXPECT_EQ(v3[3].getString(0), "! :)");
+    EXPECT_EQ(v3[4].getString(0), "!");
+
+}
+
+TEST_F(TakeTest, takeBothTest) {
+    using namespace tuplex;
+    auto opt = testOptions();
+    Context context(opt);
+
+    auto rs = context.parallelize(
+        {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(1, 1);
+
+    ASSERT_EQ(rs->rowCount(), 2);
+    auto v = rs->getRows(2);
+
+    EXPECT_EQ(v[0].getInt(0), 1);
+    EXPECT_EQ(v[1].getInt(0), 6);
+
+    auto rs2 = context.parallelize(
+        {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(2, 1);
+
+    ASSERT_EQ(rs2->rowCount(), 3);
+    auto v2 = rs2->getRows(3);
+
+    EXPECT_EQ(v2[0].getInt(0), 1);
+    EXPECT_EQ(v2[1].getInt(0), 2);
+    EXPECT_EQ(v2[2].getInt(0), 6);
+
+    auto rs3 = context.parallelize(
+        {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"), Row("!")}).take(2, 3);
+
+    ASSERT_EQ(rs3->rowCount(), 5);
+    auto v3 = rs3->getRows(5);
+
+    EXPECT_EQ(v3[0].getString(0), "hello");
+    EXPECT_EQ(v3[1].getString(0), "world");
+    EXPECT_EQ(v3[2].getString(0), "!");
+    EXPECT_EQ(v3[3].getString(0), "! :)");
+    EXPECT_EQ(v3[4].getString(0), "!");
+}
\ No newline at end of file

From 89cee2ee24d6f45f6d92ef92187e9eb0ee733846 Mon Sep 17 00:00:00 2001
From: korlamarch <khemarat_boonyapaluk@brown.edu>
Date: Wed, 16 Feb 2022 13:08:25 -0500
Subject: [PATCH 07/56] add bottom limit to transform stage (wip)

---
 tuplex/core/include/logical/TakeOperator.h | 2 +-
 tuplex/core/src/physical/PhysicalPlan.cc   | 2 ++
 tuplex/core/src/physical/TransformStage.cc | 2 ++
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/tuplex/core/include/logical/TakeOperator.h b/tuplex/core/include/logical/TakeOperator.h
index 20c035a74..b5dd5db6e 100644
--- a/tuplex/core/include/logical/TakeOperator.h
+++ b/tuplex/core/include/logical/TakeOperator.h
@@ -40,7 +40,7 @@ namespace tuplex {
 
         int64_t limit() { return _limitTop; }
 
-        bool limitBottom() { return _limitBottom; }
+        int64_t limitBottom() { return _limitBottom; }
 
         std::vector<Row> getSample(const size_t num) const override;
 
diff --git a/tuplex/core/src/physical/PhysicalPlan.cc b/tuplex/core/src/physical/PhysicalPlan.cc
index 17a4c7c0e..3985fe1ab 100644
--- a/tuplex/core/src/physical/PhysicalPlan.cc
+++ b/tuplex/core/src/physical/PhysicalPlan.cc
@@ -383,6 +383,8 @@ namespace tuplex {
         if(outputNode->type() == LogicalOperatorType::TAKE) {
             auto top = static_cast<TakeOperator*>(outputNode);
             builder.setOutputLimit(top->limit());
+            // TODO: work here
+            ...
         }
 
         // @TODO: add slowPip builder to this process...
diff --git a/tuplex/core/src/physical/TransformStage.cc b/tuplex/core/src/physical/TransformStage.cc
index b61f9cbe2..6eb3f2e1f 100644
--- a/tuplex/core/src/physical/TransformStage.cc
+++ b/tuplex/core/src/physical/TransformStage.cc
@@ -142,6 +142,8 @@ namespace tuplex {
                 size_t numOutputRows = 0;
                 for (auto partition : partitions) {
                     numOutputRows += partition->getNumRows();
+                    // TODO(march): work here
+                    ...
                     if (numOutputRows >= outputLimit()) {
                         // clip last partition & leave loop
                         auto clipped = outputLimit() - (numOutputRows - partition->getNumRows());

From 07b87fdc4eaf35f320933d1b7cc1b43c536fd946 Mon Sep 17 00:00:00 2001
From: korlamarch <khemarat_boonyapaluk@brown.edu>
Date: Thu, 24 Feb 2022 23:29:11 -0500
Subject: [PATCH 08/56] more physical stage update (wip)

---
 tuplex/core/include/Partition.h               | 41 ++++++---
 tuplex/core/include/physical/StageBuilder.h   | 10 ++-
 tuplex/core/include/physical/TransformStage.h | 10 ++-
 tuplex/core/include/physical/TransformTask.h  |  4 +
 tuplex/core/src/ee/local/LocalBackend.cc      | 17 ++--
 tuplex/core/src/physical/PhysicalPlan.cc      |  4 +-
 tuplex/core/src/physical/ResultSet.cc         |  3 +-
 tuplex/core/src/physical/StageBuilder.cc      |  5 +-
 tuplex/core/src/physical/TransformStage.cc    | 85 ++++++++++++++++---
 tuplex/core/src/physical/TransformTask.cc     |  2 +
 10 files changed, 138 insertions(+), 43 deletions(-)

diff --git a/tuplex/core/include/Partition.h b/tuplex/core/include/Partition.h
index 5a66023fd..2eba22764 100644
--- a/tuplex/core/include/Partition.h
+++ b/tuplex/core/include/Partition.h
@@ -69,6 +69,7 @@ namespace tuplex {
         void loadFromFile(const URI& uri);
 
         int64_t                 _numRows;
+        int64_t                 _numSkip; // number of rows to skip, currently only used at the output (Result set)
         uint64_t                _bytesWritten;
 
         Schema _schema; //! Schema of the partition. May be optimized away later.
@@ -110,6 +111,24 @@ namespace tuplex {
             setNumRows(0);
         }
 
+        explicit Partition(Partition* part) :
+            _owner(part->_owner),
+            _arena(part->_arena),
+            _size(part->_size),
+            _uuid(part->_uuid),
+            _active(false),
+            _immortal(false),
+            _locked(false),
+            _numRows(part->_numRows),
+            _bytesWritten(part->_bytesWritten),
+            _schema(part->_schema),
+            _dataSetID(part->_dataSetID),
+            _contextID(part->_contextID),
+            _swappedToFile(part->_swappedToFile) {
+
+            // TODO(march): to actually allocate memory here?
+        }
+
         ~Partition() {
             assert(!_locked);
         }
@@ -157,7 +176,7 @@ namespace tuplex {
          * return how much capacity is left, i.e. how many bytes can be actually written
          * @return
          */
-        size_t capacity() { return _size - sizeof(int64_t); }
+        size_t capacity() const { return _size - sizeof(int64_t); }
 
         uniqueid_t uuid() const { return _uuid; }
 
@@ -248,21 +267,19 @@ namespace tuplex {
             _mutex.unlock();
         }
 
-        void setNumLastRows(const size_t numRows) {
-            // TODO: set another value instead
+        size_t getNumSkip() {
+            size_t res = 0;
             _mutex.lock();
-
-            _numRows = numRows;
-
-            // save to memptr
-            if(_arena) {
-                *((int64_t*)_arena) = numRows;
-            }
-
+            res = num_skip;
             _mutex.unlock();
+            return res;
         }
 
-
+        void setNumSkip(const size_t numSkip) {
+            _mutex.lock();
+            _numSkip = numSkip;
+            _mutex.unlock();
+        }
 
         int64_t getDataSetID() const { return _dataSetID; }
 
diff --git a/tuplex/core/include/physical/StageBuilder.h b/tuplex/core/include/physical/StageBuilder.h
index 63b94bd57..e678ead3d 100644
--- a/tuplex/core/include/physical/StageBuilder.h
+++ b/tuplex/core/include/physical/StageBuilder.h
@@ -76,8 +76,9 @@ namespace tuplex {
             void addFileInput(FileInputOperator* csvop);
             void addFileOutput(FileOutputOperator* fop);
 
-            inline void setOutputLimit(size_t limit) {
-                _outputLimit = limit;
+            inline void  setOutputLimit(size_t topLimit, size_t bottomLimit) {
+                _outputTopLimit = topLimit;
+                _outputBottomLimit = bottomLimit;
             }
 
             TransformStage* build(PhysicalPlan* plan, IBackend* backend);
@@ -134,7 +135,8 @@ namespace tuplex {
             FileFormat _outputFileFormat;
             int64_t _outputNodeID;
             int64_t _inputNodeID;
-            size_t _outputLimit;
+            size_t _outputTopLimit;
+            size_t _outputBottomLimit;
 
             LogicalOperator* _inputNode;
             std::vector<bool> _columnsToRead;
@@ -157,7 +159,7 @@ namespace tuplex {
             int64_t outputDataSetID() const;
 
             inline bool hasOutputLimit() const {
-                return _outputLimit < std::numeric_limits<size_t>::max();
+                return _outputTopLimit < std::numeric_limits<size_t>::max() || _outputBottomLimit > 0;
             }
 
             inline char csvOutputDelimiter() const {
diff --git a/tuplex/core/include/physical/TransformStage.h b/tuplex/core/include/physical/TransformStage.h
index 22d7f5fb4..e63eaec31 100644
--- a/tuplex/core/include/physical/TransformStage.h
+++ b/tuplex/core/include/physical/TransformStage.h
@@ -111,14 +111,15 @@ namespace tuplex {
          * @param outputLimit
          */
         void setOutputLimit(size_t outputLimit) {
-            _outputLimit = outputLimit;
+            _outputTopLimit = outputLimit;
 
             // @TODO: move this logic to physical plan!
             // pushdown limit
             //pushDownOutputLimit();
         }
 
-        size_t outputLimit() const { return _outputLimit; }
+        size_t outputTopLimit() const { return _outputTopLimit; }
+        size_t outputBottomLimit() const { return _outputBottomLimit; }
         size_t inputLimit() const { return _inputLimit; }
 
         /*!
@@ -442,7 +443,8 @@ namespace tuplex {
 
         std::vector<Partition*> _inputPartitions; //! memory input partitions for this task.
         size_t                  _inputLimit; //! limit number of input rows (inf per default)
-        size_t                  _outputLimit; //! output limit, set e.g. by take, to_csv etc. (inf per default)
+        size_t                  _outputTopLimit; //! output limit, set e.g. by take, to_csv etc. (inf per default)
+        size_t                  _outputBottomLimit; //! output limit, set e.g. by take, to_csv etc. (0 per default)
 
         std::shared_ptr<ResultSet> _rs; //! result set
 
@@ -479,7 +481,7 @@ namespace tuplex {
         python::Type _hashOutputBucketType;
 
         bool hasOutputLimit() const {
-            return _outputLimit < std::numeric_limits<size_t>::max();
+            return _outputTopLimit < std::numeric_limits<size_t>::max();
         }
     };
 }
diff --git a/tuplex/core/include/physical/TransformTask.h b/tuplex/core/include/physical/TransformTask.h
index 2868ba668..c3b9dbeb4 100644
--- a/tuplex/core/include/physical/TransformTask.h
+++ b/tuplex/core/include/physical/TransformTask.h
@@ -183,6 +183,7 @@ namespace tuplex {
         HashTableSink hashTableSink() const { return _htable; } // needs to be freed manually!
 
         void setOutputLimit(size_t limit) { _outLimit = limit; resetOutputLimitCounter(); }
+        void setOutputBottomLimit(size_t limit) { _outBottomLimit = limit; resetOutputLimitCounter(); }
         void setOutputSkip(size_t numRowsToSkip) { _outSkipRows = numRowsToSkip; }
         void execute() override;
 
@@ -250,6 +251,8 @@ namespace tuplex {
 
         size_t output_rows_written() const { return _numOutputRowsWritten; }
         size_t output_limit() const { return _outLimit; }
+        size_t output_bottom_limit() const { return _outBottomLimit; }
+
     private:
         void resetSinks();
         void resetSources();
@@ -277,6 +280,7 @@ namespace tuplex {
         std::unordered_map<std::string, std::string> _outOptions;
 
         size_t _outLimit; // limits how many rows to write at max
+        size_t _outBottomLimit; // limits how many last rows to write at max
         size_t _outSkipRows; // how many rows at start to skip
 
         // memory source variables
diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc
index bed96ec5a..47da4dc23 100644
--- a/tuplex/core/src/ee/local/LocalBackend.cc
+++ b/tuplex/core/src/ee/local/LocalBackend.cc
@@ -486,6 +486,7 @@ namespace tuplex {
 
         // check what type of input the pipeline has (memory or files)
         if(tstage->fileInputMode()) {
+            // TODO(march): deal with file input
             // files
             // input is multiple files, use split file strategy here.
             // and issue tasks to executor workqueue!
@@ -550,7 +551,7 @@ namespace tuplex {
 
                         task->sinkExceptionsToMemory(inputSchema);
                         task->setStageID(tstage->getID());
-                        task->setOutputLimit(tstage->outputLimit());
+                        task->setOutputLimit(tstage->outputTopLimit());
                         // add to tasks
                         tasks.emplace_back(std::move(task));
                     } else {
@@ -584,7 +585,7 @@ namespace tuplex {
                             }
                             task->sinkExceptionsToMemory(inputSchema);
                             task->setStageID(tstage->getID());
-                            task->setOutputLimit(tstage->outputLimit());
+                            task->setOutputLimit(tstage->outputTopLimit());
                             // add to tasks
                             tasks.emplace_back(std::move(task));
                             num_parts++;
@@ -621,7 +622,7 @@ namespace tuplex {
                                 }
                                 task->sinkExceptionsToMemory(inputSchema);
                                 task->setStageID(tstage->getID());
-                                task->setOutputLimit(tstage->outputLimit());
+                                task->setOutputLimit(tstage->outputTopLimit());
                                 // add to tasks
                                 tasks.emplace_back(std::move(task));
 
@@ -683,7 +684,11 @@ namespace tuplex {
                 task->setInputExceptions(tstage->inputExceptions());
                 task->sinkExceptionsToMemory(inputSchema);
                 task->setStageID(tstage->getID());
-                task->setOutputLimit(tstage->outputLimit());
+                task->setOutputLimit(tstage->outputTopLimit());
+                if (tstage->outputBottomLimit()) {
+                    // TODO(march): work here
+                    task->setOutputBottomLimit(tstage->outputBottomLimit());
+                }
                 tasks.emplace_back(std::move(task));
                 numInputRows += partition->getNumRows();
 
@@ -837,6 +842,7 @@ namespace tuplex {
     }
 
     void LocalBackend::executeTransformStage(tuplex::TransformStage *tstage) {
+        // TODO(march): work here
 
         Timer stageTimer;
         Timer timer; // for detailed measurements.
@@ -1529,6 +1535,7 @@ namespace tuplex {
 #endif
 
         // add all tasks to queue
+        // TODO(march): question here
         for(auto& task : tasks) wq.addTask(task);
         // clear
         tasks.clear();
@@ -1955,7 +1962,7 @@ namespace tuplex {
 
         // now simply go over the partitions and write the full buffers out
         // check all the params from TrafoStage
-        size_t limit = tstage->outputLimit();
+        size_t limit = tstage->outputTopLimit();
         size_t splitSize = tstage->splitSize();
         size_t numOutputFiles = tstage->numOutputFiles();
         URI uri = tstage->outputURI();
diff --git a/tuplex/core/src/physical/PhysicalPlan.cc b/tuplex/core/src/physical/PhysicalPlan.cc
index 3985fe1ab..9c22837ad 100644
--- a/tuplex/core/src/physical/PhysicalPlan.cc
+++ b/tuplex/core/src/physical/PhysicalPlan.cc
@@ -382,9 +382,7 @@ namespace tuplex {
         // set limit if output node has a limit (currently only TakeOperator)
         if(outputNode->type() == LogicalOperatorType::TAKE) {
             auto top = static_cast<TakeOperator*>(outputNode);
-            builder.setOutputLimit(top->limit());
-            // TODO: work here
-            ...
+            builder.setOutputLimit(top->limit(), top->limitBottom());
         }
 
         // @TODO: add slowPip builder to this process...
diff --git a/tuplex/core/src/physical/ResultSet.cc b/tuplex/core/src/physical/ResultSet.cc
index 0f7bf7319..5e15867f7 100644
--- a/tuplex/core/src/physical/ResultSet.cc
+++ b/tuplex/core/src/physical/ResultSet.cc
@@ -98,7 +98,7 @@ namespace tuplex {
         Partition *first = _partitions.front();
         assert(_schema == first->schema());
 
-        auto numRows = first->getNumRows();
+        auto numRows = first->getNumRows() - first->getNumSkip();
         _rowsRetrieved += numRows;
 
         _partitions.pop_front();
@@ -183,6 +183,7 @@ namespace tuplex {
     }
 
     Row ResultSet::getNextRow() {
+        // TODO(march): logic in skip row count here
         // merge rows from objects
         if(!_pyobjects.empty()) {
             auto row_number = std::get<0>(_pyobjects.front());
diff --git a/tuplex/core/src/physical/StageBuilder.cc b/tuplex/core/src/physical/StageBuilder.cc
index 0bf509ed1..bc814182b 100644
--- a/tuplex/core/src/physical/StageBuilder.cc
+++ b/tuplex/core/src/physical/StageBuilder.cc
@@ -50,7 +50,7 @@ namespace tuplex {
                 : _stageNumber(stage_number), _isRootStage(rootStage), _allowUndefinedBehavior(allowUndefinedBehavior),
                   _generateParser(generateParser), _normalCaseThreshold(normalCaseThreshold), _sharedObjectPropagation(sharedObjectPropagation),
                   _nullValueOptimization(nullValueOptimization), _updateInputExceptions(updateInputExceptions),
-                  _inputNode(nullptr), _outputLimit(std::numeric_limits<size_t>::max()) {
+                  _inputNode(nullptr), _outputTopLimit(std::numeric_limits<size_t>::max()), _outputBottomLimit(0) {
         }
 
         void StageBuilder::generatePythonCode() {
@@ -1426,7 +1426,8 @@ namespace tuplex {
             // no limit operator yet...
 
             // get limit
-            stage->_outputLimit = _outputLimit;
+            stage->_outputTopLimit = _outputTopLimit;
+            stage->_outputBottomLimit = _outputBottomLimit;
 
             // copy input/output configurations
             stage->_fileInputParameters = _fileInputParameters;
diff --git a/tuplex/core/src/physical/TransformStage.cc b/tuplex/core/src/physical/TransformStage.cc
index 6eb3f2e1f..ff54c2db3 100644
--- a/tuplex/core/src/physical/TransformStage.cc
+++ b/tuplex/core/src/physical/TransformStage.cc
@@ -48,7 +48,8 @@ namespace tuplex {
                                    int64_t number,
                                    bool allowUndefinedBehavior) : PhysicalStage::PhysicalStage(plan, backend, number),
                                                                   _inputLimit(std::numeric_limits<size_t>::max()),
-                                                                  _outputLimit(std::numeric_limits<size_t>::max()),
+                                                                  _outputTopLimit(std::numeric_limits<size_t>::max()),
+                                                                  _outputBottomLimit(0),
                                                                   _aggMode(AggregateType::AGG_NONE) {
 
         // TODO: is this code out of date? + is allowUndefinedBehavior needed here?
@@ -129,7 +130,7 @@ namespace tuplex {
         if (partitions.empty() && interpreterRows.empty() && generalCase.empty())
             _rs = emptyResultSet();
         else {
-            std::vector<Partition *> limitedPartitions;
+            std::vector<Partition *> limitedPartitions, limitedTailPartitions;
             auto schema = Schema::UNKNOWN;
 
             if(!partitions.empty()) {
@@ -138,31 +139,91 @@ namespace tuplex {
                     assert(schema == partition->schema());
                 }
 
-                // check output limit, adjust partitions if necessary
-                size_t numOutputRows = 0;
+                // check top output limit, adjust partitions if necessary
+                size_t numTopOutputRows = 0;
+                Partition* lastTopPart = nullptr;
+                size_t clippedTop = 0;
                 for (auto partition : partitions) {
-                    numOutputRows += partition->getNumRows();
-                    // TODO(march): work here
-                    ...
-                    if (numOutputRows >= outputLimit()) {
+                    numTopOutputRows += partition->getNumRows();
+                    lastTopPart = partition;
+                    if (numTopOutputRows >= outputTopLimit()) {
                         // clip last partition & leave loop
-                        auto clipped = outputLimit() - (numOutputRows - partition->getNumRows());
+                        clippedTop = outputTopLimit() - (numTopOutputRows - partition->getNumRows());
+                        assert(clippedTop <= partition->getNumRows());
+                        break;
+                    } else if (partition == *partitions.end()) {
+                        // last partition, mark full row, but don't put to output set yet to avoid double put
+                        clippedTop = partition->getNumRows();
+                        break;
+                    } else {
+                        // put full partition to output set
+                        limitedPartitions.push_back(partition);
+                    }
+                }
+
+                // check the bottom output limit, adjust partitions if necessary
+                size_t numBottomOutputRows = 0;
+                size_t clippedBottom = 0;
+                for (auto it = partitions.rbegin(); it != partitions.rend(); it++) {
+                    auto partition = *it;
+                    numBottomOutputRows += partition->getNumRows();
+
+                    if (partition == lastTopPart) {
+                        // the bottom and the top partitions are overlapping
+                        clippedBottom = outputBottomLimit() - (numBottomOutputRows - partition->getNumRows());
+                        if (clippedTop + clippedBottom >= partition->getNumRows()) {
+                            // if top and bottom range intersect, use full partitions
+                            clippedTop = partition->getNumRows();
+                            clippedBottom = 0;
+                        }
+                        break;
+                    } else if (numBottomOutputRows >= outputBottomLimit()) {
+                        // clip last partition & leave loop
+                        auto clipped = outputBottomLimit() - (numTopOutputRows - partition->getNumRows());
                         assert(clipped <= partition->getNumRows());
+                        partition->setNumSkip(partition->getNumRows() - clippedBottom);
                         partition->setNumRows(clipped);
                         if (clipped > 0)
-                            limitedPartitions.push_back(partition);
+                            limitedTailPartitions.push_back(partition);
                         break;
                     } else {
                         // put full partition to output set
-                        limitedPartitions.push_back(partition);
+                        limitedTailPartitions.push_back(partition);
+                    }
+                }
+
+                // push the middle partition
+                if (lastTopPart != nullptr && (clippedTop > 0 || clippedBottom > 0)) {
+                    assert(clippedTop + clippedBottom <= lastTopPart->getNumRows());
+
+                    // TODO(march): to work on this (split into two partitions)
+                    // split into two partitions with both top and bottom are in the same partition
+                    Partition* lastBottomPart = nullptr;
+                    if (clippedBottom != 0) {
+                        lastBottomPart = new Partition(lastTopPart);
+                        lastBottomPart->setNumSkip(lastBottomPart->getNumRows() - clippedBottom);
+                        lastBottomPart->setNumRows(clippedBottom);
+                    }
+
+                    lastTopPart->setNumRows(clippedTop);
+
+                    limitedPartitions.push_back(lastTopPart);
+
+                    if (lastBottomPart != nullptr) {
+                        limitedPartitions.push_back(lastBottomPart);
                     }
                 }
+
+                // merge the head and tail partitions
+                std::reverse(limitedTailPartitions.begin(), limitedTailPartitions.end());
+                limitedPartitions.insert(limitedPartitions.end(), limitedTailPartitions.begin(), limitedTailPartitions.end());
             }
 
             // put ALL partitions to result set
+            // TODO(march): handle overlapping case
             _rs = std::make_shared<ResultSet>(schema, limitedPartitions,
                                               generalCase, partitionToExceptionsMap, interpreterRows,
-                                              outputLimit());
+                                              outputTopLimit() + outputBottomLimit());
         }
     }
 
diff --git a/tuplex/core/src/physical/TransformTask.cc b/tuplex/core/src/physical/TransformTask.cc
index c560c4af4..d05e7ce50 100644
--- a/tuplex/core/src/physical/TransformTask.cc
+++ b/tuplex/core/src/physical/TransformTask.cc
@@ -514,6 +514,7 @@ namespace tuplex {
         _outFile.reset(nullptr);
         _outPrefix.reset();
         _outLimit = std::numeric_limits<size_t>::max(); // write all rows
+        _outBottomLimit = 0;
         _outSkipRows = 0; // skip no rows
 
         // reset memory sink
@@ -619,6 +620,7 @@ namespace tuplex {
 
         auto functor = reinterpret_cast<codegen::read_block_f>(_functor);
 
+        // TODO(march): question here?
         // go over all input partitions.
         for(const auto &inputPartition : _inputPartitions) {
             // lock ptr, extract number of rows ==> store them

From b2beb88f0ef2414a00e81ef22783791849ef27bf Mon Sep 17 00:00:00 2001
From: korlamarch <khemarat_boonyapaluk@brown.edu>
Date: Fri, 4 Mar 2022 11:02:22 -0500
Subject: [PATCH 09/56] Quick push

---
 tuplex/core/include/Executor.h               | 21 ++++-
 tuplex/core/include/Partition.h              | 18 ----
 tuplex/core/include/physical/ResultSet.h     |  2 +
 tuplex/core/include/physical/TransformTask.h |  6 +-
 tuplex/core/src/Executor.cc                  | 97 +++++++++++---------
 tuplex/core/src/ee/local/LocalBackend.cc     | 38 ++++----
 tuplex/core/src/physical/TransformStage.cc   |  1 +
 tuplex/core/src/physical/TransformTask.cc    | 17 ++--
 tuplex/utils/include/mt/ITask.h              | 26 ++----
 9 files changed, 112 insertions(+), 114 deletions(-)

diff --git a/tuplex/core/include/Executor.h b/tuplex/core/include/Executor.h
index 0bca412be..3631f7e7d 100644
--- a/tuplex/core/include/Executor.h
+++ b/tuplex/core/include/Executor.h
@@ -44,12 +44,19 @@ namespace tuplex {
      */
     class WorkQueue {
     private:
-        std::atomic_bool _done; // protects against data races
+        std::atomic_bool _done{}; // protects against data races
         ExecutorTaskQueueType _queue;
         std::mutex _completedTasksMutex;
         std::vector<IExecutorTask*> _completedTasks;
-        std::atomic_int _numPendingTasks;
-        std::atomic_int _numCompletedTasks;
+        std::atomic_int _numPendingTasks{};
+        std::atomic_int _numCompletedTasks{};
+
+        // mapping from order number -> row count if the task is finished
+        std::mutex _rowsDoneMutex;
+        std::map<size_t, size_t> _rowsDone;
+
+        std::atomic_int _frontRowsLimit{};
+        std::atomic_int _bottomRowsLimit{};
     public:
 
         WorkQueue();
@@ -74,6 +81,14 @@ namespace tuplex {
 
         size_t numCompletedTasks() const { return _numCompletedTasks; }
 
+        size_t frontRowsLimit() const {
+            return _frontRowsLimit;
+        };
+
+        size_t bottomRowsLimit() const {
+            return _bottomRowsLimit;
+        };
+
         /*!
          * stop working on this queue & dump all tasks
          */
diff --git a/tuplex/core/include/Partition.h b/tuplex/core/include/Partition.h
index 2eba22764..24b79cc8f 100644
--- a/tuplex/core/include/Partition.h
+++ b/tuplex/core/include/Partition.h
@@ -111,24 +111,6 @@ namespace tuplex {
             setNumRows(0);
         }
 
-        explicit Partition(Partition* part) :
-            _owner(part->_owner),
-            _arena(part->_arena),
-            _size(part->_size),
-            _uuid(part->_uuid),
-            _active(false),
-            _immortal(false),
-            _locked(false),
-            _numRows(part->_numRows),
-            _bytesWritten(part->_bytesWritten),
-            _schema(part->_schema),
-            _dataSetID(part->_dataSetID),
-            _contextID(part->_contextID),
-            _swappedToFile(part->_swappedToFile) {
-
-            // TODO(march): to actually allocate memory here?
-        }
-
         ~Partition() {
             assert(!_locked);
         }
diff --git a/tuplex/core/include/physical/ResultSet.h b/tuplex/core/include/physical/ResultSet.h
index e94b8f1ae..5e69fef3a 100644
--- a/tuplex/core/include/physical/ResultSet.h
+++ b/tuplex/core/include/physical/ResultSet.h
@@ -36,6 +36,8 @@ namespace tuplex {
         size_t _rowsRetrieved;
         size_t _totalRowCounter; // used for merging in rows!
         size_t _maxRows;
+        size_t _maxRowsTop;
+        size_t _maxRowsBottom;
         Schema _schema;
 
         void removeFirstPartition();
diff --git a/tuplex/core/include/physical/TransformTask.h b/tuplex/core/include/physical/TransformTask.h
index c3b9dbeb4..d065e86d3 100644
--- a/tuplex/core/include/physical/TransformTask.h
+++ b/tuplex/core/include/physical/TransformTask.h
@@ -182,7 +182,7 @@ namespace tuplex {
         void sinkOutputToHashTable(HashTableFormat fmt, int64_t outputDataSetID);
         HashTableSink hashTableSink() const { return _htable; } // needs to be freed manually!
 
-        void setOutputLimit(size_t limit) { _outLimit = limit; resetOutputLimitCounter(); }
+        void setOutputTopLimit(size_t limit) { _outTopLimit = limit; resetOutputLimitCounter(); }
         void setOutputBottomLimit(size_t limit) { _outBottomLimit = limit; resetOutputLimitCounter(); }
         void setOutputSkip(size_t numRowsToSkip) { _outSkipRows = numRowsToSkip; }
         void execute() override;
@@ -250,7 +250,7 @@ namespace tuplex {
         double wallTime() const override { return _wallTime; }
 
         size_t output_rows_written() const { return _numOutputRowsWritten; }
-        size_t output_limit() const { return _outLimit; }
+        size_t output_top_limit() const { return _outTopLimit; }
         size_t output_bottom_limit() const { return _outBottomLimit; }
 
     private:
@@ -279,7 +279,7 @@ namespace tuplex {
         Buffer _outPrefix;
         std::unordered_map<std::string, std::string> _outOptions;
 
-        size_t _outLimit; // limits how many rows to write at max
+        size_t _outTopLimit; // limits how many rows to write at max
         size_t _outBottomLimit; // limits how many last rows to write at max
         size_t _outSkipRows; // how many rows at start to skip
 
diff --git a/tuplex/core/src/Executor.cc b/tuplex/core/src/Executor.cc
index 845b78e6a..1cc818010 100644
--- a/tuplex/core/src/Executor.cc
+++ b/tuplex/core/src/Executor.cc
@@ -32,8 +32,12 @@ namespace tuplex {
 
     std::vector<IExecutorTask*> WorkQueue::popCompletedTasks() {
         TRACE_LOCK("workQueue");
-        std::lock_guard<std::mutex> lock(_completedTasksMutex);
 
+        _taskDoneMutex.lock();
+        _taskDone.clear();
+        _taskDoneMutex.unlock();
+
+        std::lock_guard<std::mutex> lock(_completedTasksMutex);
         // move leads to circular dependency in gcc and thus a bug on travis-ci. Therefore, just
         // use the below hack to fool the compiler into actually copying the vectors
         // // move to reset completed tasks and return array
@@ -78,59 +82,66 @@ namespace tuplex {
     bool WorkQueue::workTask(Executor& executor, bool nonBlocking) {
 
         IExecutorTask *task = nullptr;
-        if(nonBlocking) {
-            // @Todo: This should be put into a function "work" on the workQueue...
-            // dequeue from general working queue
-            if(_queue.try_dequeue(task)) {
-                if(!task)
-                    return false;
 
-                task->setOwner(&executor);
-                task->setThreadNumber(executor.threadNumber()); // redundant?
+        // dequeue from general working queue
+        // Note: is this TODO: outdated?
+        // @Todo: This should be put into a function "work" on the workQueue...
+        if (nonBlocking) {
+            if(!_queue.try_dequeue(task)) {
+                return false;
+            }
+        } else {
+            _queue.wait_dequeue(task);
+        }
 
-                //executor.logger().info("started task...");
-                // process task
-                task->execute();
-                // save which thread executed this task
-                task->setID(std::this_thread::get_id());
+        if(!task) {
+            return false;
+        }
 
+        // if reach the top limit already, then don't compute the rest
+        size_t numTopCompleted;
+        TRACE_LOCK("rowsDone");
+        _rowsDoneMutex.lock();
+        size_t frontRowsDone = 0;
+        for (size_t i = 0; _rowsDone.count(i) != 0; i++) {
+            frontRowsDone += _rowsDone[i];
+            if (frontRowsDone >= _queue.frontRowsLimit()) {
+                // skip execution
                 _numPendingTasks.fetch_add(-1, std::memory_order_release);
-
-                // add task to done list
-                TRACE_LOCK("completedTasks");
-                _completedTasksMutex.lock();
-                _completedTasks.push_back(std::move(task));
-                _completedTasksMutex.unlock();
-                _numCompletedTasks.fetch_add(1, std::memory_order_release);
-                TRACE_UNLOCK("completedTasks");
+                _rowsDoneMutex.unlock();
+                TRACE_UNLOCK("rowsDone");
                 return true;
             }
-        } else {
-            _queue.wait_dequeue(task);
+        }
+        _rowsDoneMutex.unlock();
+        TRACE_UNLOCK("rowsDone");
 
-            if(!task)
-                return false;
+        task->setOwner(&executor);
+        task->setThreadNumber(executor.threadNumber()); // redundant?
 
-            task->setOwner(&executor);
-            task->setThreadNumber(executor.threadNumber()); // redundant?
+        // executor.logger().info("started task...");
+        // process task
+        task->execute();
+        // save which thread executed this task
+        task->setID(std::this_thread::get_id());
 
-            // process task
-            task->execute();
-            // save which thread executed this task
-            task->setID(std::this_thread::get_id());
+        _numPendingTasks.fetch_add(-1, std::memory_order_release);
 
-            // add task to done list
-            TRACE_LOCK("completedTasks");
-            _completedTasksMutex.lock();
-            _completedTasks.push_back(std::move(task));
-            _completedTasksMutex.unlock();
-            _numCompletedTasks.fetch_add(1, std::memory_order_release);
-            TRACE_UNLOCK("completedTasks");
+        // add task to done list
+        TRACE_LOCK("completedTasks");
+        _completedTasksMutex.lock();
+        _completedTasks.push_back(std::move(task));
+        _completedTasksMutex.unlock();
+        _numCompletedTasks.fetch_add(1, std::memory_order_release);
+        TRACE_UNLOCK("completedTasks");
 
-            _numPendingTasks.fetch_add(-1, std::memory_order_release);
-            return true;
-        }
-        return false;
+        TRACE_LOCK("rowsDone");
+        _rowsDoneMutex.lock();
+        _rowsDone[task->getOrder()] += task->getNumOutputRows();
+        _rowsDoneMutex.unlock();
+        TRACE_UNLOCK("rowsDone");
+
+        return true;
     }
 
     void WorkQueue::workUntilAllTasksFinished(tuplex::Executor &executor, bool flushPeriodicallyToPython) {
diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc
index 47da4dc23..5a1311436 100644
--- a/tuplex/core/src/ee/local/LocalBackend.cc
+++ b/tuplex/core/src/ee/local/LocalBackend.cc
@@ -551,7 +551,7 @@ namespace tuplex {
 
                         task->sinkExceptionsToMemory(inputSchema);
                         task->setStageID(tstage->getID());
-                        task->setOutputLimit(tstage->outputTopLimit());
+                        task->setOutputTopLimit(tstage->outputTopLimit());
                         // add to tasks
                         tasks.emplace_back(std::move(task));
                     } else {
@@ -585,7 +585,7 @@ namespace tuplex {
                             }
                             task->sinkExceptionsToMemory(inputSchema);
                             task->setStageID(tstage->getID());
-                            task->setOutputLimit(tstage->outputTopLimit());
+                            task->setOutputTopLimit(tstage->outputTopLimit());
                             // add to tasks
                             tasks.emplace_back(std::move(task));
                             num_parts++;
@@ -622,7 +622,7 @@ namespace tuplex {
                                 }
                                 task->sinkExceptionsToMemory(inputSchema);
                                 task->setStageID(tstage->getID());
-                                task->setOutputLimit(tstage->outputTopLimit());
+                                task->setOutputTopLimit(tstage->outputTopLimit());
                                 // add to tasks
                                 tasks.emplace_back(std::move(task));
 
@@ -684,10 +684,10 @@ namespace tuplex {
                 task->setInputExceptions(tstage->inputExceptions());
                 task->sinkExceptionsToMemory(inputSchema);
                 task->setStageID(tstage->getID());
-                task->setOutputLimit(tstage->outputTopLimit());
+                task->setOutputTopLimit(tstage->outputTopLimit());
+                task->setOutputBottomLimit(tstage->outputBottomLimit());
                 if (tstage->outputBottomLimit()) {
-                    // TODO(march): work here
-                    task->setOutputBottomLimit(tstage->outputBottomLimit());
+                    // TODO(march): work here (task output limit generation)
                 }
                 tasks.emplace_back(std::move(task));
                 numInputRows += partition->getNumRows();
@@ -842,8 +842,6 @@ namespace tuplex {
     }
 
     void LocalBackend::executeTransformStage(tuplex::TransformStage *tstage) {
-        // TODO(march): work here
-
         Timer stageTimer;
         Timer timer; // for detailed measurements.
 
@@ -943,6 +941,7 @@ namespace tuplex {
             }
         }
 
+        // TODO(march): work here (transform stage)
         auto tasks = createLoadAndTransformToMemoryTasks(tstage, _options, syms);
         auto completedTasks = performTasks(tasks);
 
@@ -1519,24 +1518,21 @@ namespace tuplex {
         WorkQueue& wq = LocalEngine::instance().getQueue();
         wq.clear();
 
-        // check if ord is set, if not issue warning & add
-        bool orderlessTaskFound = false;
+        // assign the order for all tasks
         for(int i = 0; i < tasks.size(); ++i) {
-            if(tasks[i]->getOrder().size() == 0) {
-                tasks[i]->setOrder(i);
-                orderlessTaskFound = true;
-            }
+            tasks[i]->setOrder(i);
         }
 
-#ifndef NDEBUG
-        if(orderlessTaskFound) {
-            logger().debug("task without order found, please fix in code.");
+        // add all tasks to queue
+        // TODO(march): add task stage (to do striping)
+        for(size_t i = 0; i <= tasks.size() - i - 1; i++) {
+            const size_t revI = tasks.size()- i - 1
+            wq.addTask(&tasks[i]);
+            if (revI > i) {
+                wq.addTask(&tasks[revI]);
+            }
         }
-#endif
 
-        // add all tasks to queue
-        // TODO(march): question here
-        for(auto& task : tasks) wq.addTask(task);
         // clear
         tasks.clear();
 
diff --git a/tuplex/core/src/physical/TransformStage.cc b/tuplex/core/src/physical/TransformStage.cc
index ff54c2db3..af58866dc 100644
--- a/tuplex/core/src/physical/TransformStage.cc
+++ b/tuplex/core/src/physical/TransformStage.cc
@@ -221,6 +221,7 @@ namespace tuplex {
 
             // put ALL partitions to result set
             // TODO(march): handle overlapping case
+            // TODO(march): maybe do top/bottom limit at the level instead?
             _rs = std::make_shared<ResultSet>(schema, limitedPartitions,
                                               generalCase, partitionToExceptionsMap, interpreterRows,
                                               outputTopLimit() + outputBottomLimit());
diff --git a/tuplex/core/src/physical/TransformTask.cc b/tuplex/core/src/physical/TransformTask.cc
index d05e7ce50..a65aa7f11 100644
--- a/tuplex/core/src/physical/TransformTask.cc
+++ b/tuplex/core/src/physical/TransformTask.cc
@@ -19,10 +19,12 @@
 
 namespace tuplex {
     // atomic var to count output rows!
-    static std::atomic_int64_t g_totalOutputRows;
+    static std::atomic_int64_t g_totalTopOutputRows;
+    static std::atomic_int64_t g_totalBottomOutputRows;
 
     void TransformTask::resetOutputLimitCounter() {
-        g_totalOutputRows = 0;
+        g_totalTopOutputRows = 0;
+        g_totalBottomOutputRows = 0;
     }
 }
 
@@ -41,7 +43,8 @@ extern "C" {
 
     static int64_t limited_w2mCallback(tuplex::TransformTask* task, uint8_t* buf, int64_t bufSize) {
         // i.e. check here how many output rows, if already limit reached - jump to goto!
-        if(tuplex::g_totalOutputRows >= task->output_limit()) {
+        // TODO(march): comment this out
+        if(tuplex::g_totalTopOutputRows >= task->output_top_limit()) {
             return tuplex::ecToI64(tuplex::ExceptionCode::OUTPUT_LIMIT_REACHED);
         }
 
@@ -49,10 +52,10 @@ extern "C" {
         assert(dynamic_cast<tuplex::TransformTask*>(task));
         auto rc = task->writeRowToMemory(buf, bufSize);
         if(0 == rc)
-            tuplex::g_totalOutputRows++;
+            tuplex::g_totalTopOutputRows++;
 
         // i.e. check here how many output rows, if already limit reached - jump to goto!
-        if(tuplex::g_totalOutputRows >= task->output_limit()) {
+        if(tuplex::g_totalTopOutputRows >= task->output_top_limit()) {
             return tuplex::ecToI64(tuplex::ExceptionCode::OUTPUT_LIMIT_REACHED);
         }
         return rc;
@@ -513,7 +516,7 @@ namespace tuplex {
         _outputFilePath = URI::INVALID;
         _outFile.reset(nullptr);
         _outPrefix.reset();
-        _outLimit = std::numeric_limits<size_t>::max(); // write all rows
+        _outTopLimit = std::numeric_limits<size_t>::max(); // write all rows
         _outBottomLimit = 0;
         _outSkipRows = 0; // skip no rows
 
@@ -680,7 +683,7 @@ namespace tuplex {
 
         // skip rows? limit rows??
 
-        if(_numOutputRowsWritten >= _outSkipRows && _numOutputRowsWritten < (_outLimit - _outSkipRows)) {
+        if(_numOutputRowsWritten >= _outSkipRows && _numOutputRowsWritten < (_outTopLimit - _outSkipRows)) {
             if(_outFile->write(buf, bufSize) != VirtualFileSystemStatus::VFS_OK)
                 return ecToI32(ExceptionCode::IOERROR);
         }
diff --git a/tuplex/utils/include/mt/ITask.h b/tuplex/utils/include/mt/ITask.h
index 8434896a7..01f7137f1 100644
--- a/tuplex/utils/include/mt/ITask.h
+++ b/tuplex/utils/include/mt/ITask.h
@@ -29,7 +29,7 @@ namespace tuplex {
         std::thread::id _id; //! the id of the thread that executed the task. Used to specifically execute a task on a specific thread.
 //! Per default object is constructed that does not represent a thread
 
-        std::vector<size_t> _orderNumbers; //! for sorting tasks when doing async processing, allows for multiple stages
+        size_t _orderNumbers; //! for sorting tasks when doing async processing, allows for multiple stages
 
     public:
         ITask() {};
@@ -51,33 +51,21 @@ namespace tuplex {
             _id = id;
         }
 
-        void setOrder(size_t order) { _orderNumbers = std::vector<size_t>{order}; }
-
-//        size_t getOrder(const size_t nth = 0) const {
-//            return _orderNumbers[nth];
-//        }
-        std::vector<size_t> getOrder() const { return _orderNumbers; }
-
-        void setOrder(const std::vector<size_t>& order) {
+        void setOrder(size_t order) {
             _orderNumbers = order;
         }
 
+        size_t getOrder() const {
+            return _orderNumbers;
+        }
+
         /*!
          * compare the ordering numbers of two tasks to restore initial data order after processing multiple ones
          * @param other
          * @return
          */
         bool compareAscOrder(const ITask& other) const {
-            // make sure they have the same length
-            assert(_orderNumbers.size() == other._orderNumbers.size());
-
-            // this < other?
-            // compare one by one
-            for(int i = 0; i < other._orderNumbers.size(); ++i) {
-                if(_orderNumbers[i] >= other._orderNumbers[i])
-                    return false;
-            }
-            return true;
+            return _orderNumbers[i] < other._orderNumbers[i];
         }
     };
 }

From 3e1d243c9d1d29b9ea354b0b5c98bd58e59f2d3d Mon Sep 17 00:00:00 2001
From: korlamarch <khemarat_boonyapaluk@brown.edu>
Date: Wed, 9 Mar 2022 13:00:17 -0500
Subject: [PATCH 10/56] Rework LocalBackend and TransformTask to support top
 and bottom limit

---
 tuplex/core/include/Executor.h                |   6 -
 tuplex/core/include/Partition.h               |  14 --
 tuplex/core/include/ee/local/LocalBackend.h   |   3 +
 tuplex/core/include/physical/TransformStage.h |  12 +-
 tuplex/core/src/Executor.cc                   |  24 ---
 tuplex/core/src/ee/local/LocalBackend.cc      | 182 ++++++++++++++++--
 tuplex/core/src/physical/PhysicalPlan.cc      |   2 +-
 tuplex/core/src/physical/ResultSet.cc         |   1 -
 tuplex/core/src/physical/TransformStage.cc    |  86 +--------
 tuplex/core/src/physical/TransformTask.cc     |  54 ++++--
 tuplex/utils/include/mt/ITask.h               |  85 ++++----
 11 files changed, 270 insertions(+), 199 deletions(-)

diff --git a/tuplex/core/include/Executor.h b/tuplex/core/include/Executor.h
index 3631f7e7d..7eaaee244 100644
--- a/tuplex/core/include/Executor.h
+++ b/tuplex/core/include/Executor.h
@@ -51,12 +51,6 @@ namespace tuplex {
         std::atomic_int _numPendingTasks{};
         std::atomic_int _numCompletedTasks{};
 
-        // mapping from order number -> row count if the task is finished
-        std::mutex _rowsDoneMutex;
-        std::map<size_t, size_t> _rowsDone;
-
-        std::atomic_int _frontRowsLimit{};
-        std::atomic_int _bottomRowsLimit{};
     public:
 
         WorkQueue();
diff --git a/tuplex/core/include/Partition.h b/tuplex/core/include/Partition.h
index 24b79cc8f..8bf112051 100644
--- a/tuplex/core/include/Partition.h
+++ b/tuplex/core/include/Partition.h
@@ -69,7 +69,6 @@ namespace tuplex {
         void loadFromFile(const URI& uri);
 
         int64_t                 _numRows;
-        int64_t                 _numSkip; // number of rows to skip, currently only used at the output (Result set)
         uint64_t                _bytesWritten;
 
         Schema _schema; //! Schema of the partition. May be optimized away later.
@@ -249,19 +248,6 @@ namespace tuplex {
             _mutex.unlock();
         }
 
-        size_t getNumSkip() {
-            size_t res = 0;
-            _mutex.lock();
-            res = num_skip;
-            _mutex.unlock();
-            return res;
-        }
-
-        void setNumSkip(const size_t numSkip) {
-            _mutex.lock();
-            _numSkip = numSkip;
-            _mutex.unlock();
-        }
 
         int64_t getDataSetID() const { return _dataSetID; }
 
diff --git a/tuplex/core/include/ee/local/LocalBackend.h b/tuplex/core/include/ee/local/LocalBackend.h
index 77d375aed..0dbfafdc9 100644
--- a/tuplex/core/include/ee/local/LocalBackend.h
+++ b/tuplex/core/include/ee/local/LocalBackend.h
@@ -88,6 +88,9 @@ namespace tuplex {
 
         MessageHandler& logger() const { return Logger::instance().logger("local ee"); }
 
+        void trimPartitionsToLimit(std::vector<Partition *> &partitions, size_t topLimit, size_t bottomLimit, TransformStage* tstage);
+        Partition* newPartitionWithSkipRows(Partition* p_in, int numToSkip, TransformStage* tstage);
+
         // write output (may be already in correct format!)
         void writeOutput(TransformStage* tstage, std::vector<IExecutorTask*>& sortedTasks);
 
diff --git a/tuplex/core/include/physical/TransformStage.h b/tuplex/core/include/physical/TransformStage.h
index e63eaec31..f489f1f6c 100644
--- a/tuplex/core/include/physical/TransformStage.h
+++ b/tuplex/core/include/physical/TransformStage.h
@@ -107,11 +107,13 @@ namespace tuplex {
         std::unordered_map<std::string, ExceptionInfo> partitionToExceptionsMap() { return _partitionToExceptionsMap; }
 
         /*!
-         * sets maximum number of rows this pipeline will produce
-         * @param outputLimit
+         * sets maximum number of top rows this pipeline will produce
+         * @param topLimit
+         * @param bottomLimit
          */
-        void setOutputLimit(size_t outputLimit) {
-            _outputTopLimit = outputLimit;
+        inline void  setOutputLimit(size_t topLimit, size_t bottomLimit) {
+            _outputTopLimit = topLimit;
+            _outputBottomLimit = bottomLimit;
 
             // @TODO: move this logic to physical plan!
             // pushdown limit
@@ -481,7 +483,7 @@ namespace tuplex {
         python::Type _hashOutputBucketType;
 
         bool hasOutputLimit() const {
-            return _outputTopLimit < std::numeric_limits<size_t>::max();
+            return _outputTopLimit < std::numeric_limits<size_t>::max() && _outputBottomLimit != 0;
         }
     };
 }
diff --git a/tuplex/core/src/Executor.cc b/tuplex/core/src/Executor.cc
index 1cc818010..388199e4d 100644
--- a/tuplex/core/src/Executor.cc
+++ b/tuplex/core/src/Executor.cc
@@ -98,24 +98,6 @@ namespace tuplex {
             return false;
         }
 
-        // if reach the top limit already, then don't compute the rest
-        size_t numTopCompleted;
-        TRACE_LOCK("rowsDone");
-        _rowsDoneMutex.lock();
-        size_t frontRowsDone = 0;
-        for (size_t i = 0; _rowsDone.count(i) != 0; i++) {
-            frontRowsDone += _rowsDone[i];
-            if (frontRowsDone >= _queue.frontRowsLimit()) {
-                // skip execution
-                _numPendingTasks.fetch_add(-1, std::memory_order_release);
-                _rowsDoneMutex.unlock();
-                TRACE_UNLOCK("rowsDone");
-                return true;
-            }
-        }
-        _rowsDoneMutex.unlock();
-        TRACE_UNLOCK("rowsDone");
-
         task->setOwner(&executor);
         task->setThreadNumber(executor.threadNumber()); // redundant?
 
@@ -135,12 +117,6 @@ namespace tuplex {
         _numCompletedTasks.fetch_add(1, std::memory_order_release);
         TRACE_UNLOCK("completedTasks");
 
-        TRACE_LOCK("rowsDone");
-        _rowsDoneMutex.lock();
-        _rowsDone[task->getOrder()] += task->getNumOutputRows();
-        _rowsDoneMutex.unlock();
-        TRACE_UNLOCK("rowsDone");
-
         return true;
     }
 
diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc
index 5a1311436..dbceaa1b9 100644
--- a/tuplex/core/src/ee/local/LocalBackend.cc
+++ b/tuplex/core/src/ee/local/LocalBackend.cc
@@ -486,7 +486,6 @@ namespace tuplex {
 
         // check what type of input the pipeline has (memory or files)
         if(tstage->fileInputMode()) {
-            // TODO(march): deal with file input
             // files
             // input is multiple files, use split file strategy here.
             // and issue tasks to executor workqueue!
@@ -686,9 +685,6 @@ namespace tuplex {
                 task->setStageID(tstage->getID());
                 task->setOutputTopLimit(tstage->outputTopLimit());
                 task->setOutputBottomLimit(tstage->outputBottomLimit());
-                if (tstage->outputBottomLimit()) {
-                    // TODO(march): work here (task output limit generation)
-                }
                 tasks.emplace_back(std::move(task));
                 numInputRows += partition->getNumRows();
 
@@ -698,6 +694,31 @@ namespace tuplex {
             }
         }
 
+        // assign the order for all tasks
+        for(size_t i = 0; i < tasks.size(); ++i) {
+            tasks[i]->setOrder(i);
+        }
+
+        if (tstage->hasOutputLimit()) {
+            if (tstage->outputTopLimit() > 0 && tstage->outputBottomLimit() > 0) {
+                // do task striping for output limit on both ends
+                vector<IExecutorTask*> newTasks;
+                for(size_t i = 0; i < tasks.size() - i; i++) {
+                    const size_t rev_i = tasks.size() - 1 - i;
+                    newTasks.push_back(tasks[i]);
+                    if (i < rev_i) {
+                        newTasks.push_back(tasks[rev_i]);
+                    }
+                }
+                assert(tasks.size() == newTasks.size());
+                tasks.swap(newTasks);
+            } else if (tstage->outputBottomLimit() > 0) {
+                // bottom limit only, just reverse the task order
+                std::reverse(tasks.begin(), tasks.end());
+            }
+            // if top limit only, do nothing since the order is already good
+        }
+
         return tasks;
     }
 
@@ -941,8 +962,8 @@ namespace tuplex {
             }
         }
 
-        // TODO(march): work here (transform stage)
         auto tasks = createLoadAndTransformToMemoryTasks(tstage, _options, syms);
+
         auto completedTasks = performTasks(tasks);
 
         // Note: this doesn't work yet because of the globals.
@@ -1175,6 +1196,10 @@ namespace tuplex {
                     rowDelta += taskNonConformingRows.size();
                 }
 
+                if (tstage->hasOutputLimit()) {
+                    trimPartitionsToLimit(output, tstage->outputTopLimit(), tstage->outputBottomLimit());
+                }
+
                 tstage->setMemoryResult(output, generalOutput, partitionToExceptionsMap, nonConformingRows, remainingExceptions, ecounts);
                 break;
             }
@@ -1518,21 +1543,29 @@ namespace tuplex {
         WorkQueue& wq = LocalEngine::instance().getQueue();
         wq.clear();
 
-        // assign the order for all tasks
+        // check if ord is set, if not issue warning & add
+        bool orderlessTaskFound = false;
         for(int i = 0; i < tasks.size(); ++i) {
-            tasks[i]->setOrder(i);
+            if(tasks[i]->getOrder().size() == 0) {
+                tasks[i]->setOrder(i);
+                orderlessTaskFound = true;
+            }
         }
 
-        // add all tasks to queue
-        // TODO(march): add task stage (to do striping)
-        for(size_t i = 0; i <= tasks.size() - i - 1; i++) {
-            const size_t revI = tasks.size()- i - 1
-            wq.addTask(&tasks[i]);
-            if (revI > i) {
-                wq.addTask(&tasks[revI]);
-            }
+#ifndef NDEBUG
+        if(orderlessTaskFound) {
+            logger().debug("task without order found, please fix in code.");
+        }
+#endif
+
+        for (int i = 0; i < tasks.size(); i++) {
+            // take limit only work with uniform order
+            assert(task.getOrder(0) == i);
         }
 
+        // add all tasks to queue
+        for(auto& task : tasks) wq.addTask(task);
+
         // clear
         tasks.clear();
 
@@ -2083,4 +2116,123 @@ namespace tuplex {
         Logger::instance().defaultLogger().info("writing output took " + std::to_string(timer.time()) + "s");
         tstage->setFileResult(ecounts);
     }
+
+    void LocalBackend::trimPartitionsToLimit(std::vector<Partition *> &partitions,
+                                             size_t topLimit,
+                                             size_t bottomLimit,
+                                             TransformStage* tstage) {
+        std::vector<Partition *> limitedPartitions, limitedTailPartitions;
+
+        // check top output limit, adjust partitions if necessary
+        size_t numTopOutputRows = 0;
+        Partition* lastTopPart = nullptr;
+        size_t clippedTop = 0;
+        for (auto partition : partitions) {
+            numTopOutputRows += partition->getNumRows();
+            lastTopPart = partition;
+            if (numTopOutputRows >= topLimit) {
+                // clip last partition & leave loop
+                clippedTop = topLimit - (numTopOutputRows - partition->getNumRows());
+                assert(clippedTop <= partition->getNumRows());
+                break;
+            } else if (partition == *partitions.end()) {
+                // last partition, mark full row, but don't put to output set yet to avoid double put
+                clippedTop = partition->getNumRows();
+                break;
+            } else {
+                // put full partition to output set
+                limitedPartitions.push_back(partition);
+            }
+        }
+
+        // check the bottom output limit, adjust partitions if necessary
+        size_t numBottomOutputRows = 0;
+        size_t clippedBottom = 0;
+        for (auto it = partitions.rbegin(); it != partitions.rend(); it++) {
+            auto partition = *it;
+            numBottomOutputRows += partition->getNumRows();
+
+            if (partition == lastTopPart) {
+                // the bottom and the top partitions are overlapping
+                clippedBottom = bottomLimit - (numBottomOutputRows - partition->getNumRows());
+                if (clippedTop + clippedBottom >= partition->getNumRows()) {
+                    // if top and bottom range intersect, use full partitions
+                    clippedTop = partition->getNumRows();
+                    clippedBottom = 0;
+                }
+                break;
+            } else if (numBottomOutputRows >= bottomLimit) {
+                // clip last partition & leave loop
+                auto clipped = bottomLimit - (numTopOutputRows - partition->getNumRows());
+                assert(clipped <= partition->getNumRows());
+                Partition newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage);
+                partition->invalidate();
+                parition = newPart;
+                assert(partition->getNumRows() == clipped);
+                if (clipped > 0)
+                    limitedTailPartitions.push_back(partition);
+                break;
+            } else {
+                // put full partition to output set
+                limitedTailPartitions.push_back(partition);
+            }
+        }
+
+        // push the middle partition
+        if (lastTopPart != nullptr && (clippedTop > 0 || clippedBottom > 0)) {
+            assert(clippedTop + clippedBottom <= lastTopPart->getNumRows());
+
+            // split into two partitions with both top and bottom are in the same partition
+            Partition* lastBottomPart = nullptr;
+
+            if (clippedBottom != 0) {
+                lastBottomPart = newPartitionWithSkipRows(lastTopPart, lastTopPart->getNumRows() - clippedBottom, tstage);
+            }
+
+            lastTopPart->setNumRows(clippedTop);
+
+            limitedPartitions.push_back(lastTopPart);
+
+            if (lastBottomPart != nullptr) {
+                limitedPartitions.push_back(lastBottomPart);
+            }
+        }
+
+        // merge the head and tail partitions
+        partitions.clear()
+        partitions.insert(partitions.end(), limitedPartitions.begin(), limitedPartitions.end());
+        partitions.insert(partitions.end(), limitedTailPartitions.rbegin(), limitedTailPartitions.rend());
+    }
+
+    Partition* LocalBackend::newPartitionWithSkipRows(Partition *p_in, int numToSkip, TransformStage* tstage) {
+        if(!numToSkip)
+            return nullptr;
+
+        auto ptr = p_in->lockRaw();
+        auto num_rows = *((int64_t*) ptr);
+        assert(numToSkip < num_rows);
+
+        Partition *p_out = _driver->allocWritablePartition(num_rows - numToSkip + sizeof(int64_t),
+                                                           tstage->outputSchema(), tstage->outputDataSetID(),
+                                                           tstage->context().id());
+
+        ptr += sizeof(int64_t);
+        size_t numBytesToSkip = 0;
+
+        for(unsigned i = 0; i < numToSkip; ++i) {
+            Rows r = Row::fromMemory(tstage->outputSchema(), ptr, p_in->capacity() - numBytesToSkip);
+            ptr += r.serializedLength();
+            numBytesToSkip += r.serializedLength();
+        }
+
+        auto ptr_out = p_out->lockRaw();
+        *((int64_t*) ptr_out) = p_in->getNumRows() - numToSkip;
+        ptr_out += sizeof(int64_t);
+        memcpy(ptr_out, ptr, p_in->size() - numBytesToSkip);
+        p_out->unlock();
+
+        p_in->unlock();
+
+        return p_out;
+    }
 } // namespace tuplex
\ No newline at end of file
diff --git a/tuplex/core/src/physical/PhysicalPlan.cc b/tuplex/core/src/physical/PhysicalPlan.cc
index 9c22837ad..ff67e4add 100644
--- a/tuplex/core/src/physical/PhysicalPlan.cc
+++ b/tuplex/core/src/physical/PhysicalPlan.cc
@@ -240,7 +240,7 @@ namespace tuplex {
         // user wants to merge exceptions in order.
         bool updateInputExceptions = hasFilter && hasInputExceptions && _context.getOptions().OPT_MERGE_EXCEPTIONS_INORDER();
 
-        // create trafostage via builder pattern
+        // create transfrom stage via builder pattern
         auto builder = codegen::StageBuilder(_num_stages++,
                                                isRootStage,
                                                _context.getOptions().UNDEFINED_BEHAVIOR_FOR_OPERATORS(),
diff --git a/tuplex/core/src/physical/ResultSet.cc b/tuplex/core/src/physical/ResultSet.cc
index 5e15867f7..e31e78cec 100644
--- a/tuplex/core/src/physical/ResultSet.cc
+++ b/tuplex/core/src/physical/ResultSet.cc
@@ -183,7 +183,6 @@ namespace tuplex {
     }
 
     Row ResultSet::getNextRow() {
-        // TODO(march): logic in skip row count here
         // merge rows from objects
         if(!_pyobjects.empty()) {
             auto row_number = std::get<0>(_pyobjects.front());
diff --git a/tuplex/core/src/physical/TransformStage.cc b/tuplex/core/src/physical/TransformStage.cc
index af58866dc..6e216ac5c 100644
--- a/tuplex/core/src/physical/TransformStage.cc
+++ b/tuplex/core/src/physical/TransformStage.cc
@@ -130,99 +130,23 @@ namespace tuplex {
         if (partitions.empty() && interpreterRows.empty() && generalCase.empty())
             _rs = emptyResultSet();
         else {
-            std::vector<Partition *> limitedPartitions, limitedTailPartitions;
             auto schema = Schema::UNKNOWN;
 
             if(!partitions.empty()) {
+                size_t totalRowsCount = 0;
                 schema = partitions.front()->schema();
                 for (auto partition : partitions) {
                     assert(schema == partition->schema());
+                    totalRowsCount += partition->getNumRows();
                 }
 
-                // check top output limit, adjust partitions if necessary
-                size_t numTopOutputRows = 0;
-                Partition* lastTopPart = nullptr;
-                size_t clippedTop = 0;
-                for (auto partition : partitions) {
-                    numTopOutputRows += partition->getNumRows();
-                    lastTopPart = partition;
-                    if (numTopOutputRows >= outputTopLimit()) {
-                        // clip last partition & leave loop
-                        clippedTop = outputTopLimit() - (numTopOutputRows - partition->getNumRows());
-                        assert(clippedTop <= partition->getNumRows());
-                        break;
-                    } else if (partition == *partitions.end()) {
-                        // last partition, mark full row, but don't put to output set yet to avoid double put
-                        clippedTop = partition->getNumRows();
-                        break;
-                    } else {
-                        // put full partition to output set
-                        limitedPartitions.push_back(partition);
-                    }
-                }
-
-                // check the bottom output limit, adjust partitions if necessary
-                size_t numBottomOutputRows = 0;
-                size_t clippedBottom = 0;
-                for (auto it = partitions.rbegin(); it != partitions.rend(); it++) {
-                    auto partition = *it;
-                    numBottomOutputRows += partition->getNumRows();
-
-                    if (partition == lastTopPart) {
-                        // the bottom and the top partitions are overlapping
-                        clippedBottom = outputBottomLimit() - (numBottomOutputRows - partition->getNumRows());
-                        if (clippedTop + clippedBottom >= partition->getNumRows()) {
-                            // if top and bottom range intersect, use full partitions
-                            clippedTop = partition->getNumRows();
-                            clippedBottom = 0;
-                        }
-                        break;
-                    } else if (numBottomOutputRows >= outputBottomLimit()) {
-                        // clip last partition & leave loop
-                        auto clipped = outputBottomLimit() - (numTopOutputRows - partition->getNumRows());
-                        assert(clipped <= partition->getNumRows());
-                        partition->setNumSkip(partition->getNumRows() - clippedBottom);
-                        partition->setNumRows(clipped);
-                        if (clipped > 0)
-                            limitedTailPartitions.push_back(partition);
-                        break;
-                    } else {
-                        // put full partition to output set
-                        limitedTailPartitions.push_back(partition);
-                    }
-                }
-
-                // push the middle partition
-                if (lastTopPart != nullptr && (clippedTop > 0 || clippedBottom > 0)) {
-                    assert(clippedTop + clippedBottom <= lastTopPart->getNumRows());
-
-                    // TODO(march): to work on this (split into two partitions)
-                    // split into two partitions with both top and bottom are in the same partition
-                    Partition* lastBottomPart = nullptr;
-                    if (clippedBottom != 0) {
-                        lastBottomPart = new Partition(lastTopPart);
-                        lastBottomPart->setNumSkip(lastBottomPart->getNumRows() - clippedBottom);
-                        lastBottomPart->setNumRows(clippedBottom);
-                    }
-
-                    lastTopPart->setNumRows(clippedTop);
-
-                    limitedPartitions.push_back(lastTopPart);
-
-                    if (lastBottomPart != nullptr) {
-                        limitedPartitions.push_back(lastBottomPart);
-                    }
+                if (hasOutputLimit()) {
+                    assert(totalRowsCount == _outputTopLimit + _outputBottomLimit);
                 }
-
-                // merge the head and tail partitions
-                std::reverse(limitedTailPartitions.begin(), limitedTailPartitions.end());
-                limitedPartitions.insert(limitedPartitions.end(), limitedTailPartitions.begin(), limitedTailPartitions.end());
             }
 
             // put ALL partitions to result set
-            // TODO(march): handle overlapping case
-            // TODO(march): maybe do top/bottom limit at the level instead?
-            _rs = std::make_shared<ResultSet>(schema, limitedPartitions,
+            _rs = std::make_shared<ResultSet>(schema, partitions,
                                               generalCase, partitionToExceptionsMap, interpreterRows,
                                               outputTopLimit() + outputBottomLimit());
         }
diff --git a/tuplex/core/src/physical/TransformTask.cc b/tuplex/core/src/physical/TransformTask.cc
index a65aa7f11..49d104bcc 100644
--- a/tuplex/core/src/physical/TransformTask.cc
+++ b/tuplex/core/src/physical/TransformTask.cc
@@ -22,9 +22,14 @@ namespace tuplex {
     static std::atomic_int64_t g_totalTopOutputRows;
     static std::atomic_int64_t g_totalBottomOutputRows;
 
+    // mapping from order number -> row count if the task is finished
+    static std::mutex g_rowsDoneMutex;
+    static std::map<size_t, size_t> g_rowsDone;
+
     void TransformTask::resetOutputLimitCounter() {
         g_totalTopOutputRows = 0;
         g_totalBottomOutputRows = 0;
+        g_rowsDone.clear();
     }
 }
 
@@ -42,23 +47,9 @@ extern "C" {
     }
 
     static int64_t limited_w2mCallback(tuplex::TransformTask* task, uint8_t* buf, int64_t bufSize) {
-        // i.e. check here how many output rows, if already limit reached - jump to goto!
-        // TODO(march): comment this out
-        if(tuplex::g_totalTopOutputRows >= task->output_top_limit()) {
-            return tuplex::ecToI64(tuplex::ExceptionCode::OUTPUT_LIMIT_REACHED);
-        }
-
         assert(task);
         assert(dynamic_cast<tuplex::TransformTask*>(task));
-        auto rc = task->writeRowToMemory(buf, bufSize);
-        if(0 == rc)
-            tuplex::g_totalTopOutputRows++;
-
-        // i.e. check here how many output rows, if already limit reached - jump to goto!
-        if(tuplex::g_totalTopOutputRows >= task->output_top_limit()) {
-            return tuplex::ecToI64(tuplex::ExceptionCode::OUTPUT_LIMIT_REACHED);
-        }
-        return rc;
+        return task->writeRowToMemory(buf, bufSize);
     }
 
     static int64_t limited_w2fCallback(tuplex::TransformTask* task, uint8_t* buf, int64_t bufSize) {
@@ -623,9 +614,36 @@ namespace tuplex {
 
         auto functor = reinterpret_cast<codegen::read_block_f>(_functor);
 
-        // TODO(march): question here?
         // go over all input partitions.
         for(const auto &inputPartition : _inputPartitions) {
+            size_t numTopCompleted = 0;
+            size_t numBottomCompleted = 0;
+            bool isTopLimitReached = false;
+            bool isBottomLimitReached = false;
+
+            tuplex::g_rowsDoneMutex.lock();
+            for (size_t i = 0; tuplex::g_rowsDone.count(i) != 0; i++) {
+                numTopCompleted += tuplex::g_rowsDone[i];
+                if (numTopCompleted >= _outTopLimit) {
+                    isTopLimitReached = true;
+                    break;
+                }
+            }
+            // TODO: what is the max task number here
+            for (size_t i = 100; tuplex::g_rowsDone.count(i) != 0; i--) {
+                numBottomCompleted += tuplex::g_rowsDone[i];
+                if (numBottomCompleted >= _outTopLimit) {
+                    isBottomLimitReached = true;
+                    break;
+                }
+            }
+            tuplex::g_rowsDoneMutex.unlock();
+
+            if (isTopLimitReached && isBottomLimitReached) {
+                // skip the execution, enough is done
+                break;
+            }
+
             // lock ptr, extract number of rows ==> store them
             // lock raw & call functor!
             int64_t inSize = inputPartition->size();
@@ -647,6 +665,10 @@ namespace tuplex {
             // delete partition if desired...
             if(_invalidateSourceAfterUse)
                 inputPartition->invalidate();
+
+            tuplex::g_rowsDoneMutex.lock();
+            tuplex::g_rowsDone[getOrder(0)] += getNumOutputRows();
+            tuplex::g_rowsDoneMutex.unlock();
         }
 
 #ifndef NDEBUG
diff --git a/tuplex/utils/include/mt/ITask.h b/tuplex/utils/include/mt/ITask.h
index 01f7137f1..a5ca4058f 100644
--- a/tuplex/utils/include/mt/ITask.h
+++ b/tuplex/utils/include/mt/ITask.h
@@ -21,52 +21,65 @@
 
 namespace tuplex {
 
+/*!
+ * interface for defining tasks that can be run via a threadpool
+ */
+class ITask {
+private:
+    std::thread::id _id; //! the id of the thread that executed the task. Used to specifically execute a task on a specific thread.
+//! Per default object is constructed that does not represent a thread
+
+    std::vector<size_t> _orderNumbers; //! for sorting tasks when doing async processing, allows for multiple stages
+
+public:
+    ITask() {};
+    ITask(const ITask& other) : _id(other._id), _orderNumbers(other._orderNumbers)  {}
+    virtual ~ITask() = default;
+    ITask(ITask&& other) = default;
+    ITask& operator = (ITask&& other) = default;
+
     /*!
-     * interface for defining tasks that can be run via a threadpool
+     * interface to run a task
      */
-    class ITask {
-    private:
-        std::thread::id _id; //! the id of the thread that executed the task. Used to specifically execute a task on a specific thread.
-//! Per default object is constructed that does not represent a thread
+    virtual void execute() = 0;
 
-        size_t _orderNumbers; //! for sorting tasks when doing async processing, allows for multiple stages
+    std::thread::id getID() {
+        return _id;
+    }
 
-    public:
-        ITask() {};
-        ITask(const ITask& other) : _id(other._id), _orderNumbers(other._orderNumbers)  {}
-        virtual ~ITask() = default;
-        ITask(ITask&& other) = default;
-        ITask& operator = (ITask&& other) = default;
+    void setID(const std::thread::id& id) {
+        _id = id;
+    }
 
-        /*!
-         * interface to run a task
-         */
-        virtual void execute() = 0;
+    void setOrder(size_t order) { _orderNumbers = std::vector<size_t>{order}; }
 
-        std::thread::id getID() {
-            return _id;
-        }
+    size_t getOrder(const size_t nth) const {
+        return _orderNumbers[nth];
+    }
 
-        void setID(const std::thread::id& id) {
-            _id = id;
-        }
+    std::vector<size_t> getOrder() const { return _orderNumbers; }
 
-        void setOrder(size_t order) {
-            _orderNumbers = order;
-        }
+    void setOrder(const std::vector<size_t>& order) {
+        _orderNumbers = order;
+    }
 
-        size_t getOrder() const {
-            return _orderNumbers;
-        }
+    /*!
+     * compare the ordering numbers of two tasks to restore initial data order after processing multiple ones
+     * @param other
+     * @return
+     */
+    bool compareAscOrder(const ITask& other) const {
+        // make sure they have the same length
+        assert(_orderNumbers.size() == other._orderNumbers.size());
 
-        /*!
-         * compare the ordering numbers of two tasks to restore initial data order after processing multiple ones
-         * @param other
-         * @return
-         */
-        bool compareAscOrder(const ITask& other) const {
-            return _orderNumbers[i] < other._orderNumbers[i];
+        // this < other?
+        // compare one by one
+        for(int i = 0; i < other._orderNumbers.size(); ++i) {
+            if(_orderNumbers[i] >= other._orderNumbers[i])
+                return false;
         }
-    };
+        return true;
+    }
+};
 }
 #endif //TUPLEX_ITASK_H
\ No newline at end of file

From 3bf283fb003dfa54bc82396bc750a65464969c55 Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Fri, 25 Mar 2022 00:16:11 -0400
Subject: [PATCH 11/56] Address Review Comments

---
 tuplex/core/include/DataSet.h                 |  5 +-
 tuplex/core/include/EmptyDataset.h            |  4 +-
 tuplex/core/include/ErrorDataSet.h            |  4 +-
 tuplex/core/include/Executor.h                | 10 +-
 tuplex/core/include/ee/local/LocalBackend.h   |  2 +-
 tuplex/core/include/logical/TakeOperator.h    | 12 +--
 tuplex/core/include/physical/ResultSet.h      |  2 -
 tuplex/core/include/physical/StageBuilder.h   |  5 +-
 tuplex/core/include/physical/TransformStage.h |  8 +-
 tuplex/core/src/DataSet.cc                    | 12 +--
 tuplex/core/src/EmptyDataset.cc               |  6 +-
 tuplex/core/src/ErrorDataSet.cc               |  6 +-
 tuplex/core/src/Executor.cc                   |  4 -
 tuplex/core/src/ee/local/LocalBackend.cc      | 17 ++--
 tuplex/core/src/logical/TakeOperator.cc       |  6 +-
 tuplex/core/src/physical/PhysicalPlan.cc      |  2 +-
 tuplex/core/src/physical/ResultSet.cc         |  2 +-
 tuplex/core/src/physical/StageBuilder.cc      |  2 +-
 tuplex/python/src/PythonDataSet.cc            |  2 +-
 tuplex/python/tuplex/dataset.py               | 15 +--
 tuplex/utils/include/mt/ITask.h               | 94 +++++++++----------
 21 files changed, 102 insertions(+), 118 deletions(-)

diff --git a/tuplex/core/include/DataSet.h b/tuplex/core/include/DataSet.h
index 65a766a87..f6bb97f2c 100644
--- a/tuplex/core/include/DataSet.h
+++ b/tuplex/core/include/DataSet.h
@@ -263,13 +263,12 @@ namespace tuplex {
         // these are actions that cause execution
         virtual std::shared_ptr<ResultSet> collect(std::ostream &os = std::cout);
 
-        virtual std::shared_ptr<ResultSet> take(int64_t numTop, int64_t numBottom, std::ostream &os = std::cout);
+        virtual std::shared_ptr<ResultSet> take(size_t topLimit, size_t bottomLimit, std::ostream &os = std::cout);
 
         virtual std::vector<Row> collectAsVector(std::ostream &os = std::cout);
 
-        virtual std::vector<Row> takeAsVector(int64_t numElements, std::ostream &os = std::cout);
+        virtual std::vector<Row> takeAsVector(size_t numElements, std::ostream &os = std::cout);
 
-        
         /*!
          * saves dataset to file. There are multiple options to control the behavior
          * ==> 1.) files can be split across multiple ones. Specify number of files to split rows to
diff --git a/tuplex/core/include/EmptyDataset.h b/tuplex/core/include/EmptyDataset.h
index 0f8a1f52c..6fc3219a4 100644
--- a/tuplex/core/include/EmptyDataset.h
+++ b/tuplex/core/include/EmptyDataset.h
@@ -70,13 +70,13 @@ namespace tuplex {
         virtual std::shared_ptr<ResultSet> collect(std::ostream& os) override;
 
         // take / collect will print out the error only
-        virtual std::shared_ptr<ResultSet> take(int64_t numTop, int64_t numBottom, std::ostream& os) override;
+        virtual std::shared_ptr<ResultSet> take(size_t topLimit, size_t bottomLimit, std::ostream& os) override;
 
         //virtual void show(const int64_t numRows=-1, std::ostream& os=std::cout) override;
         virtual std::vector<Row> collectAsVector(std::ostream& os) override;
 
         // take / collect will print out the error only
-        virtual std::vector<Row> takeAsVector(int64_t numElements, std::ostream& os) override;
+        virtual std::vector<Row> takeAsVector(size_t numElements, std::ostream& os) override;
 
         DataSet& cache(const Schema::MemoryLayout& memoryLayout, bool storeSpecialized) override {
             return *this;
diff --git a/tuplex/core/include/ErrorDataSet.h b/tuplex/core/include/ErrorDataSet.h
index 34fc60685..cf283ebd1 100644
--- a/tuplex/core/include/ErrorDataSet.h
+++ b/tuplex/core/include/ErrorDataSet.h
@@ -90,13 +90,13 @@ namespace tuplex {
         std::shared_ptr<ResultSet> collect(std::ostream& os) override;
 
         // take / collect will print out the error only
-        std::shared_ptr<ResultSet> take(int64_t numTop, int64_t numBottom, std::ostream& os) override;
+        std::shared_ptr<ResultSet> take(size_t topLimit, size_t bottomLimit, std::ostream& os) override;
 
         //virtual void show(const int64_t numRows=-1, std::ostream& os=std::cout) override;
         std::vector<Row> collectAsVector(std::ostream& os) override;
 
         // take / collect will print out the error only
-        std::vector<Row> takeAsVector(int64_t numElements, std::ostream& os) override;
+        std::vector<Row> takeAsVector(size_t numElements, std::ostream& os) override;
     };
 }
 
diff --git a/tuplex/core/include/Executor.h b/tuplex/core/include/Executor.h
index 7eaaee244..b6b7edac1 100644
--- a/tuplex/core/include/Executor.h
+++ b/tuplex/core/include/Executor.h
@@ -44,7 +44,7 @@ namespace tuplex {
      */
     class WorkQueue {
     private:
-        std::atomic_bool _done{}; // protects against data races
+        std::atomic_bool _done; // protects against data races
         ExecutorTaskQueueType _queue;
         std::mutex _completedTasksMutex;
         std::vector<IExecutorTask*> _completedTasks;
@@ -75,14 +75,6 @@ namespace tuplex {
 
         size_t numCompletedTasks() const { return _numCompletedTasks; }
 
-        size_t frontRowsLimit() const {
-            return _frontRowsLimit;
-        };
-
-        size_t bottomRowsLimit() const {
-            return _bottomRowsLimit;
-        };
-
         /*!
          * stop working on this queue & dump all tasks
          */
diff --git a/tuplex/core/include/ee/local/LocalBackend.h b/tuplex/core/include/ee/local/LocalBackend.h
index 0dbfafdc9..d7a5ec25b 100644
--- a/tuplex/core/include/ee/local/LocalBackend.h
+++ b/tuplex/core/include/ee/local/LocalBackend.h
@@ -89,7 +89,7 @@ namespace tuplex {
         MessageHandler& logger() const { return Logger::instance().logger("local ee"); }
 
         void trimPartitionsToLimit(std::vector<Partition *> &partitions, size_t topLimit, size_t bottomLimit, TransformStage* tstage);
-        Partition* newPartitionWithSkipRows(Partition* p_in, int numToSkip, TransformStage* tstage);
+        Partition* newPartitionWithSkipRows(Partition* p_in, size_t numToSkip, TransformStage* tstage);
 
         // write output (may be already in correct format!)
         void writeOutput(TransformStage* tstage, std::vector<IExecutorTask*>& sortedTasks);
diff --git a/tuplex/core/include/logical/TakeOperator.h b/tuplex/core/include/logical/TakeOperator.h
index b5dd5db6e..b7c4892dc 100644
--- a/tuplex/core/include/logical/TakeOperator.h
+++ b/tuplex/core/include/logical/TakeOperator.h
@@ -17,16 +17,16 @@
 namespace tuplex {
     class TakeOperator : public LogicalOperator {
     private:
-        int64_t _limitTop;
-        int64_t _limitBottom;
+        size_t _topLimit;
+        size_t _bottomLimit;
     public:
         LogicalOperator *clone() override;
 
     public:
-        TakeOperator(LogicalOperator *parent, const int64_t numTop, const int64_t numBottom);
+        TakeOperator(LogicalOperator *parent, size_t topLimit, size_t bottomLimit);
 
         std::string name() override {
-            if(_limitTop < 0 || std::numeric_limits<int64_t>::max() == _limitTop)
+            if(_topLimit < 0 || std::numeric_limits<int64_t>::max() == _topLimit)
                 return "collect";
             return "take";
         }
@@ -38,9 +38,9 @@ namespace tuplex {
 
         bool good() const override;
 
-        int64_t limit() { return _limitTop; }
+        size_t topLimit() const { return _topLimit; }
 
-        int64_t limitBottom() { return _limitBottom; }
+        size_t bottomLimit() const { return _bottomLimit; }
 
         std::vector<Row> getSample(const size_t num) const override;
 
diff --git a/tuplex/core/include/physical/ResultSet.h b/tuplex/core/include/physical/ResultSet.h
index 5e69fef3a..e94b8f1ae 100644
--- a/tuplex/core/include/physical/ResultSet.h
+++ b/tuplex/core/include/physical/ResultSet.h
@@ -36,8 +36,6 @@ namespace tuplex {
         size_t _rowsRetrieved;
         size_t _totalRowCounter; // used for merging in rows!
         size_t _maxRows;
-        size_t _maxRowsTop;
-        size_t _maxRowsBottom;
         Schema _schema;
 
         void removeFirstPartition();
diff --git a/tuplex/core/include/physical/StageBuilder.h b/tuplex/core/include/physical/StageBuilder.h
index e678ead3d..83e63208a 100644
--- a/tuplex/core/include/physical/StageBuilder.h
+++ b/tuplex/core/include/physical/StageBuilder.h
@@ -76,7 +76,7 @@ namespace tuplex {
             void addFileInput(FileInputOperator* csvop);
             void addFileOutput(FileOutputOperator* fop);
 
-            inline void  setOutputLimit(size_t topLimit, size_t bottomLimit) {
+            inline void setOutputLimit(size_t topLimit, size_t bottomLimit = 0) {
                 _outputTopLimit = topLimit;
                 _outputBottomLimit = bottomLimit;
             }
@@ -158,8 +158,9 @@ namespace tuplex {
             size_t number() const { return _stageNumber; }
             int64_t outputDataSetID() const;
 
+            // default case: both _outputTopLimit and _outputBottomLimit is zero = take everything
             inline bool hasOutputLimit() const {
-                return _outputTopLimit < std::numeric_limits<size_t>::max() || _outputBottomLimit > 0;
+                return _outputTopLimit != 0 || _outputBottomLimit != 0;
             }
 
             inline char csvOutputDelimiter() const {
diff --git a/tuplex/core/include/physical/TransformStage.h b/tuplex/core/include/physical/TransformStage.h
index f489f1f6c..f4efeebeb 100644
--- a/tuplex/core/include/physical/TransformStage.h
+++ b/tuplex/core/include/physical/TransformStage.h
@@ -393,6 +393,10 @@ namespace tuplex {
          */
         void setDataAggregationMode(const AggregateType& t) { _aggMode = t; }
 
+        // default case: both _outputTopLimit and _outputBottomLimit is zero = take everything
+        bool hasOutputLimit() const {
+            return _outputTopLimit > 0 || _outputBottomLimit > 0;
+        }
     private:
         /*!
          * creates a new TransformStage with generated code
@@ -481,10 +485,6 @@ namespace tuplex {
         // for hash output, the key and bucket type
         python::Type _hashOutputKeyType;
         python::Type _hashOutputBucketType;
-
-        bool hasOutputLimit() const {
-            return _outputTopLimit < std::numeric_limits<size_t>::max() && _outputBottomLimit != 0;
-        }
     };
 }
 #endif //TUPLEX_TRANSFORMSTAGE_H
\ No newline at end of file
diff --git a/tuplex/core/src/DataSet.cc b/tuplex/core/src/DataSet.cc
index 3de903d1c..c11482f86 100644
--- a/tuplex/core/src/DataSet.cc
+++ b/tuplex/core/src/DataSet.cc
@@ -38,21 +38,17 @@ namespace tuplex {
     }
 
     std::shared_ptr<ResultSet> DataSet::collect(std::ostream &os) {
-        return take(-1, false, os);
+        return take(0, 0, os);
     }
 
-    std::shared_ptr<ResultSet> DataSet::take(int64_t numTop, int64_t numBottom, std::ostream &os) {
+    std::shared_ptr<ResultSet> DataSet::take(size_t topLimit, size_t bottomLimit, std::ostream &os) {
         // error dataset?
         if (isError())
             throw std::runtime_error("is error dataset!");
 
-        // negative numbers mean get all elements!
-        if (numTop < 0)
-            numTop = std::numeric_limits<int64_t>::max();
-
         // create a take node
         assert(_context);
-        LogicalOperator *op = _context->addOperator(new TakeOperator(this->_operator, numTop, numBottom));
+        LogicalOperator *op = _context->addOperator(new TakeOperator(this->_operator, topLimit, bottomLimit));
         DataSet *dsptr = _context->createDataSet(op->getOutputSchema());
         dsptr->_operator = op;
         op->setDataSet(dsptr);
@@ -70,7 +66,7 @@ namespace tuplex {
     }
 
     // -1 means to retrieve all elements
-    std::vector<Row> DataSet::takeAsVector(int64_t numElements, std::ostream &os) {
+    std::vector<Row> DataSet::takeAsVector(size_t numElements, std::ostream &os) {
         auto rs = take(numElements, false, os);
         Timer timer;
 
diff --git a/tuplex/core/src/EmptyDataset.cc b/tuplex/core/src/EmptyDataset.cc
index 7504e8499..3664a591a 100644
--- a/tuplex/core/src/EmptyDataset.cc
+++ b/tuplex/core/src/EmptyDataset.cc
@@ -11,16 +11,16 @@
 #include <EmptyDataset.h>
 
 namespace tuplex {
-    std::shared_ptr<ResultSet> EmptyDataset::take(int64_t numTop, int64_t numBottom, std::ostream &os) {
+    std::shared_ptr<ResultSet> EmptyDataset::take(size_t topLimit, size_t bottomLimit, std::ostream &os) {
         return std::make_shared<ResultSet>();
     }
 
-    std::vector<Row> EmptyDataset::takeAsVector(int64_t numElements, std::ostream &os) {
+    std::vector<Row> EmptyDataset::takeAsVector(size_t numElements, std::ostream &os) {
         return std::vector<Row>{};
     }
 
     std::shared_ptr<ResultSet> EmptyDataset::collect(std::ostream &os) {
-        return take(0, false, os);
+        return take(0, 0, os);
     }
 
     std::vector<Row> EmptyDataset::collectAsVector(std::ostream &os) {
diff --git a/tuplex/core/src/ErrorDataSet.cc b/tuplex/core/src/ErrorDataSet.cc
index 9d19594f2..c87999e5f 100644
--- a/tuplex/core/src/ErrorDataSet.cc
+++ b/tuplex/core/src/ErrorDataSet.cc
@@ -12,7 +12,7 @@
 
 
 namespace tuplex {
-    std::vector<Row> ErrorDataSet::takeAsVector(int64_t numElements, std::ostream &os) {
+    std::vector<Row> ErrorDataSet::takeAsVector(size_t numElements, std::ostream &os) {
         // return empty vector and print err message
         Logger::instance().logger("core").error(this->_error);
 
@@ -23,7 +23,7 @@ namespace tuplex {
         return takeAsVector(0, os);
     }
 
-    std::shared_ptr<ResultSet> ErrorDataSet::take(int64_t numTop, int64_t numBottom, std::ostream &os) {
+    std::shared_ptr<ResultSet> ErrorDataSet::take(size_t topLimit, size_t bottomLimit, std::ostream &os) {
         // return empty vector and print err message
         Logger::instance().logger("core").error(this->_error);
 
@@ -31,7 +31,7 @@ namespace tuplex {
     }
 
     std::shared_ptr<ResultSet> ErrorDataSet::collect(std::ostream &os) {
-        return take(0, false, os);
+        return take(0, 0, os);
     }
 
     void
diff --git a/tuplex/core/src/Executor.cc b/tuplex/core/src/Executor.cc
index 388199e4d..acfdd0aa6 100644
--- a/tuplex/core/src/Executor.cc
+++ b/tuplex/core/src/Executor.cc
@@ -33,10 +33,6 @@ namespace tuplex {
     std::vector<IExecutorTask*> WorkQueue::popCompletedTasks() {
         TRACE_LOCK("workQueue");
 
-        _taskDoneMutex.lock();
-        _taskDone.clear();
-        _taskDoneMutex.unlock();
-
         std::lock_guard<std::mutex> lock(_completedTasksMutex);
         // move leads to circular dependency in gcc and thus a bug on travis-ci. Therefore, just
         // use the below hack to fool the compiler into actually copying the vectors
diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc
index dbceaa1b9..e477b653b 100644
--- a/tuplex/core/src/ee/local/LocalBackend.cc
+++ b/tuplex/core/src/ee/local/LocalBackend.cc
@@ -551,6 +551,7 @@ namespace tuplex {
                         task->sinkExceptionsToMemory(inputSchema);
                         task->setStageID(tstage->getID());
                         task->setOutputTopLimit(tstage->outputTopLimit());
+                        task->setOutputBottomLimit(tstage->outputBottomLimit());
                         // add to tasks
                         tasks.emplace_back(std::move(task));
                     } else {
@@ -1197,7 +1198,7 @@ namespace tuplex {
                 }
 
                 if (tstage->hasOutputLimit()) {
-                    trimPartitionsToLimit(output, tstage->outputTopLimit(), tstage->outputBottomLimit());
+                    trimPartitionsToLimit(output, tstage->outputTopLimit(), tstage->outputBottomLimit(), tstage);
                 }
 
                 tstage->setMemoryResult(output, generalOutput, partitionToExceptionsMap, nonConformingRows, remainingExceptions, ecounts);
@@ -1560,7 +1561,7 @@ namespace tuplex {
 
         for (int i = 0; i < tasks.size(); i++) {
             // take limit only work with uniform order
-            assert(task.getOrder(0) == i);
+            assert(tasks[i]->getOrder(0) == i);
         }
 
         // add all tasks to queue
@@ -2165,9 +2166,9 @@ namespace tuplex {
                 // clip last partition & leave loop
                 auto clipped = bottomLimit - (numTopOutputRows - partition->getNumRows());
                 assert(clipped <= partition->getNumRows());
-                Partition newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage);
+                Partition* newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage);
                 partition->invalidate();
-                parition = newPart;
+                partition = newPart;
                 assert(partition->getNumRows() == clipped);
                 if (clipped > 0)
                     limitedTailPartitions.push_back(partition);
@@ -2199,12 +2200,12 @@ namespace tuplex {
         }
 
         // merge the head and tail partitions
-        partitions.clear()
+        partitions.clear();
         partitions.insert(partitions.end(), limitedPartitions.begin(), limitedPartitions.end());
         partitions.insert(partitions.end(), limitedTailPartitions.rbegin(), limitedTailPartitions.rend());
     }
 
-    Partition* LocalBackend::newPartitionWithSkipRows(Partition *p_in, int numToSkip, TransformStage* tstage) {
+    Partition* LocalBackend::newPartitionWithSkipRows(Partition *p_in, size_t numToSkip, TransformStage* tstage) {
         if(!numToSkip)
             return nullptr;
 
@@ -2220,7 +2221,7 @@ namespace tuplex {
         size_t numBytesToSkip = 0;
 
         for(unsigned i = 0; i < numToSkip; ++i) {
-            Rows r = Row::fromMemory(tstage->outputSchema(), ptr, p_in->capacity() - numBytesToSkip);
+            Row r = Row::fromMemory(tstage->outputSchema(), ptr, p_in->capacity() - numBytesToSkip);
             ptr += r.serializedLength();
             numBytesToSkip += r.serializedLength();
         }
@@ -2228,7 +2229,7 @@ namespace tuplex {
         auto ptr_out = p_out->lockRaw();
         *((int64_t*) ptr_out) = p_in->getNumRows() - numToSkip;
         ptr_out += sizeof(int64_t);
-        memcpy(ptr_out, ptr, p_in->size() - numBytesToSkip);
+        memcpy((void *) ptr_out, ptr, p_in->size() - numBytesToSkip);
         p_out->unlock();
 
         p_in->unlock();
diff --git a/tuplex/core/src/logical/TakeOperator.cc b/tuplex/core/src/logical/TakeOperator.cc
index e588b5e97..49a4452b4 100644
--- a/tuplex/core/src/logical/TakeOperator.cc
+++ b/tuplex/core/src/logical/TakeOperator.cc
@@ -12,13 +12,13 @@
 #include <cassert>
 
 namespace tuplex {
-    TakeOperator::TakeOperator(LogicalOperator *parent, const int64_t numTop, const int64_t numBottom) : LogicalOperator::LogicalOperator(parent), _limitTop(numTop), _limitBottom(numBottom) {
+    TakeOperator::TakeOperator(LogicalOperator *parent, size_t topLimit, size_t bottomLimit) : LogicalOperator::LogicalOperator(parent), _topLimit(topLimit), _bottomLimit(bottomLimit) {
         // take schema from parent node
         setSchema(this->parent()->getOutputSchema());
     }
 
     bool TakeOperator::good() const {
-            return _limitTop >= -1 && _limitBottom >= -1;
+            return _topLimit >= 0 && _bottomLimit >= 0;
     }
 
     std::vector<Row> TakeOperator::getSample(const size_t num) const {
@@ -33,7 +33,7 @@ namespace tuplex {
 
     LogicalOperator *TakeOperator::clone() {
         // create clone of this operator
-        auto copy = new TakeOperator(parent()->clone(), _limitTop, _limitBottom);
+        auto copy = new TakeOperator(parent()->clone(), _topLimit, _bottomLimit);
 
         copy->setDataSet(getDataSet()); // weak ptr to old dataset...
         copy->copyMembers(this);
diff --git a/tuplex/core/src/physical/PhysicalPlan.cc b/tuplex/core/src/physical/PhysicalPlan.cc
index ff67e4add..f289064d5 100644
--- a/tuplex/core/src/physical/PhysicalPlan.cc
+++ b/tuplex/core/src/physical/PhysicalPlan.cc
@@ -382,7 +382,7 @@ namespace tuplex {
         // set limit if output node has a limit (currently only TakeOperator)
         if(outputNode->type() == LogicalOperatorType::TAKE) {
             auto top = static_cast<TakeOperator*>(outputNode);
-            builder.setOutputLimit(top->limit(), top->limitBottom());
+            builder.setOutputLimit(top->topLimit(), top->bottomLimit());
         }
 
         // @TODO: add slowPip builder to this process...
diff --git a/tuplex/core/src/physical/ResultSet.cc b/tuplex/core/src/physical/ResultSet.cc
index e31e78cec..0f7bf7319 100644
--- a/tuplex/core/src/physical/ResultSet.cc
+++ b/tuplex/core/src/physical/ResultSet.cc
@@ -98,7 +98,7 @@ namespace tuplex {
         Partition *first = _partitions.front();
         assert(_schema == first->schema());
 
-        auto numRows = first->getNumRows() - first->getNumSkip();
+        auto numRows = first->getNumRows();
         _rowsRetrieved += numRows;
 
         _partitions.pop_front();
diff --git a/tuplex/core/src/physical/StageBuilder.cc b/tuplex/core/src/physical/StageBuilder.cc
index bc814182b..78bc8dea4 100644
--- a/tuplex/core/src/physical/StageBuilder.cc
+++ b/tuplex/core/src/physical/StageBuilder.cc
@@ -458,7 +458,7 @@ namespace tuplex {
                     }
                     case LogicalOperatorType::TAKE: {
                         auto takeOp = dynamic_cast<TakeOperator*>(node);
-                        opt_ops.push_back(new TakeOperator(lastParent, takeOp->limit(), takeOp->limitBottom()));
+                        opt_ops.push_back(new TakeOperator(lastParent, takeOp->topLimit(), takeOp->bottomLimit()));
                         opt_ops.back()->setID(node->getID());
                         break;
                     }
diff --git a/tuplex/python/src/PythonDataSet.cc b/tuplex/python/src/PythonDataSet.cc
index 853b910db..66f94e33f 100644
--- a/tuplex/python/src/PythonDataSet.cc
+++ b/tuplex/python/src/PythonDataSet.cc
@@ -130,7 +130,7 @@ namespace tuplex {
             std::shared_ptr<ResultSet> rs;
             std::string err_message = "";
             try {
-                rs = _dataset->take(numRows, ss);
+                rs = _dataset->take(numTop, numBottom, ss);
                 if(!rs)
                     throw std::runtime_error("invalid result set");
                 // if there are more than 1 million (100k in debug mode) elements print message...
diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py
index 1046505f2..850b4ed83 100644
--- a/tuplex/python/tuplex/dataset.py
+++ b/tuplex/python/tuplex/dataset.py
@@ -191,24 +191,25 @@ def collect(self):
         assert self._dataSet is not None, 'internal API error, datasets must be created via context objects'
         return self._dataSet.collect()
 
-    def take(self, nrows=5, nbottom=0):
+    def take(self, limitTop=5, limitBottom=0):
         """ action that generates a physical plan, processes data and collects the top results then as list of tuples.
 
         Args:
-            nrows (int): number of rows to collect. Per default ``5``.
+            limitTop (int): number of top rows to collect. Per default ``5``.
+            limitBottom (int): number of bottom rows to collect. Per default ``0``.
         Returns:
             (list): A list of tuples
 
         """
 
-        assert isinstance(nrows, int), 'num rows must be an integer'
-        assert nrows > 0, 'please specify a number greater than zero'
-        assert isinstance(nbottom, int), 'num bottom last must be an integer'
-        assert nbottom >= 0, 'please specify a number greater or equal to zero'
+        assert isinstance(limitTop, int), 'num rows must be an integer'
+        assert limitTop > 0, 'please specify a number greater than zero'
+        assert isinstance(limitBottom, int), 'num bottom last must be an integer'
+        assert limitBottom >= 0, 'please specify a number greater or equal to zero'
 
         assert self._dataSet is not None, 'internal API error, datasets must be created via context objects'
 
-        return self._dataSet.take(nrows, nbottom)
+        return self._dataSet.take(limitTop, limitBottom)
 
     def show(self, nrows=None):
         """ action that generates a physical plan, processes data and prints results as nicely formatted
diff --git a/tuplex/utils/include/mt/ITask.h b/tuplex/utils/include/mt/ITask.h
index a5ca4058f..6c85d2d36 100644
--- a/tuplex/utils/include/mt/ITask.h
+++ b/tuplex/utils/include/mt/ITask.h
@@ -21,65 +21,65 @@
 
 namespace tuplex {
 
-/*!
- * interface for defining tasks that can be run via a threadpool
- */
-class ITask {
-private:
-    std::thread::id _id; //! the id of the thread that executed the task. Used to specifically execute a task on a specific thread.
+    /*!
+     * interface for defining tasks that can be run via a threadpool
+     */
+    class ITask {
+    private:
+        std::thread::id _id; //! the id of the thread that executed the task. Used to specifically execute a task on a specific thread.
 //! Per default object is constructed that does not represent a thread
 
-    std::vector<size_t> _orderNumbers; //! for sorting tasks when doing async processing, allows for multiple stages
+        std::vector<size_t> _orderNumbers; //! for sorting tasks when doing async processing, allows for multiple stages
 
-public:
-    ITask() {};
-    ITask(const ITask& other) : _id(other._id), _orderNumbers(other._orderNumbers)  {}
-    virtual ~ITask() = default;
-    ITask(ITask&& other) = default;
-    ITask& operator = (ITask&& other) = default;
+    public:
+        ITask() {};
+        ITask(const ITask& other) : _id(other._id), _orderNumbers(other._orderNumbers)  {}
+        virtual ~ITask() = default;
+        ITask(ITask&& other) = default;
+        ITask& operator = (ITask&& other) = default;
 
-    /*!
-     * interface to run a task
-     */
-    virtual void execute() = 0;
+        /*!
+         * interface to run a task
+         */
+        virtual void execute() = 0;
 
-    std::thread::id getID() {
-        return _id;
-    }
+        std::thread::id getID() {
+            return _id;
+        }
 
-    void setID(const std::thread::id& id) {
-        _id = id;
-    }
+        void setID(const std::thread::id& id) {
+            _id = id;
+        }
 
-    void setOrder(size_t order) { _orderNumbers = std::vector<size_t>{order}; }
+        void setOrder(size_t order) { _orderNumbers = std::vector<size_t>{order}; }
 
-    size_t getOrder(const size_t nth) const {
-        return _orderNumbers[nth];
-    }
+        size_t getOrder(size_t nth) const {
+            return _orderNumbers[nth];
+        }
 
-    std::vector<size_t> getOrder() const { return _orderNumbers; }
+        std::vector<size_t> getOrder() const { return _orderNumbers; }
 
-    void setOrder(const std::vector<size_t>& order) {
-        _orderNumbers = order;
-    }
+        void setOrder(const std::vector<size_t>& order) {
+            _orderNumbers = order;
+        }
 
-    /*!
-     * compare the ordering numbers of two tasks to restore initial data order after processing multiple ones
-     * @param other
-     * @return
-     */
-    bool compareAscOrder(const ITask& other) const {
-        // make sure they have the same length
-        assert(_orderNumbers.size() == other._orderNumbers.size());
+        /*!
+         * compare the ordering numbers of two tasks to restore initial data order after processing multiple ones
+         * @param other
+         * @return
+         */
+        bool compareAscOrder(const ITask& other) const {
+            // make sure they have the same length
+            assert(_orderNumbers.size() == other._orderNumbers.size());
 
-        // this < other?
-        // compare one by one
-        for(int i = 0; i < other._orderNumbers.size(); ++i) {
-            if(_orderNumbers[i] >= other._orderNumbers[i])
-                return false;
+            // this < other?
+            // compare one by one
+            for(int i = 0; i < other._orderNumbers.size(); ++i) {
+                if(_orderNumbers[i] >= other._orderNumbers[i])
+                    return false;
+            }
+            return true;
         }
-        return true;
-    }
-};
+    };
 }
 #endif //TUPLEX_ITASK_H
\ No newline at end of file

From fb90aefd563e1469f97b3bce6c75204df3aff861 Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Fri, 25 Mar 2022 00:23:41 -0400
Subject: [PATCH 12/56] Address Review Comments (2)

---
 tuplex/core/include/logical/TakeOperator.h    | 2 +-
 tuplex/core/include/physical/TransformStage.h | 2 +-
 tuplex/core/src/ee/local/LocalBackend.cc      | 2 ++
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tuplex/core/include/logical/TakeOperator.h b/tuplex/core/include/logical/TakeOperator.h
index b7c4892dc..fe5f1b0f2 100644
--- a/tuplex/core/include/logical/TakeOperator.h
+++ b/tuplex/core/include/logical/TakeOperator.h
@@ -26,7 +26,7 @@ namespace tuplex {
         TakeOperator(LogicalOperator *parent, size_t topLimit, size_t bottomLimit);
 
         std::string name() override {
-            if(_topLimit < 0 || std::numeric_limits<int64_t>::max() == _topLimit)
+            if(_topLimit == 0 && _bottomLimit == 0)
                 return "collect";
             return "take";
         }
diff --git a/tuplex/core/include/physical/TransformStage.h b/tuplex/core/include/physical/TransformStage.h
index f4efeebeb..05c7df448 100644
--- a/tuplex/core/include/physical/TransformStage.h
+++ b/tuplex/core/include/physical/TransformStage.h
@@ -395,7 +395,7 @@ namespace tuplex {
 
         // default case: both _outputTopLimit and _outputBottomLimit is zero = take everything
         bool hasOutputLimit() const {
-            return _outputTopLimit > 0 || _outputBottomLimit > 0;
+            return _outputTopLimit != 0 || _outputBottomLimit != 0;
         }
     private:
         /*!
diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc
index e477b653b..022d5a036 100644
--- a/tuplex/core/src/ee/local/LocalBackend.cc
+++ b/tuplex/core/src/ee/local/LocalBackend.cc
@@ -586,6 +586,7 @@ namespace tuplex {
                             task->sinkExceptionsToMemory(inputSchema);
                             task->setStageID(tstage->getID());
                             task->setOutputTopLimit(tstage->outputTopLimit());
+                            task->setOutputBottomLimit(tstage->outputBottomLimit());
                             // add to tasks
                             tasks.emplace_back(std::move(task));
                             num_parts++;
@@ -623,6 +624,7 @@ namespace tuplex {
                                 task->sinkExceptionsToMemory(inputSchema);
                                 task->setStageID(tstage->getID());
                                 task->setOutputTopLimit(tstage->outputTopLimit());
+                                task->setOutputBottomLimit(tstage->outputBottomLimit());
                                 // add to tasks
                                 tasks.emplace_back(std::move(task));
 

From cb4031325e7c8a3ad70c3a720cca545abf7c1e4d Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Fri, 25 Mar 2022 00:52:36 -0400
Subject: [PATCH 13/56] Debugging Tests

---
 tuplex/core/src/ee/local/LocalBackend.cc | 7 ++++++-
 tuplex/core/src/physical/ResultSet.cc    | 3 +--
 tuplex/test/core/TakeTest.cc             | 4 +++-
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc
index 022d5a036..d51ef4523 100644
--- a/tuplex/core/src/ee/local/LocalBackend.cc
+++ b/tuplex/core/src/ee/local/LocalBackend.cc
@@ -884,7 +884,12 @@ namespace tuplex {
         // special case: skip stage, i.e. empty code and mem2mem
         if(tstage->code().empty() &&  !tstage->fileInputMode() && !tstage->fileOutputMode()) {
             auto pyObjects = inputExceptionsToPythonObjects(tstage->inputExceptions(), tstage->normalCaseInputSchema());
-            tstage->setMemoryResult(tstage->inputPartitions(), std::vector<Partition*>{}, std::unordered_map<std::string, ExceptionInfo>(), pyObjects);
+
+            auto output_par = tstage->inputPartitions();
+            if (tstage->hasOutputLimit()) {
+                trimPartitionsToLimit(output_par, tstage->outputTopLimit(), tstage->outputBottomLimit(), tstage);
+            }
+            tstage->setMemoryResult(output_par, std::vector<Partition*>{}, std::unordered_map<std::string, ExceptionInfo>(), pyObjects);
             pyObjects.clear();
             // skip stage
             Logger::instance().defaultLogger().info("[Transform Stage] skipped stage " + std::to_string(tstage->number()) + " because there is nothing todo here.");
diff --git a/tuplex/core/src/physical/ResultSet.cc b/tuplex/core/src/physical/ResultSet.cc
index 0f7bf7319..bfd656dc8 100644
--- a/tuplex/core/src/physical/ResultSet.cc
+++ b/tuplex/core/src/physical/ResultSet.cc
@@ -138,8 +138,7 @@ namespace tuplex {
                 auto num_rows = first->getNumRows();
                 // how many left to retrieve?
                 auto num_to_retrieve_from_partition = std::min(limit - i, num_rows - _curRowCounter);
-                if(num_to_retrieve_from_partition <= 0)
-                    break;
+                assert(num_to_retrieve_from_partition >= 0);
 
                 // make sure partition schema matches stored schema
                 assert(_schema == first->schema());
diff --git a/tuplex/test/core/TakeTest.cc b/tuplex/test/core/TakeTest.cc
index 08b648f34..2d8f81f2f 100644
--- a/tuplex/test/core/TakeTest.cc
+++ b/tuplex/test/core/TakeTest.cc
@@ -122,4 +122,6 @@ TEST_F(TakeTest, takeBothTest) {
     EXPECT_EQ(v3[2].getString(0), "!");
     EXPECT_EQ(v3[3].getString(0), "! :)");
     EXPECT_EQ(v3[4].getString(0), "!");
-}
\ No newline at end of file
+}
+
+// TODO(march): test empty code when reusing partitions. This might not work if user reused dataset
\ No newline at end of file

From 517f2fcb3730662a1fa2c5abd181539ba87053e7 Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Wed, 6 Apr 2022 23:55:59 -0400
Subject: [PATCH 14/56] Change definition of take all

---
 tuplex/core/include/DataSet.h                 |  2 +-
 tuplex/core/include/logical/TakeOperator.h    |  2 +-
 tuplex/core/include/physical/StageBuilder.h   |  3 +--
 tuplex/core/include/physical/TransformStage.h |  2 +-
 tuplex/core/include/physical/TransformTask.h  |  2 --
 tuplex/core/src/DataSet.cc                    | 14 +++++++-------
 tuplex/core/src/physical/TransformTask.cc     |  3 +--
 tuplex/python/include/PythonDataSet.h         |  2 +-
 tuplex/python/src/PythonDataSet.cc            | 17 ++++++++++++++---
 9 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/tuplex/core/include/DataSet.h b/tuplex/core/include/DataSet.h
index f6bb97f2c..86ca23b6a 100644
--- a/tuplex/core/include/DataSet.h
+++ b/tuplex/core/include/DataSet.h
@@ -128,7 +128,7 @@ namespace tuplex {
          * @param numRows how many rows to print, i.e. top numRows are printed.xs
          * @param os ostream where to print table to
          */
-        virtual void show(const int64_t numRows = -1, std::ostream &os = std::cout);
+        virtual void show(int64_t numRows = -1, std::ostream &os = std::cout);
 
         // named dataset management functions
         /*!
diff --git a/tuplex/core/include/logical/TakeOperator.h b/tuplex/core/include/logical/TakeOperator.h
index fe5f1b0f2..f3841236b 100644
--- a/tuplex/core/include/logical/TakeOperator.h
+++ b/tuplex/core/include/logical/TakeOperator.h
@@ -26,7 +26,7 @@ namespace tuplex {
         TakeOperator(LogicalOperator *parent, size_t topLimit, size_t bottomLimit);
 
         std::string name() override {
-            if(_topLimit == 0 && _bottomLimit == 0)
+            if(_topLimit == std::numeric_limits<size_t>::max() || _bottomLimit == std::numeric_limits<size_t>::max())
                 return "collect";
             return "take";
         }
diff --git a/tuplex/core/include/physical/StageBuilder.h b/tuplex/core/include/physical/StageBuilder.h
index 83e63208a..1c322b9a6 100644
--- a/tuplex/core/include/physical/StageBuilder.h
+++ b/tuplex/core/include/physical/StageBuilder.h
@@ -158,9 +158,8 @@ namespace tuplex {
             size_t number() const { return _stageNumber; }
             int64_t outputDataSetID() const;
 
-            // default case: both _outputTopLimit and _outputBottomLimit is zero = take everything
             inline bool hasOutputLimit() const {
-                return _outputTopLimit != 0 || _outputBottomLimit != 0;
+                return _outputTopLimit != std::numeric_limits<size_t>::max() && _outputBottomLimit != std::numeric_limits<size_t>::max();
             }
 
             inline char csvOutputDelimiter() const {
diff --git a/tuplex/core/include/physical/TransformStage.h b/tuplex/core/include/physical/TransformStage.h
index 05c7df448..e1e45c97b 100644
--- a/tuplex/core/include/physical/TransformStage.h
+++ b/tuplex/core/include/physical/TransformStage.h
@@ -395,7 +395,7 @@ namespace tuplex {
 
         // default case: both _outputTopLimit and _outputBottomLimit is zero = take everything
         bool hasOutputLimit() const {
-            return _outputTopLimit != 0 || _outputBottomLimit != 0;
+            return _outputTopLimit != std::numeric_limits<size_t>::max() && _outputBottomLimit != std::numeric_limits<size_t>::max();
         }
     private:
         /*!
diff --git a/tuplex/core/include/physical/TransformTask.h b/tuplex/core/include/physical/TransformTask.h
index d065e86d3..e2b8bc5b6 100644
--- a/tuplex/core/include/physical/TransformTask.h
+++ b/tuplex/core/include/physical/TransformTask.h
@@ -184,7 +184,6 @@ namespace tuplex {
 
         void setOutputTopLimit(size_t limit) { _outTopLimit = limit; resetOutputLimitCounter(); }
         void setOutputBottomLimit(size_t limit) { _outBottomLimit = limit; resetOutputLimitCounter(); }
-        void setOutputSkip(size_t numRowsToSkip) { _outSkipRows = numRowsToSkip; }
         void execute() override;
 
         bool hasFileSink() const { return _outputFilePath != URI::INVALID; }
@@ -281,7 +280,6 @@ namespace tuplex {
 
         size_t _outTopLimit; // limits how many rows to write at max
         size_t _outBottomLimit; // limits how many last rows to write at max
-        size_t _outSkipRows; // how many rows at start to skip
 
         // memory source variables
         std::vector<Partition*> _inputPartitions;
diff --git a/tuplex/core/src/DataSet.cc b/tuplex/core/src/DataSet.cc
index c11482f86..d54edb567 100644
--- a/tuplex/core/src/DataSet.cc
+++ b/tuplex/core/src/DataSet.cc
@@ -38,7 +38,7 @@ namespace tuplex {
     }
 
     std::shared_ptr<ResultSet> DataSet::collect(std::ostream &os) {
-        return take(0, 0, os);
+        return take(std::numeric_limits<size_t>::max(), 0, os);
     }
 
     std::shared_ptr<ResultSet> DataSet::take(size_t topLimit, size_t bottomLimit, std::ostream &os) {
@@ -62,18 +62,14 @@ namespace tuplex {
 
     // collect functions
     std::vector<Row> DataSet::collectAsVector(std::ostream &os) {
-        return takeAsVector(-1, os);
+        return takeAsVector(std::numeric_limits<size_t>::max(), os);
     }
 
-    // -1 means to retrieve all elements
     std::vector<Row> DataSet::takeAsVector(size_t numElements, std::ostream &os) {
         auto rs = take(numElements, false, os);
         Timer timer;
 
 #warning "limiting should make this hack irrelevant..."
-        if (numElements < 0)
-            numElements = std::numeric_limits<int64_t>::max();
-
         // std::vector<Row> v;
         // while (rs->hasNextRow() && v.size() < numElements) {
         //     v.push_back(rs->getNextRow());
@@ -730,10 +726,14 @@ namespace tuplex {
     }
 
 
-    void DataSet::show(const int64_t numRows, std::ostream &os) {
+    void DataSet::show(int64_t numRows, std::ostream &os) {
         assert(_context);
 
         // get rows
+        if (numRows < 0) {
+            numRows = std::numeric_limits<size_t>::max();
+        }
+
         auto rows = takeAsVector(numRows, os);
         if (rows.empty()) {
             return;
diff --git a/tuplex/core/src/physical/TransformTask.cc b/tuplex/core/src/physical/TransformTask.cc
index 49d104bcc..377385deb 100644
--- a/tuplex/core/src/physical/TransformTask.cc
+++ b/tuplex/core/src/physical/TransformTask.cc
@@ -509,7 +509,6 @@ namespace tuplex {
         _outPrefix.reset();
         _outTopLimit = std::numeric_limits<size_t>::max(); // write all rows
         _outBottomLimit = 0;
-        _outSkipRows = 0; // skip no rows
 
         // reset memory sink
         _output.reset();
@@ -705,7 +704,7 @@ namespace tuplex {
 
         // skip rows? limit rows??
 
-        if(_numOutputRowsWritten >= _outSkipRows && _numOutputRowsWritten < (_outTopLimit - _outSkipRows)) {
+        if(_numOutputRowsWritten < _outTopLimit) {
             if(_outFile->write(buf, bufSize) != VirtualFileSystemStatus::VFS_OK)
                 return ecToI32(ExceptionCode::IOERROR);
         }
diff --git a/tuplex/python/include/PythonDataSet.h b/tuplex/python/include/PythonDataSet.h
index 23b09314d..ede482d9c 100644
--- a/tuplex/python/include/PythonDataSet.h
+++ b/tuplex/python/include/PythonDataSet.h
@@ -77,7 +77,7 @@ namespace tuplex {
         PythonDataSet resolve(const int64_t exceptionCode, const std::string& lambda_code, const std::string& pickled_code, const py::object& closure=py::object());
 
         py::object collect();
-        py::object take(const int64_t numTop, const int64_t numBottom);
+        py::object take(const int64_t topLimit, const int64_t bottomLimit);
         void show(const int64_t numRows=-1);
 
         // DataFrame like operations
diff --git a/tuplex/python/src/PythonDataSet.cc b/tuplex/python/src/PythonDataSet.cc
index 66f94e33f..f6079a143 100644
--- a/tuplex/python/src/PythonDataSet.cc
+++ b/tuplex/python/src/PythonDataSet.cc
@@ -107,7 +107,7 @@ namespace tuplex {
         }
     }
 
-    py::object PythonDataSet::take(const int64_t numTop, const int64_t numBottom) {
+    py::object PythonDataSet::take(const int64_t topLimit, const int64_t bottomLimit) {
         // make sure a dataset is wrapped
         assert(this->_dataset);
 
@@ -129,8 +129,19 @@ namespace tuplex {
 
             std::shared_ptr<ResultSet> rs;
             std::string err_message = "";
+
+            size_t castedTopLimit = 0;
+            if (topLimit < 0) {
+                castedTopLimit = std::numeric_limits<size_t>::max();
+            }
+
+            size_t castedBottomLimit = 0;
+            if (bottomLimit < 0) {
+                castedBottomLimit = std::numeric_limits<size_t>::max();
+            }
+
             try {
-                rs = _dataset->take(numTop, numBottom, ss);
+                rs = _dataset->take(castedTopLimit, castedBottomLimit, ss);
                 if(!rs)
                     throw std::runtime_error("invalid result set");
                 // if there are more than 1 million (100k in debug mode) elements print message...
@@ -162,7 +173,7 @@ namespace tuplex {
             // new version, directly interact with the interpreter
             Timer timer;
             // build python list object from resultset
-            auto listObj = resultSetToCPython(rs.get(), numTop);
+            auto listObj = resultSetToCPython(rs.get(), castedTopLimit);
             Logger::instance().logger("python").info("Data transfer back to python took "
                                                      + std::to_string(timer.time()) + " seconds");
             // Logger::instance().flushAll();

From c33fc23bd1b4a2790b2e49b1985ec37f2d1ce8f6 Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Wed, 6 Apr 2022 23:56:59 -0400
Subject: [PATCH 15/56] Random take test with some debugging

---
 tuplex/core/src/ee/local/LocalBackend.cc   | 35 ++++-----
 tuplex/core/src/physical/TransformStage.cc |  6 --
 tuplex/test/core/TakeTest.cc               | 87 +++++++++++++++++++++-
 3 files changed, 101 insertions(+), 27 deletions(-)

diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc
index d51ef4523..0b8157ecc 100644
--- a/tuplex/core/src/ee/local/LocalBackend.cc
+++ b/tuplex/core/src/ee/local/LocalBackend.cc
@@ -2143,7 +2143,7 @@ namespace tuplex {
                 clippedTop = topLimit - (numTopOutputRows - partition->getNumRows());
                 assert(clippedTop <= partition->getNumRows());
                 break;
-            } else if (partition == *partitions.end()) {
+            } else if (partition == partitions.back()) {
                 // last partition, mark full row, but don't put to output set yet to avoid double put
                 clippedTop = partition->getNumRows();
                 break;
@@ -2171,14 +2171,14 @@ namespace tuplex {
                 break;
             } else if (numBottomOutputRows >= bottomLimit) {
                 // clip last partition & leave loop
-                auto clipped = bottomLimit - (numTopOutputRows - partition->getNumRows());
+                auto clipped = bottomLimit - (numBottomOutputRows - partition->getNumRows());
                 assert(clipped <= partition->getNumRows());
-                Partition* newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage);
+                if (clipped > 0) {
+                    Partition *newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage);
+                    assert(newPart->getNumRows() == clipped);
+                    limitedTailPartitions.push_back(newPart);
+                }
                 partition->invalidate();
-                partition = newPart;
-                assert(partition->getNumRows() == clipped);
-                if (clipped > 0)
-                    limitedTailPartitions.push_back(partition);
                 break;
             } else {
                 // put full partition to output set
@@ -2197,9 +2197,12 @@ namespace tuplex {
                 lastBottomPart = newPartitionWithSkipRows(lastTopPart, lastTopPart->getNumRows() - clippedBottom, tstage);
             }
 
-            lastTopPart->setNumRows(clippedTop);
-
-            limitedPartitions.push_back(lastTopPart);
+            if (clippedTop != 0) {
+                lastTopPart->setNumRows(clippedTop);
+                limitedPartitions.push_back(lastTopPart);
+            } else {
+                lastTopPart->invalidate();
+            }
 
             if (lastBottomPart != nullptr) {
                 limitedPartitions.push_back(lastBottomPart);
@@ -2213,17 +2216,10 @@ namespace tuplex {
     }
 
     Partition* LocalBackend::newPartitionWithSkipRows(Partition *p_in, size_t numToSkip, TransformStage* tstage) {
-        if(!numToSkip)
-            return nullptr;
-
         auto ptr = p_in->lockRaw();
         auto num_rows = *((int64_t*) ptr);
         assert(numToSkip < num_rows);
 
-        Partition *p_out = _driver->allocWritablePartition(num_rows - numToSkip + sizeof(int64_t),
-                                                           tstage->outputSchema(), tstage->outputDataSetID(),
-                                                           tstage->context().id());
-
         ptr += sizeof(int64_t);
         size_t numBytesToSkip = 0;
 
@@ -2233,6 +2229,11 @@ namespace tuplex {
             numBytesToSkip += r.serializedLength();
         }
 
+        Partition *p_out = _driver->allocWritablePartition(p_in->size() - numBytesToSkip + sizeof(int64_t),
+                                                           tstage->outputSchema(), tstage->outputDataSetID(),
+                                                           tstage->context().id());
+        assert(p_out->capacity() >= p_in->size() - numBytesToSkip);
+
         auto ptr_out = p_out->lockRaw();
         *((int64_t*) ptr_out) = p_in->getNumRows() - numToSkip;
         ptr_out += sizeof(int64_t);
diff --git a/tuplex/core/src/physical/TransformStage.cc b/tuplex/core/src/physical/TransformStage.cc
index 6e216ac5c..060365697 100644
--- a/tuplex/core/src/physical/TransformStage.cc
+++ b/tuplex/core/src/physical/TransformStage.cc
@@ -133,15 +133,9 @@ namespace tuplex {
             auto schema = Schema::UNKNOWN;
 
             if(!partitions.empty()) {
-                size_t totalRowsCount = 0;
                 schema = partitions.front()->schema();
                 for (auto partition : partitions) {
                     assert(schema == partition->schema());
-                    totalRowsCount += partition->getNumRows();
-                }
-
-                if (hasOutputLimit()) {
-                    assert(totalRowsCount == _outputTopLimit + _outputBottomLimit);
                 }
             }
 
diff --git a/tuplex/test/core/TakeTest.cc b/tuplex/test/core/TakeTest.cc
index 2d8f81f2f..3990fcd07 100644
--- a/tuplex/test/core/TakeTest.cc
+++ b/tuplex/test/core/TakeTest.cc
@@ -8,13 +8,51 @@
 //  License: Apache 2.0                                                                                               //
 //--------------------------------------------------------------------------------------------------------------------//
 
+#include <random>
+
 #include <Context.h>
 #include "TestUtils.h"
 
+using namespace tuplex;
+using namespace std;
+
 class TakeTest : public PyTest {};
 
+/**
+ * Randomly generate a vector of rows for testing
+ * @param N the size of vector
+ * @return a vector of size N, containing the random data
+ */
+vector<Row> generateTestData(size_t N, uint64_t seed) {
+    mt19937 gen(seed); //Standard mersenne_twister_engine seeded with rd()
+    uniform_int_distribution<> distrib(1, 100000000);
+
+    vector<Row> data;
+    data.reserve(N);
+
+    for (int i = 0; i < N; i++) {
+        data.emplace_back(distrib(gen), distrib(gen), distrib(gen));
+    }
+
+    return data;
+}
+
+vector<Row> generateReferenceData(const vector<Row>& input, size_t topLimit, size_t bottomLimit) {
+    vector<Row> output;
+    for(size_t i = 0; i < topLimit && i < input.size(); i++) {
+        output.push_back(input[i]);
+    }
+    size_t start_bottom = input.size() >= bottomLimit ? input.size() - bottomLimit : 0;
+    start_bottom = max(topLimit, start_bottom);
+
+    for(size_t i = start_bottom; i < input.size(); i++) {
+        output.push_back(input[i]);
+    }
+
+    return output;
+}
+
 TEST_F(TakeTest, takeTopTest) {
-    using namespace tuplex;
     auto opt = testOptions();
     Context context(opt);
 
@@ -51,7 +89,6 @@ TEST_F(TakeTest, takeTopTest) {
 }
 
 TEST_F(TakeTest, takeBottomTest) {
-    using namespace tuplex;
     auto opt = testOptions();
     Context context(opt);
 
@@ -88,7 +125,6 @@ TEST_F(TakeTest, takeBottomTest) {
 }
 
 TEST_F(TakeTest, takeBothTest) {
-    using namespace tuplex;
     auto opt = testOptions();
     Context context(opt);
 
@@ -124,4 +160,47 @@ TEST_F(TakeTest, takeBothTest) {
     EXPECT_EQ(v3[4].getString(0), "!");
 }
 
-// TODO(march): test empty code when reusing partitions. This might not work if user reused dataset
\ No newline at end of file
+TEST_F(TakeTest, takeBigTest) {
+    mt19937 data_seed_gen(4242);
+
+    const std::vector<size_t> test_size{1, 10, 100, 1001, 10001};
+    const std::vector<size_t> limit_values{0, 1, 5, 11, 600, 10000};
+    const std::vector<string> partition_sizes{"256B", "512KB", "1MB"};
+
+    for(auto& part_size : partition_sizes) {
+        auto opt = testOptions();
+        opt.set("tuplex.partitionSize", part_size);
+        Context context(opt);
+
+        for(auto data_size : test_size) {
+            for (auto top_limit: limit_values) {
+                for (auto bottom_limit: limit_values) {
+                    std::cout << "testing with partition size:" << part_size << " data size:"
+                              << data_size << " top:" << top_limit << " bottom:" << bottom_limit << std::endl;
+
+                    auto data = generateTestData(data_size, data_seed_gen());
+                    auto ref_data = generateReferenceData(data, top_limit, bottom_limit);
+
+                    auto res = context.parallelize(data).take(top_limit, bottom_limit);
+                    ASSERT_EQ(ref_data.size(), res->rowCount());
+                    for (Row &r: ref_data) {
+                        Row res_row = res->getNextRow();
+                        if (!(res_row == r)) {
+                            ASSERT_EQ(res_row, r);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+// TODO(march): with map, filter function
+//TEST_F(TakeTest, takeMapFilterTest) {
+//    srand(4242);
+//}
+
+// TODO(march): with file input
+//    context.csv("../resources/");
+
+// TODO(march): collect operator
\ No newline at end of file

From 38d9ca971363eb028a02f86bc0920110aa6172fb Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Thu, 7 Apr 2022 23:28:54 -0400
Subject: [PATCH 16/56] Polish the python interface

---
 tuplex/core/include/DataSet.h         |   8 ++
 tuplex/core/src/DataSet.cc            | 109 +++++++++++++++++++++++++
 tuplex/python/include/PythonDataSet.h |   1 +
 tuplex/python/src/PythonBindings.cc   |   1 +
 tuplex/python/src/PythonDataSet.cc    |  49 +++++++++++
 tuplex/python/tuplex/dataset.py       | 112 +++++++-------------------
 6 files changed, 196 insertions(+), 84 deletions(-)

diff --git a/tuplex/core/include/DataSet.h b/tuplex/core/include/DataSet.h
index 86ca23b6a..9510427e2 100644
--- a/tuplex/core/include/DataSet.h
+++ b/tuplex/core/include/DataSet.h
@@ -130,6 +130,14 @@ namespace tuplex {
          */
         virtual void show(int64_t numRows = -1, std::ostream &os = std::cout);
 
+        /*!
+         * Displays a formatted HTML table of a small portion of the data
+         * @param topLimit how many top rows to print
+         * @param bottomLimit how many bottom rows to print
+         * @param os ostream where to print table to
+         */
+        virtual void showHTMLPreview(size_t topLimit, size_t bottomLimit, std::ostream &os = std::cout);
+
         // named dataset management functions
         /*!
          * map Column using a UDF
diff --git a/tuplex/core/src/DataSet.cc b/tuplex/core/src/DataSet.cc
index d54edb567..210b3ec60 100644
--- a/tuplex/core/src/DataSet.cc
+++ b/tuplex/core/src/DataSet.cc
@@ -756,6 +756,115 @@ namespace tuplex {
         printTable(os, headers, rows);
     }
 
+    void printHTMLRow(std::ostream &os, size_t ind, const Row& r) {
+        os << "    <tr>\n";
+        os << fmt::format("      <th>{}</th>\n", ind);
+        for (auto& s : r.getAsStrings()) {
+            os << fmt::format("      <td>{}</td>\n", s);
+        }
+        os << "    </tr>\n";
+    }
+
+    void DataSet::showHTMLPreview(size_t topLimit, size_t bottomLimit, std::ostream &os) {
+        std::string HTML_TEMPLATE =
+                "<div>\n"
+                "<style scoped>\n"
+                "    .dataframe tbody tr th:only-of-type {\n"
+                "        vertical-align: middle;\n"
+                "    }\n"
+                "\n"
+                "    .dataframe tbody tr th {\n"
+                "        vertical-align: top;\n"
+                "    }\n"
+                "\n"
+                "    .dataframe thead th {\n"
+                "        text-align: right;\n"
+                "    }\n"
+                "</style>\n"
+                "<table border=\"1\" class=\"dataframe\">\n"
+                "  <thead>\n"
+                "    <tr style=\"text-align: right;\">\n"
+                "{}"
+                "    </tr>\n"
+                "  </thead>\n"
+                "  <tbody>\n"
+                "{}"
+                "  </tbody>\n"
+                "</table>\n"
+                "<p>{} columns</p>\n"
+                "</div>";
+
+        assert(_context);
+
+        auto rows = take(topLimit, bottomLimit);
+
+        if (rows->rowCount() == 0) {
+            os << fmt::format(HTML_TEMPLATE, "<th></th>\n", "<tr></tr>\n", 0);
+            return;
+        }
+
+        std::stringstream headers_stream, body_stream;
+        size_t numColumns = 0;
+        assert(rows->rowCount() <= topLimit + bottomLimit);
+
+        // construct tables
+        if (rows->rowCount() < topLimit + bottomLimit) {
+            // the data is small so we get everything (no need to render ...)
+            for (size_t i = 0; rows->hasNextRow(); i++) {
+                Row r = rows->getNextRow();
+                if (i == 0) {
+                    // we set num columns based on the first row
+                    numColumns = r.getNumColumns();
+                }
+
+                printHTMLRow(body_stream, i, r);
+            }
+        } else {
+            // some data is not processed because of limiting
+            size_t i;
+            for (i = 0; rows->hasNextRow() && i < topLimit; i++) {
+                Row r = rows->getNextRow();
+                if (i == 0) {
+                    // we set num columns based on the first row
+                    numColumns = r.getNumColumns();
+                }
+
+                printHTMLRow(body_stream, i, r);
+            }
+
+            // add the ...
+            body_stream << "    <tr>\n";
+            body_stream << "      <th>...</th>\n";
+            for(int j = 0; j < numColumns; j++) {
+                body_stream << "      <td>...</td>\n";
+                body_stream << "    </tr>\n";
+            }
+
+            while (rows->hasNextRow()) {
+                Row r = rows->getNextRow();
+                printHTMLRow(body_stream, i, r);
+            }
+        }
+
+        assert(numColumns != 0);
+
+        // construct headers
+        std::vector<std::string> headers(numColumns);
+        if (!_columnNames.empty()) {
+            assert(numColumns == _columnNames.size());
+            for (auto &c_name: _columnNames) {
+                headers_stream << fmt::format("      <th>{}</th>\n", c_name);
+            }
+        } else {
+            // default to generic name if column name doesn't exist
+            for (int i = 0; i < numColumns; ++i) {
+                headers_stream << fmt::format("      <th>Column {}</th>\n", i);
+            }
+        }
+
+        os << fmt::format(HTML_TEMPLATE, headers_stream.str(), body_stream.str(), numColumns);
+    }
+
     Schema DataSet::schema() const {
         if(!_operator)
             return Schema::UNKNOWN;
diff --git a/tuplex/python/include/PythonDataSet.h b/tuplex/python/include/PythonDataSet.h
index ede482d9c..4761ac7f0 100644
--- a/tuplex/python/include/PythonDataSet.h
+++ b/tuplex/python/include/PythonDataSet.h
@@ -79,6 +79,7 @@ namespace tuplex {
         py::object collect();
         py::object take(const int64_t topLimit, const int64_t bottomLimit);
         void show(const int64_t numRows=-1);
+        std::string showHTMLPreview(const int64_t topLimit, const int64_t bottomLimit);
 
         // DataFrame like operations
         PythonDataSet mapColumn(const std::string& column, const std::string& lambda_code, const std::string& pickled_code, const py::object& closure=py::object());
diff --git a/tuplex/python/src/PythonBindings.cc b/tuplex/python/src/PythonBindings.cc
index 6b3683853..ab239a1a2 100644
--- a/tuplex/python/src/PythonBindings.cc
+++ b/tuplex/python/src/PythonBindings.cc
@@ -41,6 +41,7 @@ PYMODULE {
 
     py::class_<tuplex::PythonDataSet>(m, "_DataSet")
             .def("show", &tuplex::PythonDataSet::show)
+            .def("showHTMLPreview", &tuplex::PythonDataSet::showHTMLPreview)
             .def("collect", &tuplex::PythonDataSet::collect)
             .def("take", &tuplex::PythonDataSet::take)
             .def("map", &tuplex::PythonDataSet::map)
diff --git a/tuplex/python/src/PythonDataSet.cc b/tuplex/python/src/PythonDataSet.cc
index f6079a143..1f543e5d2 100644
--- a/tuplex/python/src/PythonDataSet.cc
+++ b/tuplex/python/src/PythonDataSet.cc
@@ -880,6 +880,55 @@ namespace tuplex {
         }
     }
 
+    std::string PythonDataSet::showHTMLPreview(const int64_t topLimit, const int64_t bottomLimit) {
+        // make sure a dataset is wrapped
+        assert(this->_dataset);
+
+        // is callee error dataset? if so return list with error string
+        if (this->_dataset->isError()) {
+            auto errset = dynamic_cast<ErrorDataSet *>(this->_dataset);
+            assert(errset);
+            return "Error: " + errset->getError();
+        } else {
+            // release GIL & hand over everything to Tuplex
+            assert(PyGILState_Check()); // make sure this thread holds the GIL!
+            python::unlockGIL();
+
+            std::stringstream ss;
+            std::string err_message;
+
+            size_t castedTopLimit = 0;
+            if (topLimit < 0) {
+                castedTopLimit = std::numeric_limits<size_t>::max();
+            }
+
+            size_t castedBottomLimit = 0;
+            if (bottomLimit < 0) {
+                castedBottomLimit = std::numeric_limits<size_t>::max();
+            }
+
+            try {
+                this->_dataset->showHTMLPreview(castedTopLimit, castedBottomLimit, ss);
+            } catch (const std::exception &e) {
+                err_message = e.what();
+                Logger::instance().defaultLogger().error(err_message);
+            } catch (...) {
+                err_message = "unknown C++ exception occurred, please change type.";
+                Logger::instance().defaultLogger().error(err_message);
+            }
+
+            // reacquire GIL
+            python::lockGIL();
+            Logger::instance().flushToPython();
+
+            if (!ss.str().empty() && err_message.empty()) {
+                return ss.str();
+            } else {
+                return "Error occurred: " + err_message;
+            }
+        }
+    }
+
     PyObject* PythonDataSet::anyToCPythonWithPyObjects(ResultSet* rs, size_t maxRowCount) {
         assert(rs);
 
diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py
index 850b4ed83..4d02cf4d4 100644
--- a/tuplex/python/tuplex/dataset.py
+++ b/tuplex/python/tuplex/dataset.py
@@ -28,87 +28,8 @@ class DataSet:
     def __init__(self):
         self._dataSet = None
 
-    def getColumnSize(self):
-        data = self.collect()
-        if len(data) == 0:
-            return 0, 0
-        else:
-            return len(data[0])
-
-    def revTake(self, nRows = 5):
-        return self.collect()[-nRows:]
-
     def _repr_html_(self):
-        rows_list = self.take()
-        total_col_cnt = self.getColumnSize()
-        print('rowlist')
-        print(rows_list)
-        if len(rows_list) == 0:
-            header = '<th></th>\n'
-            rows = '<tr></tr>\n'
-        else:
-            header = '<th></th>\n'
-
-            if self.columns != None:
-                for x in self.columns:
-                    header += f'      <th>{x}</th>\n'
-            else:
-                for i in range(len(rows_list[0])):
-                    header += f'      <th>column {i + 1}</th>\n'
-
-            rows = ''
-            for i, r in enumerate(rows_list):
-                rows += '    <tr>\n'
-                rows += f'      <th>{i}</th>\n'
-                for data in r:
-                    rows += f'      <td>{data}</td>\n'
-                rows += '    </tr>\n'
-
-            # add the ...
-            rows += '    <tr>\n'
-            rows += '      <th>...</th>\n'
-            for i in range(total_col_cnt):
-                rows += '      <td>...</td>\n'
-            rows += '    </tr>\n'
-
-            lastData = self.revTake()
-            for i, r in enumerate(lastData):
-                rows += '    <tr>\n'
-                rows += f'      <th>{0 - len(lastData) + i}</th>\n'
-                for data in r:
-                    rows += f'      <td>{data}</td>\n'
-                rows += '    </tr>\n'
-
-        html_template = (
-            '<div>\n'
-            '<style scoped>\n'
-            '    .dataframe tbody tr th:only-of-type {\n'
-            '        vertical-align: middle;\n'
-            '    }\n'
-            '\n'
-            '    .dataframe tbody tr th {\n'
-            '        vertical-align: top;\n'
-            '    }\n'
-            '\n'
-            '    .dataframe thead th {\n'
-            '        text-align: right;\n'
-            '    }\n'
-            '</style>\n'
-            '<table border="1" class="dataframe">\n'
-            '  <thead>\n'
-            '    <tr style="text-align: right;">\n'
-            f'{header}'
-            '    </tr>\n'
-            '  </thead>\n'
-            '  <tbody>\n'
-            f'{rows}'
-            '  </tbody>\n'
-            '</table>\n'
-            f'<p>{total_col_cnt} columns</p>\n'
-            '</div>'
-        )
-
-        return html_template
+        return self._dataSet.showHTMLPreview()
 
     def unique(self):
         """ removes duplicates from Dataset (out-of-order). Equivalent to a DISTINCT clause in a SQL-statement.
@@ -201,11 +122,14 @@ def take(self, limitTop=5, limitBottom=0):
             (list): A list of tuples
 
         """
+        assert limitTop is None or isinstance(limitTop, int), 'num rows must be an integer or None'
+        assert limitBottom is None or isinstance(limitBottom, int), 'num bottom last must be an integer or None'
 
-        assert isinstance(limitTop, int), 'num rows must be an integer'
-        assert limitTop > 0, 'please specify a number greater than zero'
-        assert isinstance(limitBottom, int), 'num bottom last must be an integer'
-        assert limitBottom >= 0, 'please specify a number greater or equal to zero'
+        if limitTop is None or limitTop < 0:
+            limitTop = -1
+
+        if limitBottom is None or limitBottom < 0:
+            limitBottom = -1
 
         assert self._dataSet is not None, 'internal API error, datasets must be created via context objects'
 
@@ -227,6 +151,26 @@ def show(self, nrows=None):
 
         self._dataSet.show(nrows)
 
+    def showHTMLPreview(self, topLimit=5, bottomLimit=5):
+        """ action that generates a physical plan, processes data and return a subset of results as nicely formatted
+        HTML table to stdout.
+
+        Args:
+            topLimit (int): number of top rows to collect. If ``None`` all rows will be collected
+            bottomLimit (int): number of bottom rows to collect. If ``None`` all rows will be collected
+
+        Returns:
+            string: an HTML table showing a preview of the data
+        """
+        assert self._dataSet is not None, 'internal API error, datasets must be created via context objects'
+
+        if topLimit is None or topLimit < 0:
+            topLimit = -1
+        if bottomLimit is None or bottomLimit < 0:
+            bottomLimit = -1
+
+        return self._dataSet.showHTMLPreview(topLimit, bottomLimit)
+
     def resolve(self, eclass, ftor):
         """ Adds a resolver operator to the pipeline. The signature of ftor needs to be identical to the one of the preceding operator.
 

From 1f5ff5934a6faa5c3ac963fee4f8c22c3d16acc2 Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Fri, 8 Apr 2022 01:24:03 -0400
Subject: [PATCH 17/56] Address PR comments

---
 tuplex/core/include/ee/local/LocalBackend.h   | 24 +++++-
 tuplex/core/include/physical/TransformStage.h |  6 +-
 tuplex/core/include/physical/TransformTask.h  | 20 ++++-
 tuplex/core/src/ee/local/LocalBackend.cc      | 69 ++++++++++------
 tuplex/core/src/physical/TransformTask.cc     | 81 +++++++++++--------
 tuplex/python/tuplex/dataset.py               | 23 +++++-
 tuplex/test/core/TakeTest.cc                  |  4 +-
 7 files changed, 156 insertions(+), 71 deletions(-)

diff --git a/tuplex/core/include/ee/local/LocalBackend.h b/tuplex/core/include/ee/local/LocalBackend.h
index d7a5ec25b..712f0ae43 100644
--- a/tuplex/core/include/ee/local/LocalBackend.h
+++ b/tuplex/core/include/ee/local/LocalBackend.h
@@ -88,9 +88,6 @@ namespace tuplex {
 
         MessageHandler& logger() const { return Logger::instance().logger("local ee"); }
 
-        void trimPartitionsToLimit(std::vector<Partition *> &partitions, size_t topLimit, size_t bottomLimit, TransformStage* tstage);
-        Partition* newPartitionWithSkipRows(Partition* p_in, size_t numToSkip, TransformStage* tstage);
-
         // write output (may be already in correct format!)
         void writeOutput(TransformStage* tstage, std::vector<IExecutorTask*>& sortedTasks);
 
@@ -187,6 +184,27 @@ namespace tuplex {
      * @return
      */
     extern URI outputURI(const UDF& udf, const URI& baseURI, int64_t partNo, FileFormat fmt);
+
+    /*!
+     * Trim list of partitions so that it includes up to the first n rows and the last m rows
+     * @param partitions [in,out] the list of partitions to trim
+     * @param topLimit n, the number of top rows to include
+     * @param bottomLimit m, the number of bottom rows to include
+     * @param tstage pointer to transform stage, might be used to generate new partition
+     * @param exec pointer to executor, might be used to allocate new partition
+     */
+    extern void trimPartitionsToLimit(std::vector<Partition *> &partitions, size_t topLimit, size_t bottomLimit,
+                               TransformStage *tstage, Executor *exec);
+
+    /*!
+     * Create a new partition with the same data as the specified partition, but with the first n rows removed
+     * @param p_in the input partition
+     * @param numToSkip number of rows to remove from the new partition
+     * @param tstage pointer to transform stage, used to generate new partition
+     * @param exec pointer to executor, used to allocate new partition
+     * @return the new partition
+     */
+    extern Partition *newPartitionWithSkipRows(Partition *p_in, size_t numToSkip, TransformStage *tstage, Executor *exec);
 }
 
 #endif //TUPLEX_LOCALBACKEND_H
\ No newline at end of file
diff --git a/tuplex/core/include/physical/TransformStage.h b/tuplex/core/include/physical/TransformStage.h
index e1e45c97b..ff4ece1dd 100644
--- a/tuplex/core/include/physical/TransformStage.h
+++ b/tuplex/core/include/physical/TransformStage.h
@@ -107,9 +107,9 @@ namespace tuplex {
         std::unordered_map<std::string, ExceptionInfo> partitionToExceptionsMap() { return _partitionToExceptionsMap; }
 
         /*!
-         * sets maximum number of top rows this pipeline will produce
-         * @param topLimit
-         * @param bottomLimit
+         * sets maximum number of rows this pipeline will produce
+         * @param topLimit number of top rows to produce, 0 means none, and size_t::max means everything
+         * @param bottomLimit number of bottom rows to produce, 0 means none, and size_t::max means everything
          */
         inline void  setOutputLimit(size_t topLimit, size_t bottomLimit) {
             _outputTopLimit = topLimit;
diff --git a/tuplex/core/include/physical/TransformTask.h b/tuplex/core/include/physical/TransformTask.h
index e2b8bc5b6..d966b69ee 100644
--- a/tuplex/core/include/physical/TransformTask.h
+++ b/tuplex/core/include/physical/TransformTask.h
@@ -182,8 +182,19 @@ namespace tuplex {
         void sinkOutputToHashTable(HashTableFormat fmt, int64_t outputDataSetID);
         HashTableSink hashTableSink() const { return _htable; } // needs to be freed manually!
 
-        void setOutputTopLimit(size_t limit) { _outTopLimit = limit; resetOutputLimitCounter(); }
-        void setOutputBottomLimit(size_t limit) { _outBottomLimit = limit; resetOutputLimitCounter(); }
+        void setOutputTopLimit(size_t limit) {
+            _outTopLimit = limit;
+        }
+
+        void setOutputBottomLimit(size_t limit) {
+            _outBottomLimit = limit;
+        }
+
+        // maxOrder of infinity means disregarding the bottomLimit short circuit
+        static void setMaxOrderAndResetLimits(size_t maxOrder = std::numeric_limits<size_t>::max()) {
+            resetLimits(maxOrder);
+        }
+
         void execute() override;
 
         bool hasFileSink() const { return _outputFilePath != URI::INVALID; }
@@ -207,7 +218,7 @@ namespace tuplex {
         static codegen::i64_hash_row_f writeInt64HashTableAggregateCallback();
         static codegen::write_row_f aggCombineCallback();
 
-        static void resetOutputLimitCounter();
+        static void resetLimits(size_t maxOrder);
 
         // most be public because of C++ issues -.-
         int64_t writeRowToMemory(uint8_t* buf, int64_t bufSize);
@@ -314,6 +325,9 @@ namespace tuplex {
            _exceptions.unlock();
         }
 
+        bool limitReached() const;
+        void updateLimits();
+
         void processMemorySourceWithExp();
         void processMemorySource();
         void processFileSource();
diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc
index 0b8157ecc..9530e9d04 100644
--- a/tuplex/core/src/ee/local/LocalBackend.cc
+++ b/tuplex/core/src/ee/local/LocalBackend.cc
@@ -651,7 +651,6 @@ namespace tuplex {
             // --> issue for each memory partition a transform task and put it into local workqueue
             assert(tstage->inputMode() == EndPointMode::MEMORY);
 
-
             // restrict after input limit
             size_t numInputRows = 0;
             auto inputPartitions = tstage->inputPartitions();
@@ -697,14 +696,24 @@ namespace tuplex {
             }
         }
 
+        // TODO(march): we can avoid setting order here by pre init g_rowsDone
+
         // assign the order for all tasks
         for(size_t i = 0; i < tasks.size(); ++i) {
             tasks[i]->setOrder(i);
         }
 
+        TransformTask::setMaxOrderAndResetLimits(tasks.size() - 1);
+
         if (tstage->hasOutputLimit()) {
+            // There are 3 possible cases here:
+            // 1. both top and bottom limit
+            // 2. only top limit
+            // 3. only bottom limit
             if (tstage->outputTopLimit() > 0 && tstage->outputBottomLimit() > 0) {
-                // do task striping for output limit on both ends
+                // case 1: do task striping for output limit on both ends
+                // We are executing in the striping order instead of ascending or descending order
+                // This is an optimization in the case where we have small limits to avoid executing all partitions
                 vector<IExecutorTask*> newTasks;
                 for(size_t i = 0; i < tasks.size() - i; i++) {
                     const size_t rev_i = tasks.size() - 1 - i;
@@ -716,10 +725,13 @@ namespace tuplex {
                 assert(tasks.size() == newTasks.size());
                 tasks.swap(newTasks);
             } else if (tstage->outputBottomLimit() > 0) {
-                // bottom limit only, just reverse the task order
+                // case 3: bottom limit only, just reverse the task order
+                // We are executing the last partitions first, since we don't need the top rows.
+                // Thus speeding up the execution time
                 std::reverse(tasks.begin(), tasks.end());
             }
-            // if top limit only, do nothing since the order is already good
+            // case 3: if top limit only, do nothing since the order is already good
+            // (the tasks is generated in ascending order)
         }
 
         return tasks;
@@ -887,7 +899,7 @@ namespace tuplex {
 
             auto output_par = tstage->inputPartitions();
             if (tstage->hasOutputLimit()) {
-                trimPartitionsToLimit(output_par, tstage->outputTopLimit(), tstage->outputBottomLimit(), tstage);
+                trimPartitionsToLimit(output_par, tstage->outputTopLimit(), tstage->outputBottomLimit(), tstage, _driver);
             }
             tstage->setMemoryResult(output_par, std::vector<Partition*>{}, std::unordered_map<std::string, ExceptionInfo>(), pyObjects);
             pyObjects.clear();
@@ -972,6 +984,13 @@ namespace tuplex {
 
         auto tasks = createLoadAndTransformToMemoryTasks(tstage, _options, syms);
 
+        if (tstage->hasOutputLimit()) {
+            for (int i = 0; i < tasks.size(); i++) {
+                // take limit only work with uniform order
+                assert(tasks[i]->getOrder(0) == i);
+            }
+        }
+
         auto completedTasks = performTasks(tasks);
 
         // Note: this doesn't work yet because of the globals.
@@ -1205,7 +1224,8 @@ namespace tuplex {
                 }
 
                 if (tstage->hasOutputLimit()) {
-                    trimPartitionsToLimit(output, tstage->outputTopLimit(), tstage->outputBottomLimit(), tstage);
+                    // the function expect the output to be sorted in ascending order (guaranteed by sortTasks())
+                    trimPartitionsToLimit(output, tstage->outputTopLimit(), tstage->outputBottomLimit(), tstage, _driver);
                 }
 
                 tstage->setMemoryResult(output, generalOutput, partitionToExceptionsMap, nonConformingRows, remainingExceptions, ecounts);
@@ -1565,12 +1585,6 @@ namespace tuplex {
             logger().debug("task without order found, please fix in code.");
         }
 #endif
-
-        for (int i = 0; i < tasks.size(); i++) {
-            // take limit only work with uniform order
-            assert(tasks[i]->getOrder(0) == i);
-        }
-
         // add all tasks to queue
         for(auto& task : tasks) wq.addTask(task);
 
@@ -2125,17 +2139,18 @@ namespace tuplex {
         tstage->setFileResult(ecounts);
     }
 
-    void LocalBackend::trimPartitionsToLimit(std::vector<Partition *> &partitions,
+    void trimPartitionsToLimit(std::vector<Partition *> &partitions,
                                              size_t topLimit,
                                              size_t bottomLimit,
-                                             TransformStage* tstage) {
+                                             TransformStage* tstage,
+                                             Executor *exec) {
         std::vector<Partition *> limitedPartitions, limitedTailPartitions;
 
         // check top output limit, adjust partitions if necessary
         size_t numTopOutputRows = 0;
-        Partition* lastTopPart = nullptr;
+        Partition *lastTopPart = nullptr;
         size_t clippedTop = 0;
-        for (auto partition : partitions) {
+        for (auto partition: partitions) {
             numTopOutputRows += partition->getNumRows();
             lastTopPart = partition;
             if (numTopOutputRows >= topLimit) {
@@ -2174,7 +2189,8 @@ namespace tuplex {
                 auto clipped = bottomLimit - (numBottomOutputRows - partition->getNumRows());
                 assert(clipped <= partition->getNumRows());
                 if (clipped > 0) {
-                    Partition *newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage);
+                    Partition *newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage,
+                                                                  exec);
                     assert(newPart->getNumRows() == clipped);
                     limitedTailPartitions.push_back(newPart);
                 }
@@ -2191,10 +2207,11 @@ namespace tuplex {
             assert(clippedTop + clippedBottom <= lastTopPart->getNumRows());
 
             // split into two partitions with both top and bottom are in the same partition
-            Partition* lastBottomPart = nullptr;
+            Partition *lastBottomPart = nullptr;
 
             if (clippedBottom != 0) {
-                lastBottomPart = newPartitionWithSkipRows(lastTopPart, lastTopPart->getNumRows() - clippedBottom, tstage);
+                lastBottomPart = newPartitionWithSkipRows(lastTopPart, lastTopPart->getNumRows() - clippedBottom,
+                                                          tstage, exec);
             }
 
             if (clippedTop != 0) {
@@ -2215,27 +2232,27 @@ namespace tuplex {
         partitions.insert(partitions.end(), limitedTailPartitions.rbegin(), limitedTailPartitions.rend());
     }
 
-    Partition* LocalBackend::newPartitionWithSkipRows(Partition *p_in, size_t numToSkip, TransformStage* tstage) {
+    Partition *newPartitionWithSkipRows(Partition *p_in, size_t numToSkip, TransformStage *tstage, Executor *exec) {
         auto ptr = p_in->lockRaw();
-        auto num_rows = *((int64_t*) ptr);
+        auto num_rows = *((int64_t *) ptr);
         assert(numToSkip < num_rows);
 
         ptr += sizeof(int64_t);
         size_t numBytesToSkip = 0;
 
-        for(unsigned i = 0; i < numToSkip; ++i) {
+        for (unsigned i = 0; i < numToSkip; ++i) {
             Row r = Row::fromMemory(tstage->outputSchema(), ptr, p_in->capacity() - numBytesToSkip);
             ptr += r.serializedLength();
             numBytesToSkip += r.serializedLength();
         }
 
-        Partition *p_out = _driver->allocWritablePartition(p_in->size() - numBytesToSkip + sizeof(int64_t),
-                                                           tstage->outputSchema(), tstage->outputDataSetID(),
-                                                           tstage->context().id());
+        Partition *p_out = exec->allocWritablePartition(p_in->size() - numBytesToSkip + sizeof(int64_t),
+                                                        tstage->outputSchema(), tstage->outputDataSetID(),
+                                                        tstage->context().id());
         assert(p_out->capacity() >= p_in->size() - numBytesToSkip);
 
         auto ptr_out = p_out->lockRaw();
-        *((int64_t*) ptr_out) = p_in->getNumRows() - numToSkip;
+        *((int64_t *) ptr_out) = p_in->getNumRows() - numToSkip;
         ptr_out += sizeof(int64_t);
         memcpy((void *) ptr_out, ptr, p_in->size() - numBytesToSkip);
         p_out->unlock();
diff --git a/tuplex/core/src/physical/TransformTask.cc b/tuplex/core/src/physical/TransformTask.cc
index 377385deb..8ebe18a7b 100644
--- a/tuplex/core/src/physical/TransformTask.cc
+++ b/tuplex/core/src/physical/TransformTask.cc
@@ -18,18 +18,14 @@
 #include <bucket.h>
 
 namespace tuplex {
-    // atomic var to count output rows!
-    static std::atomic_int64_t g_totalTopOutputRows;
-    static std::atomic_int64_t g_totalBottomOutputRows;
-
     // mapping from order number -> row count if the task is finished
     static std::mutex g_rowsDoneMutex;
-    static std::map<size_t, size_t> g_rowsDone;
+    static std::unordered_map<size_t, size_t> g_rowsDone;
+    static std::atomic_size_t g_maxOrder;
 
-    void TransformTask::resetOutputLimitCounter() {
-        g_totalTopOutputRows = 0;
-        g_totalBottomOutputRows = 0;
+    void TransformTask::resetLimits(size_t maxOrder) {
         g_rowsDone.clear();
+        g_maxOrder = maxOrder;
     }
 }
 
@@ -602,25 +598,16 @@ namespace tuplex {
 #endif
     }
 
-    void TransformTask::processMemorySource() {
-        assert(!_inputPartitions.empty());
-        assert(_functor);
-
-        _numInputRowsRead = 0;
-        _numOutputRowsWritten = 0;
-
-        int64_t  num_normal_rows = 0, num_bad_rows = 0;
-
-        auto functor = reinterpret_cast<codegen::read_block_f>(_functor);
-
-        // go over all input partitions.
-        for(const auto &inputPartition : _inputPartitions) {
-            size_t numTopCompleted = 0;
-            size_t numBottomCompleted = 0;
-            bool isTopLimitReached = false;
-            bool isBottomLimitReached = false;
+    bool TransformTask::limitReached() const {
+        size_t numTopCompleted = 0;
+        size_t numBottomCompleted = 0;
+        bool isTopLimitReached = false;
+        bool isBottomLimitReached = false;
 
-            tuplex::g_rowsDoneMutex.lock();
+        tuplex::g_rowsDoneMutex.lock();
+        if (_outTopLimit == 0) {
+            isTopLimitReached = true;
+        } else {
             for (size_t i = 0; tuplex::g_rowsDone.count(i) != 0; i++) {
                 numTopCompleted += tuplex::g_rowsDone[i];
                 if (numTopCompleted >= _outTopLimit) {
@@ -628,17 +615,45 @@ namespace tuplex {
                     break;
                 }
             }
-            // TODO: what is the max task number here
-            for (size_t i = 100; tuplex::g_rowsDone.count(i) != 0; i--) {
+        }
+
+        // TODO: what is the max task number here
+        if (_outBottomLimit == 0) {
+            isBottomLimitReached = true;
+        } else {
+            for (size_t i = tuplex::g_maxOrder; tuplex::g_rowsDone.count(i) != 0; i--) {
                 numBottomCompleted += tuplex::g_rowsDone[i];
-                if (numBottomCompleted >= _outTopLimit) {
+                if (numBottomCompleted >= _outBottomLimit) {
                     isBottomLimitReached = true;
                     break;
                 }
             }
-            tuplex::g_rowsDoneMutex.unlock();
+        }
+        tuplex::g_rowsDoneMutex.unlock();
+
+        return isTopLimitReached && isBottomLimitReached;
+    }
+
+    void TransformTask::updateLimits() {
+        tuplex::g_rowsDoneMutex.lock();
+        tuplex::g_rowsDone[getOrder(0)] += getNumOutputRows();
+        tuplex::g_rowsDoneMutex.unlock();
+    }
+
+    void TransformTask::processMemorySource() {
+        assert(!_inputPartitions.empty());
+        assert(_functor);
 
-            if (isTopLimitReached && isBottomLimitReached) {
+        _numInputRowsRead = 0;
+        _numOutputRowsWritten = 0;
+
+        int64_t  num_normal_rows = 0, num_bad_rows = 0;
+
+        auto functor = reinterpret_cast<codegen::read_block_f>(_functor);
+
+        // go over all input partitions.
+        for(const auto &inputPartition : _inputPartitions) {
+            if (limitReached()) {
                 // skip the execution, enough is done
                 break;
             }
@@ -665,9 +680,7 @@ namespace tuplex {
             if(_invalidateSourceAfterUse)
                 inputPartition->invalidate();
 
-            tuplex::g_rowsDoneMutex.lock();
-            tuplex::g_rowsDone[getOrder(0)] += getNumOutputRows();
-            tuplex::g_rowsDoneMutex.unlock();
+            updateLimits();
         }
 
 #ifndef NDEBUG
diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py
index 4d02cf4d4..376134934 100644
--- a/tuplex/python/tuplex/dataset.py
+++ b/tuplex/python/tuplex/dataset.py
@@ -113,7 +113,8 @@ def collect(self):
         return self._dataSet.collect()
 
     def take(self, limitTop=5, limitBottom=0):
-        """ action that generates a physical plan, processes data and collects the top results then as list of tuples.
+        """ action that generates a physical plan, processes data and collects the top and bottom results
+        then as list of tuples.
 
         Args:
             limitTop (int): number of top rows to collect. Per default ``5``.
@@ -135,6 +136,26 @@ def take(self, limitTop=5, limitBottom=0):
 
         return self._dataSet.take(limitTop, limitBottom)
 
+    def head(self, nrows):
+        """ action that generates a physical plan, processes data and collects the top results then as list of tuples.
+
+        Args:
+            nrows (int): number of rows to collect.
+        Returns:
+            (list): A list of tuples
+        """
+        return self.take(nrows, 0)
+
+    def tail(self, nrows):
+        """ action that generates a physical plan, processes data and collects the bottom results then as list of tuples.
+
+        Args:
+            nrows (int): number of rows to collect.
+        Returns:
+            (list): A list of tuples
+        """
+        return self.take(0, nrows)
+
     def show(self, nrows=None):
         """ action that generates a physical plan, processes data and prints results as nicely formatted
         ASCII table to stdout.
diff --git a/tuplex/test/core/TakeTest.cc b/tuplex/test/core/TakeTest.cc
index 3990fcd07..8c4ed5fe5 100644
--- a/tuplex/test/core/TakeTest.cc
+++ b/tuplex/test/core/TakeTest.cc
@@ -203,4 +203,6 @@ TEST_F(TakeTest, takeBigTest) {
 // TODO(march): with file input
 //    context.csv("../resources/");
 
-// TODO(march): collect operator
\ No newline at end of file
+// TODO(march): collect operator
+
+// TODO(march): write test for trimPartitionsToLimit
\ No newline at end of file

From ac4c600cc43628d89ea08a1558f8fcee93bc89c1 Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Fri, 8 Apr 2022 02:01:12 -0400
Subject: [PATCH 18/56] Add two more testcases

---
 tuplex/core/src/ee/local/LocalBackend.cc |   8 --
 tuplex/test/core/TakeTest.cc             | 119 +++++++++++++++++++----
 2 files changed, 99 insertions(+), 28 deletions(-)

diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc
index 9530e9d04..5616488e7 100644
--- a/tuplex/core/src/ee/local/LocalBackend.cc
+++ b/tuplex/core/src/ee/local/LocalBackend.cc
@@ -983,14 +983,6 @@ namespace tuplex {
         }
 
         auto tasks = createLoadAndTransformToMemoryTasks(tstage, _options, syms);
-
-        if (tstage->hasOutputLimit()) {
-            for (int i = 0; i < tasks.size(); i++) {
-                // take limit only work with uniform order
-                assert(tasks[i]->getOrder(0) == i);
-            }
-        }
-
         auto completedTasks = performTasks(tasks);
 
         // Note: this doesn't work yet because of the globals.
diff --git a/tuplex/test/core/TakeTest.cc b/tuplex/test/core/TakeTest.cc
index 8c4ed5fe5..98edecb41 100644
--- a/tuplex/test/core/TakeTest.cc
+++ b/tuplex/test/core/TakeTest.cc
@@ -16,7 +16,8 @@
 using namespace tuplex;
 using namespace std;
 
-class TakeTest : public PyTest {};
+class TakeTest : public PyTest {
+};
 
 /**
  * Randomly generate a vector of rows for testing
@@ -37,15 +38,15 @@ vector<Row> generateTestData(size_t N, uint64_t seed) {
     return data;
 }
 
-vector<Row> generateReferenceData(const vector<Row>& input, size_t topLimit, size_t bottomLimit) {
+vector<Row> generateReferenceData(const vector<Row> &input, size_t topLimit, size_t bottomLimit) {
     vector<Row> output;
-    for(size_t i = 0; i < topLimit && i < input.size(); i++) {
+    for (size_t i = 0; i < topLimit && i < input.size(); i++) {
         output.push_back(input[i]);
     }
     size_t start_bottom = input.size() >= bottomLimit ? input.size() - bottomLimit : 0;
     start_bottom = max(topLimit, start_bottom);
 
-    for(size_t i = start_bottom; i < input.size(); i++) {
+    for (size_t i = start_bottom; i < input.size(); i++) {
         output.push_back(input[i]);
     }
 
@@ -57,7 +58,7 @@ TEST_F(TakeTest, takeTopTest) {
     Context context(opt);
 
     auto rs = context.parallelize(
-        {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(1, 0);
+            {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(1, 0);
 
     ASSERT_EQ(rs->rowCount(), 1);
     auto v = rs->getRows(1);
@@ -65,7 +66,7 @@ TEST_F(TakeTest, takeTopTest) {
     EXPECT_EQ(v[0].getInt(0), 1);
 
     auto rs2 = context.parallelize(
-        {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(3, 0);
+            {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(3, 0);
 
     ASSERT_EQ(rs2->rowCount(), 3);
     auto v2 = rs2->getRows(3);
@@ -75,7 +76,8 @@ TEST_F(TakeTest, takeTopTest) {
     EXPECT_EQ(v2[2].getInt(0), 3);
 
     auto rs3 = context.parallelize(
-        {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"), Row("!")}).take(5, 0);
+            {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"),
+             Row("!")}).take(5, 0);
 
     ASSERT_EQ(rs3->rowCount(), 5);
     auto v3 = rs3->getRows(5);
@@ -93,7 +95,7 @@ TEST_F(TakeTest, takeBottomTest) {
     Context context(opt);
 
     auto rs = context.parallelize(
-        {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(0, 1);
+            {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(0, 1);
 
     ASSERT_EQ(rs->rowCount(), 1);
     auto v = rs->getRows(1);
@@ -101,7 +103,7 @@ TEST_F(TakeTest, takeBottomTest) {
     EXPECT_EQ(v[0].getInt(0), 6);
 
     auto rs2 = context.parallelize(
-        {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(0, 3);
+            {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(0, 3);
 
     ASSERT_EQ(rs2->rowCount(), 3);
     auto v2 = rs2->getRows(3);
@@ -111,7 +113,8 @@ TEST_F(TakeTest, takeBottomTest) {
     EXPECT_EQ(v2[2].getInt(0), 6);
 
     auto rs3 = context.parallelize(
-        {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"), Row("!")}).take(0, 5);
+            {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"),
+             Row("!")}).take(0, 5);
 
     ASSERT_EQ(rs3->rowCount(), 5);
     auto v3 = rs3->getRows(5);
@@ -129,7 +132,7 @@ TEST_F(TakeTest, takeBothTest) {
     Context context(opt);
 
     auto rs = context.parallelize(
-        {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(1, 1);
+            {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(1, 1);
 
     ASSERT_EQ(rs->rowCount(), 2);
     auto v = rs->getRows(2);
@@ -138,7 +141,7 @@ TEST_F(TakeTest, takeBothTest) {
     EXPECT_EQ(v[1].getInt(0), 6);
 
     auto rs2 = context.parallelize(
-        {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(2, 1);
+            {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(2, 1);
 
     ASSERT_EQ(rs2->rowCount(), 3);
     auto v2 = rs2->getRows(3);
@@ -148,7 +151,8 @@ TEST_F(TakeTest, takeBothTest) {
     EXPECT_EQ(v2[2].getInt(0), 6);
 
     auto rs3 = context.parallelize(
-        {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"), Row("!")}).take(2, 3);
+            {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"),
+             Row("!")}).take(2, 3);
 
     ASSERT_EQ(rs3->rowCount(), 5);
     auto v3 = rs3->getRows(5);
@@ -167,12 +171,12 @@ TEST_F(TakeTest, takeBigTest) {
     const std::vector<size_t> limit_values{0, 1, 5, 11, 600, 10000};
     const std::vector<string> partition_sizes{"256B", "512KB", "1MB"};
 
-    for(auto& part_size : partition_sizes) {
+    for (auto &part_size: partition_sizes) {
         auto opt = testOptions();
         opt.set("tuplex.partitionSize", part_size);
         Context context(opt);
 
-        for(auto data_size : test_size) {
+        for (auto data_size: test_size) {
             for (auto top_limit: limit_values) {
                 for (auto bottom_limit: limit_values) {
                     std::cout << "testing with partition size:" << part_size << " data size:"
@@ -195,14 +199,89 @@ TEST_F(TakeTest, takeBigTest) {
     }
 }
 
-// TODO(march): with map, filter function
-//TEST_F(TakeTest, takeMapFilterTest) {
-//    srand(4242);
-//}
+vector<Row> generateMapFilterReferenceData(const vector<Row> &input, size_t topLimit, size_t bottomLimit) {
+    if (input.empty()) {
+        return {};
+    }
+
+    assert(input[0].getNumColumns() == 3);
+    vector<Row> intermedate;
+    for (const Row &r: input) {
+        int64_t new_a = r.getInt(0) + r.getInt(1);
+
+        if (new_a % 2 == 0) {
+            intermedate.emplace_back(new_a, r.getInt(2));
+        }
+    }
+
+    return generateReferenceData(intermedate, topLimit, bottomLimit);
+}
+
+TEST_F(TakeTest, takeMapFilterTest) {
+    mt19937 data_seed_gen(56120);
+
+    const std::vector<size_t> test_size{1, 10, 100, 1001, 10001};
+    const std::vector<size_t> limit_values{0, 1, 5, 11, 600, 10000};
+    const std::vector<string> partition_sizes{"256B", "512KB", "1MB"};
+
+    UDF map_udf("lambda a, b, c: ((a + b), c)");
+    UDF filter_udf("lambda a, b: a % 2 == 0");
+
+    for (auto &part_size: partition_sizes) {
+        auto opt = testOptions();
+        opt.set("tuplex.partitionSize", part_size);
+        Context context(opt);
+
+        for (auto data_size: test_size) {
+            for (auto top_limit: limit_values) {
+                for (auto bottom_limit: limit_values) {
+                    std::cout << "testing with partition size:" << part_size << " data size:"
+                              << data_size << " top:" << top_limit << " bottom:" << bottom_limit << std::endl;
+
+                    auto data = generateTestData(data_size, data_seed_gen());
+                    auto ref_data = generateMapFilterReferenceData(data, top_limit, bottom_limit);
+
+                    auto ds = context.parallelize(data).map(map_udf).filter(filter_udf);
+                    auto res = ds.take(top_limit, bottom_limit);
+                    ASSERT_EQ(ref_data.size(), res->rowCount());
+                    for (Row &r: ref_data) {
+                        Row res_row = res->getNextRow();
+                        if (!(res_row == r)) {
+                            ASSERT_EQ(res_row, r);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
 
 // TODO(march): with file input
 //    context.csv("../resources/");
 
-// TODO(march): collect operator
+TEST_F(TakeTest, collectIdentityTest) {
+    mt19937 data_seed_gen(123454);
+
+    const std::vector<size_t> test_size{1, 10, 100, 1001, 10001};
+    const std::vector<string> partition_sizes{"256B", "512KB", "1MB"};
+
+    for (auto &part_size: partition_sizes) {
+        auto opt = testOptions();
+        opt.set("tuplex.partitionSize", part_size);
+        Context context(opt);
+
+        for (auto data_size: test_size) {
+            auto data = generateTestData(data_size, data_seed_gen());
+            auto res = context.parallelize(data).collect();
+            ASSERT_EQ(data.size(), res->rowCount());
+            for (Row &r: data) {
+                Row res_row = res->getNextRow();
+                if (!(res_row == r)) {
+                    ASSERT_EQ(res_row, r);
+                }
+            }
+        }
+    }
+}
 
 // TODO(march): write test for trimPartitionsToLimit
\ No newline at end of file

From 56131a7843274a35b37d13ed0a70d0be98d2a155 Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Fri, 15 Apr 2022 01:06:39 -0400
Subject: [PATCH 19/56] Address PR feedbacks

---
 tuplex/core/include/DataSet.h                |  35 ++++++-
 tuplex/core/include/EmptyDataset.h           |  14 +--
 tuplex/core/include/LocalEngine.h            |  59 +++++++----
 tuplex/core/include/ee/IBackend.h            |  15 +--
 tuplex/core/include/ee/local/LocalBackend.h  |  14 +--
 tuplex/core/include/physical/TransformTask.h |  31 ++++--
 tuplex/core/src/DataSet.cc                   |   1 -
 tuplex/core/src/LocalEngine.cc               |  38 +++++--
 tuplex/core/src/ee/local/LocalBackend.cc     |  17 +--
 tuplex/core/src/physical/PhysicalPlan.cc     |   2 +-
 tuplex/core/src/physical/ResultSet.cc        |  12 ++-
 tuplex/core/src/physical/TransformStage.cc   |   9 +-
 tuplex/core/src/physical/TransformTask.cc    |  11 +-
 tuplex/python/tuplex/dataset.py              | 105 ++++++++++++++++++-
 tuplex/test/core/ContextBasics.cc            |  56 +++++++++-
 tuplex/test/core/ResultSetTest.cc            |   5 +-
 tuplex/test/core/TakeTest.cc                 |  72 +++++++++++--
 17 files changed, 413 insertions(+), 83 deletions(-)

diff --git a/tuplex/core/include/DataSet.h b/tuplex/core/include/DataSet.h
index 9510427e2..3a5f450ac 100644
--- a/tuplex/core/include/DataSet.h
+++ b/tuplex/core/include/DataSet.h
@@ -125,7 +125,7 @@ namespace tuplex {
 
         /*!
          * action that displays tuples as nicely formatted table
-         * @param numRows how many rows to print, i.e. top numRows are printed.xs
+         * @param numRows how many rows to print, i.e. top numRows are printed.xs, -1 means print all rows
          * @param os ostream where to print table to
          */
         virtual void show(int64_t numRows = -1, std::ostream &os = std::cout);
@@ -260,21 +260,48 @@ namespace tuplex {
          * @param memoryLayout
          * @return
          */
-        virtual DataSet& cache(const Schema::MemoryLayout& memoryLayout, bool storeSpecialized);
-        DataSet& cache(bool storeSpecialized=true) { return cache(Schema::MemoryLayout::ROW, storeSpecialized); }
+        virtual DataSet &cache(const Schema::MemoryLayout &memoryLayout, bool storeSpecialized);
+
+        DataSet &cache(bool storeSpecialized = true) { return cache(Schema::MemoryLayout::ROW, storeSpecialized); }
 
         /*!
          * helper setter without checks, to update internal column names.
          */
         void setColumns(const std::vector<std::string> &columnNames) { _columnNames = columnNames; }
 
-        // these are actions that cause execution
+        /*!
+         * Execute the pipeline and return all outputs
+         * @param os the logging output
+         * @return the output of the execution
+         */
         virtual std::shared_ptr<ResultSet> collect(std::ostream &os = std::cout);
 
+        /*!
+         * Execute the pipeline and take a subset of the output from the top and bottom rows.
+         * If both top and bottom rows limit exist, then the top and bottom rows will be concatenated.
+         * In the case where topLimit + bottomLimit exceeds the output size, all rows will be taken.
+         * To take all rows, pass in either topLimit=size_t::max(), bottomLimit=size_t::max(), or both.
+         * @param topLimit number of top rows to take. size_t::max() means taking all rows
+         * @param bottomLimit number of bottom rows to take. size_t::max() means taking all rows
+         * @param os the logging output
+         * @return result of the execution, trim to the size of top and bottom limit.
+         */
         virtual std::shared_ptr<ResultSet> take(size_t topLimit, size_t bottomLimit, std::ostream &os = std::cout);
 
+        /*!
+         * Execute the pipeline and return all outputs as vector
+         * @param os the logging output
+         * @return the output of the execution in vector
+         */
         virtual std::vector<Row> collectAsVector(std::ostream &os = std::cout);
 
+        /*!
+         * Execute the pipeline and take a subset of the output from the top rows, return as vector
+         * In the case where numElements exceeds the output size, all rows will be taken.
+         * @param numElements number of top rows to take. size_t::max() means taking all rows
+         * @param os the logging output
+         * @return result of the execution in vector, trim to the size of numElements
+         */
         virtual std::vector<Row> takeAsVector(size_t numElements, std::ostream &os = std::cout);
 
         /*!
diff --git a/tuplex/core/include/EmptyDataset.h b/tuplex/core/include/EmptyDataset.h
index 6fc3219a4..585b70881 100644
--- a/tuplex/core/include/EmptyDataset.h
+++ b/tuplex/core/include/EmptyDataset.h
@@ -67,18 +67,20 @@ namespace tuplex {
         virtual DataSet& aggregateByKey(const UDF& aggCombine, const UDF& aggUDF, const Row& aggInitial, const std::vector<std::string> &keyColumns) override { return *this; }
 
         //virtual void show(const int64_t numRows=-1, std::ostream& os=std::cout) override;
-        virtual std::shared_ptr<ResultSet> collect(std::ostream& os) override;
+        virtual std::shared_ptr<ResultSet> collect(std::ostream &os) override;
 
         // take / collect will print out the error only
-        virtual std::shared_ptr<ResultSet> take(size_t topLimit, size_t bottomLimit, std::ostream& os) override;
+        virtual std::shared_ptr<ResultSet> take(size_t topLimit, size_t bottomLimit, std::ostream &os) override;
 
         //virtual void show(const int64_t numRows=-1, std::ostream& os=std::cout) override;
-        virtual std::vector<Row> collectAsVector(std::ostream& os) override;
+        virtual std::vector<Row> collectAsVector(std::ostream &os) override;
 
-        // take / collect will print out the error only
-        virtual std::vector<Row> takeAsVector(size_t numElements, std::ostream& os) override;
+        /*!
+         * take / collect will print out the error only, return empty rows
+         */
+        virtual std::vector<Row> takeAsVector(size_t numElements, std::ostream &os) override;
 
-        DataSet& cache(const Schema::MemoryLayout& memoryLayout, bool storeSpecialized) override {
+        DataSet &cache(const Schema::MemoryLayout &memoryLayout, bool storeSpecialized) override {
             return *this;
         }
     };
diff --git a/tuplex/core/include/LocalEngine.h b/tuplex/core/include/LocalEngine.h
index 66ed3a1e8..740a40b4d 100644
--- a/tuplex/core/include/LocalEngine.h
+++ b/tuplex/core/include/LocalEngine.h
@@ -16,7 +16,28 @@
 #include <TSingleton.h>
 #include "RESTInterface.h"
 
+
 namespace tuplex {
+    struct ExecutorConfig {
+        size_t _size; // size in bytes that each executor should have
+        size_t _blockSize; // size of individual blocks used (can be used for coarse or fine grained parallelism)
+        size_t _runTimeMemory;
+        size_t _runTimeMemoryDefaultBlockSize;
+        URI _cache_path;
+
+        bool operator==(const ExecutorConfig &rhs) const {
+            return _size == rhs._size &&
+                   _blockSize == rhs._blockSize &&
+                   _runTimeMemory == rhs._runTimeMemory &&
+                   _runTimeMemoryDefaultBlockSize == rhs._runTimeMemoryDefaultBlockSize &&
+                   _cache_path == rhs._cache_path;
+        }
+
+        bool operator!=(const ExecutorConfig &rhs) const {
+            return !(rhs == *this);
+        }
+    };
+
     /*!
      * local execution engine. Provides local executors for a context
      * THIS IS NOT THREADSAFE. Should be only accessed by driver thread.
@@ -25,16 +46,18 @@ namespace tuplex {
 
     private:
         // non-detached executor that serves as the driver
-        std::unique_ptr<Executor> _driver;
+        std::shared_ptr<Executor> _driver;
+        ExecutorConfig _driver_cfg;
 
         std::vector<std::unique_ptr<Executor>> _executors;
-        std::map<Executor*, size_t> _refCounts; //! reference counts for each executor
+        std::map<Executor *, size_t> _refCounts; //! reference counts for each executor
+
+        LocalEngine(const LocalEngine &);
 
-        LocalEngine(const LocalEngine&);
-        void operator = (const LocalEngine&);
+        void operator=(const LocalEngine &);
 
         // The local task queue
-        WorkQueue  _queue;
+        WorkQueue _queue;
 
     protected:
         LocalEngine();
@@ -63,25 +86,25 @@ namespace tuplex {
          * @param cache_path directory where subfolders will be created for all executors to be started
          * @return array of executor references
          */
-        std::vector<Executor*> getExecutors(const size_t num,
-                                            const size_t size,
-                                            const size_t blockSize,
-                                            const size_t runTimeMemory,
-                                            const size_t runTimeMemoryDefaultBlockSize,
-                const URI& cache_path);
+        std::vector<Executor *> getExecutors(const size_t num,
+                                             const size_t size,
+                                             const size_t blockSize,
+                                             const size_t runTimeMemory,
+                                             const size_t runTimeMemoryDefaultBlockSize,
+                                             const URI &cache_path);
 
         /*!
          * releases executors (invoked by context)
          * @param executors
          * @param ctx
          */
-        void freeExecutors(const std::vector<Executor*>& executors, const Context* ctx=nullptr);
+        void freeExecutors(const std::vector<Executor *> &executors, const Context *ctx = nullptr);
 
-        Executor* getDriver(const size_t size,
-                            const size_t blockSize,
-                            const size_t runTimeMemory,
-                            const size_t runTimeMemoryDefaultBlockSize,
-                            const URI& cache_path);
+        std::shared_ptr<Executor> getDriver(const size_t size,
+                                            const size_t blockSize,
+                                            const size_t runTimeMemory,
+                                            const size_t runTimeMemoryDefaultBlockSize,
+                                            const URI &cache_path);
 
         void release();
 
@@ -89,7 +112,7 @@ namespace tuplex {
          * retrieves the global work queue for local executors
          * @return
          */
-        WorkQueue& getQueue() { return _queue; }
+        WorkQueue &getQueue() { return _queue; }
     };
 }
 #endif //TUPLEX_LOCALENGINE_H
\ No newline at end of file
diff --git a/tuplex/core/include/ee/IBackend.h b/tuplex/core/include/ee/IBackend.h
index e7a80e5bb..1a543df8f 100644
--- a/tuplex/core/include/ee/IBackend.h
+++ b/tuplex/core/include/ee/IBackend.h
@@ -29,19 +29,22 @@ namespace tuplex {
     class IBackend {
     public:
         IBackend() = delete;
-        IBackend(const IBackend& other) = delete;
-        IBackend(const Context& context) : _context(context) {}
+
+        IBackend(const IBackend &other) = delete;
+
+        IBackend(const Context &context) : _context(context) {}
 
         // driver, i.e. where to store local data.
-        virtual Executor* driver() = 0;
-        virtual void execute(PhysicalStage* stage) = 0;
+        virtual Executor *driver() = 0;
+
+        virtual void execute(PhysicalStage *stage) = 0;
 
         virtual ~IBackend() {} // virtual destructor needed b.c. of smart pointers
 
-        virtual const Context& context() const { return _context; }
+        virtual const Context &context() const { return _context; }
 
     private:
-        const Context& _context;
+        const Context &_context;
     };
 
     inline std::unordered_map<std::tuple<int64_t, ExceptionCode>, size_t> merge_ecounts(std::unordered_map<std::tuple<int64_t, ExceptionCode>, size_t> lhs,
diff --git a/tuplex/core/include/ee/local/LocalBackend.h b/tuplex/core/include/ee/local/LocalBackend.h
index 712f0ae43..3d73a5d9f 100644
--- a/tuplex/core/include/ee/local/LocalBackend.h
+++ b/tuplex/core/include/ee/local/LocalBackend.h
@@ -40,14 +40,15 @@ namespace tuplex {
          * constructor for convenience
          * @param context
          */
-        explicit LocalBackend(const Context& context);
+        explicit LocalBackend(const Context &context);
 
-        Executor* driver() override; // for local execution
+        Executor *driver() override; // for local execution
+
+        void execute(PhysicalStage *stage) override;
 
-        void execute(PhysicalStage* stage) override;
     private:
-        Executor *_driver; //! driver from local backend...
-        std::vector<Executor*> _executors; //! drivers to be used
+        std::shared_ptr<Executor> _driver; //! driver from local backend...
+        std::vector<Executor *> _executors; //! drivers to be used
         std::unique_ptr<JITCompiler> _compiler;
 
         HistoryServerConnection _historyConn;
@@ -187,6 +188,7 @@ namespace tuplex {
 
     /*!
      * Trim list of partitions so that it includes up to the first n rows and the last m rows
+     * if n + m > number of rows in input partitions, the partitions will remain unchanged
      * @param partitions [in,out] the list of partitions to trim
      * @param topLimit n, the number of top rows to include
      * @param bottomLimit m, the number of bottom rows to include
@@ -197,7 +199,7 @@ namespace tuplex {
                                TransformStage *tstage, Executor *exec);
 
     /*!
-     * Create a new partition with the same data as the specified partition, but with the first n rows removed
+     * Create a newly allocated partition with the same data as the specified partition, but with the first n rows removed
      * @param p_in the input partition
      * @param numToSkip number of rows to remove from the new partition
      * @param tstage pointer to transform stage, used to generate new partition
diff --git a/tuplex/core/include/physical/TransformTask.h b/tuplex/core/include/physical/TransformTask.h
index d966b69ee..8ac5ba6df 100644
--- a/tuplex/core/include/physical/TransformTask.h
+++ b/tuplex/core/include/physical/TransformTask.h
@@ -180,6 +180,7 @@ namespace tuplex {
         void setOutputPrefix(const char* buf, size_t bufSize); // extra prefix to write first to output.
 
         void sinkOutputToHashTable(HashTableFormat fmt, int64_t outputDataSetID);
+
         HashTableSink hashTableSink() const { return _htable; } // needs to be freed manually!
 
         void setOutputTopLimit(size_t limit) {
@@ -190,16 +191,21 @@ namespace tuplex {
             _outBottomLimit = limit;
         }
 
-        // maxOrder of infinity means disregarding the bottomLimit short circuit
-        static void setMaxOrderAndResetLimits(size_t maxOrder = std::numeric_limits<size_t>::max()) {
-            resetLimits(maxOrder);
-        }
+        /*!
+         * Set the maximum task order number that the current stage execute and reset the row counter.
+         * This is used to detect and stop the execution when we have reached the rows limit
+         * @param maxOrder maximum task order number in the pipeline, infinity means disregarding the bottomLimit short circuit
+         */
+        static void setMaxOrderAndResetLimits(size_t maxOrder = std::numeric_limits<size_t>::max());
 
         void execute() override;
 
         bool hasFileSink() const { return _outputFilePath != URI::INVALID; }
+
         bool hasFileSource() const { return _inputFilePath != URI::INVALID; }
+
         bool hasMemorySink() const { return _outputSchema != Schema::UNKNOWN; }
+
         bool hasMemorySource() const { return !_inputPartitions.empty(); }
         bool hasHashTableSink() const { return _htableFormat != HashTableFormat::UNKNOWN; }
         HashTableFormat hashTableFormat() const { return _htableFormat; }
@@ -218,8 +224,6 @@ namespace tuplex {
         static codegen::i64_hash_row_f writeInt64HashTableAggregateCallback();
         static codegen::write_row_f aggCombineCallback();
 
-        static void resetLimits(size_t maxOrder);
-
         // most be public because of C++ issues -.-
         int64_t writeRowToMemory(uint8_t* buf, int64_t bufSize);
         int64_t writeRowToFile(uint8_t* buf, int64_t bufSize);
@@ -321,15 +325,26 @@ namespace tuplex {
         inline int64_t contextID() const { return _contextID; }
 
         inline void unlockAllMemorySinks() {  // output partition existing? if so unlock
-           _output.unlock();
-           _exceptions.unlock();
+            _output.unlock();
+            _exceptions.unlock();
         }
 
+        /*!
+         * check whether the stage reached both top and bottom limit, to use this one must call
+         * setMaxOrderAndResetLimits before execution and set both top and bottom limit
+         * @return true if limit is reached
+         */
         bool limitReached() const;
+
+        /*!
+         * Update the global stage limit counter, should only be called once, at the end of task
+         */
         void updateLimits();
 
         void processMemorySourceWithExp();
+
         void processMemorySource();
+
         void processFileSource();
 
         // exceptions
diff --git a/tuplex/core/src/DataSet.cc b/tuplex/core/src/DataSet.cc
index 210b3ec60..b62946ae4 100644
--- a/tuplex/core/src/DataSet.cc
+++ b/tuplex/core/src/DataSet.cc
@@ -849,7 +849,6 @@ namespace tuplex {
         assert(numColumns != 0);
 
         // construct headers
-        std::vector<std::string> headers(numColumns);
         if (!_columnNames.empty()) {
             assert(numColumns == _columnNames.size());
             for (auto &c_name: _columnNames) {
diff --git a/tuplex/core/src/LocalEngine.cc b/tuplex/core/src/LocalEngine.cc
index 02c060a90..c9c6d506b 100644
--- a/tuplex/core/src/LocalEngine.cc
+++ b/tuplex/core/src/LocalEngine.cc
@@ -98,7 +98,8 @@ namespace tuplex {
             exec->processQueue(true);
 
             std::stringstream ss;
-            ss<<"started local executor "<<exec->name()<<" ("<<sizeToMemString(size)<<", "<<sizeToMemString(blockSize)<<" default partition size)";
+            ss << "started local executor " << exec->name() << " (" << sizeToMemString(size) << ", "
+               << sizeToMemString(blockSize) << " default partition size)";
             logger.info(ss.str());
         }
 
@@ -107,23 +108,44 @@ namespace tuplex {
         return execs;
     }
 
-    Executor* LocalEngine::getDriver(const size_t size, const size_t blockSize, const size_t runTimeMemory,
-                                     const size_t runTimeMemoryDefaultBlockSize, const tuplex::URI &cache_path) {
-        // lazy start driver
-        if(!_driver) {
+    std::shared_ptr<Executor>
+    LocalEngine::getDriver(const size_t size, const size_t blockSize, const size_t runTimeMemory,
+                           const size_t runTimeMemoryDefaultBlockSize, const tuplex::URI &cache_path) {
+        ExecutorConfig new_cfg = ExecutorConfig{
+                ._size = size,
+                ._blockSize = blockSize,
+                ._runTimeMemory = runTimeMemory,
+                ._runTimeMemoryDefaultBlockSize = runTimeMemoryDefaultBlockSize,
+                ._cache_path = cache_path
+        };
+
+        if (!_driver || _driver_cfg != new_cfg) {
+            if (_driver) {
+                Logger::instance().logger("local execution engine").info(
+                        "driver already exist, starting new driver with updated config");
+                _driver->release(); // TODO(march): test whether we need this
+            }
+
+            // lazy start driver
             URI uri = URI(cache_path.toString() + "/" + "driver");
-            _driver = std::make_unique<Executor>(size, blockSize, runTimeMemory, runTimeMemoryDefaultBlockSize, uri, "driver");
+            _driver = std::make_shared<Executor>(size, blockSize, runTimeMemory, runTimeMemoryDefaultBlockSize, uri,
+                                                 "driver");
+            _driver_cfg = new_cfg;
 
+            // TODO(march): this could be a problem, if multiple driver with number = 0
+            // TODO(march): write a test for two drivers existing together (thread number 0)
+            // TODO(march): make a comment about potential issue here
             // driver always has thread number 0!
             _driver->setThreadNumber(0);
 
             std::stringstream ss;
-            ss<<"started driver ("<<sizeToMemString(size)<<", "<<sizeToMemString(blockSize)<<" default partition size)";
+            ss << "started driver (" << sizeToMemString(size) << ", " << sizeToMemString(blockSize)
+               << " default partition size)";
             //  <<"overflow will be cached at "<<uri.toString();
             Logger::instance().logger("local execution engine").info(ss.str());
         }
 
-        return _driver.get();
+        return _driver;
     }
 
     void LocalEngine::freeExecutors(const std::vector<tuplex::Executor *> & executors, const Context* ctx) {
diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc
index 5616488e7..351d55b88 100644
--- a/tuplex/core/src/ee/local/LocalBackend.cc
+++ b/tuplex/core/src/ee/local/LocalBackend.cc
@@ -116,14 +116,14 @@ namespace tuplex {
     }
 
     Executor *LocalBackend::driver() {
-      assert(_driver);
-      return _driver;
+        assert(_driver);
+        return _driver.get();
     }
 
     void LocalBackend::execute(tuplex::PhysicalStage *stage) {
         assert(stage);
 
-        if(!stage)
+        if (!stage)
             return;
 
         // history server connection should be established
@@ -696,8 +696,6 @@ namespace tuplex {
             }
         }
 
-        // TODO(march): we can avoid setting order here by pre init g_rowsDone
-
         // assign the order for all tasks
         for(size_t i = 0; i < tasks.size(); ++i) {
             tasks[i]->setOrder(i);
@@ -899,7 +897,8 @@ namespace tuplex {
 
             auto output_par = tstage->inputPartitions();
             if (tstage->hasOutputLimit()) {
-                trimPartitionsToLimit(output_par, tstage->outputTopLimit(), tstage->outputBottomLimit(), tstage, _driver);
+                trimPartitionsToLimit(output_par, tstage->outputTopLimit(), tstage->outputBottomLimit(), tstage,
+                                      _driver.get());
             }
             tstage->setMemoryResult(output_par, std::vector<Partition*>{}, std::unordered_map<std::string, ExceptionInfo>(), pyObjects);
             pyObjects.clear();
@@ -1217,7 +1216,8 @@ namespace tuplex {
 
                 if (tstage->hasOutputLimit()) {
                     // the function expect the output to be sorted in ascending order (guaranteed by sortTasks())
-                    trimPartitionsToLimit(output, tstage->outputTopLimit(), tstage->outputBottomLimit(), tstage, _driver);
+                    trimPartitionsToLimit(output, tstage->outputTopLimit(), tstage->outputBottomLimit(), tstage,
+                                          _driver.get());
                 }
 
                 tstage->setMemoryResult(output, generalOutput, partitionToExceptionsMap, nonConformingRows, remainingExceptions, ecounts);
@@ -2232,8 +2232,9 @@ namespace tuplex {
         ptr += sizeof(int64_t);
         size_t numBytesToSkip = 0;
 
+        Deserializer ds(tstage->outputSchema());
         for (unsigned i = 0; i < numToSkip; ++i) {
-            Row r = Row::fromMemory(tstage->outputSchema(), ptr, p_in->capacity() - numBytesToSkip);
+            Row r = Row::fromMemory(ds, ptr, p_in->capacity() - numBytesToSkip);
             ptr += r.serializedLength();
             numBytesToSkip += r.serializedLength();
         }
diff --git a/tuplex/core/src/physical/PhysicalPlan.cc b/tuplex/core/src/physical/PhysicalPlan.cc
index f289064d5..e88189447 100644
--- a/tuplex/core/src/physical/PhysicalPlan.cc
+++ b/tuplex/core/src/physical/PhysicalPlan.cc
@@ -240,7 +240,7 @@ namespace tuplex {
         // user wants to merge exceptions in order.
         bool updateInputExceptions = hasFilter && hasInputExceptions && _context.getOptions().OPT_MERGE_EXCEPTIONS_INORDER();
 
-        // create transfrom stage via builder pattern
+        // create transform stage via builder pattern
         auto builder = codegen::StageBuilder(_num_stages++,
                                                isRootStage,
                                                _context.getOptions().UNDEFINED_BEHAVIOR_FOR_OPERATORS(),
diff --git a/tuplex/core/src/physical/ResultSet.cc b/tuplex/core/src/physical/ResultSet.cc
index bfd656dc8..0eb6d95ad 100644
--- a/tuplex/core/src/physical/ResultSet.cc
+++ b/tuplex/core/src/physical/ResultSet.cc
@@ -127,15 +127,19 @@ namespace tuplex {
                 return vector<Row>{};
 
             Deserializer ds(_schema);
-            for(int i = 0; i < limit;) {
+            for (size_t i = 0; i < limit;) {
 
                 // all exhausted
-                if(_partitions.empty())
+                if (_partitions.empty())
                     break;
 
                 // get number of rows in first partition
                 Partition *first = _partitions.front();
                 auto num_rows = first->getNumRows();
+
+                assert(num_rows >= _curRowCounter);
+                assert(limit >= i);
+
                 // how many left to retrieve?
                 auto num_to_retrieve_from_partition = std::min(limit - i, num_rows - _curRowCounter);
                 assert(num_to_retrieve_from_partition >= 0);
@@ -145,8 +149,8 @@ namespace tuplex {
 
                 // thread safe version (slow)
                 // get next element of partition
-                const uint8_t* ptr = first->lock();
-                for(int j = 0; j < num_to_retrieve_from_partition; ++j) {
+                const uint8_t *ptr = first->lock();
+                for (size_t j = 0; j < num_to_retrieve_from_partition; ++j) {
                     auto row = Row::fromMemory(ds, ptr + _byteCounter, first->capacity() - _byteCounter);
                     _byteCounter += row.serializedLength();
                     _curRowCounter++;
diff --git a/tuplex/core/src/physical/TransformStage.cc b/tuplex/core/src/physical/TransformStage.cc
index 060365697..b0e2e70ab 100644
--- a/tuplex/core/src/physical/TransformStage.cc
+++ b/tuplex/core/src/physical/TransformStage.cc
@@ -139,10 +139,17 @@ namespace tuplex {
                 }
             }
 
+            size_t maxRows;
+            if (hasOutputLimit()) {
+                maxRows = outputTopLimit() + outputBottomLimit();
+            } else {
+                maxRows = std::numeric_limits<size_t>::max();
+            }
+
             // put ALL partitions to result set
             _rs = std::make_shared<ResultSet>(schema, partitions,
                                               generalCase, partitionToExceptionsMap, interpreterRows,
-                                              outputTopLimit() + outputBottomLimit());
+                                              maxRows);
         }
     }
 
diff --git a/tuplex/core/src/physical/TransformTask.cc b/tuplex/core/src/physical/TransformTask.cc
index 8ebe18a7b..2de71e4fe 100644
--- a/tuplex/core/src/physical/TransformTask.cc
+++ b/tuplex/core/src/physical/TransformTask.cc
@@ -18,12 +18,20 @@
 #include <bucket.h>
 
 namespace tuplex {
+    // this is a logic to stop the execution once it has reached the topLimit and bottomLimit
+    // here, we assume that task order starts with zero and count up by 1, e.g. 0, 1, 2, ..., n
+    // To implement limit, we maintain a mapping from the task order to the number of rows done in that task
+    // (rows done are either 0 or #output rows after processing)
+    // we can then find out how many top rows are done by looking at g_rowsDone[0], g_rowsDone[1], ...
+    // until we reach some segment that's 0
+    // likewise, we can find the bottom rows done by looking at g_rowsDone[g_maxOrder], g_rowsDone[g_maxOrder - 1], ...
+
     // mapping from order number -> row count if the task is finished
     static std::mutex g_rowsDoneMutex;
     static std::unordered_map<size_t, size_t> g_rowsDone;
     static std::atomic_size_t g_maxOrder;
 
-    void TransformTask::resetLimits(size_t maxOrder) {
+    void TransformTask::setMaxOrderAndResetLimits(size_t maxOrder) {
         g_rowsDone.clear();
         g_maxOrder = maxOrder;
     }
@@ -617,7 +625,6 @@ namespace tuplex {
             }
         }
 
-        // TODO: what is the max task number here
         if (_outBottomLimit == 0) {
             isBottomLimitReached = true;
         } else {
diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py
index 376134934..7eda223a1 100644
--- a/tuplex/python/tuplex/dataset.py
+++ b/tuplex/python/tuplex/dataset.py
@@ -172,6 +172,15 @@ def show(self, nrows=None):
 
         self._dataSet.show(nrows)
 
+    def _getHTMLRow(self, ind, row):
+        row_str = ""
+        row_str += "    <tr>\n"
+        row_str += "      <th>{}</th>\n".format(ind)
+        for col in row:
+            row_str += "      <td>{}</td>\n".format(col)
+        row_str += "    </tr>\n"
+        return row_str
+
     def showHTMLPreview(self, topLimit=5, bottomLimit=5):
         """ action that generates a physical plan, processes data and return a subset of results as nicely formatted
         HTML table to stdout.
@@ -183,14 +192,108 @@ def showHTMLPreview(self, topLimit=5, bottomLimit=5):
         Returns:
             string: an HTML table showing a preview of the data
         """
+        HTML_TEMPLATE = (
+            "<div>\n"
+            "<style scoped>\n"
+            "    .dataframe tbody tr th:only-of-type {\n"
+            "        vertical-align: middle;\n"
+            "    }\n"
+            "\n"
+            "    .dataframe tbody tr th {\n"
+            "        vertical-align: top;\n"
+            "    }\n"
+            "\n"
+            "    .dataframe thead th {\n"
+            "        text-align: right;\n"
+            "    }\n"
+            "</style>\n"
+            "<table border=\"1\" class=\"dataframe\">\n"
+            "  <thead>\n"
+            "    <tr style=\"text-align: right;\">\n"
+            "{}"
+            "    </tr>\n"
+            "  </thead>\n"
+            "  <tbody>\n"
+            "{}"
+            "  </tbody>\n"
+            "</table>\n"
+            "<p>{} columns</p>\n"
+            "</div>")
+
         assert self._dataSet is not None, 'internal API error, datasets must be created via context objects'
 
+        # TODO(march): edit this top/bottom limit
         if topLimit is None or topLimit < 0:
             topLimit = -1
         if bottomLimit is None or bottomLimit < 0:
             bottomLimit = -1
 
-        return self._dataSet.showHTMLPreview(topLimit, bottomLimit)
+        rows = self.take(topLimit, bottomLimit)
+
+        if len(rows) == 0:
+            return HTML_TEMPLATE.format("<th></th>\n", "<tr></tr>\n")
+
+        assert topLimit == -1 or bottomLimit == -1 or len(rows) <= topLimit + bottomLimit
+
+        headers_str = ""
+        body = ""
+        num_columns = None
+
+        # construct tables
+        if len(rows) < topLimit + bottomLimit:
+            # the data is small so we get everything (no need to render ...)
+            i = 0
+            for r in rows:
+                if i == 0:
+                    # we set num columns based on the first row
+                    num_columns = r.getNumColumns()
+                body += self._getHTMLRow(i, r)
+                i += 1
+        else:
+            # some data is not processed because of limiting
+            i = 0
+            for r in rows:
+                if i >= topLimit:
+                    break
+                if i == 0:
+                    # we set num columns based on the first row
+                    num_columns = r.getNumColumns()
+
+                body += self._getHTMLRow(i, r)
+                i += 1
+
+            # add the ...
+            body += "    <tr>\n"
+            body += "      <th>...</th>\n"
+            for _ in range(num_columns):
+                body += "      <td>...</td>\n"
+                body += "    </tr>\n"
+
+            for j in range(i, len(rows)):
+                body += self._getHTMLRow(i, rows[j])
+
+        assert num_columns is not None
+
+        # construct headers
+        column_names = self._dataSet.columns()
+        if column_names is not None:
+            assert (num_columns == column_names.size())
+            for c_name in column_names:
+                headers_str += "      <th>{}</th>\n".format(c_name)
+        else:
+            # default to generic name if column name doesn't exist
+            for i in range(num_columns):
+                headers_str += "      <th>Column {}</th>\n".format(i)
+
+        return HTML_TEMPLATE.format(headers_str, body, num_columns)
+
+    def _getConsoleRow(self, ind, row):
+        # TODO(march): (work on this)
+        pass
+
+    def showConsolePreview(self, topLimit=5, bottomLimit=5):
+        # TODO(march): (work on this)
+        pass
 
     def resolve(self, eclass, ftor):
         """ Adds a resolver operator to the pipeline. The signature of ftor needs to be identical to the one of the preceding operator.
diff --git a/tuplex/test/core/ContextBasics.cc b/tuplex/test/core/ContextBasics.cc
index fdbdd8d50..0be3c6030 100644
--- a/tuplex/test/core/ContextBasics.cc
+++ b/tuplex/test/core/ContextBasics.cc
@@ -136,4 +136,58 @@ TEST_F(ContextBasicsTest, JSON) {
 
     auto str = ContextOptions::defaults().asJSON();
     EXPECT_GT(str.length(), 2);
-}
\ No newline at end of file
+}
+
+TEST_F(ContextBasicsTest, twoContextTest) {
+    using namespace tuplex;
+
+    python::initInterpreter();
+    python::unlockGIL();
+
+    ContextOptions co = testOptions();
+    co.set("tuplex.partitionSize", "100B");
+    co.set("tuplex.executorMemory", "1MB");
+    co.set("tuplex.scratchDir", scratchDir + "/context1");
+
+    // second context with different executor config, should cause the driver to split up
+    ContextOptions co2 = testOptions();
+    co.set("tuplex.partitionSize", "100B");
+    co2.set("tuplex.executorMemory", "2MB");
+    co2.set("tuplex.scratchDir", scratchDir + "/context2");
+
+    Context c1(co);
+    Context c2(co2);
+    Row row1(Tuple(0), Tuple("hello"));
+    Row row2(Tuple(1), Tuple("this"));
+    Row row3(Tuple(2), Tuple("is"));
+    Row row4(Tuple(3), Tuple("a"));
+    Row row5(Tuple(4), Tuple("test"));
+
+    for (int t = 0; t < 10; t++) {
+        auto ds1 = c1.parallelize({row1, row2, row3, row4, row5})
+                .map(UDF("lambda x: x[1][0]")); // new code: string index operator! first to raise an exception!
+
+        auto ds2 = c2.parallelize({row1, row2, row3, row4, row5})
+                .map(UDF("lambda x: x[1][0]")); // new code: string index operator! first to raise an exception!
+
+        auto v1 = ds1.collectAsVector();
+        auto v2 = ds2.collectAsVector();
+
+        std::vector<std::string> ref{"hello", "this", "is", "a", "test"};
+
+        EXPECT_EQ(v1.size(), 5);
+        for (int i = 0; i < 5; i++) {
+            EXPECT_EQ(v1[i].getString(0), ref[i]);
+        }
+
+        EXPECT_EQ(v2.size(), 5);
+        for (int i = 0; i < 5; i++) {
+            EXPECT_EQ(v2[i].getString(0), ref[i]);
+        }
+    }
+
+    python::lockGIL();
+    python::closeInterpreter();
+}
+
+// TODO(march): multiple context test
\ No newline at end of file
diff --git a/tuplex/test/core/ResultSetTest.cc b/tuplex/test/core/ResultSetTest.cc
index 4acd38921..2ea273062 100644
--- a/tuplex/test/core/ResultSetTest.cc
+++ b/tuplex/test/core/ResultSetTest.cc
@@ -14,7 +14,7 @@
 
 class ResultSetTest : public PyTest {
 protected:
-    tuplex::Executor *driver;
+    std::shared_ptr<tuplex::Executor> driver;
     tuplex::ContextOptions options;
 public:
     // init function
@@ -45,7 +45,8 @@ class ResultSetTest : public PyTest {
             EXPECT_EQ(r.getRowType(), first_type);
 
         // now write via partition writer
-        tuplex::PartitionWriter pw(driver, Schema(Schema::MemoryLayout::ROW, first_type), 0, 0, options.PARTITION_SIZE());
+        tuplex::PartitionWriter pw(driver.get(), Schema(Schema::MemoryLayout::ROW, first_type), 0, 0,
+                                   options.PARTITION_SIZE());
         for(const auto& r : rows)
             pw.writeRow(r);
         return pw.getOutputPartitions();
diff --git a/tuplex/test/core/TakeTest.cc b/tuplex/test/core/TakeTest.cc
index 98edecb41..86173e40b 100644
--- a/tuplex/test/core/TakeTest.cc
+++ b/tuplex/test/core/TakeTest.cc
@@ -205,16 +205,16 @@ vector<Row> generateMapFilterReferenceData(const vector<Row> &input, size_t topL
     }
 
     assert(input[0].getNumColumns() == 3);
-    vector<Row> intermedate;
+    vector<Row> intermediate;
     for (const Row &r: input) {
         int64_t new_a = r.getInt(0) + r.getInt(1);
 
         if (new_a % 2 == 0) {
-            intermedate.emplace_back(new_a, r.getInt(2));
+            intermediate.emplace_back(new_a, r.getInt(2));
         }
     }
 
-    return generateReferenceData(intermedate, topLimit, bottomLimit);
+    return generateReferenceData(intermediate, topLimit, bottomLimit);
 }
 
 TEST_F(TakeTest, takeMapFilterTest) {
@@ -256,9 +256,6 @@ TEST_F(TakeTest, takeMapFilterTest) {
     }
 }
 
-// TODO(march): with file input
-//    context.csv("../resources/");
-
 TEST_F(TakeTest, collectIdentityTest) {
     mt19937 data_seed_gen(123454);
 
@@ -284,4 +281,67 @@ TEST_F(TakeTest, collectIdentityTest) {
     }
 }
 
+TEST_F(TakeTest, fileInputTest) {
+    const std::vector<size_t> test_size{1, 10, 100, 1001, 50001};
+    const std::vector<size_t> limit_values{0, 1, 5, 11, 600, 10000};
+    const std::vector<string> partition_sizes{"256B", "512KB", "1MB"};
+    std::vector<std::vector<Row>> expected_outputs;
+
+    if (!boost::filesystem::exists(scratchDir)) {
+        boost::filesystem::create_directory(scratchDir);
+    }
+
+    std::vector<string> fileInputNames;
+    for (unsigned long N: test_size) {
+        std::vector<Row> ref_output;
+        // write temp file
+        auto fName = fmt::format("{}/{}-{}.csv", scratchDir, testName, N);
+
+        FILE *fp = fopen(fName.c_str(), "w");
+        ASSERT_TRUE(fp);
+        fprintf(fp, "colA,colStr,colB\n");
+        for (int i = 0; i < N; ++i) {
+            fprintf(fp, "%d,\"hello%d\",%d\n", i, (i * 3) % 7, i % 15);
+            ref_output.emplace_back(i, fmt::format("hello{}", (i * 3) % 7), (i % 15) * (i % 15));
+        }
+        fclose(fp);
+
+        expected_outputs.push_back(std::move(ref_output));
+        fileInputNames.push_back(fName);
+    }
+
+    ASSERT_TRUE(expected_outputs.size() == test_size.size());
+    ASSERT_TRUE(fileInputNames.size() == test_size.size());
+
+    for (auto &part_size: partition_sizes) {
+        auto opt = microTestOptions();
+        opt.set("tuplex.partitionSize", part_size);
+        Context context(opt);
+
+        for (int t = 0; t < test_size.size(); t++) {
+            const size_t data_size = test_size[t];
+
+            for (auto top_limit: limit_values) {
+                for (auto bottom_limit: limit_values) {
+                    std::cout << "file testing with partition size:" << part_size << " data size:"
+                              << data_size << " top:" << top_limit << " bottom:" << bottom_limit << std::endl;
+
+                    auto ref_output = generateReferenceData(expected_outputs[t], top_limit, bottom_limit);
+                    auto res = context.csv(testName + ".csv")
+                            .mapColumn("colB", UDF("lambda x: x * x"))
+                            .take(top_limit, bottom_limit);
+
+                    ASSERT_EQ(ref_output.size(), res->rowCount());
+                    for (Row &r: ref_output) {
+                        Row res_row = res->getNextRow();
+                        if (!(res_row == r)) {
+                            ASSERT_EQ(res_row, r);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
 // TODO(march): write test for trimPartitionsToLimit
\ No newline at end of file

From 2005458822a3e8f03c9eb5cea95c04f13178d6b3 Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Fri, 15 Apr 2022 02:50:20 -0400
Subject: [PATCH 20/56] Add file testcases

---
 tuplex/core/src/LocalEngine.cc    |  6 ++----
 tuplex/test/core/ContextBasics.cc |  4 +---
 tuplex/test/core/TakeTest.cc      | 22 ++++++++++++----------
 3 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/tuplex/core/src/LocalEngine.cc b/tuplex/core/src/LocalEngine.cc
index c9c6d506b..91892d44d 100644
--- a/tuplex/core/src/LocalEngine.cc
+++ b/tuplex/core/src/LocalEngine.cc
@@ -123,7 +123,6 @@ namespace tuplex {
             if (_driver) {
                 Logger::instance().logger("local execution engine").info(
                         "driver already exist, starting new driver with updated config");
-                _driver->release(); // TODO(march): test whether we need this
             }
 
             // lazy start driver
@@ -132,10 +131,9 @@ namespace tuplex {
                                                  "driver");
             _driver_cfg = new_cfg;
 
-            // TODO(march): this could be a problem, if multiple driver with number = 0
-            // TODO(march): write a test for two drivers existing together (thread number 0)
-            // TODO(march): make a comment about potential issue here
             // driver always has thread number 0!
+            // Note: this could be a potential issue if the config change and the old driver is still running
+            // due to external reference. Then there could be two executors with the same number
             _driver->setThreadNumber(0);
 
             std::stringstream ss;
diff --git a/tuplex/test/core/ContextBasics.cc b/tuplex/test/core/ContextBasics.cc
index 0be3c6030..e85107b40 100644
--- a/tuplex/test/core/ContextBasics.cc
+++ b/tuplex/test/core/ContextBasics.cc
@@ -188,6 +188,4 @@ TEST_F(ContextBasicsTest, twoContextTest) {
 
     python::lockGIL();
     python::closeInterpreter();
-}
-
-// TODO(march): multiple context test
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/tuplex/test/core/TakeTest.cc b/tuplex/test/core/TakeTest.cc
index 86173e40b..eda609518 100644
--- a/tuplex/test/core/TakeTest.cc
+++ b/tuplex/test/core/TakeTest.cc
@@ -282,9 +282,9 @@ TEST_F(TakeTest, collectIdentityTest) {
 }
 
 TEST_F(TakeTest, fileInputTest) {
-    const std::vector<size_t> test_size{1, 10, 100, 1001, 50001};
-    const std::vector<size_t> limit_values{0, 1, 5, 11, 600, 10000};
-    const std::vector<string> partition_sizes{"256B", "512KB", "1MB"};
+    const std::vector<size_t> test_size{1, 10, 1001, 50001};
+    const std::vector<size_t> limit_values{0, 1, 6, 600, 10000};
+    const std::vector<string> partition_sizes{"256B", "1MB"};
     std::vector<std::vector<Row>> expected_outputs;
 
     if (!boost::filesystem::exists(scratchDir)) {
@@ -327,21 +327,23 @@ TEST_F(TakeTest, fileInputTest) {
                               << data_size << " top:" << top_limit << " bottom:" << bottom_limit << std::endl;
 
                     auto ref_output = generateReferenceData(expected_outputs[t], top_limit, bottom_limit);
-                    auto res = context.csv(testName + ".csv")
+                    auto res = context.csv(fileInputNames[t])
                             .mapColumn("colB", UDF("lambda x: x * x"))
                             .take(top_limit, bottom_limit);
 
                     ASSERT_EQ(ref_output.size(), res->rowCount());
                     for (Row &r: ref_output) {
                         Row res_row = res->getNextRow();
-                        if (!(res_row == r)) {
-                            ASSERT_EQ(res_row, r);
-                        }
+                        ASSERT_EQ(res_row.getInt(0), r.getInt(0));
+                        ASSERT_EQ(res_row.getString(1), r.getString(1));
+                        ASSERT_EQ(res_row.getInt(2), r.getInt(2));
+                        // TODO(march): this doesn't work because schema are different (for some reason infer as opt[int]?)
+                        // if (!(res_row == r)) {
+                        //     ASSERT_EQ(res_row, r);
+                        // }
                     }
                 }
             }
         }
     }
-}
-
-// TODO(march): write test for trimPartitionsToLimit
\ No newline at end of file
+}
\ No newline at end of file

From 41b04a75e945865bbaca5f230ccfd65fc14b5629 Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Tue, 19 Apr 2022 23:45:01 -0400
Subject: [PATCH 21/56] Python Dataset Debug

---
 tuplex/python/src/PythonDataSet.cc         |   4 +
 tuplex/python/tuplex/dataset.py            | 128 +++++++++++++++------
 tuplex/python/tuplex/utils/table_format.py |  80 +++++++++++++
 tuplex/test/core/TakeTest.cc               |   8 +-
 4 files changed, 178 insertions(+), 42 deletions(-)
 create mode 100644 tuplex/python/tuplex/utils/table_format.py

diff --git a/tuplex/python/src/PythonDataSet.cc b/tuplex/python/src/PythonDataSet.cc
index 1f543e5d2..5382ad24d 100644
--- a/tuplex/python/src/PythonDataSet.cc
+++ b/tuplex/python/src/PythonDataSet.cc
@@ -133,11 +133,15 @@ namespace tuplex {
             size_t castedTopLimit = 0;
             if (topLimit < 0) {
                 castedTopLimit = std::numeric_limits<size_t>::max();
+            } else {
+                castedTopLimit = topLimit;
             }
 
             size_t castedBottomLimit = 0;
             if (bottomLimit < 0) {
                 castedBottomLimit = std::numeric_limits<size_t>::max();
+            } else {
+                castedBottomLimit = bottomLimit;
             }
 
             try {
diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py
index 7eda223a1..c0b9ef4d0 100644
--- a/tuplex/python/tuplex/dataset.py
+++ b/tuplex/python/tuplex/dataset.py
@@ -19,6 +19,7 @@
 from tuplex.utils.framework import UDFCodeExtractionError
 from tuplex.utils.source_vault import SourceVault
 from .exceptions import classToExceptionCode
+import tuplex.utils.table_format as table_format
 
 # signed 64bit limit
 max_rows = 9223372036854775807
@@ -29,7 +30,10 @@ def __init__(self):
         self._dataSet = None
 
     def _repr_html_(self):
-        return self._dataSet.showHTMLPreview()
+        return self.showHTMLPreview()
+
+    def __repr__(self):
+        return self.showStrPreview()
 
     def unique(self):
         """ removes duplicates from Dataset (out-of-order). Equivalent to a DISTINCT clause in a SQL-statement.
@@ -172,15 +176,6 @@ def show(self, nrows=None):
 
         self._dataSet.show(nrows)
 
-    def _getHTMLRow(self, ind, row):
-        row_str = ""
-        row_str += "    <tr>\n"
-        row_str += "      <th>{}</th>\n".format(ind)
-        for col in row:
-            row_str += "      <td>{}</td>\n".format(col)
-        row_str += "    </tr>\n"
-        return row_str
-
     def showHTMLPreview(self, topLimit=5, bottomLimit=5):
         """ action that generates a physical plan, processes data and return a subset of results as nicely formatted
         HTML table to stdout.
@@ -195,17 +190,17 @@ def showHTMLPreview(self, topLimit=5, bottomLimit=5):
         HTML_TEMPLATE = (
             "<div>\n"
             "<style scoped>\n"
-            "    .dataframe tbody tr th:only-of-type {\n"
+            "    .dataframe tbody tr th:only-of-type {{\n"
             "        vertical-align: middle;\n"
-            "    }\n"
+            "    }}\n"
             "\n"
-            "    .dataframe tbody tr th {\n"
+            "    .dataframe tbody tr th {{\n"
             "        vertical-align: top;\n"
-            "    }\n"
+            "    }}\n"
             "\n"
-            "    .dataframe thead th {\n"
+            "    .dataframe thead th {{\n"
             "        text-align: right;\n"
-            "    }\n"
+            "    }}\n"
             "</style>\n"
             "<table border=\"1\" class=\"dataframe\">\n"
             "  <thead>\n"
@@ -222,16 +217,10 @@ def showHTMLPreview(self, topLimit=5, bottomLimit=5):
 
         assert self._dataSet is not None, 'internal API error, datasets must be created via context objects'
 
-        # TODO(march): edit this top/bottom limit
-        if topLimit is None or topLimit < 0:
-            topLimit = -1
-        if bottomLimit is None or bottomLimit < 0:
-            bottomLimit = -1
-
         rows = self.take(topLimit, bottomLimit)
 
         if len(rows) == 0:
-            return HTML_TEMPLATE.format("<th></th>\n", "<tr></tr>\n")
+            return HTML_TEMPLATE.format("<th></th>\n", "<tr></tr>\n", 0)
 
         assert topLimit == -1 or bottomLimit == -1 or len(rows) <= topLimit + bottomLimit
 
@@ -246,8 +235,8 @@ def showHTMLPreview(self, topLimit=5, bottomLimit=5):
             for r in rows:
                 if i == 0:
                     # we set num columns based on the first row
-                    num_columns = r.getNumColumns()
-                body += self._getHTMLRow(i, r)
+                    num_columns = len(r) if isinstance(r, list) or isinstance(r, tuple) else 1
+                body += table_format.getHTMLRow(i, r)
                 i += 1
         else:
             # some data is not processed because of limiting
@@ -257,9 +246,9 @@ def showHTMLPreview(self, topLimit=5, bottomLimit=5):
                     break
                 if i == 0:
                     # we set num columns based on the first row
-                    num_columns = r.getNumColumns()
+                    num_columns = len(r) if isinstance(r, list) or isinstance(r, tuple) else 1
 
-                body += self._getHTMLRow(i, r)
+                body += table_format.getHTMLRow(i, r)
                 i += 1
 
             # add the ...
@@ -270,14 +259,15 @@ def showHTMLPreview(self, topLimit=5, bottomLimit=5):
                 body += "    </tr>\n"
 
             for j in range(i, len(rows)):
-                body += self._getHTMLRow(i, rows[j])
+                body += table_format.getHTMLRow(len(rows) - j, rows[j])
 
         assert num_columns is not None
 
         # construct headers
         column_names = self._dataSet.columns()
-        if column_names is not None:
-            assert (num_columns == column_names.size())
+        headers_str += "      <th></th>\n"
+        if len(column_names) > 0:
+            assert (num_columns == len(column_names))
             for c_name in column_names:
                 headers_str += "      <th>{}</th>\n".format(c_name)
         else:
@@ -287,13 +277,79 @@ def showHTMLPreview(self, topLimit=5, bottomLimit=5):
 
         return HTML_TEMPLATE.format(headers_str, body, num_columns)
 
-    def _getConsoleRow(self, ind, row):
-        # TODO(march): (work on this)
-        pass
+    def showStrPreview(self, topLimit=5, bottomLimit=5):
+        """ action that generates a physical plan, processes data and return a subset of results as nicely formatted
+        ASCII table to stdout.
+
+        Args:
+            topLimit (int): number of top rows to collect. If ``None`` all rows will be collected
+            bottomLimit (int): number of bottom rows to collect. If ``None`` all rows will be collected
+
+        Returns:
+            string: an HTML table showing a preview of the data
+        """
+        assert self._dataSet is not None, 'internal API error, datasets must be created via context objects'
+
+        rows = self.take(topLimit, bottomLimit)
+
+        if len(rows) == 0:
+            return (
+                "---\n"
+                "| |\n"
+                "---\n"
+                "0 columns\n")
+
+        assert topLimit == -1 or bottomLimit == -1 or len(rows) <= topLimit + bottomLimit
+
+        str_table = []
+        num_columns = None
+
+        # construct tables
+        if len(rows) < topLimit + bottomLimit:
+            # the data is small so we get everything (no need to render ...)
+            i = 0
+            for r in rows:
+                if i == 0:
+                    # we set num columns based on the first row
+                    num_columns = len(r) if isinstance(r, list) or isinstance(r, tuple) else 1
+                str_table.append(table_format.getStrTableRow(i, r))
+                i += 1
+        else:
+            # some data is not processed because of limiting
+            i = 0
+            for r in rows:
+                if i >= topLimit:
+                    break
+                if i == 0:
+                    # we set num columns based on the first row
+                    num_columns = len(r) if isinstance(r, list) or isinstance(r, tuple) else 1
+
+                str_table.append(table_format.getStrTableRow(i, r))
+                i += 1
+
+            # add the ...
+            str_table.append(["..."] * (num_columns + 1))
+
+            for j in range(i, len(rows)):
+                str_table.append(table_format.getStrTableRow(len(rows) - j, rows[j]))
+
+        assert num_columns is not None
+
+        # construct headers
+        column_names = self._dataSet.columns()
+        headers_list = [""]
+        if len(column_names) > 0:
+            assert (num_columns == len(column_names))
+            for c_name in column_names:
+                headers_list.append("{}".format(c_name))
+        else:
+            # default to generic name if column name doesn't exist
+            for i in range(num_columns):
+                headers_list.append("Column {}".format(i))
+
+        str_table = [headers_list] + str_table
 
-    def showConsolePreview(self, topLimit=5, bottomLimit=5):
-        # TODO(march): (work on this)
-        pass
+        return table_format.generateStrTable(num_columns + 1, str_table)
 
     def resolve(self, eclass, ftor):
         """ Adds a resolver operator to the pipeline. The signature of ftor needs to be identical to the one of the preceding operator.
diff --git a/tuplex/python/tuplex/utils/table_format.py b/tuplex/python/tuplex/utils/table_format.py
new file mode 100644
index 000000000..bb83118b4
--- /dev/null
+++ b/tuplex/python/tuplex/utils/table_format.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+# ----------------------------------------------------------------------------------------------------------------------#
+#                                                                                                                      #
+#                                       Tuplex: Blazing Fast Python Data Science                                       #
+#                                                                                                                      #
+#                                                                                                                      #
+#  (c) 2017 - 2021, Tuplex team                                                                                        #
+#  Created by Leonhard Spiegelberg first on 4/19/2022                                                                  #
+#  License: Apache 2.0                                                                                                 #
+# ----------------------------------------------------------------------------------------------------------------------#
+
+def getHTMLRow(ind, row):
+    """
+    Given a row, converts all the contents to an HTML row and return
+    :param ind: the index of that row
+    :param row: a row output from dataset
+    :return: an HTML row, representative of the row
+    """
+    row_str = ""
+    row_str += "    <tr>\n"
+    row_str += "      <th>{}</th>\n".format(ind)
+    if isinstance(row, list) or isinstance(row, tuple):
+        for col in row:
+            row_str += "      <td>{}</td>\n".format(col)
+    else:
+        row_str += "      <td>{}</td>\n".format(row)
+    row_str += "    </tr>\n"
+    return row_str
+
+
+def getStrTableRow(ind, row):
+    """
+    Given a row, converts all the contents to string and return
+    :param ind: the index of that row
+    :param row: a row output from dataset
+    :return: a list of string, representative of the row
+    """
+    row_str_list = ["{}".format(ind)]
+    if isinstance(row, list) or isinstance(row, tuple):
+        for col in row:
+            row_str_list.append("{}".format(col))
+    else:
+        row_str_list.append("{}".format(row))
+    return row_str_list
+
+
+def _getLineDivider(col_width):
+    out = ""
+    for w in col_width:
+        out += "+" + ("-" * (w + 2))
+    out += "+\n"
+
+    return out
+
+def generateStrTable(numCols, strTable):
+    """
+    Given a 2-dimensional list of strings, print a nicely formatted table of the contents in the list
+    :param numCols: number of columns in the table
+    :param strTable: 2-dimensional list of strings, as list of list
+    :return: a nicely formatted table in string
+    """
+    max_col_width = [0] * numCols
+
+    for r in strTable:
+        for i in range(0, len(r)):
+            assert (isinstance(r[i], str))
+            if len(r[i]) > max_col_width[i]:
+                max_col_width[i] = len(r[i])
+
+    output_str = ""
+
+    for r in strTable:
+        output_str += _getLineDivider(max_col_width)
+        for i in range(0, len(r)):
+            output_str += "| {:<{width}} ".format(r[i], width=max_col_width[i])
+        output_str += "|\n"
+
+    output_str += _getLineDivider(max_col_width) + "{} columns\n".format(numCols)
+
+    return output_str
diff --git a/tuplex/test/core/TakeTest.cc b/tuplex/test/core/TakeTest.cc
index eda609518..4e4a70f53 100644
--- a/tuplex/test/core/TakeTest.cc
+++ b/tuplex/test/core/TakeTest.cc
@@ -282,8 +282,8 @@ TEST_F(TakeTest, collectIdentityTest) {
 }
 
 TEST_F(TakeTest, fileInputTest) {
-    const std::vector<size_t> test_size{1, 10, 1001, 50001};
-    const std::vector<size_t> limit_values{0, 1, 6, 600, 10000};
+    const std::vector<size_t> test_size{1, 1001, 50001};
+    const std::vector<size_t> limit_values{0, 1, 600, 10000};
     const std::vector<string> partition_sizes{"256B", "1MB"};
     std::vector<std::vector<Row>> expected_outputs;
 
@@ -337,10 +337,6 @@ TEST_F(TakeTest, fileInputTest) {
                         ASSERT_EQ(res_row.getInt(0), r.getInt(0));
                         ASSERT_EQ(res_row.getString(1), r.getString(1));
                         ASSERT_EQ(res_row.getInt(2), r.getInt(2));
-                        // TODO(march): this doesn't work because schema are different (for some reason infer as opt[int]?)
-                        // if (!(res_row == r)) {
-                        //     ASSERT_EQ(res_row, r);
-                        // }
                     }
                 }
             }

From fc751f190b4c6f97af2b60f46b3fa2c25675ae7f Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Tue, 19 Apr 2022 23:47:17 -0400
Subject: [PATCH 22/56] Remove showHTMLPreview from Dataset in C++

---
 tuplex/core/include/DataSet.h         |   8 --
 tuplex/core/src/DataSet.cc            | 108 --------------------------
 tuplex/python/include/PythonDataSet.h |   1 -
 tuplex/python/src/PythonBindings.cc   |   1 -
 tuplex/python/src/PythonDataSet.cc    |  49 ------------
 5 files changed, 167 deletions(-)

diff --git a/tuplex/core/include/DataSet.h b/tuplex/core/include/DataSet.h
index 3a5f450ac..1b11c1f75 100644
--- a/tuplex/core/include/DataSet.h
+++ b/tuplex/core/include/DataSet.h
@@ -130,14 +130,6 @@ namespace tuplex {
          */
         virtual void show(int64_t numRows = -1, std::ostream &os = std::cout);
 
-        /*!
-         * Displays a formatted HTML table of a small portion of the data
-         * @param topLimit how many top rows to print
-         * @param bottomLimit how many bottom rows to print
-         * @param os ostream where to print table to
-         */
-        virtual void showHTMLPreview(size_t topLimit, size_t bottomLimit, std::ostream &os = std::cout);
-
         // named dataset management functions
         /*!
          * map Column using a UDF
diff --git a/tuplex/core/src/DataSet.cc b/tuplex/core/src/DataSet.cc
index b62946ae4..d54edb567 100644
--- a/tuplex/core/src/DataSet.cc
+++ b/tuplex/core/src/DataSet.cc
@@ -756,114 +756,6 @@ namespace tuplex {
         printTable(os, headers, rows);
     }
 
-    void printHTMLRow(std::ostream &os, size_t ind, const Row& r) {
-        os << "    <tr>\n";
-        os << fmt::format("      <th>{}</th>\n", ind);
-        for (auto& s : r.getAsStrings()) {
-            os << fmt::format("      <td>{}</td>\n", s);
-        }
-        os << "    </tr>\n";
-    }
-
-    void DataSet::showHTMLPreview(size_t topLimit, size_t bottomLimit, std::ostream &os) {
-        std::string HTML_TEMPLATE =
-                "<div>\n"
-                "<style scoped>\n"
-                "    .dataframe tbody tr th:only-of-type {\n"
-                "        vertical-align: middle;\n"
-                "    }\n"
-                "\n"
-                "    .dataframe tbody tr th {\n"
-                "        vertical-align: top;\n"
-                "    }\n"
-                "\n"
-                "    .dataframe thead th {\n"
-                "        text-align: right;\n"
-                "    }\n"
-                "</style>\n"
-                "<table border=\"1\" class=\"dataframe\">\n"
-                "  <thead>\n"
-                "    <tr style=\"text-align: right;\">\n"
-                "{}"
-                "    </tr>\n"
-                "  </thead>\n"
-                "  <tbody>\n"
-                "{}"
-                "  </tbody>\n"
-                "</table>\n"
-                "<p>{} columns</p>\n"
-                "</div>";
-
-        assert(_context);
-
-        auto rows = take(topLimit, bottomLimit);
-
-        if (rows->rowCount() == 0) {
-            os << fmt::format(HTML_TEMPLATE, "<th></th>\n", "<tr></tr>\n", 0);
-            return;
-        }
-
-        std::stringstream headers_stream, body_stream;
-        size_t numColumns = 0;
-        assert(rows->rowCount() <= topLimit + bottomLimit);
-
-        // construct tables
-        if (rows->rowCount() < topLimit + bottomLimit) {
-            // the data is small so we get everything (no need to render ...)
-            for (size_t i = 0; rows->hasNextRow(); i++) {
-                Row r = rows->getNextRow();
-                if (i == 0) {
-                    // we set num columns based on the first row
-                    numColumns = r.getNumColumns();
-                }
-
-                printHTMLRow(body_stream, i, r);
-            }
-        } else {
-            // some data is not processed because of limiting
-            size_t i;
-            for (i = 0; rows->hasNextRow() && i < topLimit; i++) {
-                Row r = rows->getNextRow();
-                if (i == 0) {
-                    // we set num columns based on the first row
-                    numColumns = r.getNumColumns();
-                }
-
-                printHTMLRow(body_stream, i, r);
-            }
-
-            // add the ...
-            body_stream << "    <tr>\n";
-            body_stream << "      <th>...</th>\n";
-            for(int j = 0; j < numColumns; j++) {
-                body_stream << "      <td>...</td>\n";
-                body_stream << "    </tr>\n";
-            }
-
-            while (rows->hasNextRow()) {
-                Row r = rows->getNextRow();
-                printHTMLRow(body_stream, i, r);
-            }
-        }
-
-        assert(numColumns != 0);
-
-        // construct headers
-        if (!_columnNames.empty()) {
-            assert(numColumns == _columnNames.size());
-            for (auto &c_name: _columnNames) {
-                headers_stream << fmt::format("      <th>{}</th>\n", c_name);
-            }
-        } else {
-            // default to generic name if column name doesn't exist
-            for (int i = 0; i < numColumns; ++i) {
-                headers_stream << fmt::format("      <th>Column {}</th>\n", i);
-            }
-        }
-
-        os << fmt::format(HTML_TEMPLATE, headers_stream.str(), body_stream.str(), numColumns);
-    }
-
     Schema DataSet::schema() const {
         if(!_operator)
             return Schema::UNKNOWN;
diff --git a/tuplex/python/include/PythonDataSet.h b/tuplex/python/include/PythonDataSet.h
index 4761ac7f0..ede482d9c 100644
--- a/tuplex/python/include/PythonDataSet.h
+++ b/tuplex/python/include/PythonDataSet.h
@@ -79,7 +79,6 @@ namespace tuplex {
         py::object collect();
         py::object take(const int64_t topLimit, const int64_t bottomLimit);
         void show(const int64_t numRows=-1);
-        std::string showHTMLPreview(const int64_t topLimit, const int64_t bottomLimit);
 
         // DataFrame like operations
         PythonDataSet mapColumn(const std::string& column, const std::string& lambda_code, const std::string& pickled_code, const py::object& closure=py::object());
diff --git a/tuplex/python/src/PythonBindings.cc b/tuplex/python/src/PythonBindings.cc
index ab239a1a2..6b3683853 100644
--- a/tuplex/python/src/PythonBindings.cc
+++ b/tuplex/python/src/PythonBindings.cc
@@ -41,7 +41,6 @@ PYMODULE {
 
     py::class_<tuplex::PythonDataSet>(m, "_DataSet")
             .def("show", &tuplex::PythonDataSet::show)
-            .def("showHTMLPreview", &tuplex::PythonDataSet::showHTMLPreview)
             .def("collect", &tuplex::PythonDataSet::collect)
             .def("take", &tuplex::PythonDataSet::take)
             .def("map", &tuplex::PythonDataSet::map)
diff --git a/tuplex/python/src/PythonDataSet.cc b/tuplex/python/src/PythonDataSet.cc
index 5382ad24d..ec972a899 100644
--- a/tuplex/python/src/PythonDataSet.cc
+++ b/tuplex/python/src/PythonDataSet.cc
@@ -884,55 +884,6 @@ namespace tuplex {
         }
     }
 
-    std::string PythonDataSet::showHTMLPreview(const int64_t topLimit, const int64_t bottomLimit) {
-        // make sure a dataset is wrapped
-        assert(this->_dataset);
-
-        // is callee error dataset? if so return list with error string
-        if (this->_dataset->isError()) {
-            auto errset = dynamic_cast<ErrorDataSet *>(this->_dataset);
-            assert(errset);
-            return "Error: " + errset->getError();
-        } else {
-            // release GIL & hand over everything to Tuplex
-            assert(PyGILState_Check()); // make sure this thread holds the GIL!
-            python::unlockGIL();
-
-            std::stringstream ss;
-            std::string err_message;
-
-            size_t castedTopLimit = 0;
-            if (topLimit < 0) {
-                castedTopLimit = std::numeric_limits<size_t>::max();
-            }
-
-            size_t castedBottomLimit = 0;
-            if (bottomLimit < 0) {
-                castedBottomLimit = std::numeric_limits<size_t>::max();
-            }
-
-            try {
-                this->_dataset->showHTMLPreview(castedTopLimit, castedBottomLimit, ss);
-            } catch (const std::exception &e) {
-                err_message = e.what();
-                Logger::instance().defaultLogger().error(err_message);
-            } catch (...) {
-                err_message = "unknown C++ exception occurred, please change type.";
-                Logger::instance().defaultLogger().error(err_message);
-            }
-
-            // reacquire GIL
-            python::lockGIL();
-            Logger::instance().flushToPython();
-
-            if (!ss.str().empty() && err_message.empty()) {
-                return ss.str();
-            } else {
-                return "Error occurred: " + err_message;
-            }
-        }
-    }
-
     PyObject* PythonDataSet::anyToCPythonWithPyObjects(ResultSet* rs, size_t maxRowCount) {
         assert(rs);
 

From 6b5c692e353582b9a012d26d967ea939f8236c84 Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Wed, 20 Apr 2022 00:15:03 -0400
Subject: [PATCH 23/56] Separate out partition utils

---
 tuplex/core/include/PartitionUtils.h        |  46 +++++++
 tuplex/core/include/ee/local/LocalBackend.h |  22 ----
 tuplex/core/src/PartitionUtils.cc           | 138 ++++++++++++++++++++
 tuplex/core/src/ee/local/LocalBackend.cc    | 125 +-----------------
 tuplex/python/tuplex/utils/table_format.py  |   2 +-
 tuplex/test/core/TakeTest.cc                |   2 +-
 6 files changed, 187 insertions(+), 148 deletions(-)
 create mode 100644 tuplex/core/include/PartitionUtils.h
 create mode 100644 tuplex/core/src/PartitionUtils.cc

diff --git a/tuplex/core/include/PartitionUtils.h b/tuplex/core/include/PartitionUtils.h
new file mode 100644
index 000000000..d247edcfc
--- /dev/null
+++ b/tuplex/core/include/PartitionUtils.h
@@ -0,0 +1,46 @@
+//--------------------------------------------------------------------------------------------------------------------//
+//                                                                                                                    //
+//                                      Tuplex: Blazing Fast Python Data Science                                      //
+//                                                                                                                    //
+//                                                                                                                    //
+//  (c) 2017 - 2021, Tuplex team                                                                                      //
+//  Created by March Boonyapaluk first on 4/19/2021                                                                   //
+//  License: Apache 2.0                                                                                               //
+//--------------------------------------------------------------------------------------------------------------------//
+
+#ifndef TUPLEX_PARTITIONUTILS_H
+#define TUPLEX_PARTITIONUTILS_H
+
+#include <vector>
+#include <physical/TransformStage.h>
+#include <Executor.h>
+
+namespace tuplex {
+    /*!
+     * Trim list of partitions so that it includes up to the first n rows and the last m rows
+     * if n + m > number of rows in input partitions, the partitions will remain unchanged
+     * @param partitions [in,out] the list of partitions to trim
+     * @param topLimit n, the number of top rows to include
+     * @param bottomLimit m, the number of bottom rows to include
+     * @param tstage pointer to transform stage, might be used to generate new partition
+     * @param exec pointer to executor, might be used to allocate new partition
+     */
+    void trimPartitionsToLimit(std::vector<Partition *> &partitions, size_t topLimit, size_t bottomLimit,
+                               TransformStage *tstage, Executor *exec);
+
+    /*!
+     * Create a newly allocated partition with the same data as the specified partition, but with the first n rows removed
+     * @param p_in the input partition
+     * @param numToSkip number of rows to remove from the new partition
+     * @param tstage pointer to transform stage, used to generate new partition
+     * @param exec pointer to executor, used to allocate new partition
+     * @return the new partition
+     */
+    Partition *newPartitionWithSkipRows(Partition *p_in,
+                                        size_t numToSkip,
+                                        TransformStage *tstage,
+                                        Executor *exec);
+
+}
+
+#endif //TUPLEX_PARTITIONUTILS_H
diff --git a/tuplex/core/include/ee/local/LocalBackend.h b/tuplex/core/include/ee/local/LocalBackend.h
index 3d73a5d9f..7f42ff1cb 100644
--- a/tuplex/core/include/ee/local/LocalBackend.h
+++ b/tuplex/core/include/ee/local/LocalBackend.h
@@ -185,28 +185,6 @@ namespace tuplex {
      * @return
      */
     extern URI outputURI(const UDF& udf, const URI& baseURI, int64_t partNo, FileFormat fmt);
-
-    /*!
-     * Trim list of partitions so that it includes up to the first n rows and the last m rows
-     * if n + m > number of rows in input partitions, the partitions will remain unchanged
-     * @param partitions [in,out] the list of partitions to trim
-     * @param topLimit n, the number of top rows to include
-     * @param bottomLimit m, the number of bottom rows to include
-     * @param tstage pointer to transform stage, might be used to generate new partition
-     * @param exec pointer to executor, might be used to allocate new partition
-     */
-    extern void trimPartitionsToLimit(std::vector<Partition *> &partitions, size_t topLimit, size_t bottomLimit,
-                               TransformStage *tstage, Executor *exec);
-
-    /*!
-     * Create a newly allocated partition with the same data as the specified partition, but with the first n rows removed
-     * @param p_in the input partition
-     * @param numToSkip number of rows to remove from the new partition
-     * @param tstage pointer to transform stage, used to generate new partition
-     * @param exec pointer to executor, used to allocate new partition
-     * @return the new partition
-     */
-    extern Partition *newPartitionWithSkipRows(Partition *p_in, size_t numToSkip, TransformStage *tstage, Executor *exec);
 }
 
 #endif //TUPLEX_LOCALBACKEND_H
\ No newline at end of file
diff --git a/tuplex/core/src/PartitionUtils.cc b/tuplex/core/src/PartitionUtils.cc
new file mode 100644
index 000000000..745332c93
--- /dev/null
+++ b/tuplex/core/src/PartitionUtils.cc
@@ -0,0 +1,138 @@
+//--------------------------------------------------------------------------------------------------------------------//
+//                                                                                                                    //
+//                                      Tuplex: Blazing Fast Python Data Science                                      //
+//                                                                                                                    //
+//                                                                                                                    //
+//  (c) 2017 - 2021, Tuplex team                                                                                      //
+//  Created by March Boonyapaluk first on 4/19/2021                                                                   //
+//  License: Apache 2.0                                                                                               //
+//--------------------------------------------------------------------------------------------------------------------//
+
+#include "PartitionUtils.h"
+
+namespace tuplex {
+
+    void trimPartitionsToLimit(std::vector<Partition *> &partitions,
+                               size_t topLimit,
+                               size_t bottomLimit,
+                               TransformStage* tstage,
+                               Executor *exec) {
+        std::vector<Partition *> limitedPartitions, limitedTailPartitions;
+
+        // check top output limit, adjust partitions if necessary
+        size_t numTopOutputRows = 0;
+        Partition *lastTopPart = nullptr;
+        size_t clippedTop = 0;
+        for (auto partition: partitions) {
+            numTopOutputRows += partition->getNumRows();
+            lastTopPart = partition;
+            if (numTopOutputRows >= topLimit) {
+                // clip last partition & leave loop
+                clippedTop = topLimit - (numTopOutputRows - partition->getNumRows());
+                assert(clippedTop <= partition->getNumRows());
+                break;
+            } else if (partition == partitions.back()) {
+                // last partition, mark full row, but don't put to output set yet to avoid double put
+                clippedTop = partition->getNumRows();
+                break;
+            } else {
+                // put full partition to output set
+                limitedPartitions.push_back(partition);
+            }
+        }
+
+        // check the bottom output limit, adjust partitions if necessary
+        size_t numBottomOutputRows = 0;
+        size_t clippedBottom = 0;
+        for (auto it = partitions.rbegin(); it != partitions.rend(); it++) {
+            auto partition = *it;
+            numBottomOutputRows += partition->getNumRows();
+
+            if (partition == lastTopPart) {
+                // the bottom and the top partitions are overlapping
+                clippedBottom = bottomLimit - (numBottomOutputRows - partition->getNumRows());
+                if (clippedTop + clippedBottom >= partition->getNumRows()) {
+                    // if top and bottom range intersect, use full partitions
+                    clippedTop = partition->getNumRows();
+                    clippedBottom = 0;
+                }
+                break;
+            } else if (numBottomOutputRows >= bottomLimit) {
+                // clip last partition & leave loop
+                auto clipped = bottomLimit - (numBottomOutputRows - partition->getNumRows());
+                assert(clipped <= partition->getNumRows());
+                if (clipped > 0) {
+                    Partition *newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage,
+                                                                  exec);
+                    assert(newPart->getNumRows() == clipped);
+                    limitedTailPartitions.push_back(newPart);
+                }
+                partition->invalidate();
+                break;
+            } else {
+                // put full partition to output set
+                limitedTailPartitions.push_back(partition);
+            }
+        }
+
+        // push the middle partition
+        if (lastTopPart != nullptr && (clippedTop > 0 || clippedBottom > 0)) {
+            assert(clippedTop + clippedBottom <= lastTopPart->getNumRows());
+
+            // split into two partitions with both top and bottom are in the same partition
+            Partition *lastBottomPart = nullptr;
+
+            if (clippedBottom != 0) {
+                lastBottomPart = newPartitionWithSkipRows(lastTopPart, lastTopPart->getNumRows() - clippedBottom,
+                                                          tstage, exec);
+            }
+
+            if (clippedTop != 0) {
+                lastTopPart->setNumRows(clippedTop);
+                limitedPartitions.push_back(lastTopPart);
+            } else {
+                lastTopPart->invalidate();
+            }
+
+            if (lastBottomPart != nullptr) {
+                limitedPartitions.push_back(lastBottomPart);
+            }
+        }
+
+        // merge the head and tail partitions
+        partitions.clear();
+        partitions.insert(partitions.end(), limitedPartitions.begin(), limitedPartitions.end());
+        partitions.insert(partitions.end(), limitedTailPartitions.rbegin(), limitedTailPartitions.rend());
+    }
+
+    Partition *newPartitionWithSkipRows(Partition *p_in, size_t numToSkip, TransformStage *tstage, Executor *exec) {
+        auto ptr = p_in->lockRaw();
+        auto num_rows = *((int64_t *) ptr);
+        assert(numToSkip < num_rows);
+
+        ptr += sizeof(int64_t);
+        size_t numBytesToSkip = 0;
+
+        Deserializer ds(tstage->outputSchema());
+        for (unsigned i = 0; i < numToSkip; ++i) {
+            Row r = Row::fromMemory(ds, ptr, p_in->capacity() - numBytesToSkip);
+            ptr += r.serializedLength();
+            numBytesToSkip += r.serializedLength();
+        }
+
+        Partition *p_out = exec->allocWritablePartition(p_in->size() - numBytesToSkip + sizeof(int64_t),
+                                                        tstage->outputSchema(), tstage->outputDataSetID(),
+                                                        tstage->context().id());
+        assert(p_out->capacity() >= p_in->size() - numBytesToSkip);
+
+        auto ptr_out = p_out->lockRaw();
+        *((int64_t *) ptr_out) = p_in->getNumRows() - numToSkip;
+        ptr_out += sizeof(int64_t);
+        memcpy((void *) ptr_out, ptr, p_in->size() - numBytesToSkip);
+        p_out->unlock();
+
+        p_in->unlock();
+
+        return p_out;
+    }
+} // namespace tuplex
\ No newline at end of file
diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc
index 351d55b88..676a4e3b3 100644
--- a/tuplex/core/src/ee/local/LocalBackend.cc
+++ b/tuplex/core/src/ee/local/LocalBackend.cc
@@ -27,7 +27,7 @@
 #include <physical/HashProbeTask.h>
 #include <physical/LLVMOptimizer.h>
 #include <HybridHashTable.h>
-#include <int_hashmap.h>
+#include "PartitionUtils.h"
 
 namespace tuplex {
 
@@ -2131,127 +2131,4 @@ namespace tuplex {
         tstage->setFileResult(ecounts);
     }
 
-    void trimPartitionsToLimit(std::vector<Partition *> &partitions,
-                                             size_t topLimit,
-                                             size_t bottomLimit,
-                                             TransformStage* tstage,
-                                             Executor *exec) {
-        std::vector<Partition *> limitedPartitions, limitedTailPartitions;
-
-        // check top output limit, adjust partitions if necessary
-        size_t numTopOutputRows = 0;
-        Partition *lastTopPart = nullptr;
-        size_t clippedTop = 0;
-        for (auto partition: partitions) {
-            numTopOutputRows += partition->getNumRows();
-            lastTopPart = partition;
-            if (numTopOutputRows >= topLimit) {
-                // clip last partition & leave loop
-                clippedTop = topLimit - (numTopOutputRows - partition->getNumRows());
-                assert(clippedTop <= partition->getNumRows());
-                break;
-            } else if (partition == partitions.back()) {
-                // last partition, mark full row, but don't put to output set yet to avoid double put
-                clippedTop = partition->getNumRows();
-                break;
-            } else {
-                // put full partition to output set
-                limitedPartitions.push_back(partition);
-            }
-        }
-
-        // check the bottom output limit, adjust partitions if necessary
-        size_t numBottomOutputRows = 0;
-        size_t clippedBottom = 0;
-        for (auto it = partitions.rbegin(); it != partitions.rend(); it++) {
-            auto partition = *it;
-            numBottomOutputRows += partition->getNumRows();
-
-            if (partition == lastTopPart) {
-                // the bottom and the top partitions are overlapping
-                clippedBottom = bottomLimit - (numBottomOutputRows - partition->getNumRows());
-                if (clippedTop + clippedBottom >= partition->getNumRows()) {
-                    // if top and bottom range intersect, use full partitions
-                    clippedTop = partition->getNumRows();
-                    clippedBottom = 0;
-                }
-                break;
-            } else if (numBottomOutputRows >= bottomLimit) {
-                // clip last partition & leave loop
-                auto clipped = bottomLimit - (numBottomOutputRows - partition->getNumRows());
-                assert(clipped <= partition->getNumRows());
-                if (clipped > 0) {
-                    Partition *newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage,
-                                                                  exec);
-                    assert(newPart->getNumRows() == clipped);
-                    limitedTailPartitions.push_back(newPart);
-                }
-                partition->invalidate();
-                break;
-            } else {
-                // put full partition to output set
-                limitedTailPartitions.push_back(partition);
-            }
-        }
-
-        // push the middle partition
-        if (lastTopPart != nullptr && (clippedTop > 0 || clippedBottom > 0)) {
-            assert(clippedTop + clippedBottom <= lastTopPart->getNumRows());
-
-            // split into two partitions with both top and bottom are in the same partition
-            Partition *lastBottomPart = nullptr;
-
-            if (clippedBottom != 0) {
-                lastBottomPart = newPartitionWithSkipRows(lastTopPart, lastTopPart->getNumRows() - clippedBottom,
-                                                          tstage, exec);
-            }
-
-            if (clippedTop != 0) {
-                lastTopPart->setNumRows(clippedTop);
-                limitedPartitions.push_back(lastTopPart);
-            } else {
-                lastTopPart->invalidate();
-            }
-
-            if (lastBottomPart != nullptr) {
-                limitedPartitions.push_back(lastBottomPart);
-            }
-        }
-
-        // merge the head and tail partitions
-        partitions.clear();
-        partitions.insert(partitions.end(), limitedPartitions.begin(), limitedPartitions.end());
-        partitions.insert(partitions.end(), limitedTailPartitions.rbegin(), limitedTailPartitions.rend());
-    }
-
-    Partition *newPartitionWithSkipRows(Partition *p_in, size_t numToSkip, TransformStage *tstage, Executor *exec) {
-        auto ptr = p_in->lockRaw();
-        auto num_rows = *((int64_t *) ptr);
-        assert(numToSkip < num_rows);
-
-        ptr += sizeof(int64_t);
-        size_t numBytesToSkip = 0;
-
-        Deserializer ds(tstage->outputSchema());
-        for (unsigned i = 0; i < numToSkip; ++i) {
-            Row r = Row::fromMemory(ds, ptr, p_in->capacity() - numBytesToSkip);
-            ptr += r.serializedLength();
-            numBytesToSkip += r.serializedLength();
-        }
-
-        Partition *p_out = exec->allocWritablePartition(p_in->size() - numBytesToSkip + sizeof(int64_t),
-                                                        tstage->outputSchema(), tstage->outputDataSetID(),
-                                                        tstage->context().id());
-        assert(p_out->capacity() >= p_in->size() - numBytesToSkip);
-
-        auto ptr_out = p_out->lockRaw();
-        *((int64_t *) ptr_out) = p_in->getNumRows() - numToSkip;
-        ptr_out += sizeof(int64_t);
-        memcpy((void *) ptr_out, ptr, p_in->size() - numBytesToSkip);
-        p_out->unlock();
-
-        p_in->unlock();
-
-        return p_out;
-    }
 } // namespace tuplex
\ No newline at end of file
diff --git a/tuplex/python/tuplex/utils/table_format.py b/tuplex/python/tuplex/utils/table_format.py
index bb83118b4..ecd333f5a 100644
--- a/tuplex/python/tuplex/utils/table_format.py
+++ b/tuplex/python/tuplex/utils/table_format.py
@@ -5,7 +5,7 @@
 #                                                                                                                      #
 #                                                                                                                      #
 #  (c) 2017 - 2021, Tuplex team                                                                                        #
-#  Created by Leonhard Spiegelberg first on 4/19/2022                                                                  #
+#  Created by March Boonyapaluk first on 4/19/2022                                                                     #
 #  License: Apache 2.0                                                                                                 #
 # ----------------------------------------------------------------------------------------------------------------------#
 
diff --git a/tuplex/test/core/TakeTest.cc b/tuplex/test/core/TakeTest.cc
index 4e4a70f53..40b624ca8 100644
--- a/tuplex/test/core/TakeTest.cc
+++ b/tuplex/test/core/TakeTest.cc
@@ -4,7 +4,7 @@
 //                                                                                                                    //
 //                                                                                                                    //
 //  (c) 2017 - 2021, Tuplex team                                                                                      //
-//  Created by Leonhard Spiegelberg first on 1/1/2021                                                                 //
+//  Created by March Boonyapaluk first on 4/19/2021                                                                   //
 //  License: Apache 2.0                                                                                               //
 //--------------------------------------------------------------------------------------------------------------------//
 

From a072e405956b2b09e0fa3e89e60830ca6fb5612a Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Wed, 20 Apr 2022 11:45:34 -0400
Subject: [PATCH 24/56] Fix Azure pipeline failing

---
 tuplex/python/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tuplex/python/CMakeLists.txt b/tuplex/python/CMakeLists.txt
index b0b0e54c5..7ccb7057c 100644
--- a/tuplex/python/CMakeLists.txt
+++ b/tuplex/python/CMakeLists.txt
@@ -104,6 +104,7 @@ FILE(COPY ${CMAKE_CURRENT_SOURCE_DIR}/tuplex/utils/__init__.py
         ${CMAKE_CURRENT_SOURCE_DIR}/tuplex/utils/tracebacks.py
         ${CMAKE_CURRENT_SOURCE_DIR}/tuplex/utils/version.py
         ${CMAKE_CURRENT_SOURCE_DIR}/tuplex/utils/globs.py
+        ${CMAKE_CURRENT_SOURCE_DIR}/tuplex/utils/table_format.py
         DESTINATION ${PYTHON_DIST_DIR}/tuplex/utils)
 
 FILE(COPY ${CMAKE_CURRENT_SOURCE_DIR}/tests/test_tuples.py

From 5a1a3429ecdecdd88ea14ac2b246ce53ad3224a3 Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Sat, 22 Jan 2022 17:37:43 -0500
Subject: [PATCH 25/56] Modify dataset

---
 tuplex/python/tuplex/dataset.py | 82 +++++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)

diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py
index a2b8c0b33..aa5b1ca12 100644
--- a/tuplex/python/tuplex/dataset.py
+++ b/tuplex/python/tuplex/dataset.py
@@ -28,6 +28,88 @@ class DataSet:
     def __init__(self):
         self._dataSet = None
 
+    def getDataLen(self):
+        data = self.collect()
+        if len(data) == 0:
+            return 0, 0
+        else:
+            return len(data), len(data[0])
+
+    def revTake(self, nRows = 5):
+        return self.collect()[-nRows:]
+
+    def _repr_html_(self):
+        rows_list = self.take()
+        total_row_cnt, total_col_cnt = self.getDataLen()
+        print('rowlist')
+        print(rows_list)
+        if len(rows_list) == 0:
+            header = '<th></th>\n'
+            rows = '<tr></tr>\n'
+        else:
+            header = '<th></th>\n'
+
+            if self.columns != None:
+                for x in self.columns:
+                    header += f'      <th>{x}</th>\n'
+            else:
+                for i in range(len(rows_list[0])):
+                    header += f'      <th>column {i + 1}</th>\n'
+
+            rows = ''
+            for i, r in enumerate(rows_list):
+                rows += '    <tr>\n'
+                rows += f'      <th>{i}</th>\n'
+                for data in r:
+                    rows += f'      <td>{data}</td>\n'
+                rows += '    </tr>\n'
+
+            # add the ...
+            rows += '    <tr>\n'
+            rows += '      <th>...</th>\n'
+            for i in range(total_col_cnt):
+                rows += '      <td>...</td>\n'
+            rows += '    </tr>\n'
+
+            lastData = self.revTake()
+            for i, r in enumerate(lastData):
+                rows += '    <tr>\n'
+                rows += f'      <th>{total_row_cnt - len(lastData) + i}</th>\n'
+                for data in r:
+                    rows += f'      <td>{data}</td>\n'
+                rows += '    </tr>\n'
+
+        html_template = (
+            '<div>\n'
+            '<style scoped>\n'
+            '    .dataframe tbody tr th:only-of-type {\n'
+            '        vertical-align: middle;\n'
+            '    }\n'
+            '\n'
+            '    .dataframe tbody tr th {\n'
+            '        vertical-align: top;\n'
+            '    }\n'
+            '\n'
+            '    .dataframe thead th {\n'
+            '        text-align: right;\n'
+            '    }\n'
+            '</style>\n'
+            '<table border="1" class="dataframe">\n'
+            '  <thead>\n'
+            '    <tr style="text-align: right;">\n'
+            f'{header}'
+            '    </tr>\n'
+            '  </thead>\n'
+            '  <tbody>\n'
+            f'{rows}'
+            '  </tbody>\n'
+            '</table>\n'
+            f'<p>{total_row_cnt} rows × {total_col_cnt} columns</p>\n'
+            '</div>'
+        )
+
+        return html_template
+
     def unique(self):
         """ removes duplicates from Dataset (out-of-order). Equivalent to a DISTINCT clause in a SQL-statement.
         Returns:

From b68b4a10ed68598a3f0f318f3e8008c4d99e8a60 Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Thu, 27 Jan 2022 22:12:09 -0500
Subject: [PATCH 26/56] Add in takeLast operator

---
 tuplex/core/include/DataSet.h                 |  2 +
 .../include/logical/LogicalOperatorType.h     |  1 +
 .../core/include/logical/TakeLastOperator.h   | 51 ++++++++++++++
 tuplex/core/src/DataSet.cc                    | 24 +++++++
 tuplex/core/src/logical/TaskLastOperator.cc   | 43 ++++++++++++
 tuplex/core/src/physical/PhysicalPlan.cc      |  7 +-
 tuplex/python/include/PythonDataSet.h         |  1 +
 tuplex/python/src/PythonBindings.cc           |  1 +
 tuplex/python/src/PythonDataSet.cc            | 69 +++++++++++++++++++
 tuplex/python/tuplex/dataset.py               | 17 +++++
 10 files changed, 215 insertions(+), 1 deletion(-)
 create mode 100644 tuplex/core/include/logical/TakeLastOperator.h
 create mode 100644 tuplex/core/src/logical/TaskLastOperator.cc

diff --git a/tuplex/core/include/DataSet.h b/tuplex/core/include/DataSet.h
index 899032723..429d8c6a7 100644
--- a/tuplex/core/include/DataSet.h
+++ b/tuplex/core/include/DataSet.h
@@ -269,6 +269,8 @@ namespace tuplex {
 
         virtual std::vector<Row> takeAsVector(int64_t numElements, std::ostream &os = std::cout);
 
+        virtual std::shared_ptr<ResultSet> takeLast(int64_t numElements, std::ostream &os = std::cout);
+        
         /*!
          * saves dataset to file. There are multiple options to control the behavior
          * ==> 1.) files can be split across multiple ones. Specify number of files to split rows to
diff --git a/tuplex/core/include/logical/LogicalOperatorType.h b/tuplex/core/include/logical/LogicalOperatorType.h
index 594252820..b6a1c788b 100644
--- a/tuplex/core/include/logical/LogicalOperatorType.h
+++ b/tuplex/core/include/logical/LogicalOperatorType.h
@@ -17,6 +17,7 @@ namespace tuplex {
         MAP,
         FILTER,
         TAKE, // i.e. output to python / in memory
+        TAKELAST,
         PARALLELIZE, // i.e. input from python
         FILEINPUT,
         RESOLVE,
diff --git a/tuplex/core/include/logical/TakeLastOperator.h b/tuplex/core/include/logical/TakeLastOperator.h
new file mode 100644
index 000000000..28896e513
--- /dev/null
+++ b/tuplex/core/include/logical/TakeLastOperator.h
@@ -0,0 +1,51 @@
+//--------------------------------------------------------------------------------------------------------------------//
+//                                                                                                                    //
+//                                      Tuplex: Blazing Fast Python Data Science                                      //
+//                                                                                                                    //
+//                                                                                                                    //
+//  (c) 2017 - 2021, Tuplex team                                                                                      //
+//  Created by Leonhard Spiegelberg first on 1/1/2021                                                                 //
+//  License: Apache 2.0                                                                                               //
+//--------------------------------------------------------------------------------------------------------------------//
+
+#ifndef TUPLEX_TAKELASTOPERATOR_H
+#define TUPLEX_TAKELASTOPERATOR_H
+
+
+#include "LogicalOperator.h"
+
+namespace tuplex {
+    class TakeLastOperator : public LogicalOperator {
+    private:
+        int64_t _limit;
+    public:
+        LogicalOperator *clone() override;
+
+    public:
+        TakeLastOperator(LogicalOperator *parent, const int64_t numElements);
+
+        std::string name() override {
+            if(_limit < 0 || std::numeric_limits<int64_t>::max() == _limit)
+                return "collect";
+            return "take";
+        }
+        LogicalOperatorType type() const override { return LogicalOperatorType::TAKELAST; }
+
+        bool isActionable() override { return true; }
+
+        bool isDataSource() override { return false; }
+
+        bool good() const override;
+
+        int64_t limit() { return _limit; }
+
+
+        std::vector<Row> getSample(const size_t num) const override;
+
+        Schema getInputSchema() const override { return getOutputSchema(); }
+
+        std::vector<std::string> columns() const override;
+    };
+}
+
+#endif //TUPLEX_TAKELASTOPERATOR_H
\ No newline at end of file
diff --git a/tuplex/core/src/DataSet.cc b/tuplex/core/src/DataSet.cc
index a53a14094..66a6a548c 100644
--- a/tuplex/core/src/DataSet.cc
+++ b/tuplex/core/src/DataSet.cc
@@ -12,6 +12,7 @@
 #include <logical/MapOperator.h>
 #include <logical/FilterOperator.h>
 #include <logical/TakeOperator.h>
+#include <logical/TakeLastOperator.h>
 #include <logical/ResolveOperator.h>
 #include <logical/IgnoreOperator.h>
 #include <logical/MapColumnOperator.h>
@@ -102,6 +103,29 @@ namespace tuplex {
         return v;
     }
 
+    std::shared_ptr<ResultSet> DataSet::takeLast(int64_t numElements, std::ostream &os) {
+        // error dataset?
+        if (isError())
+            throw std::runtime_error("is error dataset!");
+
+        // negative numbers mean get all elements!
+        if (numElements < 0)
+            numElements = std::numeric_limits<int64_t>::max();
+
+        // create a take node
+        assert(_context);
+        LogicalOperator *op = _context->addOperator(new TakeLastOperator(this->_operator, numElements));
+        DataSet *dsptr = _context->createDataSet(op->getOutputSchema());
+        dsptr->_operator = op;
+        op->setDataSet(dsptr);
+
+        // perform action.
+        assert(this->_context);
+        auto rs = op->compute(*this->_context);
+
+        return rs;
+    }
+
     void DataSet::tofile(tuplex::FileFormat fmt, const tuplex::URI &uri, const tuplex::UDF &udf,
                          size_t fileCount, size_t shardSize,
                          const std::unordered_map<std::string, std::string> &outputOptions, size_t limit,
diff --git a/tuplex/core/src/logical/TaskLastOperator.cc b/tuplex/core/src/logical/TaskLastOperator.cc
new file mode 100644
index 000000000..92295efb3
--- /dev/null
+++ b/tuplex/core/src/logical/TaskLastOperator.cc
@@ -0,0 +1,43 @@
+//--------------------------------------------------------------------------------------------------------------------//
+//                                                                                                                    //
+//                                      Tuplex: Blazing Fast Python Data Science                                      //
+//                                                                                                                    //
+//                                                                                                                    //
+//  (c) 2017 - 2021, Tuplex team                                                                                      //
+//  Created by Leonhard Spiegelberg first on 1/1/2021                                                                 //
+//  License: Apache 2.0                                                                                               //
+//--------------------------------------------------------------------------------------------------------------------//
+
+#include <logical/TakeLastOperator.h>
+#include <cassert>
+
+namespace tuplex {
+    TakeLastOperator::TakeLastOperator(LogicalOperator *parent, const int64_t numElements) : LogicalOperator::LogicalOperator(parent), _limit(numElements) {
+        // take schema from parent node
+        setSchema(this->parent()->getOutputSchema());
+    }
+
+    bool TakeLastOperator::good() const {
+            return _limit >= -1;
+    }
+
+    std::vector<Row> TakeLastOperator::getSample(const size_t num) const {
+        // take sample from parent
+        return parent()->getSample(num);
+    }
+
+    std::vector<std::string> TakeLastOperator::columns() const {
+        assert(parent());
+        return parent()->columns();
+    }
+
+    LogicalOperator *TakeLastOperator::clone() {
+        // create clone of this operator
+        auto copy = new TakeLastOperator(parent()->clone(), _limit);
+
+        copy->setDataSet(getDataSet()); // weak ptr to old dataset...
+        copy->copyMembers(this);
+        assert(getID() == copy->getID());
+        return copy;
+    }
+}
\ No newline at end of file
diff --git a/tuplex/core/src/physical/PhysicalPlan.cc b/tuplex/core/src/physical/PhysicalPlan.cc
index 2399edf6f..87a73a712 100644
--- a/tuplex/core/src/physical/PhysicalPlan.cc
+++ b/tuplex/core/src/physical/PhysicalPlan.cc
@@ -208,7 +208,9 @@ namespace tuplex {
         if(ops.back()->isActionable()) {
             if(ops.back()->type() == LogicalOperatorType::FILEOUTPUT)
                 outputMode = EndPointMode::FILE;
-            else if(ops.back()->type() == LogicalOperatorType::TAKE || ops.back()->type() == LogicalOperatorType::CACHE) {
+            else if(ops.back()->type() == LogicalOperatorType::TAKE ||
+                    ops.back()->type() == LogicalOperatorType::TAKELAST || 
+                    ops.back()->type() == LogicalOperatorType::CACHE) {
                // memory?
                outputMode = EndPointMode::MEMORY;
             } else
@@ -382,6 +384,9 @@ namespace tuplex {
         if(outputNode->type() == LogicalOperatorType::TAKE) {
             auto top = static_cast<TakeOperator*>(outputNode);
             builder.setOutputLimit(top->limit());
+        } else if (outputNode->type() == LogicalOperatorType::TAKELAST) {
+            auto top = static_cast<TakeLastOperator*>(outputNode);
+            builder.setOutputLimit(top->limit());
         }
 
         // @TODO: add slowPip builder to this process...
diff --git a/tuplex/python/include/PythonDataSet.h b/tuplex/python/include/PythonDataSet.h
index 665d68856..58827ea33 100644
--- a/tuplex/python/include/PythonDataSet.h
+++ b/tuplex/python/include/PythonDataSet.h
@@ -78,6 +78,7 @@ namespace tuplex {
 
         py::object collect();
         py::object take(const int64_t numRows);
+        boost::python::object takeLast(const int64_t numRows);
         void show(const int64_t numRows=-1);
 
         // DataFrame like operations
diff --git a/tuplex/python/src/PythonBindings.cc b/tuplex/python/src/PythonBindings.cc
index 6b3683853..4d0b1f4e9 100644
--- a/tuplex/python/src/PythonBindings.cc
+++ b/tuplex/python/src/PythonBindings.cc
@@ -43,6 +43,7 @@ PYMODULE {
             .def("show", &tuplex::PythonDataSet::show)
             .def("collect", &tuplex::PythonDataSet::collect)
             .def("take", &tuplex::PythonDataSet::take)
+            .def("takeLast", &tuplex::PythonDataSet::takeLast)
             .def("map", &tuplex::PythonDataSet::map)
             .def("resolve", &tuplex::PythonDataSet::resolve)
             .def("ignore", &tuplex::PythonDataSet::ignore)
diff --git a/tuplex/python/src/PythonDataSet.cc b/tuplex/python/src/PythonDataSet.cc
index 36f9a392b..2e54deec5 100644
--- a/tuplex/python/src/PythonDataSet.cc
+++ b/tuplex/python/src/PythonDataSet.cc
@@ -176,6 +176,75 @@ namespace tuplex {
         }
     }
 
+    boost::python::object PythonDataSet::takeLast(const int64_t numRows) {
+        // make sure a dataset is wrapped
+        assert(this->_dataset);
+
+        // is callee error dataset? if so return list with error string
+        if (this->_dataset->isError()) {
+            ErrorDataSet *eds = static_cast<ErrorDataSet *>(this->_dataset);
+            boost::python::list L;
+            L.append(eds->getError());
+            // Logger::instance().flushAll();
+            Logger::instance().flushToPython();
+            return L;
+        } else {
+            std::stringstream ss;
+
+            // release GIL & hand over everything to Tuplex
+            assert(PyGILState_Check()); // make sure this thread holds the GIL!
+            python::unlockGIL();
+
+            std::shared_ptr<ResultSet> rs;
+            std::string err_message = "";
+            try {
+                rs = _dataset->takeLast(numRows, ss);
+                if(!rs)
+                    throw std::runtime_error("invalid result set");
+                // if there are more than 1 million (100k in debug mode) elements print message...
+                if (rs->rowCount() > LARGE_RESULT_SIZE)
+                    Logger::instance().logger("python").info("transferring "
+                                                             + std::to_string(rs->rowCount()) +
+                                                             " elements back to Python. This might take a while...");
+            } catch(const std::exception& e) {
+                err_message = e.what();
+                Logger::instance().defaultLogger().error(err_message);
+            } catch(...) {
+                err_message = "unknown C++ exception occurred, please change type.";
+                Logger::instance().defaultLogger().error(err_message);
+            }
+
+            // reqacquire GIL
+            python::lockGIL();
+
+            // error? then return list of error string
+            if(!rs || !err_message.empty()) {
+                // Logger::instance().flushAll();
+                Logger::instance().flushToPython();
+                auto listObj = PyList_New(1);
+                PyList_SetItem(listObj, 0, python::PyString_FromString(err_message.c_str()));
+                auto list = boost::python::object(boost::python::borrowed<>(listObj));
+                return list;
+            }
+
+            // collect results & transfer them back to python
+            // new version, directly interact with the interpreter
+            Timer timer;
+            // build python list object from resultset
+            auto listObj = resultSetToCPython(rs.get(), numRows);
+            Logger::instance().logger("python").info("Data transfer back to python took "
+                                                     + std::to_string(timer.time()) + " seconds");
+            // Logger::instance().flushAll();
+            Logger::instance().flushToPython();
+
+            // print errors
+            if (ss.str().length() > 0)
+                PySys_FormatStdout("%s", ss.str().c_str());
+
+            return boost::python::object(boost::python::handle<>(listObj));
+        }
+    }
+
     PythonDataSet PythonDataSet::map(const std::string &lambda_code, const std::string &pickled_code, const py::object& closure) {
 
         auto& logger = Logger::instance().logger("python");
diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py
index aa5b1ca12..a1d838526 100644
--- a/tuplex/python/tuplex/dataset.py
+++ b/tuplex/python/tuplex/dataset.py
@@ -208,6 +208,23 @@ def take(self, nrows=5):
 
         return self._dataSet.take(nrows)
 
+    def takeLast(self, nrows=5):
+        """ action that generates a physical plan, processes data and collects the last results then as list of tuples.
+
+        Args:
+            nrows (int): number of rows to collect. Per default ``5``.
+        Returns:
+            (list): A list of tuples
+
+        """
+
+        assert isinstance(nrows, int), 'num rows must be an integer'
+        assert nrows > 0, 'please specify a number greater than zero'
+
+        assert self._dataSet is not None, 'internal API error, datasets must be created via context objects'
+
+        return self._dataSet.takeLast(nrows)
+
     def show(self, nrows=None):
         """ action that generates a physical plan, processes data and prints results as nicely formatted
         ASCII table to stdout.

From 02b51aabb30541dcb14e9cc5d2c0aad3d421f1f8 Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Thu, 27 Jan 2022 22:25:39 -0500
Subject: [PATCH 27/56] (wip) add reverse limit in partition

---
 tuplex/core/include/Partition.h            | 15 +++++++
 tuplex/core/src/physical/TransformStage.cc | 51 ++++++++++++++++------
 2 files changed, 53 insertions(+), 13 deletions(-)

diff --git a/tuplex/core/include/Partition.h b/tuplex/core/include/Partition.h
index 9bc7fc54c..5a66023fd 100644
--- a/tuplex/core/include/Partition.h
+++ b/tuplex/core/include/Partition.h
@@ -248,6 +248,21 @@ namespace tuplex {
             _mutex.unlock();
         }
 
+        void setNumLastRows(const size_t numRows) {
+            // TODO: set another value instead
+            _mutex.lock();
+
+            _numRows = numRows;
+
+            // save to memptr
+            if(_arena) {
+                *((int64_t*)_arena) = numRows;
+            }
+
+            _mutex.unlock();
+        }
+
+
 
         int64_t getDataSetID() const { return _dataSetID; }
 
diff --git a/tuplex/core/src/physical/TransformStage.cc b/tuplex/core/src/physical/TransformStage.cc
index b61f9cbe2..9cd15694a 100644
--- a/tuplex/core/src/physical/TransformStage.cc
+++ b/tuplex/core/src/physical/TransformStage.cc
@@ -139,21 +139,46 @@ namespace tuplex {
                 }
 
                 // check output limit, adjust partitions if necessary
-                size_t numOutputRows = 0;
-                for (auto partition : partitions) {
-                    numOutputRows += partition->getNumRows();
-                    if (numOutputRows >= outputLimit()) {
-                        // clip last partition & leave loop
-                        auto clipped = outputLimit() - (numOutputRows - partition->getNumRows());
-                        assert(clipped <= partition->getNumRows());
-                        partition->setNumRows(clipped);
-                        if (clipped > 0)
+                // TODO: add reverse outputLimit condition here
+                if (true) {
+                    size_t numOutputRows = 0;
+                    for (auto partition : partitions) {
+                        numOutputRows += partition->getNumRows();
+                        if (numOutputRows >= outputLimit()) {
+                            // clip last partition & leave loop
+                            auto clipped = outputLimit() - (numOutputRows - partition->getNumRows());
+                            assert(clipped <= partition->getNumRows());
+                            partition->setNumRows(clipped);
+                            if (clipped > 0)
+                                limitedPartitions.push_back(partition);
+                            break;
+                        } else {
+                            // put full partition to output set
                             limitedPartitions.push_back(partition);
-                        break;
-                    } else {
-                        // put full partition to output set
-                        limitedPartitions.push_back(partition);
+                        }
+                    }   
+                } else {
+                    size_t numOutputRows = 0;
+                    for (auto partitionIt = partitions.rbeing();
+                            partitionIt != partitions.rend(); partitionIt++) {
+                        numOutputRows += partition->getNumRows();
+                        if (numOutputRows >= outputLimit()) {
+                            // clip last partition & leave loop
+                            auto clipped = outputLimit() - (numOutputRows - partition->getNumRows());
+                            assert(clipped <= partition->getNumRows());
+
+                            // TODO: do backward clip here instead
+                            partition->setNumRows(clipped);
+                            if (clipped > 0)
+                                limitedPartitions.push_back(partition);
+                            break;
+                        } else {
+                            // put full partition to output set
+                            limitedPartitions.push_back(partition);
+                        }    
                     }
+
+                    std::reverse(limitedPartitions.begin(), limitedPartitions.end());
                 }
             }
 

From a721e0f4f17a134eb6b8229ab6872b51bd551063 Mon Sep 17 00:00:00 2001
From: korlamarch <khemarat_boonyapaluk@brown.edu>
Date: Fri, 11 Feb 2022 09:20:06 -0500
Subject: [PATCH 28/56] Remove row count

---
 tuplex/python/tuplex/dataset.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py
index a1d838526..976a751f4 100644
--- a/tuplex/python/tuplex/dataset.py
+++ b/tuplex/python/tuplex/dataset.py
@@ -28,19 +28,19 @@ class DataSet:
     def __init__(self):
         self._dataSet = None
 
-    def getDataLen(self):
+    def getColumnSize(self):
         data = self.collect()
         if len(data) == 0:
             return 0, 0
         else:
-            return len(data), len(data[0])
+            return len(data[0])
 
     def revTake(self, nRows = 5):
         return self.collect()[-nRows:]
 
     def _repr_html_(self):
         rows_list = self.take()
-        total_row_cnt, total_col_cnt = self.getDataLen()
+        total_col_cnt = self.getColumnSize()
         print('rowlist')
         print(rows_list)
         if len(rows_list) == 0:
@@ -74,7 +74,7 @@ def _repr_html_(self):
             lastData = self.revTake()
             for i, r in enumerate(lastData):
                 rows += '    <tr>\n'
-                rows += f'      <th>{total_row_cnt - len(lastData) + i}</th>\n'
+                rows += f'      <th>{0 - len(lastData) + i}</th>\n'
                 for data in r:
                     rows += f'      <td>{data}</td>\n'
                 rows += '    </tr>\n'
@@ -104,7 +104,7 @@ def _repr_html_(self):
             f'{rows}'
             '  </tbody>\n'
             '</table>\n'
-            f'<p>{total_row_cnt} rows × {total_col_cnt} columns</p>\n'
+            f'<p>{total_col_cnt} columns</p>\n'
             '</div>'
         )
 

From 6955392a5b098709a1f4b7e8fa1cc487130b93c0 Mon Sep 17 00:00:00 2001
From: korlamarch <khemarat_boonyapaluk@brown.edu>
Date: Tue, 15 Feb 2022 23:30:47 -0500
Subject: [PATCH 29/56] refactor TakeOperator

---
 tuplex/core/include/DataSet.h                 |  3 +-
 tuplex/core/include/EmptyDataset.h            |  2 +-
 tuplex/core/include/ErrorDataSet.h            |  2 +-
 .../include/logical/LogicalOperatorType.h     |  1 -
 .../core/include/logical/TakeLastOperator.h   | 51 -------------
 tuplex/core/include/logical/TakeOperator.h    | 10 ++-
 tuplex/core/src/DataSet.cc                    | 36 ++-------
 tuplex/core/src/EmptyDataset.cc               |  4 +-
 tuplex/core/src/ErrorDataSet.cc               |  4 +-
 tuplex/core/src/logical/TakeOperator.cc       |  6 +-
 tuplex/core/src/logical/TaskLastOperator.cc   | 43 -----------
 tuplex/core/src/physical/PhysicalPlan.cc      |  4 -
 tuplex/core/src/physical/StageBuilder.cc      |  3 +-
 tuplex/core/src/physical/TransformStage.cc    | 51 ++++---------
 tuplex/python/include/PythonDataSet.h         |  3 +-
 tuplex/python/src/PythonBindings.cc           |  1 -
 tuplex/python/src/PythonDataSet.cc            | 73 +------------------
 tuplex/python/tuplex/dataset.py               | 23 +-----
 18 files changed, 44 insertions(+), 276 deletions(-)
 delete mode 100644 tuplex/core/include/logical/TakeLastOperator.h
 delete mode 100644 tuplex/core/src/logical/TaskLastOperator.cc

diff --git a/tuplex/core/include/DataSet.h b/tuplex/core/include/DataSet.h
index 429d8c6a7..65a766a87 100644
--- a/tuplex/core/include/DataSet.h
+++ b/tuplex/core/include/DataSet.h
@@ -263,13 +263,12 @@ namespace tuplex {
         // these are actions that cause execution
         virtual std::shared_ptr<ResultSet> collect(std::ostream &os = std::cout);
 
-        virtual std::shared_ptr<ResultSet> take(int64_t numElements, std::ostream &os = std::cout);
+        virtual std::shared_ptr<ResultSet> take(int64_t numTop, int64_t numBottom, std::ostream &os = std::cout);
 
         virtual std::vector<Row> collectAsVector(std::ostream &os = std::cout);
 
         virtual std::vector<Row> takeAsVector(int64_t numElements, std::ostream &os = std::cout);
 
-        virtual std::shared_ptr<ResultSet> takeLast(int64_t numElements, std::ostream &os = std::cout);
         
         /*!
          * saves dataset to file. There are multiple options to control the behavior
diff --git a/tuplex/core/include/EmptyDataset.h b/tuplex/core/include/EmptyDataset.h
index b3c1ed7af..0f8a1f52c 100644
--- a/tuplex/core/include/EmptyDataset.h
+++ b/tuplex/core/include/EmptyDataset.h
@@ -70,7 +70,7 @@ namespace tuplex {
         virtual std::shared_ptr<ResultSet> collect(std::ostream& os) override;
 
         // take / collect will print out the error only
-        virtual std::shared_ptr<ResultSet> take(int64_t numElements, std::ostream& os) override;
+        virtual std::shared_ptr<ResultSet> take(int64_t numTop, int64_t numBottom, std::ostream& os) override;
 
         //virtual void show(const int64_t numRows=-1, std::ostream& os=std::cout) override;
         virtual std::vector<Row> collectAsVector(std::ostream& os) override;
diff --git a/tuplex/core/include/ErrorDataSet.h b/tuplex/core/include/ErrorDataSet.h
index 2f46d8638..34fc60685 100644
--- a/tuplex/core/include/ErrorDataSet.h
+++ b/tuplex/core/include/ErrorDataSet.h
@@ -90,7 +90,7 @@ namespace tuplex {
         std::shared_ptr<ResultSet> collect(std::ostream& os) override;
 
         // take / collect will print out the error only
-        std::shared_ptr<ResultSet> take(int64_t numElements, std::ostream& os) override;
+        std::shared_ptr<ResultSet> take(int64_t numTop, int64_t numBottom, std::ostream& os) override;
 
         //virtual void show(const int64_t numRows=-1, std::ostream& os=std::cout) override;
         std::vector<Row> collectAsVector(std::ostream& os) override;
diff --git a/tuplex/core/include/logical/LogicalOperatorType.h b/tuplex/core/include/logical/LogicalOperatorType.h
index b6a1c788b..594252820 100644
--- a/tuplex/core/include/logical/LogicalOperatorType.h
+++ b/tuplex/core/include/logical/LogicalOperatorType.h
@@ -17,7 +17,6 @@ namespace tuplex {
         MAP,
         FILTER,
         TAKE, // i.e. output to python / in memory
-        TAKELAST,
         PARALLELIZE, // i.e. input from python
         FILEINPUT,
         RESOLVE,
diff --git a/tuplex/core/include/logical/TakeLastOperator.h b/tuplex/core/include/logical/TakeLastOperator.h
deleted file mode 100644
index 28896e513..000000000
--- a/tuplex/core/include/logical/TakeLastOperator.h
+++ /dev/null
@@ -1,51 +0,0 @@
-//--------------------------------------------------------------------------------------------------------------------//
-//                                                                                                                    //
-//                                      Tuplex: Blazing Fast Python Data Science                                      //
-//                                                                                                                    //
-//                                                                                                                    //
-//  (c) 2017 - 2021, Tuplex team                                                                                      //
-//  Created by Leonhard Spiegelberg first on 1/1/2021                                                                 //
-//  License: Apache 2.0                                                                                               //
-//--------------------------------------------------------------------------------------------------------------------//
-
-#ifndef TUPLEX_TAKELASTOPERATOR_H
-#define TUPLEX_TAKELASTOPERATOR_H
-
-
-#include "LogicalOperator.h"
-
-namespace tuplex {
-    class TakeLastOperator : public LogicalOperator {
-    private:
-        int64_t _limit;
-    public:
-        LogicalOperator *clone() override;
-
-    public:
-        TakeLastOperator(LogicalOperator *parent, const int64_t numElements);
-
-        std::string name() override {
-            if(_limit < 0 || std::numeric_limits<int64_t>::max() == _limit)
-                return "collect";
-            return "take";
-        }
-        LogicalOperatorType type() const override { return LogicalOperatorType::TAKELAST; }
-
-        bool isActionable() override { return true; }
-
-        bool isDataSource() override { return false; }
-
-        bool good() const override;
-
-        int64_t limit() { return _limit; }
-
-
-        std::vector<Row> getSample(const size_t num) const override;
-
-        Schema getInputSchema() const override { return getOutputSchema(); }
-
-        std::vector<std::string> columns() const override;
-    };
-}
-
-#endif //TUPLEX_TAKELASTOPERATOR_H
\ No newline at end of file
diff --git a/tuplex/core/include/logical/TakeOperator.h b/tuplex/core/include/logical/TakeOperator.h
index 8d0d6dcab..20c035a74 100644
--- a/tuplex/core/include/logical/TakeOperator.h
+++ b/tuplex/core/include/logical/TakeOperator.h
@@ -17,15 +17,16 @@
 namespace tuplex {
     class TakeOperator : public LogicalOperator {
     private:
-        int64_t _limit;
+        int64_t _limitTop;
+        int64_t _limitBottom;
     public:
         LogicalOperator *clone() override;
 
     public:
-        TakeOperator(LogicalOperator *parent, const int64_t numElements);
+        TakeOperator(LogicalOperator *parent, const int64_t numTop, const int64_t numBottom);
 
         std::string name() override {
-            if(_limit < 0 || std::numeric_limits<int64_t>::max() == _limit)
+            if(_limitTop < 0 || std::numeric_limits<int64_t>::max() == _limitTop)
                 return "collect";
             return "take";
         }
@@ -37,8 +38,9 @@ namespace tuplex {
 
         bool good() const override;
 
-        int64_t limit() { return _limit; }
+        int64_t limit() { return _limitTop; }
 
+        bool limitBottom() { return _limitBottom; }
 
         std::vector<Row> getSample(const size_t num) const override;
 
diff --git a/tuplex/core/src/DataSet.cc b/tuplex/core/src/DataSet.cc
index 66a6a548c..3de903d1c 100644
--- a/tuplex/core/src/DataSet.cc
+++ b/tuplex/core/src/DataSet.cc
@@ -12,7 +12,6 @@
 #include <logical/MapOperator.h>
 #include <logical/FilterOperator.h>
 #include <logical/TakeOperator.h>
-#include <logical/TakeLastOperator.h>
 #include <logical/ResolveOperator.h>
 #include <logical/IgnoreOperator.h>
 #include <logical/MapColumnOperator.h>
@@ -39,21 +38,21 @@ namespace tuplex {
     }
 
     std::shared_ptr<ResultSet> DataSet::collect(std::ostream &os) {
-        return take(-1, os);
+        return take(-1, false, os);
     }
 
-    std::shared_ptr<ResultSet> DataSet::take(int64_t numElements, std::ostream &os) {
+    std::shared_ptr<ResultSet> DataSet::take(int64_t numTop, int64_t numBottom, std::ostream &os) {
         // error dataset?
         if (isError())
             throw std::runtime_error("is error dataset!");
 
         // negative numbers mean get all elements!
-        if (numElements < 0)
-            numElements = std::numeric_limits<int64_t>::max();
+        if (numTop < 0)
+            numTop = std::numeric_limits<int64_t>::max();
 
         // create a take node
         assert(_context);
-        LogicalOperator *op = _context->addOperator(new TakeOperator(this->_operator, numElements));
+        LogicalOperator *op = _context->addOperator(new TakeOperator(this->_operator, numTop, numBottom));
         DataSet *dsptr = _context->createDataSet(op->getOutputSchema());
         dsptr->_operator = op;
         op->setDataSet(dsptr);
@@ -72,7 +71,7 @@ namespace tuplex {
 
     // -1 means to retrieve all elements
     std::vector<Row> DataSet::takeAsVector(int64_t numElements, std::ostream &os) {
-        auto rs = take(numElements, os);
+        auto rs = take(numElements, false, os);
         Timer timer;
 
 #warning "limiting should make this hack irrelevant..."
@@ -103,29 +102,6 @@ namespace tuplex {
         return v;
     }
 
-    std::shared_ptr<ResultSet> DataSet::takeLast(int64_t numElements, std::ostream &os) {
-        // error dataset?
-        if (isError())
-            throw std::runtime_error("is error dataset!");
-
-        // negative numbers mean get all elements!
-        if (numElements < 0)
-            numElements = std::numeric_limits<int64_t>::max();
-
-        // create a take node
-        assert(_context);
-        LogicalOperator *op = _context->addOperator(new TakeLastOperator(this->_operator, numElements));
-        DataSet *dsptr = _context->createDataSet(op->getOutputSchema());
-        dsptr->_operator = op;
-        op->setDataSet(dsptr);
-
-        // perform action.
-        assert(this->_context);
-        auto rs = op->compute(*this->_context);
-
-        return rs;
-    }
-
     void DataSet::tofile(tuplex::FileFormat fmt, const tuplex::URI &uri, const tuplex::UDF &udf,
                          size_t fileCount, size_t shardSize,
                          const std::unordered_map<std::string, std::string> &outputOptions, size_t limit,
diff --git a/tuplex/core/src/EmptyDataset.cc b/tuplex/core/src/EmptyDataset.cc
index 984fa904f..7504e8499 100644
--- a/tuplex/core/src/EmptyDataset.cc
+++ b/tuplex/core/src/EmptyDataset.cc
@@ -11,7 +11,7 @@
 #include <EmptyDataset.h>
 
 namespace tuplex {
-    std::shared_ptr<ResultSet> EmptyDataset::take(int64_t numElements, std::ostream &os) {
+    std::shared_ptr<ResultSet> EmptyDataset::take(int64_t numTop, int64_t numBottom, std::ostream &os) {
         return std::make_shared<ResultSet>();
     }
 
@@ -20,7 +20,7 @@ namespace tuplex {
     }
 
     std::shared_ptr<ResultSet> EmptyDataset::collect(std::ostream &os) {
-        return take(0, os);
+        return take(0, false, os);
     }
 
     std::vector<Row> EmptyDataset::collectAsVector(std::ostream &os) {
diff --git a/tuplex/core/src/ErrorDataSet.cc b/tuplex/core/src/ErrorDataSet.cc
index 57c03ffba..9d19594f2 100644
--- a/tuplex/core/src/ErrorDataSet.cc
+++ b/tuplex/core/src/ErrorDataSet.cc
@@ -23,7 +23,7 @@ namespace tuplex {
         return takeAsVector(0, os);
     }
 
-    std::shared_ptr<ResultSet> ErrorDataSet::take(int64_t numElements, std::ostream &os) {
+    std::shared_ptr<ResultSet> ErrorDataSet::take(int64_t numTop, int64_t numBottom, std::ostream &os) {
         // return empty vector and print err message
         Logger::instance().logger("core").error(this->_error);
 
@@ -31,7 +31,7 @@ namespace tuplex {
     }
 
     std::shared_ptr<ResultSet> ErrorDataSet::collect(std::ostream &os) {
-        return take(0, os);
+        return take(0, false, os);
     }
 
     void
diff --git a/tuplex/core/src/logical/TakeOperator.cc b/tuplex/core/src/logical/TakeOperator.cc
index aa7c49668..e588b5e97 100644
--- a/tuplex/core/src/logical/TakeOperator.cc
+++ b/tuplex/core/src/logical/TakeOperator.cc
@@ -12,13 +12,13 @@
 #include <cassert>
 
 namespace tuplex {
-    TakeOperator::TakeOperator(LogicalOperator *parent, const int64_t numElements) : LogicalOperator::LogicalOperator(parent), _limit(numElements) {
+    TakeOperator::TakeOperator(LogicalOperator *parent, const int64_t numTop, const int64_t numBottom) : LogicalOperator::LogicalOperator(parent), _limitTop(numTop), _limitBottom(numBottom) {
         // take schema from parent node
         setSchema(this->parent()->getOutputSchema());
     }
 
     bool TakeOperator::good() const {
-            return _limit >= -1;
+            return _limitTop >= -1 && _limitBottom >= -1;
     }
 
     std::vector<Row> TakeOperator::getSample(const size_t num) const {
@@ -33,7 +33,7 @@ namespace tuplex {
 
     LogicalOperator *TakeOperator::clone() {
         // create clone of this operator
-        auto copy = new TakeOperator(parent()->clone(), _limit);
+        auto copy = new TakeOperator(parent()->clone(), _limitTop, _limitBottom);
 
         copy->setDataSet(getDataSet()); // weak ptr to old dataset...
         copy->copyMembers(this);
diff --git a/tuplex/core/src/logical/TaskLastOperator.cc b/tuplex/core/src/logical/TaskLastOperator.cc
deleted file mode 100644
index 92295efb3..000000000
--- a/tuplex/core/src/logical/TaskLastOperator.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-//--------------------------------------------------------------------------------------------------------------------//
-//                                                                                                                    //
-//                                      Tuplex: Blazing Fast Python Data Science                                      //
-//                                                                                                                    //
-//                                                                                                                    //
-//  (c) 2017 - 2021, Tuplex team                                                                                      //
-//  Created by Leonhard Spiegelberg first on 1/1/2021                                                                 //
-//  License: Apache 2.0                                                                                               //
-//--------------------------------------------------------------------------------------------------------------------//
-
-#include <logical/TakeLastOperator.h>
-#include <cassert>
-
-namespace tuplex {
-    TakeLastOperator::TakeLastOperator(LogicalOperator *parent, const int64_t numElements) : LogicalOperator::LogicalOperator(parent), _limit(numElements) {
-        // take schema from parent node
-        setSchema(this->parent()->getOutputSchema());
-    }
-
-    bool TakeLastOperator::good() const {
-            return _limit >= -1;
-    }
-
-    std::vector<Row> TakeLastOperator::getSample(const size_t num) const {
-        // take sample from parent
-        return parent()->getSample(num);
-    }
-
-    std::vector<std::string> TakeLastOperator::columns() const {
-        assert(parent());
-        return parent()->columns();
-    }
-
-    LogicalOperator *TakeLastOperator::clone() {
-        // create clone of this operator
-        auto copy = new TakeLastOperator(parent()->clone(), _limit);
-
-        copy->setDataSet(getDataSet()); // weak ptr to old dataset...
-        copy->copyMembers(this);
-        assert(getID() == copy->getID());
-        return copy;
-    }
-}
\ No newline at end of file
diff --git a/tuplex/core/src/physical/PhysicalPlan.cc b/tuplex/core/src/physical/PhysicalPlan.cc
index 87a73a712..17a4c7c0e 100644
--- a/tuplex/core/src/physical/PhysicalPlan.cc
+++ b/tuplex/core/src/physical/PhysicalPlan.cc
@@ -209,7 +209,6 @@ namespace tuplex {
             if(ops.back()->type() == LogicalOperatorType::FILEOUTPUT)
                 outputMode = EndPointMode::FILE;
             else if(ops.back()->type() == LogicalOperatorType::TAKE ||
-                    ops.back()->type() == LogicalOperatorType::TAKELAST || 
                     ops.back()->type() == LogicalOperatorType::CACHE) {
                // memory?
                outputMode = EndPointMode::MEMORY;
@@ -384,9 +383,6 @@ namespace tuplex {
         if(outputNode->type() == LogicalOperatorType::TAKE) {
             auto top = static_cast<TakeOperator*>(outputNode);
             builder.setOutputLimit(top->limit());
-        } else if (outputNode->type() == LogicalOperatorType::TAKELAST) {
-            auto top = static_cast<TakeLastOperator*>(outputNode);
-            builder.setOutputLimit(top->limit());
         }
 
         // @TODO: add slowPip builder to this process...
diff --git a/tuplex/core/src/physical/StageBuilder.cc b/tuplex/core/src/physical/StageBuilder.cc
index 72f01e2b8..0bf509ed1 100644
--- a/tuplex/core/src/physical/StageBuilder.cc
+++ b/tuplex/core/src/physical/StageBuilder.cc
@@ -457,7 +457,8 @@ namespace tuplex {
                         break;
                     }
                     case LogicalOperatorType::TAKE: {
-                        opt_ops.push_back(new TakeOperator(lastParent, dynamic_cast<TakeOperator*>(node)->limit()));
+                        auto takeOp = dynamic_cast<TakeOperator*>(node);
+                        opt_ops.push_back(new TakeOperator(lastParent, takeOp->limit(), takeOp->limitBottom()));
                         opt_ops.back()->setID(node->getID());
                         break;
                     }
diff --git a/tuplex/core/src/physical/TransformStage.cc b/tuplex/core/src/physical/TransformStage.cc
index 9cd15694a..b61f9cbe2 100644
--- a/tuplex/core/src/physical/TransformStage.cc
+++ b/tuplex/core/src/physical/TransformStage.cc
@@ -139,46 +139,21 @@ namespace tuplex {
                 }
 
                 // check output limit, adjust partitions if necessary
-                // TODO: add reverse outputLimit condition here
-                if (true) {
-                    size_t numOutputRows = 0;
-                    for (auto partition : partitions) {
-                        numOutputRows += partition->getNumRows();
-                        if (numOutputRows >= outputLimit()) {
-                            // clip last partition & leave loop
-                            auto clipped = outputLimit() - (numOutputRows - partition->getNumRows());
-                            assert(clipped <= partition->getNumRows());
-                            partition->setNumRows(clipped);
-                            if (clipped > 0)
-                                limitedPartitions.push_back(partition);
-                            break;
-                        } else {
-                            // put full partition to output set
-                            limitedPartitions.push_back(partition);
-                        }
-                    }   
-                } else {
-                    size_t numOutputRows = 0;
-                    for (auto partitionIt = partitions.rbeing();
-                            partitionIt != partitions.rend(); partitionIt++) {
-                        numOutputRows += partition->getNumRows();
-                        if (numOutputRows >= outputLimit()) {
-                            // clip last partition & leave loop
-                            auto clipped = outputLimit() - (numOutputRows - partition->getNumRows());
-                            assert(clipped <= partition->getNumRows());
-
-                            // TODO: do backward clip here instead
-                            partition->setNumRows(clipped);
-                            if (clipped > 0)
-                                limitedPartitions.push_back(partition);
-                            break;
-                        } else {
-                            // put full partition to output set
+                size_t numOutputRows = 0;
+                for (auto partition : partitions) {
+                    numOutputRows += partition->getNumRows();
+                    if (numOutputRows >= outputLimit()) {
+                        // clip last partition & leave loop
+                        auto clipped = outputLimit() - (numOutputRows - partition->getNumRows());
+                        assert(clipped <= partition->getNumRows());
+                        partition->setNumRows(clipped);
+                        if (clipped > 0)
                             limitedPartitions.push_back(partition);
-                        }    
+                        break;
+                    } else {
+                        // put full partition to output set
+                        limitedPartitions.push_back(partition);
                     }
-
-                    std::reverse(limitedPartitions.begin(), limitedPartitions.end());
                 }
             }
 
diff --git a/tuplex/python/include/PythonDataSet.h b/tuplex/python/include/PythonDataSet.h
index 58827ea33..23b09314d 100644
--- a/tuplex/python/include/PythonDataSet.h
+++ b/tuplex/python/include/PythonDataSet.h
@@ -77,8 +77,7 @@ namespace tuplex {
         PythonDataSet resolve(const int64_t exceptionCode, const std::string& lambda_code, const std::string& pickled_code, const py::object& closure=py::object());
 
         py::object collect();
-        py::object take(const int64_t numRows);
-        boost::python::object takeLast(const int64_t numRows);
+        py::object take(const int64_t numTop, const int64_t numBottom);
         void show(const int64_t numRows=-1);
 
         // DataFrame like operations
diff --git a/tuplex/python/src/PythonBindings.cc b/tuplex/python/src/PythonBindings.cc
index 4d0b1f4e9..6b3683853 100644
--- a/tuplex/python/src/PythonBindings.cc
+++ b/tuplex/python/src/PythonBindings.cc
@@ -43,7 +43,6 @@ PYMODULE {
             .def("show", &tuplex::PythonDataSet::show)
             .def("collect", &tuplex::PythonDataSet::collect)
             .def("take", &tuplex::PythonDataSet::take)
-            .def("takeLast", &tuplex::PythonDataSet::takeLast)
             .def("map", &tuplex::PythonDataSet::map)
             .def("resolve", &tuplex::PythonDataSet::resolve)
             .def("ignore", &tuplex::PythonDataSet::ignore)
diff --git a/tuplex/python/src/PythonDataSet.cc b/tuplex/python/src/PythonDataSet.cc
index 2e54deec5..853b910db 100644
--- a/tuplex/python/src/PythonDataSet.cc
+++ b/tuplex/python/src/PythonDataSet.cc
@@ -107,7 +107,7 @@ namespace tuplex {
         }
     }
 
-    py::object PythonDataSet::take(const int64_t numRows) {
+    py::object PythonDataSet::take(const int64_t numTop, const int64_t numBottom) {
         // make sure a dataset is wrapped
         assert(this->_dataset);
 
@@ -162,7 +162,7 @@ namespace tuplex {
             // new version, directly interact with the interpreter
             Timer timer;
             // build python list object from resultset
-            auto listObj = resultSetToCPython(rs.get(), numRows);
+            auto listObj = resultSetToCPython(rs.get(), numTop);
             Logger::instance().logger("python").info("Data transfer back to python took "
                                                      + std::to_string(timer.time()) + " seconds");
             // Logger::instance().flushAll();
@@ -176,75 +176,6 @@ namespace tuplex {
         }
     }
 
-    boost::python::object PythonDataSet::takeLast(const int64_t numRows) {
-        // make sure a dataset is wrapped
-        assert(this->_dataset);
-
-        // is callee error dataset? if so return list with error string
-        if (this->_dataset->isError()) {
-            ErrorDataSet *eds = static_cast<ErrorDataSet *>(this->_dataset);
-            boost::python::list L;
-            L.append(eds->getError());
-            // Logger::instance().flushAll();
-            Logger::instance().flushToPython();
-            return L;
-        } else {
-            std::stringstream ss;
-
-            // release GIL & hand over everything to Tuplex
-            assert(PyGILState_Check()); // make sure this thread holds the GIL!
-            python::unlockGIL();
-
-            std::shared_ptr<ResultSet> rs;
-            std::string err_message = "";
-            try {
-                rs = _dataset->takeLast(numRows, ss);
-                if(!rs)
-                    throw std::runtime_error("invalid result set");
-                // if there are more than 1 million (100k in debug mode) elements print message...
-                if (rs->rowCount() > LARGE_RESULT_SIZE)
-                    Logger::instance().logger("python").info("transferring "
-                                                             + std::to_string(rs->rowCount()) +
-                                                             " elements back to Python. This might take a while...");
-            } catch(const std::exception& e) {
-                err_message = e.what();
-                Logger::instance().defaultLogger().error(err_message);
-            } catch(...) {
-                err_message = "unknown C++ exception occurred, please change type.";
-                Logger::instance().defaultLogger().error(err_message);
-            }
-
-            // reqacquire GIL
-            python::lockGIL();
-
-            // error? then return list of error string
-            if(!rs || !err_message.empty()) {
-                // Logger::instance().flushAll();
-                Logger::instance().flushToPython();
-                auto listObj = PyList_New(1);
-                PyList_SetItem(listObj, 0, python::PyString_FromString(err_message.c_str()));
-                auto list = boost::python::object(boost::python::borrowed<>(listObj));
-                return list;
-            }
-
-            // collect results & transfer them back to python
-            // new version, directly interact with the interpreter
-            Timer timer;
-            // build python list object from resultset
-            auto listObj = resultSetToCPython(rs.get(), numRows);
-            Logger::instance().logger("python").info("Data transfer back to python took "
-                                                     + std::to_string(timer.time()) + " seconds");
-            // Logger::instance().flushAll();
-            Logger::instance().flushToPython();
-
-            // print errors
-            if (ss.str().length() > 0)
-                PySys_FormatStdout("%s", ss.str().c_str());
-
-            return boost::python::object(boost::python::handle<>(listObj));
-        }
-    }
-
     PythonDataSet PythonDataSet::map(const std::string &lambda_code, const std::string &pickled_code, const py::object& closure) {
 
         auto& logger = Logger::instance().logger("python");
diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py
index 976a751f4..1046505f2 100644
--- a/tuplex/python/tuplex/dataset.py
+++ b/tuplex/python/tuplex/dataset.py
@@ -191,7 +191,7 @@ def collect(self):
         assert self._dataSet is not None, 'internal API error, datasets must be created via context objects'
         return self._dataSet.collect()
 
-    def take(self, nrows=5):
+    def take(self, nrows=5, nbottom=0):
         """ action that generates a physical plan, processes data and collects the top results then as list of tuples.
 
         Args:
@@ -203,27 +203,12 @@ def take(self, nrows=5):
 
         assert isinstance(nrows, int), 'num rows must be an integer'
         assert nrows > 0, 'please specify a number greater than zero'
+        assert isinstance(nbottom, int), 'num bottom last must be an integer'
+        assert nbottom >= 0, 'please specify a number greater or equal to zero'
 
         assert self._dataSet is not None, 'internal API error, datasets must be created via context objects'
 
-        return self._dataSet.take(nrows)
-
-    def takeLast(self, nrows=5):
-        """ action that generates a physical plan, processes data and collects the last results then as list of tuples.
-
-        Args:
-            nrows (int): number of rows to collect. Per default ``5``.
-        Returns:
-            (list): A list of tuples
-
-        """
-
-        assert isinstance(nrows, int), 'num rows must be an integer'
-        assert nrows > 0, 'please specify a number greater than zero'
-
-        assert self._dataSet is not None, 'internal API error, datasets must be created via context objects'
-
-        return self._dataSet.takeLast(nrows)
+        return self._dataSet.take(nrows, nbottom)
 
     def show(self, nrows=None):
         """ action that generates a physical plan, processes data and prints results as nicely formatted

From 7fa6b175bc337dd56d0e0d2a39f41adcc5788065 Mon Sep 17 00:00:00 2001
From: korlamarch <khemarat_boonyapaluk@brown.edu>
Date: Wed, 16 Feb 2022 12:17:36 -0500
Subject: [PATCH 30/56] Add unit tests

---
 tuplex/test/core/TakeTest.cc | 125 +++++++++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)
 create mode 100644 tuplex/test/core/TakeTest.cc

diff --git a/tuplex/test/core/TakeTest.cc b/tuplex/test/core/TakeTest.cc
new file mode 100644
index 000000000..08b648f34
--- /dev/null
+++ b/tuplex/test/core/TakeTest.cc
@@ -0,0 +1,125 @@
+//--------------------------------------------------------------------------------------------------------------------//
+//                                                                                                                    //
+//                                      Tuplex: Blazing Fast Python Data Science                                      //
+//                                                                                                                    //
+//                                                                                                                    //
+//  (c) 2017 - 2021, Tuplex team                                                                                      //
+//  Created by Leonhard Spiegelberg first on 1/1/2021                                                                 //
+//  License: Apache 2.0                                                                                               //
+//--------------------------------------------------------------------------------------------------------------------//
+
+#include <Context.h>
+#include "TestUtils.h"
+
+class TakeTest : public PyTest {};
+
+TEST_F(TakeTest, takeTopTest) {
+    using namespace tuplex;
+    auto opt = testOptions();
+    Context context(opt);
+
+    auto rs = context.parallelize(
+        {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(1, 0);
+
+    ASSERT_EQ(rs->rowCount(), 1);
+    auto v = rs->getRows(1);
+
+    EXPECT_EQ(v[0].getInt(0), 1);
+
+    auto rs2 = context.parallelize(
+        {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(3, 0);
+
+    ASSERT_EQ(rs2->rowCount(), 3);
+    auto v2 = rs2->getRows(3);
+
+    EXPECT_EQ(v2[0].getInt(0), 1);
+    EXPECT_EQ(v2[1].getInt(0), 2);
+    EXPECT_EQ(v2[2].getInt(0), 3);
+
+    auto rs3 = context.parallelize(
+        {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"), Row("!")}).take(5, 0);
+
+    ASSERT_EQ(rs3->rowCount(), 5);
+    auto v3 = rs3->getRows(5);
+
+    EXPECT_EQ(v3[0].getString(0), "hello");
+    EXPECT_EQ(v3[1].getString(0), "world");
+    EXPECT_EQ(v3[2].getString(0), "! :)");
+    EXPECT_EQ(v3[3].getString(0), "world");
+    EXPECT_EQ(v3[4].getString(0), "hello");
+
+}
+
+TEST_F(TakeTest, takeBottomTest) {
+    using namespace tuplex;
+    auto opt = testOptions();
+    Context context(opt);
+
+    auto rs = context.parallelize(
+        {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(0, 1);
+
+    ASSERT_EQ(rs->rowCount(), 1);
+    auto v = rs->getRows(1);
+
+    EXPECT_EQ(v[0].getInt(0), 6);
+
+    auto rs2 = context.parallelize(
+        {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(0, 3);
+
+    ASSERT_EQ(rs2->rowCount(), 3);
+    auto v2 = rs2->getRows(3);
+
+    EXPECT_EQ(v2[0].getInt(0), 4);
+    EXPECT_EQ(v2[1].getInt(0), 5);
+    EXPECT_EQ(v2[2].getInt(0), 6);
+
+    auto rs3 = context.parallelize(
+        {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"), Row("!")}).take(0, 5);
+
+    ASSERT_EQ(rs3->rowCount(), 5);
+    auto v3 = rs3->getRows(5);
+
+    EXPECT_EQ(v3[0].getString(0), "world");
+    EXPECT_EQ(v3[1].getString(0), "hello");
+    EXPECT_EQ(v3[2].getString(0), "!");
+    EXPECT_EQ(v3[3].getString(0), "! :)");
+    EXPECT_EQ(v3[4].getString(0), "!");
+
+}
+
+TEST_F(TakeTest, takeBothTest) {
+    using namespace tuplex;
+    auto opt = testOptions();
+    Context context(opt);
+
+    auto rs = context.parallelize(
+        {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(1, 1);
+
+    ASSERT_EQ(rs->rowCount(), 2);
+    auto v = rs->getRows(2);
+
+    EXPECT_EQ(v[0].getInt(0), 1);
+    EXPECT_EQ(v[1].getInt(0), 6);
+
+    auto rs2 = context.parallelize(
+        {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(2, 1);
+
+    ASSERT_EQ(rs2->rowCount(), 3);
+    auto v2 = rs2->getRows(3);
+
+    EXPECT_EQ(v2[0].getInt(0), 1);
+    EXPECT_EQ(v2[1].getInt(0), 2);
+    EXPECT_EQ(v2[2].getInt(0), 6);
+
+    auto rs3 = context.parallelize(
+        {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"), Row("!")}).take(2, 3);
+
+    ASSERT_EQ(rs3->rowCount(), 5);
+    auto v3 = rs3->getRows(5);
+
+    EXPECT_EQ(v3[0].getString(0), "hello");
+    EXPECT_EQ(v3[1].getString(0), "world");
+    EXPECT_EQ(v3[2].getString(0), "!");
+    EXPECT_EQ(v3[3].getString(0), "! :)");
+    EXPECT_EQ(v3[4].getString(0), "!");
+}
\ No newline at end of file

From c78a63784fdab0fa7e311b5969c0017d0b981ebf Mon Sep 17 00:00:00 2001
From: korlamarch <khemarat_boonyapaluk@brown.edu>
Date: Wed, 16 Feb 2022 13:08:25 -0500
Subject: [PATCH 31/56] add bottom limit to transform stage (wip)

---
 tuplex/core/include/logical/TakeOperator.h | 2 +-
 tuplex/core/src/physical/PhysicalPlan.cc   | 2 ++
 tuplex/core/src/physical/TransformStage.cc | 2 ++
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/tuplex/core/include/logical/TakeOperator.h b/tuplex/core/include/logical/TakeOperator.h
index 20c035a74..b5dd5db6e 100644
--- a/tuplex/core/include/logical/TakeOperator.h
+++ b/tuplex/core/include/logical/TakeOperator.h
@@ -40,7 +40,7 @@ namespace tuplex {
 
         int64_t limit() { return _limitTop; }
 
-        bool limitBottom() { return _limitBottom; }
+        int64_t limitBottom() { return _limitBottom; }
 
         std::vector<Row> getSample(const size_t num) const override;
 
diff --git a/tuplex/core/src/physical/PhysicalPlan.cc b/tuplex/core/src/physical/PhysicalPlan.cc
index 17a4c7c0e..3985fe1ab 100644
--- a/tuplex/core/src/physical/PhysicalPlan.cc
+++ b/tuplex/core/src/physical/PhysicalPlan.cc
@@ -383,6 +383,8 @@ namespace tuplex {
         if(outputNode->type() == LogicalOperatorType::TAKE) {
             auto top = static_cast<TakeOperator*>(outputNode);
             builder.setOutputLimit(top->limit());
+            // TODO: work here
+            ...
         }
 
         // @TODO: add slowPip builder to this process...
diff --git a/tuplex/core/src/physical/TransformStage.cc b/tuplex/core/src/physical/TransformStage.cc
index b61f9cbe2..6eb3f2e1f 100644
--- a/tuplex/core/src/physical/TransformStage.cc
+++ b/tuplex/core/src/physical/TransformStage.cc
@@ -142,6 +142,8 @@ namespace tuplex {
                 size_t numOutputRows = 0;
                 for (auto partition : partitions) {
                     numOutputRows += partition->getNumRows();
+                    // TODO(march): work here
+                    ...
                     if (numOutputRows >= outputLimit()) {
                         // clip last partition & leave loop
                         auto clipped = outputLimit() - (numOutputRows - partition->getNumRows());

From 5628d279cee0a9923a15befcea8d2ebe46169397 Mon Sep 17 00:00:00 2001
From: korlamarch <khemarat_boonyapaluk@brown.edu>
Date: Thu, 24 Feb 2022 23:29:11 -0500
Subject: [PATCH 32/56] more physical stage update (wip)

Quick push
---
 tuplex/core/include/Executor.h                | 21 +++-
 tuplex/core/include/Partition.h               | 23 +++--
 tuplex/core/include/physical/ResultSet.h      |  2 +
 tuplex/core/include/physical/StageBuilder.h   | 10 +-
 tuplex/core/include/physical/TransformStage.h | 10 +-
 tuplex/core/include/physical/TransformTask.h  | 10 +-
 tuplex/core/src/Executor.cc                   | 97 +++++++++++--------
 tuplex/core/src/ee/local/LocalBackend.cc      | 39 ++++----
 tuplex/core/src/physical/PhysicalPlan.cc      |  4 +-
 tuplex/core/src/physical/ResultSet.cc         |  3 +-
 tuplex/core/src/physical/StageBuilder.cc      |  5 +-
 tuplex/core/src/physical/TransformStage.cc    | 86 +++++++++++++---
 tuplex/core/src/physical/TransformTask.cc     | 19 ++--
 tuplex/utils/include/mt/ITask.h               | 26 ++---
 14 files changed, 224 insertions(+), 131 deletions(-)

diff --git a/tuplex/core/include/Executor.h b/tuplex/core/include/Executor.h
index 0bca412be..3631f7e7d 100644
--- a/tuplex/core/include/Executor.h
+++ b/tuplex/core/include/Executor.h
@@ -44,12 +44,19 @@ namespace tuplex {
      */
     class WorkQueue {
     private:
-        std::atomic_bool _done; // protects against data races
+        std::atomic_bool _done{}; // protects against data races
         ExecutorTaskQueueType _queue;
         std::mutex _completedTasksMutex;
         std::vector<IExecutorTask*> _completedTasks;
-        std::atomic_int _numPendingTasks;
-        std::atomic_int _numCompletedTasks;
+        std::atomic_int _numPendingTasks{};
+        std::atomic_int _numCompletedTasks{};
+
+        // mapping from order number -> row count if the task is finished
+        std::mutex _rowsDoneMutex;
+        std::map<size_t, size_t> _rowsDone;
+
+        std::atomic_int _frontRowsLimit{};
+        std::atomic_int _bottomRowsLimit{};
     public:
 
         WorkQueue();
@@ -74,6 +81,14 @@ namespace tuplex {
 
         size_t numCompletedTasks() const { return _numCompletedTasks; }
 
+        size_t frontRowsLimit() const {
+            return _frontRowsLimit;
+        };
+
+        size_t bottomRowsLimit() const {
+            return _bottomRowsLimit;
+        };
+
         /*!
          * stop working on this queue & dump all tasks
          */
diff --git a/tuplex/core/include/Partition.h b/tuplex/core/include/Partition.h
index 5a66023fd..24b79cc8f 100644
--- a/tuplex/core/include/Partition.h
+++ b/tuplex/core/include/Partition.h
@@ -69,6 +69,7 @@ namespace tuplex {
         void loadFromFile(const URI& uri);
 
         int64_t                 _numRows;
+        int64_t                 _numSkip; // number of rows to skip, currently only used at the output (Result set)
         uint64_t                _bytesWritten;
 
         Schema _schema; //! Schema of the partition. May be optimized away later.
@@ -157,7 +158,7 @@ namespace tuplex {
          * return how much capacity is left, i.e. how many bytes can be actually written
          * @return
          */
-        size_t capacity() { return _size - sizeof(int64_t); }
+        size_t capacity() const { return _size - sizeof(int64_t); }
 
         uniqueid_t uuid() const { return _uuid; }
 
@@ -248,21 +249,19 @@ namespace tuplex {
             _mutex.unlock();
         }
 
-        void setNumLastRows(const size_t numRows) {
-            // TODO: set another value instead
+        size_t getNumSkip() {
+            size_t res = 0;
             _mutex.lock();
-
-            _numRows = numRows;
-
-            // save to memptr
-            if(_arena) {
-                *((int64_t*)_arena) = numRows;
-            }
-
+            res = num_skip;
             _mutex.unlock();
+            return res;
         }
 
-
+        void setNumSkip(const size_t numSkip) {
+            _mutex.lock();
+            _numSkip = numSkip;
+            _mutex.unlock();
+        }
 
         int64_t getDataSetID() const { return _dataSetID; }
 
diff --git a/tuplex/core/include/physical/ResultSet.h b/tuplex/core/include/physical/ResultSet.h
index e94b8f1ae..5e69fef3a 100644
--- a/tuplex/core/include/physical/ResultSet.h
+++ b/tuplex/core/include/physical/ResultSet.h
@@ -36,6 +36,8 @@ namespace tuplex {
         size_t _rowsRetrieved;
         size_t _totalRowCounter; // used for merging in rows!
         size_t _maxRows;
+        size_t _maxRowsTop;
+        size_t _maxRowsBottom;
         Schema _schema;
 
         void removeFirstPartition();
diff --git a/tuplex/core/include/physical/StageBuilder.h b/tuplex/core/include/physical/StageBuilder.h
index 63b94bd57..e678ead3d 100644
--- a/tuplex/core/include/physical/StageBuilder.h
+++ b/tuplex/core/include/physical/StageBuilder.h
@@ -76,8 +76,9 @@ namespace tuplex {
             void addFileInput(FileInputOperator* csvop);
             void addFileOutput(FileOutputOperator* fop);
 
-            inline void setOutputLimit(size_t limit) {
-                _outputLimit = limit;
+            inline void  setOutputLimit(size_t topLimit, size_t bottomLimit) {
+                _outputTopLimit = topLimit;
+                _outputBottomLimit = bottomLimit;
             }
 
             TransformStage* build(PhysicalPlan* plan, IBackend* backend);
@@ -134,7 +135,8 @@ namespace tuplex {
             FileFormat _outputFileFormat;
             int64_t _outputNodeID;
             int64_t _inputNodeID;
-            size_t _outputLimit;
+            size_t _outputTopLimit;
+            size_t _outputBottomLimit;
 
             LogicalOperator* _inputNode;
             std::vector<bool> _columnsToRead;
@@ -157,7 +159,7 @@ namespace tuplex {
             int64_t outputDataSetID() const;
 
             inline bool hasOutputLimit() const {
-                return _outputLimit < std::numeric_limits<size_t>::max();
+                return _outputTopLimit < std::numeric_limits<size_t>::max() || _outputBottomLimit > 0;
             }
 
             inline char csvOutputDelimiter() const {
diff --git a/tuplex/core/include/physical/TransformStage.h b/tuplex/core/include/physical/TransformStage.h
index 22d7f5fb4..e63eaec31 100644
--- a/tuplex/core/include/physical/TransformStage.h
+++ b/tuplex/core/include/physical/TransformStage.h
@@ -111,14 +111,15 @@ namespace tuplex {
          * @param outputLimit
          */
         void setOutputLimit(size_t outputLimit) {
-            _outputLimit = outputLimit;
+            _outputTopLimit = outputLimit;
 
             // @TODO: move this logic to physical plan!
             // pushdown limit
             //pushDownOutputLimit();
         }
 
-        size_t outputLimit() const { return _outputLimit; }
+        size_t outputTopLimit() const { return _outputTopLimit; }
+        size_t outputBottomLimit() const { return _outputBottomLimit; }
         size_t inputLimit() const { return _inputLimit; }
 
         /*!
@@ -442,7 +443,8 @@ namespace tuplex {
 
         std::vector<Partition*> _inputPartitions; //! memory input partitions for this task.
         size_t                  _inputLimit; //! limit number of input rows (inf per default)
-        size_t                  _outputLimit; //! output limit, set e.g. by take, to_csv etc. (inf per default)
+        size_t                  _outputTopLimit; //! output limit, set e.g. by take, to_csv etc. (inf per default)
+        size_t                  _outputBottomLimit; //! output limit, set e.g. by take, to_csv etc. (0 per default)
 
         std::shared_ptr<ResultSet> _rs; //! result set
 
@@ -479,7 +481,7 @@ namespace tuplex {
         python::Type _hashOutputBucketType;
 
         bool hasOutputLimit() const {
-            return _outputLimit < std::numeric_limits<size_t>::max();
+            return _outputTopLimit < std::numeric_limits<size_t>::max();
         }
     };
 }
diff --git a/tuplex/core/include/physical/TransformTask.h b/tuplex/core/include/physical/TransformTask.h
index 2868ba668..d065e86d3 100644
--- a/tuplex/core/include/physical/TransformTask.h
+++ b/tuplex/core/include/physical/TransformTask.h
@@ -182,7 +182,8 @@ namespace tuplex {
         void sinkOutputToHashTable(HashTableFormat fmt, int64_t outputDataSetID);
         HashTableSink hashTableSink() const { return _htable; } // needs to be freed manually!
 
-        void setOutputLimit(size_t limit) { _outLimit = limit; resetOutputLimitCounter(); }
+        void setOutputTopLimit(size_t limit) { _outTopLimit = limit; resetOutputLimitCounter(); }
+        void setOutputBottomLimit(size_t limit) { _outBottomLimit = limit; resetOutputLimitCounter(); }
         void setOutputSkip(size_t numRowsToSkip) { _outSkipRows = numRowsToSkip; }
         void execute() override;
 
@@ -249,7 +250,9 @@ namespace tuplex {
         double wallTime() const override { return _wallTime; }
 
         size_t output_rows_written() const { return _numOutputRowsWritten; }
-        size_t output_limit() const { return _outLimit; }
+        size_t output_top_limit() const { return _outTopLimit; }
+        size_t output_bottom_limit() const { return _outBottomLimit; }
+
     private:
         void resetSinks();
         void resetSources();
@@ -276,7 +279,8 @@ namespace tuplex {
         Buffer _outPrefix;
         std::unordered_map<std::string, std::string> _outOptions;
 
-        size_t _outLimit; // limits how many rows to write at max
+        size_t _outTopLimit; // limits how many rows to write at max
+        size_t _outBottomLimit; // limits how many last rows to write at max
         size_t _outSkipRows; // how many rows at start to skip
 
         // memory source variables
diff --git a/tuplex/core/src/Executor.cc b/tuplex/core/src/Executor.cc
index 845b78e6a..1cc818010 100644
--- a/tuplex/core/src/Executor.cc
+++ b/tuplex/core/src/Executor.cc
@@ -32,8 +32,12 @@ namespace tuplex {
 
     std::vector<IExecutorTask*> WorkQueue::popCompletedTasks() {
         TRACE_LOCK("workQueue");
-        std::lock_guard<std::mutex> lock(_completedTasksMutex);
 
+        _taskDoneMutex.lock();
+        _taskDone.clear();
+        _taskDoneMutex.unlock();
+
+        std::lock_guard<std::mutex> lock(_completedTasksMutex);
         // move leads to circular dependency in gcc and thus a bug on travis-ci. Therefore, just
         // use the below hack to fool the compiler into actually copying the vectors
         // // move to reset completed tasks and return array
@@ -78,59 +82,66 @@ namespace tuplex {
     bool WorkQueue::workTask(Executor& executor, bool nonBlocking) {
 
         IExecutorTask *task = nullptr;
-        if(nonBlocking) {
-            // @Todo: This should be put into a function "work" on the workQueue...
-            // dequeue from general working queue
-            if(_queue.try_dequeue(task)) {
-                if(!task)
-                    return false;
 
-                task->setOwner(&executor);
-                task->setThreadNumber(executor.threadNumber()); // redundant?
+        // dequeue from general working queue
+        // Note: is this TODO: outdated?
+        // @Todo: This should be put into a function "work" on the workQueue...
+        if (nonBlocking) {
+            if(!_queue.try_dequeue(task)) {
+                return false;
+            }
+        } else {
+            _queue.wait_dequeue(task);
+        }
 
-                //executor.logger().info("started task...");
-                // process task
-                task->execute();
-                // save which thread executed this task
-                task->setID(std::this_thread::get_id());
+        if(!task) {
+            return false;
+        }
 
+        // if reach the top limit already, then don't compute the rest
+        size_t numTopCompleted;
+        TRACE_LOCK("rowsDone");
+        _rowsDoneMutex.lock();
+        size_t frontRowsDone = 0;
+        for (size_t i = 0; _rowsDone.count(i) != 0; i++) {
+            frontRowsDone += _rowsDone[i];
+            if (frontRowsDone >= _queue.frontRowsLimit()) {
+                // skip execution
                 _numPendingTasks.fetch_add(-1, std::memory_order_release);
-
-                // add task to done list
-                TRACE_LOCK("completedTasks");
-                _completedTasksMutex.lock();
-                _completedTasks.push_back(std::move(task));
-                _completedTasksMutex.unlock();
-                _numCompletedTasks.fetch_add(1, std::memory_order_release);
-                TRACE_UNLOCK("completedTasks");
+                _rowsDoneMutex.unlock();
+                TRACE_UNLOCK("rowsDone");
                 return true;
             }
-        } else {
-            _queue.wait_dequeue(task);
+        }
+        _rowsDoneMutex.unlock();
+        TRACE_UNLOCK("rowsDone");
 
-            if(!task)
-                return false;
+        task->setOwner(&executor);
+        task->setThreadNumber(executor.threadNumber()); // redundant?
 
-            task->setOwner(&executor);
-            task->setThreadNumber(executor.threadNumber()); // redundant?
+        // executor.logger().info("started task...");
+        // process task
+        task->execute();
+        // save which thread executed this task
+        task->setID(std::this_thread::get_id());
 
-            // process task
-            task->execute();
-            // save which thread executed this task
-            task->setID(std::this_thread::get_id());
+        _numPendingTasks.fetch_add(-1, std::memory_order_release);
 
-            // add task to done list
-            TRACE_LOCK("completedTasks");
-            _completedTasksMutex.lock();
-            _completedTasks.push_back(std::move(task));
-            _completedTasksMutex.unlock();
-            _numCompletedTasks.fetch_add(1, std::memory_order_release);
-            TRACE_UNLOCK("completedTasks");
+        // add task to done list
+        TRACE_LOCK("completedTasks");
+        _completedTasksMutex.lock();
+        _completedTasks.push_back(std::move(task));
+        _completedTasksMutex.unlock();
+        _numCompletedTasks.fetch_add(1, std::memory_order_release);
+        TRACE_UNLOCK("completedTasks");
 
-            _numPendingTasks.fetch_add(-1, std::memory_order_release);
-            return true;
-        }
-        return false;
+        TRACE_LOCK("rowsDone");
+        _rowsDoneMutex.lock();
+        _rowsDone[task->getOrder()] += task->getNumOutputRows();
+        _rowsDoneMutex.unlock();
+        TRACE_UNLOCK("rowsDone");
+
+        return true;
     }
 
     void WorkQueue::workUntilAllTasksFinished(tuplex::Executor &executor, bool flushPeriodicallyToPython) {
diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc
index bed96ec5a..5a1311436 100644
--- a/tuplex/core/src/ee/local/LocalBackend.cc
+++ b/tuplex/core/src/ee/local/LocalBackend.cc
@@ -486,6 +486,7 @@ namespace tuplex {
 
         // check what type of input the pipeline has (memory or files)
         if(tstage->fileInputMode()) {
+            // TODO(march): deal with file input
             // files
             // input is multiple files, use split file strategy here.
             // and issue tasks to executor workqueue!
@@ -550,7 +551,7 @@ namespace tuplex {
 
                         task->sinkExceptionsToMemory(inputSchema);
                         task->setStageID(tstage->getID());
-                        task->setOutputLimit(tstage->outputLimit());
+                        task->setOutputTopLimit(tstage->outputTopLimit());
                         // add to tasks
                         tasks.emplace_back(std::move(task));
                     } else {
@@ -584,7 +585,7 @@ namespace tuplex {
                             }
                             task->sinkExceptionsToMemory(inputSchema);
                             task->setStageID(tstage->getID());
-                            task->setOutputLimit(tstage->outputLimit());
+                            task->setOutputTopLimit(tstage->outputTopLimit());
                             // add to tasks
                             tasks.emplace_back(std::move(task));
                             num_parts++;
@@ -621,7 +622,7 @@ namespace tuplex {
                                 }
                                 task->sinkExceptionsToMemory(inputSchema);
                                 task->setStageID(tstage->getID());
-                                task->setOutputLimit(tstage->outputLimit());
+                                task->setOutputTopLimit(tstage->outputTopLimit());
                                 // add to tasks
                                 tasks.emplace_back(std::move(task));
 
@@ -683,7 +684,11 @@ namespace tuplex {
                 task->setInputExceptions(tstage->inputExceptions());
                 task->sinkExceptionsToMemory(inputSchema);
                 task->setStageID(tstage->getID());
-                task->setOutputLimit(tstage->outputLimit());
+                task->setOutputTopLimit(tstage->outputTopLimit());
+                task->setOutputBottomLimit(tstage->outputBottomLimit());
+                if (tstage->outputBottomLimit()) {
+                    // TODO(march): work here (task output limit generation)
+                }
                 tasks.emplace_back(std::move(task));
                 numInputRows += partition->getNumRows();
 
@@ -837,7 +842,6 @@ namespace tuplex {
     }
 
     void LocalBackend::executeTransformStage(tuplex::TransformStage *tstage) {
-
         Timer stageTimer;
         Timer timer; // for detailed measurements.
 
@@ -937,6 +941,7 @@ namespace tuplex {
             }
         }
 
+        // TODO(march): work here (transform stage)
         auto tasks = createLoadAndTransformToMemoryTasks(tstage, _options, syms);
         auto completedTasks = performTasks(tasks);
 
@@ -1513,23 +1518,21 @@ namespace tuplex {
         WorkQueue& wq = LocalEngine::instance().getQueue();
         wq.clear();
 
-        // check if ord is set, if not issue warning & add
-        bool orderlessTaskFound = false;
+        // assign the order for all tasks
         for(int i = 0; i < tasks.size(); ++i) {
-            if(tasks[i]->getOrder().size() == 0) {
-                tasks[i]->setOrder(i);
-                orderlessTaskFound = true;
-            }
+            tasks[i]->setOrder(i);
         }
 
-#ifndef NDEBUG
-        if(orderlessTaskFound) {
-            logger().debug("task without order found, please fix in code.");
+        // add all tasks to queue
+        // TODO(march): add task stage (to do striping)
+        for(size_t i = 0; i <= tasks.size() - i - 1; i++) {
+            const size_t revI = tasks.size()- i - 1
+            wq.addTask(&tasks[i]);
+            if (revI > i) {
+                wq.addTask(&tasks[revI]);
+            }
         }
-#endif
 
-        // add all tasks to queue
-        for(auto& task : tasks) wq.addTask(task);
         // clear
         tasks.clear();
 
@@ -1955,7 +1958,7 @@ namespace tuplex {
 
         // now simply go over the partitions and write the full buffers out
         // check all the params from TrafoStage
-        size_t limit = tstage->outputLimit();
+        size_t limit = tstage->outputTopLimit();
         size_t splitSize = tstage->splitSize();
         size_t numOutputFiles = tstage->numOutputFiles();
         URI uri = tstage->outputURI();
diff --git a/tuplex/core/src/physical/PhysicalPlan.cc b/tuplex/core/src/physical/PhysicalPlan.cc
index 3985fe1ab..9c22837ad 100644
--- a/tuplex/core/src/physical/PhysicalPlan.cc
+++ b/tuplex/core/src/physical/PhysicalPlan.cc
@@ -382,9 +382,7 @@ namespace tuplex {
         // set limit if output node has a limit (currently only TakeOperator)
         if(outputNode->type() == LogicalOperatorType::TAKE) {
             auto top = static_cast<TakeOperator*>(outputNode);
-            builder.setOutputLimit(top->limit());
-            // TODO: work here
-            ...
+            builder.setOutputLimit(top->limit(), top->limitBottom());
         }
 
         // @TODO: add slowPip builder to this process...
diff --git a/tuplex/core/src/physical/ResultSet.cc b/tuplex/core/src/physical/ResultSet.cc
index 0f7bf7319..5e15867f7 100644
--- a/tuplex/core/src/physical/ResultSet.cc
+++ b/tuplex/core/src/physical/ResultSet.cc
@@ -98,7 +98,7 @@ namespace tuplex {
         Partition *first = _partitions.front();
         assert(_schema == first->schema());
 
-        auto numRows = first->getNumRows();
+        auto numRows = first->getNumRows() - first->getNumSkip();
         _rowsRetrieved += numRows;
 
         _partitions.pop_front();
@@ -183,6 +183,7 @@ namespace tuplex {
     }
 
     Row ResultSet::getNextRow() {
+        // TODO(march): logic in skip row count here
         // merge rows from objects
         if(!_pyobjects.empty()) {
             auto row_number = std::get<0>(_pyobjects.front());
diff --git a/tuplex/core/src/physical/StageBuilder.cc b/tuplex/core/src/physical/StageBuilder.cc
index 0bf509ed1..bc814182b 100644
--- a/tuplex/core/src/physical/StageBuilder.cc
+++ b/tuplex/core/src/physical/StageBuilder.cc
@@ -50,7 +50,7 @@ namespace tuplex {
                 : _stageNumber(stage_number), _isRootStage(rootStage), _allowUndefinedBehavior(allowUndefinedBehavior),
                   _generateParser(generateParser), _normalCaseThreshold(normalCaseThreshold), _sharedObjectPropagation(sharedObjectPropagation),
                   _nullValueOptimization(nullValueOptimization), _updateInputExceptions(updateInputExceptions),
-                  _inputNode(nullptr), _outputLimit(std::numeric_limits<size_t>::max()) {
+                  _inputNode(nullptr), _outputTopLimit(std::numeric_limits<size_t>::max()), _outputBottomLimit(0) {
         }
 
         void StageBuilder::generatePythonCode() {
@@ -1426,7 +1426,8 @@ namespace tuplex {
             // no limit operator yet...
 
             // get limit
-            stage->_outputLimit = _outputLimit;
+            stage->_outputTopLimit = _outputTopLimit;
+            stage->_outputBottomLimit = _outputBottomLimit;
 
             // copy input/output configurations
             stage->_fileInputParameters = _fileInputParameters;
diff --git a/tuplex/core/src/physical/TransformStage.cc b/tuplex/core/src/physical/TransformStage.cc
index 6eb3f2e1f..af58866dc 100644
--- a/tuplex/core/src/physical/TransformStage.cc
+++ b/tuplex/core/src/physical/TransformStage.cc
@@ -48,7 +48,8 @@ namespace tuplex {
                                    int64_t number,
                                    bool allowUndefinedBehavior) : PhysicalStage::PhysicalStage(plan, backend, number),
                                                                   _inputLimit(std::numeric_limits<size_t>::max()),
-                                                                  _outputLimit(std::numeric_limits<size_t>::max()),
+                                                                  _outputTopLimit(std::numeric_limits<size_t>::max()),
+                                                                  _outputBottomLimit(0),
                                                                   _aggMode(AggregateType::AGG_NONE) {
 
         // TODO: is this code out of date? + is allowUndefinedBehavior needed here?
@@ -129,7 +130,7 @@ namespace tuplex {
         if (partitions.empty() && interpreterRows.empty() && generalCase.empty())
             _rs = emptyResultSet();
         else {
-            std::vector<Partition *> limitedPartitions;
+            std::vector<Partition *> limitedPartitions, limitedTailPartitions;
             auto schema = Schema::UNKNOWN;
 
             if(!partitions.empty()) {
@@ -138,31 +139,92 @@ namespace tuplex {
                     assert(schema == partition->schema());
                 }
 
-                // check output limit, adjust partitions if necessary
-                size_t numOutputRows = 0;
+                // check top output limit, adjust partitions if necessary
+                size_t numTopOutputRows = 0;
+                Partition* lastTopPart = nullptr;
+                size_t clippedTop = 0;
                 for (auto partition : partitions) {
-                    numOutputRows += partition->getNumRows();
-                    // TODO(march): work here
-                    ...
-                    if (numOutputRows >= outputLimit()) {
+                    numTopOutputRows += partition->getNumRows();
+                    lastTopPart = partition;
+                    if (numTopOutputRows >= outputTopLimit()) {
                         // clip last partition & leave loop
-                        auto clipped = outputLimit() - (numOutputRows - partition->getNumRows());
+                        clippedTop = outputTopLimit() - (numTopOutputRows - partition->getNumRows());
+                        assert(clippedTop <= partition->getNumRows());
+                        break;
+                    } else if (partition == *partitions.end()) {
+                        // last partition, mark full row, but don't put to output set yet to avoid double put
+                        clippedTop = partition->getNumRows();
+                        break;
+                    } else {
+                        // put full partition to output set
+                        limitedPartitions.push_back(partition);
+                    }
+                }
+
+                // check the bottom output limit, adjust partitions if necessary
+                size_t numBottomOutputRows = 0;
+                size_t clippedBottom = 0;
+                for (auto it = partitions.rbegin(); it != partitions.rend(); it++) {
+                    auto partition = *it;
+                    numBottomOutputRows += partition->getNumRows();
+
+                    if (partition == lastTopPart) {
+                        // the bottom and the top partitions are overlapping
+                        clippedBottom = outputBottomLimit() - (numBottomOutputRows - partition->getNumRows());
+                        if (clippedTop + clippedBottom >= partition->getNumRows()) {
+                            // if top and bottom range intersect, use full partitions
+                            clippedTop = partition->getNumRows();
+                            clippedBottom = 0;
+                        }
+                        break;
+                    } else if (numBottomOutputRows >= outputBottomLimit()) {
+                        // clip last partition & leave loop
+                        auto clipped = outputBottomLimit() - (numTopOutputRows - partition->getNumRows());
                         assert(clipped <= partition->getNumRows());
+                        partition->setNumSkip(partition->getNumRows() - clippedBottom);
                         partition->setNumRows(clipped);
                         if (clipped > 0)
-                            limitedPartitions.push_back(partition);
+                            limitedTailPartitions.push_back(partition);
                         break;
                     } else {
                         // put full partition to output set
-                        limitedPartitions.push_back(partition);
+                        limitedTailPartitions.push_back(partition);
+                    }
+                }
+
+                // push the middle partition
+                if (lastTopPart != nullptr && (clippedTop > 0 || clippedBottom > 0)) {
+                    assert(clippedTop + clippedBottom <= lastTopPart->getNumRows());
+
+                    // TODO(march): to work on this (split into two partitions)
+                    // split into two partitions with both top and bottom are in the same partition
+                    Partition* lastBottomPart = nullptr;
+                    if (clippedBottom != 0) {
+                        lastBottomPart = new Partition(lastTopPart);
+                        lastBottomPart->setNumSkip(lastBottomPart->getNumRows() - clippedBottom);
+                        lastBottomPart->setNumRows(clippedBottom);
+                    }
+
+                    lastTopPart->setNumRows(clippedTop);
+
+                    limitedPartitions.push_back(lastTopPart);
+
+                    if (lastBottomPart != nullptr) {
+                        limitedPartitions.push_back(lastBottomPart);
                     }
                 }
+
+                // merge the head and tail partitions
+                std::reverse(limitedTailPartitions.begin(), limitedTailPartitions.end());
+                limitedPartitions.insert(limitedPartitions.end(), limitedTailPartitions.begin(), limitedTailPartitions.end());
             }
 
             // put ALL partitions to result set
+            // TODO(march): handle overlapping case
+            // TODO(march): maybe do top/bottom limit at the level instead?
             _rs = std::make_shared<ResultSet>(schema, limitedPartitions,
                                               generalCase, partitionToExceptionsMap, interpreterRows,
-                                              outputLimit());
+                                              outputTopLimit() + outputBottomLimit());
         }
     }
 
diff --git a/tuplex/core/src/physical/TransformTask.cc b/tuplex/core/src/physical/TransformTask.cc
index c560c4af4..a65aa7f11 100644
--- a/tuplex/core/src/physical/TransformTask.cc
+++ b/tuplex/core/src/physical/TransformTask.cc
@@ -19,10 +19,12 @@
 
 namespace tuplex {
     // atomic var to count output rows!
-    static std::atomic_int64_t g_totalOutputRows;
+    static std::atomic_int64_t g_totalTopOutputRows;
+    static std::atomic_int64_t g_totalBottomOutputRows;
 
     void TransformTask::resetOutputLimitCounter() {
-        g_totalOutputRows = 0;
+        g_totalTopOutputRows = 0;
+        g_totalBottomOutputRows = 0;
     }
 }
 
@@ -41,7 +43,8 @@ extern "C" {
 
     static int64_t limited_w2mCallback(tuplex::TransformTask* task, uint8_t* buf, int64_t bufSize) {
         // i.e. check here how many output rows, if already limit reached - jump to goto!
-        if(tuplex::g_totalOutputRows >= task->output_limit()) {
+        // TODO(march): comment this out
+        if(tuplex::g_totalTopOutputRows >= task->output_top_limit()) {
             return tuplex::ecToI64(tuplex::ExceptionCode::OUTPUT_LIMIT_REACHED);
         }
 
@@ -49,10 +52,10 @@ extern "C" {
         assert(dynamic_cast<tuplex::TransformTask*>(task));
         auto rc = task->writeRowToMemory(buf, bufSize);
         if(0 == rc)
-            tuplex::g_totalOutputRows++;
+            tuplex::g_totalTopOutputRows++;
 
         // i.e. check here how many output rows, if already limit reached - jump to goto!
-        if(tuplex::g_totalOutputRows >= task->output_limit()) {
+        if(tuplex::g_totalTopOutputRows >= task->output_top_limit()) {
             return tuplex::ecToI64(tuplex::ExceptionCode::OUTPUT_LIMIT_REACHED);
         }
         return rc;
@@ -513,7 +516,8 @@ namespace tuplex {
         _outputFilePath = URI::INVALID;
         _outFile.reset(nullptr);
         _outPrefix.reset();
-        _outLimit = std::numeric_limits<size_t>::max(); // write all rows
+        _outTopLimit = std::numeric_limits<size_t>::max(); // write all rows
+        _outBottomLimit = 0;
         _outSkipRows = 0; // skip no rows
 
         // reset memory sink
@@ -619,6 +623,7 @@ namespace tuplex {
 
         auto functor = reinterpret_cast<codegen::read_block_f>(_functor);
 
+        // TODO(march): question here?
         // go over all input partitions.
         for(const auto &inputPartition : _inputPartitions) {
             // lock ptr, extract number of rows ==> store them
@@ -678,7 +683,7 @@ namespace tuplex {
 
         // skip rows? limit rows??
 
-        if(_numOutputRowsWritten >= _outSkipRows && _numOutputRowsWritten < (_outLimit - _outSkipRows)) {
+        if(_numOutputRowsWritten >= _outSkipRows && _numOutputRowsWritten < (_outTopLimit - _outSkipRows)) {
             if(_outFile->write(buf, bufSize) != VirtualFileSystemStatus::VFS_OK)
                 return ecToI32(ExceptionCode::IOERROR);
         }
diff --git a/tuplex/utils/include/mt/ITask.h b/tuplex/utils/include/mt/ITask.h
index 8434896a7..01f7137f1 100644
--- a/tuplex/utils/include/mt/ITask.h
+++ b/tuplex/utils/include/mt/ITask.h
@@ -29,7 +29,7 @@ namespace tuplex {
         std::thread::id _id; //! the id of the thread that executed the task. Used to specifically execute a task on a specific thread.
 //! Per default object is constructed that does not represent a thread
 
-        std::vector<size_t> _orderNumbers; //! for sorting tasks when doing async processing, allows for multiple stages
+        size_t _orderNumbers; //! for sorting tasks when doing async processing, allows for multiple stages
 
     public:
         ITask() {};
@@ -51,33 +51,21 @@ namespace tuplex {
             _id = id;
         }
 
-        void setOrder(size_t order) { _orderNumbers = std::vector<size_t>{order}; }
-
-//        size_t getOrder(const size_t nth = 0) const {
-//            return _orderNumbers[nth];
-//        }
-        std::vector<size_t> getOrder() const { return _orderNumbers; }
-
-        void setOrder(const std::vector<size_t>& order) {
+        void setOrder(size_t order) {
             _orderNumbers = order;
         }
 
+        size_t getOrder() const {
+            return _orderNumbers;
+        }
+
         /*!
          * compare the ordering numbers of two tasks to restore initial data order after processing multiple ones
          * @param other
          * @return
          */
         bool compareAscOrder(const ITask& other) const {
-            // make sure they have the same length
-            assert(_orderNumbers.size() == other._orderNumbers.size());
-
-            // this < other?
-            // compare one by one
-            for(int i = 0; i < other._orderNumbers.size(); ++i) {
-                if(_orderNumbers[i] >= other._orderNumbers[i])
-                    return false;
-            }
-            return true;
+            return _orderNumbers[i] < other._orderNumbers[i];
         }
     };
 }

From a506d88be56f5f69e1ed2833907e0867c2dda734 Mon Sep 17 00:00:00 2001
From: korlamarch <khemarat_boonyapaluk@brown.edu>
Date: Wed, 9 Mar 2022 13:00:17 -0500
Subject: [PATCH 33/56] Rework LocalBackend and TransformTask to support top
 and bottom limit

---
 tuplex/core/include/Executor.h                |   6 -
 tuplex/core/include/Partition.h               |  14 --
 tuplex/core/include/ee/local/LocalBackend.h   |   3 +
 tuplex/core/include/physical/TransformStage.h |  12 +-
 tuplex/core/src/Executor.cc                   |  24 ---
 tuplex/core/src/ee/local/LocalBackend.cc      | 182 ++++++++++++++++--
 tuplex/core/src/physical/PhysicalPlan.cc      |   2 +-
 tuplex/core/src/physical/ResultSet.cc         |   1 -
 tuplex/core/src/physical/TransformStage.cc    |  86 +--------
 tuplex/core/src/physical/TransformTask.cc     |  54 ++++--
 tuplex/utils/include/mt/ITask.h               |  85 ++++----
 11 files changed, 270 insertions(+), 199 deletions(-)

diff --git a/tuplex/core/include/Executor.h b/tuplex/core/include/Executor.h
index 3631f7e7d..7eaaee244 100644
--- a/tuplex/core/include/Executor.h
+++ b/tuplex/core/include/Executor.h
@@ -51,12 +51,6 @@ namespace tuplex {
         std::atomic_int _numPendingTasks{};
         std::atomic_int _numCompletedTasks{};
 
-        // mapping from order number -> row count if the task is finished
-        std::mutex _rowsDoneMutex;
-        std::map<size_t, size_t> _rowsDone;
-
-        std::atomic_int _frontRowsLimit{};
-        std::atomic_int _bottomRowsLimit{};
     public:
 
         WorkQueue();
diff --git a/tuplex/core/include/Partition.h b/tuplex/core/include/Partition.h
index 24b79cc8f..8bf112051 100644
--- a/tuplex/core/include/Partition.h
+++ b/tuplex/core/include/Partition.h
@@ -69,7 +69,6 @@ namespace tuplex {
         void loadFromFile(const URI& uri);
 
         int64_t                 _numRows;
-        int64_t                 _numSkip; // number of rows to skip, currently only used at the output (Result set)
         uint64_t                _bytesWritten;
 
         Schema _schema; //! Schema of the partition. May be optimized away later.
@@ -249,19 +248,6 @@ namespace tuplex {
             _mutex.unlock();
         }
 
-        size_t getNumSkip() {
-            size_t res = 0;
-            _mutex.lock();
-            res = num_skip;
-            _mutex.unlock();
-            return res;
-        }
-
-        void setNumSkip(const size_t numSkip) {
-            _mutex.lock();
-            _numSkip = numSkip;
-            _mutex.unlock();
-        }
 
         int64_t getDataSetID() const { return _dataSetID; }
 
diff --git a/tuplex/core/include/ee/local/LocalBackend.h b/tuplex/core/include/ee/local/LocalBackend.h
index 77d375aed..0dbfafdc9 100644
--- a/tuplex/core/include/ee/local/LocalBackend.h
+++ b/tuplex/core/include/ee/local/LocalBackend.h
@@ -88,6 +88,9 @@ namespace tuplex {
 
         MessageHandler& logger() const { return Logger::instance().logger("local ee"); }
 
+        void trimPartitionsToLimit(std::vector<Partition *> &partitions, size_t topLimit, size_t bottomLimit, TransformStage* tstage);
+        Partition* newPartitionWithSkipRows(Partition* p_in, int numToSkip, TransformStage* tstage);
+
         // write output (may be already in correct format!)
         void writeOutput(TransformStage* tstage, std::vector<IExecutorTask*>& sortedTasks);
 
diff --git a/tuplex/core/include/physical/TransformStage.h b/tuplex/core/include/physical/TransformStage.h
index e63eaec31..f489f1f6c 100644
--- a/tuplex/core/include/physical/TransformStage.h
+++ b/tuplex/core/include/physical/TransformStage.h
@@ -107,11 +107,13 @@ namespace tuplex {
         std::unordered_map<std::string, ExceptionInfo> partitionToExceptionsMap() { return _partitionToExceptionsMap; }
 
         /*!
-         * sets maximum number of rows this pipeline will produce
-         * @param outputLimit
+         * sets maximum number of top rows this pipeline will produce
+         * @param topLimit
+         * @param bottomLimit
          */
-        void setOutputLimit(size_t outputLimit) {
-            _outputTopLimit = outputLimit;
+        inline void  setOutputLimit(size_t topLimit, size_t bottomLimit) {
+            _outputTopLimit = topLimit;
+            _outputBottomLimit = bottomLimit;
 
             // @TODO: move this logic to physical plan!
             // pushdown limit
@@ -481,7 +483,7 @@ namespace tuplex {
         python::Type _hashOutputBucketType;
 
         bool hasOutputLimit() const {
-            return _outputTopLimit < std::numeric_limits<size_t>::max();
+            return _outputTopLimit < std::numeric_limits<size_t>::max() && _outputBottomLimit != 0;
         }
     };
 }
diff --git a/tuplex/core/src/Executor.cc b/tuplex/core/src/Executor.cc
index 1cc818010..388199e4d 100644
--- a/tuplex/core/src/Executor.cc
+++ b/tuplex/core/src/Executor.cc
@@ -98,24 +98,6 @@ namespace tuplex {
             return false;
         }
 
-        // if reach the top limit already, then don't compute the rest
-        size_t numTopCompleted;
-        TRACE_LOCK("rowsDone");
-        _rowsDoneMutex.lock();
-        size_t frontRowsDone = 0;
-        for (size_t i = 0; _rowsDone.count(i) != 0; i++) {
-            frontRowsDone += _rowsDone[i];
-            if (frontRowsDone >= _queue.frontRowsLimit()) {
-                // skip execution
-                _numPendingTasks.fetch_add(-1, std::memory_order_release);
-                _rowsDoneMutex.unlock();
-                TRACE_UNLOCK("rowsDone");
-                return true;
-            }
-        }
-        _rowsDoneMutex.unlock();
-        TRACE_UNLOCK("rowsDone");
-
         task->setOwner(&executor);
         task->setThreadNumber(executor.threadNumber()); // redundant?
 
@@ -135,12 +117,6 @@ namespace tuplex {
         _numCompletedTasks.fetch_add(1, std::memory_order_release);
         TRACE_UNLOCK("completedTasks");
 
-        TRACE_LOCK("rowsDone");
-        _rowsDoneMutex.lock();
-        _rowsDone[task->getOrder()] += task->getNumOutputRows();
-        _rowsDoneMutex.unlock();
-        TRACE_UNLOCK("rowsDone");
-
         return true;
     }
 
diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc
index 5a1311436..dbceaa1b9 100644
--- a/tuplex/core/src/ee/local/LocalBackend.cc
+++ b/tuplex/core/src/ee/local/LocalBackend.cc
@@ -486,7 +486,6 @@ namespace tuplex {
 
         // check what type of input the pipeline has (memory or files)
         if(tstage->fileInputMode()) {
-            // TODO(march): deal with file input
             // files
             // input is multiple files, use split file strategy here.
             // and issue tasks to executor workqueue!
@@ -686,9 +685,6 @@ namespace tuplex {
                 task->setStageID(tstage->getID());
                 task->setOutputTopLimit(tstage->outputTopLimit());
                 task->setOutputBottomLimit(tstage->outputBottomLimit());
-                if (tstage->outputBottomLimit()) {
-                    // TODO(march): work here (task output limit generation)
-                }
                 tasks.emplace_back(std::move(task));
                 numInputRows += partition->getNumRows();
 
@@ -698,6 +694,31 @@ namespace tuplex {
             }
         }
 
+        // assign the order for all tasks
+        for(size_t i = 0; i < tasks.size(); ++i) {
+            tasks[i]->setOrder(i);
+        }
+
+        if (tstage->hasOutputLimit()) {
+            if (tstage->outputTopLimit() > 0 && tstage->outputBottomLimit() > 0) {
+                // do task striping for output limit on both ends
+                vector<IExecutorTask*> newTasks;
+                for(size_t i = 0; i < tasks.size() - i; i++) {
+                    const size_t rev_i = tasks.size() - 1 - i;
+                    newTasks.push_back(tasks[i]);
+                    if (i < rev_i) {
+                        newTasks.push_back(tasks[rev_i]);
+                    }
+                }
+                assert(tasks.size() == newTasks.size());
+                tasks.swap(newTasks);
+            } else if (tstage->outputBottomLimit() > 0) {
+                // bottom limit only, just reverse the task order
+                std::reverse(tasks.begin(), tasks.end());
+            }
+            // if top limit only, do nothing since the order is already good
+        }
+
         return tasks;
     }
 
@@ -941,8 +962,8 @@ namespace tuplex {
             }
         }
 
-        // TODO(march): work here (transform stage)
         auto tasks = createLoadAndTransformToMemoryTasks(tstage, _options, syms);
+
         auto completedTasks = performTasks(tasks);
 
         // Note: this doesn't work yet because of the globals.
@@ -1175,6 +1196,10 @@ namespace tuplex {
                     rowDelta += taskNonConformingRows.size();
                 }
 
+                if (tstage->hasOutputLimit()) {
+                    trimPartitionsToLimit(output, tstage->outputTopLimit(), tstage->outputBottomLimit());
+                }
+
                 tstage->setMemoryResult(output, generalOutput, partitionToExceptionsMap, nonConformingRows, remainingExceptions, ecounts);
                 break;
             }
@@ -1518,21 +1543,29 @@ namespace tuplex {
         WorkQueue& wq = LocalEngine::instance().getQueue();
         wq.clear();
 
-        // assign the order for all tasks
+        // check if ord is set, if not issue warning & add
+        bool orderlessTaskFound = false;
         for(int i = 0; i < tasks.size(); ++i) {
-            tasks[i]->setOrder(i);
+            if(tasks[i]->getOrder().size() == 0) {
+                tasks[i]->setOrder(i);
+                orderlessTaskFound = true;
+            }
         }
 
-        // add all tasks to queue
-        // TODO(march): add task stage (to do striping)
-        for(size_t i = 0; i <= tasks.size() - i - 1; i++) {
-            const size_t revI = tasks.size()- i - 1
-            wq.addTask(&tasks[i]);
-            if (revI > i) {
-                wq.addTask(&tasks[revI]);
-            }
+#ifndef NDEBUG
+        if(orderlessTaskFound) {
+            logger().debug("task without order found, please fix in code.");
+        }
+#endif
+
+        for (int i = 0; i < tasks.size(); i++) {
+            // take limit only work with uniform order
+            assert(task.getOrder(0) == i);
         }
 
+        // add all tasks to queue
+        for(auto& task : tasks) wq.addTask(task);
+
         // clear
         tasks.clear();
 
@@ -2083,4 +2116,123 @@ namespace tuplex {
         Logger::instance().defaultLogger().info("writing output took " + std::to_string(timer.time()) + "s");
         tstage->setFileResult(ecounts);
     }
+
+    void LocalBackend::trimPartitionsToLimit(std::vector<Partition *> &partitions,
+                                             size_t topLimit,
+                                             size_t bottomLimit,
+                                             TransformStage* tstage) {
+        std::vector<Partition *> limitedPartitions, limitedTailPartitions;
+
+        // check top output limit, adjust partitions if necessary
+        size_t numTopOutputRows = 0;
+        Partition* lastTopPart = nullptr;
+        size_t clippedTop = 0;
+        for (auto partition : partitions) {
+            numTopOutputRows += partition->getNumRows();
+            lastTopPart = partition;
+            if (numTopOutputRows >= topLimit) {
+                // clip last partition & leave loop
+                clippedTop = topLimit - (numTopOutputRows - partition->getNumRows());
+                assert(clippedTop <= partition->getNumRows());
+                break;
+            } else if (partition == *partitions.end()) {
+                // last partition, mark full row, but don't put to output set yet to avoid double put
+                clippedTop = partition->getNumRows();
+                break;
+            } else {
+                // put full partition to output set
+                limitedPartitions.push_back(partition);
+            }
+        }
+
+        // check the bottom output limit, adjust partitions if necessary
+        size_t numBottomOutputRows = 0;
+        size_t clippedBottom = 0;
+        for (auto it = partitions.rbegin(); it != partitions.rend(); it++) {
+            auto partition = *it;
+            numBottomOutputRows += partition->getNumRows();
+
+            if (partition == lastTopPart) {
+                // the bottom and the top partitions are overlapping
+                clippedBottom = bottomLimit - (numBottomOutputRows - partition->getNumRows());
+                if (clippedTop + clippedBottom >= partition->getNumRows()) {
+                    // if top and bottom range intersect, use full partitions
+                    clippedTop = partition->getNumRows();
+                    clippedBottom = 0;
+                }
+                break;
+            } else if (numBottomOutputRows >= bottomLimit) {
+                // clip last partition & leave loop
+                auto clipped = bottomLimit - (numTopOutputRows - partition->getNumRows());
+                assert(clipped <= partition->getNumRows());
+                Partition newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage);
+                partition->invalidate();
+                parition = newPart;
+                assert(partition->getNumRows() == clipped);
+                if (clipped > 0)
+                    limitedTailPartitions.push_back(partition);
+                break;
+            } else {
+                // put full partition to output set
+                limitedTailPartitions.push_back(partition);
+            }
+        }
+
+        // push the middle partition
+        if (lastTopPart != nullptr && (clippedTop > 0 || clippedBottom > 0)) {
+            assert(clippedTop + clippedBottom <= lastTopPart->getNumRows());
+
+            // split into two partitions with both top and bottom are in the same partition
+            Partition* lastBottomPart = nullptr;
+
+            if (clippedBottom != 0) {
+                lastBottomPart = newPartitionWithSkipRows(lastTopPart, lastTopPart->getNumRows() - clippedBottom, tstage);
+            }
+
+            lastTopPart->setNumRows(clippedTop);
+
+            limitedPartitions.push_back(lastTopPart);
+
+            if (lastBottomPart != nullptr) {
+                limitedPartitions.push_back(lastBottomPart);
+            }
+        }
+
+        // merge the head and tail partitions
+        partitions.clear()
+        partitions.insert(partitions.end(), limitedPartitions.begin(), limitedPartitions.end());
+        partitions.insert(partitions.end(), limitedTailPartitions.rbegin(), limitedTailPartitions.rend());
+    }
+
+    Partition* LocalBackend::newPartitionWithSkipRows(Partition *p_in, int numToSkip, TransformStage* tstage) {
+        if(!numToSkip)
+            return nullptr;
+
+        auto ptr = p_in->lockRaw();
+        auto num_rows = *((int64_t*) ptr);
+        assert(numToSkip < num_rows);
+
+        Partition *p_out = _driver->allocWritablePartition(num_rows - numToSkip + sizeof(int64_t),
+                                                           tstage->outputSchema(), tstage->outputDataSetID(),
+                                                           tstage->context().id());
+
+        ptr += sizeof(int64_t);
+        size_t numBytesToSkip = 0;
+
+        for(unsigned i = 0; i < numToSkip; ++i) {
+            Rows r = Row::fromMemory(tstage->outputSchema(), ptr, p_in->capacity() - numBytesToSkip);
+            ptr += r.serializedLength();
+            numBytesToSkip += r.serializedLength();
+        }
+
+        auto ptr_out = p_out->lockRaw();
+        *((int64_t*) ptr_out) = p_in->getNumRows() - numToSkip;
+        ptr_out += sizeof(int64_t);
+        memcpy(ptr_out, ptr, p_in->size() - numBytesToSkip);
+        p_out->unlock();
+
+        p_in->unlock();
+
+        return p_out;
+    }
 } // namespace tuplex
\ No newline at end of file
diff --git a/tuplex/core/src/physical/PhysicalPlan.cc b/tuplex/core/src/physical/PhysicalPlan.cc
index 9c22837ad..ff67e4add 100644
--- a/tuplex/core/src/physical/PhysicalPlan.cc
+++ b/tuplex/core/src/physical/PhysicalPlan.cc
@@ -240,7 +240,7 @@ namespace tuplex {
         // user wants to merge exceptions in order.
         bool updateInputExceptions = hasFilter && hasInputExceptions && _context.getOptions().OPT_MERGE_EXCEPTIONS_INORDER();
 
-        // create trafostage via builder pattern
+        // create transfrom stage via builder pattern
         auto builder = codegen::StageBuilder(_num_stages++,
                                                isRootStage,
                                                _context.getOptions().UNDEFINED_BEHAVIOR_FOR_OPERATORS(),
diff --git a/tuplex/core/src/physical/ResultSet.cc b/tuplex/core/src/physical/ResultSet.cc
index 5e15867f7..e31e78cec 100644
--- a/tuplex/core/src/physical/ResultSet.cc
+++ b/tuplex/core/src/physical/ResultSet.cc
@@ -183,7 +183,6 @@ namespace tuplex {
     }
 
     Row ResultSet::getNextRow() {
-        // TODO(march): logic in skip row count here
         // merge rows from objects
         if(!_pyobjects.empty()) {
             auto row_number = std::get<0>(_pyobjects.front());
diff --git a/tuplex/core/src/physical/TransformStage.cc b/tuplex/core/src/physical/TransformStage.cc
index af58866dc..6e216ac5c 100644
--- a/tuplex/core/src/physical/TransformStage.cc
+++ b/tuplex/core/src/physical/TransformStage.cc
@@ -130,99 +130,23 @@ namespace tuplex {
         if (partitions.empty() && interpreterRows.empty() && generalCase.empty())
             _rs = emptyResultSet();
         else {
-            std::vector<Partition *> limitedPartitions, limitedTailPartitions;
             auto schema = Schema::UNKNOWN;
 
             if(!partitions.empty()) {
+                size_t totalRowsCount = 0;
                 schema = partitions.front()->schema();
                 for (auto partition : partitions) {
                     assert(schema == partition->schema());
+                    totalRowsCount += partition->getNumRows();
                 }
 
-                // check top output limit, adjust partitions if necessary
-                size_t numTopOutputRows = 0;
-                Partition* lastTopPart = nullptr;
-                size_t clippedTop = 0;
-                for (auto partition : partitions) {
-                    numTopOutputRows += partition->getNumRows();
-                    lastTopPart = partition;
-                    if (numTopOutputRows >= outputTopLimit()) {
-                        // clip last partition & leave loop
-                        clippedTop = outputTopLimit() - (numTopOutputRows - partition->getNumRows());
-                        assert(clippedTop <= partition->getNumRows());
-                        break;
-                    } else if (partition == *partitions.end()) {
-                        // last partition, mark full row, but don't put to output set yet to avoid double put
-                        clippedTop = partition->getNumRows();
-                        break;
-                    } else {
-                        // put full partition to output set
-                        limitedPartitions.push_back(partition);
-                    }
-                }
-
-                // check the bottom output limit, adjust partitions if necessary
-                size_t numBottomOutputRows = 0;
-                size_t clippedBottom = 0;
-                for (auto it = partitions.rbegin(); it != partitions.rend(); it++) {
-                    auto partition = *it;
-                    numBottomOutputRows += partition->getNumRows();
-
-                    if (partition == lastTopPart) {
-                        // the bottom and the top partitions are overlapping
-                        clippedBottom = outputBottomLimit() - (numBottomOutputRows - partition->getNumRows());
-                        if (clippedTop + clippedBottom >= partition->getNumRows()) {
-                            // if top and bottom range intersect, use full partitions
-                            clippedTop = partition->getNumRows();
-                            clippedBottom = 0;
-                        }
-                        break;
-                    } else if (numBottomOutputRows >= outputBottomLimit()) {
-                        // clip last partition & leave loop
-                        auto clipped = outputBottomLimit() - (numTopOutputRows - partition->getNumRows());
-                        assert(clipped <= partition->getNumRows());
-                        partition->setNumSkip(partition->getNumRows() - clippedBottom);
-                        partition->setNumRows(clipped);
-                        if (clipped > 0)
-                            limitedTailPartitions.push_back(partition);
-                        break;
-                    } else {
-                        // put full partition to output set
-                        limitedTailPartitions.push_back(partition);
-                    }
-                }
-
-                // push the middle partition
-                if (lastTopPart != nullptr && (clippedTop > 0 || clippedBottom > 0)) {
-                    assert(clippedTop + clippedBottom <= lastTopPart->getNumRows());
-
-                    // TODO(march): to work on this (split into two partitions)
-                    // split into two partitions with both top and bottom are in the same partition
-                    Partition* lastBottomPart = nullptr;
-                    if (clippedBottom != 0) {
-                        lastBottomPart = new Partition(lastTopPart);
-                        lastBottomPart->setNumSkip(lastBottomPart->getNumRows() - clippedBottom);
-                        lastBottomPart->setNumRows(clippedBottom);
-                    }
-
-                    lastTopPart->setNumRows(clippedTop);
-
-                    limitedPartitions.push_back(lastTopPart);
-
-                    if (lastBottomPart != nullptr) {
-                        limitedPartitions.push_back(lastBottomPart);
-                    }
+                if (hasOutputLimit()) {
+                    assert(totalRowsCount == _outputTopLimit + _outputBottomLimit);
                 }
-
-                // merge the head and tail partitions
-                std::reverse(limitedTailPartitions.begin(), limitedTailPartitions.end());
-                limitedPartitions.insert(limitedPartitions.end(), limitedTailPartitions.begin(), limitedTailPartitions.end());
             }
 
             // put ALL partitions to result set
-            // TODO(march): handle overlapping case
-            // TODO(march): maybe do top/bottom limit at the level instead?
-            _rs = std::make_shared<ResultSet>(schema, limitedPartitions,
+            _rs = std::make_shared<ResultSet>(schema, partitions,
                                               generalCase, partitionToExceptionsMap, interpreterRows,
                                               outputTopLimit() + outputBottomLimit());
         }
diff --git a/tuplex/core/src/physical/TransformTask.cc b/tuplex/core/src/physical/TransformTask.cc
index a65aa7f11..49d104bcc 100644
--- a/tuplex/core/src/physical/TransformTask.cc
+++ b/tuplex/core/src/physical/TransformTask.cc
@@ -22,9 +22,14 @@ namespace tuplex {
     static std::atomic_int64_t g_totalTopOutputRows;
     static std::atomic_int64_t g_totalBottomOutputRows;
 
+    // mapping from order number -> row count if the task is finished
+    static std::mutex g_rowsDoneMutex;
+    static std::map<size_t, size_t> g_rowsDone;
+
     void TransformTask::resetOutputLimitCounter() {
         g_totalTopOutputRows = 0;
         g_totalBottomOutputRows = 0;
+        g_rowsDone.clear();
     }
 }
 
@@ -42,23 +47,9 @@ extern "C" {
     }
 
     static int64_t limited_w2mCallback(tuplex::TransformTask* task, uint8_t* buf, int64_t bufSize) {
-        // i.e. check here how many output rows, if already limit reached - jump to goto!
-        // TODO(march): comment this out
-        if(tuplex::g_totalTopOutputRows >= task->output_top_limit()) {
-            return tuplex::ecToI64(tuplex::ExceptionCode::OUTPUT_LIMIT_REACHED);
-        }
-
         assert(task);
         assert(dynamic_cast<tuplex::TransformTask*>(task));
-        auto rc = task->writeRowToMemory(buf, bufSize);
-        if(0 == rc)
-            tuplex::g_totalTopOutputRows++;
-
-        // i.e. check here how many output rows, if already limit reached - jump to goto!
-        if(tuplex::g_totalTopOutputRows >= task->output_top_limit()) {
-            return tuplex::ecToI64(tuplex::ExceptionCode::OUTPUT_LIMIT_REACHED);
-        }
-        return rc;
+        return task->writeRowToMemory(buf, bufSize);
     }
 
     static int64_t limited_w2fCallback(tuplex::TransformTask* task, uint8_t* buf, int64_t bufSize) {
@@ -623,9 +614,36 @@ namespace tuplex {
 
         auto functor = reinterpret_cast<codegen::read_block_f>(_functor);
 
-        // TODO(march): question here?
         // go over all input partitions.
         for(const auto &inputPartition : _inputPartitions) {
+            size_t numTopCompleted = 0;
+            size_t numBottomCompleted = 0;
+            bool isTopLimitReached = false;
+            bool isBottomLimitReached = false;
+
+            tuplex::g_rowsDoneMutex.lock();
+            for (size_t i = 0; tuplex::g_rowsDone.count(i) != 0; i++) {
+                numTopCompleted += tuplex::g_rowsDone[i];
+                if (numTopCompleted >= _outTopLimit) {
+                    isTopLimitReached = true;
+                    break;
+                }
+            }
+            // TODO: what is the max task number here
+            for (size_t i = 100; tuplex::g_rowsDone.count(i) != 0; i--) {
+                numBottomCompleted += tuplex::g_rowsDone[i];
+                if (numBottomCompleted >= _outTopLimit) {
+                    isBottomLimitReached = true;
+                    break;
+                }
+            }
+            tuplex::g_rowsDoneMutex.unlock();
+
+            if (isTopLimitReached && isBottomLimitReached) {
+                // skip the execution, enough is done
+                break;
+            }
+
             // lock ptr, extract number of rows ==> store them
             // lock raw & call functor!
             int64_t inSize = inputPartition->size();
@@ -647,6 +665,10 @@ namespace tuplex {
             // delete partition if desired...
             if(_invalidateSourceAfterUse)
                 inputPartition->invalidate();
+
+            tuplex::g_rowsDoneMutex.lock();
+            tuplex::g_rowsDone[getOrder(0)] += getNumOutputRows();
+            tuplex::g_rowsDoneMutex.unlock();
         }
 
 #ifndef NDEBUG
diff --git a/tuplex/utils/include/mt/ITask.h b/tuplex/utils/include/mt/ITask.h
index 01f7137f1..a5ca4058f 100644
--- a/tuplex/utils/include/mt/ITask.h
+++ b/tuplex/utils/include/mt/ITask.h
@@ -21,52 +21,65 @@
 
 namespace tuplex {
 
+/*!
+ * interface for defining tasks that can be run via a threadpool
+ */
+class ITask {
+private:
+    std::thread::id _id; //! the id of the thread that executed the task. Used to specifically execute a task on a specific thread.
+//! Per default object is constructed that does not represent a thread
+
+    std::vector<size_t> _orderNumbers; //! for sorting tasks when doing async processing, allows for multiple stages
+
+public:
+    ITask() {};
+    ITask(const ITask& other) : _id(other._id), _orderNumbers(other._orderNumbers)  {}
+    virtual ~ITask() = default;
+    ITask(ITask&& other) = default;
+    ITask& operator = (ITask&& other) = default;
+
     /*!
-     * interface for defining tasks that can be run via a threadpool
+     * interface to run a task
      */
-    class ITask {
-    private:
-        std::thread::id _id; //! the id of the thread that executed the task. Used to specifically execute a task on a specific thread.
-//! Per default object is constructed that does not represent a thread
+    virtual void execute() = 0;
 
-        size_t _orderNumbers; //! for sorting tasks when doing async processing, allows for multiple stages
+    std::thread::id getID() {
+        return _id;
+    }
 
-    public:
-        ITask() {};
-        ITask(const ITask& other) : _id(other._id), _orderNumbers(other._orderNumbers)  {}
-        virtual ~ITask() = default;
-        ITask(ITask&& other) = default;
-        ITask& operator = (ITask&& other) = default;
+    void setID(const std::thread::id& id) {
+        _id = id;
+    }
 
-        /*!
-         * interface to run a task
-         */
-        virtual void execute() = 0;
+    void setOrder(size_t order) { _orderNumbers = std::vector<size_t>{order}; }
 
-        std::thread::id getID() {
-            return _id;
-        }
+    size_t getOrder(const size_t nth) const {
+        return _orderNumbers[nth];
+    }
 
-        void setID(const std::thread::id& id) {
-            _id = id;
-        }
+    std::vector<size_t> getOrder() const { return _orderNumbers; }
 
-        void setOrder(size_t order) {
-            _orderNumbers = order;
-        }
+    void setOrder(const std::vector<size_t>& order) {
+        _orderNumbers = order;
+    }
 
-        size_t getOrder() const {
-            return _orderNumbers;
-        }
+    /*!
+     * compare the ordering numbers of two tasks to restore initial data order after processing multiple ones
+     * @param other
+     * @return
+     */
+    bool compareAscOrder(const ITask& other) const {
+        // make sure they have the same length
+        assert(_orderNumbers.size() == other._orderNumbers.size());
 
-        /*!
-         * compare the ordering numbers of two tasks to restore initial data order after processing multiple ones
-         * @param other
-         * @return
-         */
-        bool compareAscOrder(const ITask& other) const {
-            return _orderNumbers[i] < other._orderNumbers[i];
+        // this < other?
+        // compare one by one
+        for(int i = 0; i < other._orderNumbers.size(); ++i) {
+            if(_orderNumbers[i] >= other._orderNumbers[i])
+                return false;
         }
-    };
+        return true;
+    }
+};
 }
 #endif //TUPLEX_ITASK_H
\ No newline at end of file

From 26ed614138e593bf38dae7a44c25ec6dbe278bf2 Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Fri, 25 Mar 2022 00:16:11 -0400
Subject: [PATCH 34/56] Address Review Comments

Address Review Comments (2)
---
 tuplex/core/include/DataSet.h                 |  5 +-
 tuplex/core/include/EmptyDataset.h            |  4 +-
 tuplex/core/include/ErrorDataSet.h            |  4 +-
 tuplex/core/include/Executor.h                | 10 +-
 tuplex/core/include/ee/local/LocalBackend.h   |  2 +-
 tuplex/core/include/logical/TakeOperator.h    | 12 +--
 tuplex/core/include/physical/ResultSet.h      |  2 -
 tuplex/core/include/physical/StageBuilder.h   |  5 +-
 tuplex/core/include/physical/TransformStage.h |  8 +-
 tuplex/core/src/DataSet.cc                    | 12 +--
 tuplex/core/src/EmptyDataset.cc               |  6 +-
 tuplex/core/src/ErrorDataSet.cc               |  6 +-
 tuplex/core/src/Executor.cc                   |  4 -
 tuplex/core/src/ee/local/LocalBackend.cc      | 19 ++--
 tuplex/core/src/logical/TakeOperator.cc       |  6 +-
 tuplex/core/src/physical/PhysicalPlan.cc      |  2 +-
 tuplex/core/src/physical/ResultSet.cc         |  2 +-
 tuplex/core/src/physical/StageBuilder.cc      |  2 +-
 tuplex/python/src/PythonDataSet.cc            |  2 +-
 tuplex/python/tuplex/dataset.py               | 15 +--
 tuplex/utils/include/mt/ITask.h               | 94 +++++++++----------
 21 files changed, 104 insertions(+), 118 deletions(-)

diff --git a/tuplex/core/include/DataSet.h b/tuplex/core/include/DataSet.h
index 65a766a87..f6bb97f2c 100644
--- a/tuplex/core/include/DataSet.h
+++ b/tuplex/core/include/DataSet.h
@@ -263,13 +263,12 @@ namespace tuplex {
         // these are actions that cause execution
         virtual std::shared_ptr<ResultSet> collect(std::ostream &os = std::cout);
 
-        virtual std::shared_ptr<ResultSet> take(int64_t numTop, int64_t numBottom, std::ostream &os = std::cout);
+        virtual std::shared_ptr<ResultSet> take(size_t topLimit, size_t bottomLimit, std::ostream &os = std::cout);
 
         virtual std::vector<Row> collectAsVector(std::ostream &os = std::cout);
 
-        virtual std::vector<Row> takeAsVector(int64_t numElements, std::ostream &os = std::cout);
+        virtual std::vector<Row> takeAsVector(size_t numElements, std::ostream &os = std::cout);
 
-        
         /*!
          * saves dataset to file. There are multiple options to control the behavior
          * ==> 1.) files can be split across multiple ones. Specify number of files to split rows to
diff --git a/tuplex/core/include/EmptyDataset.h b/tuplex/core/include/EmptyDataset.h
index 0f8a1f52c..6fc3219a4 100644
--- a/tuplex/core/include/EmptyDataset.h
+++ b/tuplex/core/include/EmptyDataset.h
@@ -70,13 +70,13 @@ namespace tuplex {
         virtual std::shared_ptr<ResultSet> collect(std::ostream& os) override;
 
         // take / collect will print out the error only
-        virtual std::shared_ptr<ResultSet> take(int64_t numTop, int64_t numBottom, std::ostream& os) override;
+        virtual std::shared_ptr<ResultSet> take(size_t topLimit, size_t bottomLimit, std::ostream& os) override;
 
         //virtual void show(const int64_t numRows=-1, std::ostream& os=std::cout) override;
         virtual std::vector<Row> collectAsVector(std::ostream& os) override;
 
         // take / collect will print out the error only
-        virtual std::vector<Row> takeAsVector(int64_t numElements, std::ostream& os) override;
+        virtual std::vector<Row> takeAsVector(size_t numElements, std::ostream& os) override;
 
         DataSet& cache(const Schema::MemoryLayout& memoryLayout, bool storeSpecialized) override {
             return *this;
diff --git a/tuplex/core/include/ErrorDataSet.h b/tuplex/core/include/ErrorDataSet.h
index 34fc60685..cf283ebd1 100644
--- a/tuplex/core/include/ErrorDataSet.h
+++ b/tuplex/core/include/ErrorDataSet.h
@@ -90,13 +90,13 @@ namespace tuplex {
         std::shared_ptr<ResultSet> collect(std::ostream& os) override;
 
         // take / collect will print out the error only
-        std::shared_ptr<ResultSet> take(int64_t numTop, int64_t numBottom, std::ostream& os) override;
+        std::shared_ptr<ResultSet> take(size_t topLimit, size_t bottomLimit, std::ostream& os) override;
 
         //virtual void show(const int64_t numRows=-1, std::ostream& os=std::cout) override;
         std::vector<Row> collectAsVector(std::ostream& os) override;
 
         // take / collect will print out the error only
-        std::vector<Row> takeAsVector(int64_t numElements, std::ostream& os) override;
+        std::vector<Row> takeAsVector(size_t numElements, std::ostream& os) override;
     };
 }
 
diff --git a/tuplex/core/include/Executor.h b/tuplex/core/include/Executor.h
index 7eaaee244..b6b7edac1 100644
--- a/tuplex/core/include/Executor.h
+++ b/tuplex/core/include/Executor.h
@@ -44,7 +44,7 @@ namespace tuplex {
      */
     class WorkQueue {
     private:
-        std::atomic_bool _done{}; // protects against data races
+        std::atomic_bool _done; // protects against data races
         ExecutorTaskQueueType _queue;
         std::mutex _completedTasksMutex;
         std::vector<IExecutorTask*> _completedTasks;
@@ -75,14 +75,6 @@ namespace tuplex {
 
         size_t numCompletedTasks() const { return _numCompletedTasks; }
 
-        size_t frontRowsLimit() const {
-            return _frontRowsLimit;
-        };
-
-        size_t bottomRowsLimit() const {
-            return _bottomRowsLimit;
-        };
-
         /*!
          * stop working on this queue & dump all tasks
          */
diff --git a/tuplex/core/include/ee/local/LocalBackend.h b/tuplex/core/include/ee/local/LocalBackend.h
index 0dbfafdc9..d7a5ec25b 100644
--- a/tuplex/core/include/ee/local/LocalBackend.h
+++ b/tuplex/core/include/ee/local/LocalBackend.h
@@ -89,7 +89,7 @@ namespace tuplex {
         MessageHandler& logger() const { return Logger::instance().logger("local ee"); }
 
         void trimPartitionsToLimit(std::vector<Partition *> &partitions, size_t topLimit, size_t bottomLimit, TransformStage* tstage);
-        Partition* newPartitionWithSkipRows(Partition* p_in, int numToSkip, TransformStage* tstage);
+        Partition* newPartitionWithSkipRows(Partition* p_in, size_t numToSkip, TransformStage* tstage);
 
         // write output (may be already in correct format!)
         void writeOutput(TransformStage* tstage, std::vector<IExecutorTask*>& sortedTasks);
diff --git a/tuplex/core/include/logical/TakeOperator.h b/tuplex/core/include/logical/TakeOperator.h
index b5dd5db6e..fe5f1b0f2 100644
--- a/tuplex/core/include/logical/TakeOperator.h
+++ b/tuplex/core/include/logical/TakeOperator.h
@@ -17,16 +17,16 @@
 namespace tuplex {
     class TakeOperator : public LogicalOperator {
     private:
-        int64_t _limitTop;
-        int64_t _limitBottom;
+        size_t _topLimit;
+        size_t _bottomLimit;
     public:
         LogicalOperator *clone() override;
 
     public:
-        TakeOperator(LogicalOperator *parent, const int64_t numTop, const int64_t numBottom);
+        TakeOperator(LogicalOperator *parent, size_t topLimit, size_t bottomLimit);
 
         std::string name() override {
-            if(_limitTop < 0 || std::numeric_limits<int64_t>::max() == _limitTop)
+            if(_topLimit == 0 && _bottomLimit == 0)
                 return "collect";
             return "take";
         }
@@ -38,9 +38,9 @@ namespace tuplex {
 
         bool good() const override;
 
-        int64_t limit() { return _limitTop; }
+        size_t topLimit() const { return _topLimit; }
 
-        int64_t limitBottom() { return _limitBottom; }
+        size_t bottomLimit() const { return _bottomLimit; }
 
         std::vector<Row> getSample(const size_t num) const override;
 
diff --git a/tuplex/core/include/physical/ResultSet.h b/tuplex/core/include/physical/ResultSet.h
index 5e69fef3a..e94b8f1ae 100644
--- a/tuplex/core/include/physical/ResultSet.h
+++ b/tuplex/core/include/physical/ResultSet.h
@@ -36,8 +36,6 @@ namespace tuplex {
         size_t _rowsRetrieved;
         size_t _totalRowCounter; // used for merging in rows!
         size_t _maxRows;
-        size_t _maxRowsTop;
-        size_t _maxRowsBottom;
         Schema _schema;
 
         void removeFirstPartition();
diff --git a/tuplex/core/include/physical/StageBuilder.h b/tuplex/core/include/physical/StageBuilder.h
index e678ead3d..83e63208a 100644
--- a/tuplex/core/include/physical/StageBuilder.h
+++ b/tuplex/core/include/physical/StageBuilder.h
@@ -76,7 +76,7 @@ namespace tuplex {
             void addFileInput(FileInputOperator* csvop);
             void addFileOutput(FileOutputOperator* fop);
 
-            inline void  setOutputLimit(size_t topLimit, size_t bottomLimit) {
+            inline void setOutputLimit(size_t topLimit, size_t bottomLimit = 0) {
                 _outputTopLimit = topLimit;
                 _outputBottomLimit = bottomLimit;
             }
@@ -158,8 +158,9 @@ namespace tuplex {
             size_t number() const { return _stageNumber; }
             int64_t outputDataSetID() const;
 
+            // default case: both _outputTopLimit and _outputBottomLimit is zero = take everything
             inline bool hasOutputLimit() const {
-                return _outputTopLimit < std::numeric_limits<size_t>::max() || _outputBottomLimit > 0;
+                return _outputTopLimit != 0 || _outputBottomLimit != 0;
             }
 
             inline char csvOutputDelimiter() const {
diff --git a/tuplex/core/include/physical/TransformStage.h b/tuplex/core/include/physical/TransformStage.h
index f489f1f6c..05c7df448 100644
--- a/tuplex/core/include/physical/TransformStage.h
+++ b/tuplex/core/include/physical/TransformStage.h
@@ -393,6 +393,10 @@ namespace tuplex {
          */
         void setDataAggregationMode(const AggregateType& t) { _aggMode = t; }
 
+        // default case: both _outputTopLimit and _outputBottomLimit is zero = take everything
+        bool hasOutputLimit() const {
+            return _outputTopLimit != 0 || _outputBottomLimit != 0;
+        }
     private:
         /*!
          * creates a new TransformStage with generated code
@@ -481,10 +485,6 @@ namespace tuplex {
         // for hash output, the key and bucket type
         python::Type _hashOutputKeyType;
         python::Type _hashOutputBucketType;
-
-        bool hasOutputLimit() const {
-            return _outputTopLimit < std::numeric_limits<size_t>::max() && _outputBottomLimit != 0;
-        }
     };
 }
 #endif //TUPLEX_TRANSFORMSTAGE_H
\ No newline at end of file
diff --git a/tuplex/core/src/DataSet.cc b/tuplex/core/src/DataSet.cc
index 3de903d1c..c11482f86 100644
--- a/tuplex/core/src/DataSet.cc
+++ b/tuplex/core/src/DataSet.cc
@@ -38,21 +38,17 @@ namespace tuplex {
     }
 
     std::shared_ptr<ResultSet> DataSet::collect(std::ostream &os) {
-        return take(-1, false, os);
+        return take(0, 0, os);
     }
 
-    std::shared_ptr<ResultSet> DataSet::take(int64_t numTop, int64_t numBottom, std::ostream &os) {
+    std::shared_ptr<ResultSet> DataSet::take(size_t topLimit, size_t bottomLimit, std::ostream &os) {
         // error dataset?
         if (isError())
             throw std::runtime_error("is error dataset!");
 
-        // negative numbers mean get all elements!
-        if (numTop < 0)
-            numTop = std::numeric_limits<int64_t>::max();
-
         // create a take node
         assert(_context);
-        LogicalOperator *op = _context->addOperator(new TakeOperator(this->_operator, numTop, numBottom));
+        LogicalOperator *op = _context->addOperator(new TakeOperator(this->_operator, topLimit, bottomLimit));
         DataSet *dsptr = _context->createDataSet(op->getOutputSchema());
         dsptr->_operator = op;
         op->setDataSet(dsptr);
@@ -70,7 +66,7 @@ namespace tuplex {
     }
 
     // -1 means to retrieve all elements
-    std::vector<Row> DataSet::takeAsVector(int64_t numElements, std::ostream &os) {
+    std::vector<Row> DataSet::takeAsVector(size_t numElements, std::ostream &os) {
         auto rs = take(numElements, false, os);
         Timer timer;
 
diff --git a/tuplex/core/src/EmptyDataset.cc b/tuplex/core/src/EmptyDataset.cc
index 7504e8499..3664a591a 100644
--- a/tuplex/core/src/EmptyDataset.cc
+++ b/tuplex/core/src/EmptyDataset.cc
@@ -11,16 +11,16 @@
 #include <EmptyDataset.h>
 
 namespace tuplex {
-    std::shared_ptr<ResultSet> EmptyDataset::take(int64_t numTop, int64_t numBottom, std::ostream &os) {
+    std::shared_ptr<ResultSet> EmptyDataset::take(size_t topLimit, size_t bottomLimit, std::ostream &os) {
         return std::make_shared<ResultSet>();
     }
 
-    std::vector<Row> EmptyDataset::takeAsVector(int64_t numElements, std::ostream &os) {
+    std::vector<Row> EmptyDataset::takeAsVector(size_t numElements, std::ostream &os) {
         return std::vector<Row>{};
     }
 
     std::shared_ptr<ResultSet> EmptyDataset::collect(std::ostream &os) {
-        return take(0, false, os);
+        return take(0, 0, os);
     }
 
     std::vector<Row> EmptyDataset::collectAsVector(std::ostream &os) {
diff --git a/tuplex/core/src/ErrorDataSet.cc b/tuplex/core/src/ErrorDataSet.cc
index 9d19594f2..c87999e5f 100644
--- a/tuplex/core/src/ErrorDataSet.cc
+++ b/tuplex/core/src/ErrorDataSet.cc
@@ -12,7 +12,7 @@
 
 
 namespace tuplex {
-    std::vector<Row> ErrorDataSet::takeAsVector(int64_t numElements, std::ostream &os) {
+    std::vector<Row> ErrorDataSet::takeAsVector(size_t numElements, std::ostream &os) {
         // return empty vector and print err message
         Logger::instance().logger("core").error(this->_error);
 
@@ -23,7 +23,7 @@ namespace tuplex {
         return takeAsVector(0, os);
     }
 
-    std::shared_ptr<ResultSet> ErrorDataSet::take(int64_t numTop, int64_t numBottom, std::ostream &os) {
+    std::shared_ptr<ResultSet> ErrorDataSet::take(size_t topLimit, size_t bottomLimit, std::ostream &os) {
         // return empty vector and print err message
         Logger::instance().logger("core").error(this->_error);
 
@@ -31,7 +31,7 @@ namespace tuplex {
     }
 
     std::shared_ptr<ResultSet> ErrorDataSet::collect(std::ostream &os) {
-        return take(0, false, os);
+        return take(0, 0, os);
     }
 
     void
diff --git a/tuplex/core/src/Executor.cc b/tuplex/core/src/Executor.cc
index 388199e4d..acfdd0aa6 100644
--- a/tuplex/core/src/Executor.cc
+++ b/tuplex/core/src/Executor.cc
@@ -33,10 +33,6 @@ namespace tuplex {
     std::vector<IExecutorTask*> WorkQueue::popCompletedTasks() {
         TRACE_LOCK("workQueue");
 
-        _taskDoneMutex.lock();
-        _taskDone.clear();
-        _taskDoneMutex.unlock();
-
         std::lock_guard<std::mutex> lock(_completedTasksMutex);
         // move leads to circular dependency in gcc and thus a bug on travis-ci. Therefore, just
         // use the below hack to fool the compiler into actually copying the vectors
diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc
index dbceaa1b9..022d5a036 100644
--- a/tuplex/core/src/ee/local/LocalBackend.cc
+++ b/tuplex/core/src/ee/local/LocalBackend.cc
@@ -551,6 +551,7 @@ namespace tuplex {
                         task->sinkExceptionsToMemory(inputSchema);
                         task->setStageID(tstage->getID());
                         task->setOutputTopLimit(tstage->outputTopLimit());
+                        task->setOutputBottomLimit(tstage->outputBottomLimit());
                         // add to tasks
                         tasks.emplace_back(std::move(task));
                     } else {
@@ -585,6 +586,7 @@ namespace tuplex {
                             task->sinkExceptionsToMemory(inputSchema);
                             task->setStageID(tstage->getID());
                             task->setOutputTopLimit(tstage->outputTopLimit());
+                            task->setOutputBottomLimit(tstage->outputBottomLimit());
                             // add to tasks
                             tasks.emplace_back(std::move(task));
                             num_parts++;
@@ -622,6 +624,7 @@ namespace tuplex {
                                 task->sinkExceptionsToMemory(inputSchema);
                                 task->setStageID(tstage->getID());
                                 task->setOutputTopLimit(tstage->outputTopLimit());
+                                task->setOutputBottomLimit(tstage->outputBottomLimit());
                                 // add to tasks
                                 tasks.emplace_back(std::move(task));
 
@@ -1197,7 +1200,7 @@ namespace tuplex {
                 }
 
                 if (tstage->hasOutputLimit()) {
-                    trimPartitionsToLimit(output, tstage->outputTopLimit(), tstage->outputBottomLimit());
+                    trimPartitionsToLimit(output, tstage->outputTopLimit(), tstage->outputBottomLimit(), tstage);
                 }
 
                 tstage->setMemoryResult(output, generalOutput, partitionToExceptionsMap, nonConformingRows, remainingExceptions, ecounts);
@@ -1560,7 +1563,7 @@ namespace tuplex {
 
         for (int i = 0; i < tasks.size(); i++) {
             // take limit only work with uniform order
-            assert(task.getOrder(0) == i);
+            assert(tasks[i]->getOrder(0) == i);
         }
 
         // add all tasks to queue
@@ -2165,9 +2168,9 @@ namespace tuplex {
                 // clip last partition & leave loop
                 auto clipped = bottomLimit - (numTopOutputRows - partition->getNumRows());
                 assert(clipped <= partition->getNumRows());
-                Partition newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage);
+                Partition* newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage);
                 partition->invalidate();
-                parition = newPart;
+                partition = newPart;
                 assert(partition->getNumRows() == clipped);
                 if (clipped > 0)
                     limitedTailPartitions.push_back(partition);
@@ -2199,12 +2202,12 @@ namespace tuplex {
         }
 
         // merge the head and tail partitions
-        partitions.clear()
+        partitions.clear();
         partitions.insert(partitions.end(), limitedPartitions.begin(), limitedPartitions.end());
         partitions.insert(partitions.end(), limitedTailPartitions.rbegin(), limitedTailPartitions.rend());
     }
 
-    Partition* LocalBackend::newPartitionWithSkipRows(Partition *p_in, int numToSkip, TransformStage* tstage) {
+    Partition* LocalBackend::newPartitionWithSkipRows(Partition *p_in, size_t numToSkip, TransformStage* tstage) {
         if(!numToSkip)
             return nullptr;
 
@@ -2220,7 +2223,7 @@ namespace tuplex {
         size_t numBytesToSkip = 0;
 
         for(unsigned i = 0; i < numToSkip; ++i) {
-            Rows r = Row::fromMemory(tstage->outputSchema(), ptr, p_in->capacity() - numBytesToSkip);
+            Row r = Row::fromMemory(tstage->outputSchema(), ptr, p_in->capacity() - numBytesToSkip);
             ptr += r.serializedLength();
             numBytesToSkip += r.serializedLength();
         }
@@ -2228,7 +2231,7 @@ namespace tuplex {
         auto ptr_out = p_out->lockRaw();
         *((int64_t*) ptr_out) = p_in->getNumRows() - numToSkip;
         ptr_out += sizeof(int64_t);
-        memcpy(ptr_out, ptr, p_in->size() - numBytesToSkip);
+        memcpy((void *) ptr_out, ptr, p_in->size() - numBytesToSkip);
         p_out->unlock();
 
         p_in->unlock();
diff --git a/tuplex/core/src/logical/TakeOperator.cc b/tuplex/core/src/logical/TakeOperator.cc
index e588b5e97..49a4452b4 100644
--- a/tuplex/core/src/logical/TakeOperator.cc
+++ b/tuplex/core/src/logical/TakeOperator.cc
@@ -12,13 +12,13 @@
 #include <cassert>
 
 namespace tuplex {
-    TakeOperator::TakeOperator(LogicalOperator *parent, const int64_t numTop, const int64_t numBottom) : LogicalOperator::LogicalOperator(parent), _limitTop(numTop), _limitBottom(numBottom) {
+    TakeOperator::TakeOperator(LogicalOperator *parent, size_t topLimit, size_t bottomLimit) : LogicalOperator::LogicalOperator(parent), _topLimit(topLimit), _bottomLimit(bottomLimit) {
         // take schema from parent node
         setSchema(this->parent()->getOutputSchema());
     }
 
     bool TakeOperator::good() const {
-            return _limitTop >= -1 && _limitBottom >= -1;
+            return _topLimit >= 0 && _bottomLimit >= 0;
     }
 
     std::vector<Row> TakeOperator::getSample(const size_t num) const {
@@ -33,7 +33,7 @@ namespace tuplex {
 
     LogicalOperator *TakeOperator::clone() {
         // create clone of this operator
-        auto copy = new TakeOperator(parent()->clone(), _limitTop, _limitBottom);
+        auto copy = new TakeOperator(parent()->clone(), _topLimit, _bottomLimit);
 
         copy->setDataSet(getDataSet()); // weak ptr to old dataset...
         copy->copyMembers(this);
diff --git a/tuplex/core/src/physical/PhysicalPlan.cc b/tuplex/core/src/physical/PhysicalPlan.cc
index ff67e4add..f289064d5 100644
--- a/tuplex/core/src/physical/PhysicalPlan.cc
+++ b/tuplex/core/src/physical/PhysicalPlan.cc
@@ -382,7 +382,7 @@ namespace tuplex {
         // set limit if output node has a limit (currently only TakeOperator)
         if(outputNode->type() == LogicalOperatorType::TAKE) {
             auto top = static_cast<TakeOperator*>(outputNode);
-            builder.setOutputLimit(top->limit(), top->limitBottom());
+            builder.setOutputLimit(top->topLimit(), top->bottomLimit());
         }
 
         // @TODO: add slowPip builder to this process...
diff --git a/tuplex/core/src/physical/ResultSet.cc b/tuplex/core/src/physical/ResultSet.cc
index e31e78cec..0f7bf7319 100644
--- a/tuplex/core/src/physical/ResultSet.cc
+++ b/tuplex/core/src/physical/ResultSet.cc
@@ -98,7 +98,7 @@ namespace tuplex {
         Partition *first = _partitions.front();
         assert(_schema == first->schema());
 
-        auto numRows = first->getNumRows() - first->getNumSkip();
+        auto numRows = first->getNumRows();
         _rowsRetrieved += numRows;
 
         _partitions.pop_front();
diff --git a/tuplex/core/src/physical/StageBuilder.cc b/tuplex/core/src/physical/StageBuilder.cc
index bc814182b..78bc8dea4 100644
--- a/tuplex/core/src/physical/StageBuilder.cc
+++ b/tuplex/core/src/physical/StageBuilder.cc
@@ -458,7 +458,7 @@ namespace tuplex {
                     }
                     case LogicalOperatorType::TAKE: {
                         auto takeOp = dynamic_cast<TakeOperator*>(node);
-                        opt_ops.push_back(new TakeOperator(lastParent, takeOp->limit(), takeOp->limitBottom()));
+                        opt_ops.push_back(new TakeOperator(lastParent, takeOp->topLimit(), takeOp->bottomLimit()));
                         opt_ops.back()->setID(node->getID());
                         break;
                     }
diff --git a/tuplex/python/src/PythonDataSet.cc b/tuplex/python/src/PythonDataSet.cc
index 853b910db..66f94e33f 100644
--- a/tuplex/python/src/PythonDataSet.cc
+++ b/tuplex/python/src/PythonDataSet.cc
@@ -130,7 +130,7 @@ namespace tuplex {
             std::shared_ptr<ResultSet> rs;
             std::string err_message = "";
             try {
-                rs = _dataset->take(numRows, ss);
+                rs = _dataset->take(numTop, numBottom, ss);
                 if(!rs)
                     throw std::runtime_error("invalid result set");
                 // if there are more than 1 million (100k in debug mode) elements print message...
diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py
index 1046505f2..850b4ed83 100644
--- a/tuplex/python/tuplex/dataset.py
+++ b/tuplex/python/tuplex/dataset.py
@@ -191,24 +191,25 @@ def collect(self):
         assert self._dataSet is not None, 'internal API error, datasets must be created via context objects'
         return self._dataSet.collect()
 
-    def take(self, nrows=5, nbottom=0):
+    def take(self, limitTop=5, limitBottom=0):
         """ action that generates a physical plan, processes data and collects the top results then as list of tuples.
 
         Args:
-            nrows (int): number of rows to collect. Per default ``5``.
+            limitTop (int): number of top rows to collect. Per default ``5``.
+            limitBottom (int): number of bottom rows to collect. Per default ``0``.
         Returns:
             (list): A list of tuples
 
         """
 
-        assert isinstance(nrows, int), 'num rows must be an integer'
-        assert nrows > 0, 'please specify a number greater than zero'
-        assert isinstance(nbottom, int), 'num bottom last must be an integer'
-        assert nbottom >= 0, 'please specify a number greater or equal to zero'
+        assert isinstance(limitTop, int), 'num rows must be an integer'
+        assert limitTop > 0, 'please specify a number greater than zero'
+        assert isinstance(limitBottom, int), 'num bottom last must be an integer'
+        assert limitBottom >= 0, 'please specify a number greater or equal to zero'
 
         assert self._dataSet is not None, 'internal API error, datasets must be created via context objects'
 
-        return self._dataSet.take(nrows, nbottom)
+        return self._dataSet.take(limitTop, limitBottom)
 
     def show(self, nrows=None):
         """ action that generates a physical plan, processes data and prints results as nicely formatted
diff --git a/tuplex/utils/include/mt/ITask.h b/tuplex/utils/include/mt/ITask.h
index a5ca4058f..6c85d2d36 100644
--- a/tuplex/utils/include/mt/ITask.h
+++ b/tuplex/utils/include/mt/ITask.h
@@ -21,65 +21,65 @@
 
 namespace tuplex {
 
-/*!
- * interface for defining tasks that can be run via a threadpool
- */
-class ITask {
-private:
-    std::thread::id _id; //! the id of the thread that executed the task. Used to specifically execute a task on a specific thread.
+    /*!
+     * interface for defining tasks that can be run via a threadpool
+     */
+    class ITask {
+    private:
+        std::thread::id _id; //! the id of the thread that executed the task. Used to specifically execute a task on a specific thread.
 //! Per default object is constructed that does not represent a thread
 
-    std::vector<size_t> _orderNumbers; //! for sorting tasks when doing async processing, allows for multiple stages
+        std::vector<size_t> _orderNumbers; //! for sorting tasks when doing async processing, allows for multiple stages
 
-public:
-    ITask() {};
-    ITask(const ITask& other) : _id(other._id), _orderNumbers(other._orderNumbers)  {}
-    virtual ~ITask() = default;
-    ITask(ITask&& other) = default;
-    ITask& operator = (ITask&& other) = default;
+    public:
+        ITask() {};
+        ITask(const ITask& other) : _id(other._id), _orderNumbers(other._orderNumbers)  {}
+        virtual ~ITask() = default;
+        ITask(ITask&& other) = default;
+        ITask& operator = (ITask&& other) = default;
 
-    /*!
-     * interface to run a task
-     */
-    virtual void execute() = 0;
+        /*!
+         * interface to run a task
+         */
+        virtual void execute() = 0;
 
-    std::thread::id getID() {
-        return _id;
-    }
+        std::thread::id getID() {
+            return _id;
+        }
 
-    void setID(const std::thread::id& id) {
-        _id = id;
-    }
+        void setID(const std::thread::id& id) {
+            _id = id;
+        }
 
-    void setOrder(size_t order) { _orderNumbers = std::vector<size_t>{order}; }
+        void setOrder(size_t order) { _orderNumbers = std::vector<size_t>{order}; }
 
-    size_t getOrder(const size_t nth) const {
-        return _orderNumbers[nth];
-    }
+        size_t getOrder(size_t nth) const {
+            return _orderNumbers[nth];
+        }
 
-    std::vector<size_t> getOrder() const { return _orderNumbers; }
+        std::vector<size_t> getOrder() const { return _orderNumbers; }
 
-    void setOrder(const std::vector<size_t>& order) {
-        _orderNumbers = order;
-    }
+        void setOrder(const std::vector<size_t>& order) {
+            _orderNumbers = order;
+        }
 
-    /*!
-     * compare the ordering numbers of two tasks to restore initial data order after processing multiple ones
-     * @param other
-     * @return
-     */
-    bool compareAscOrder(const ITask& other) const {
-        // make sure they have the same length
-        assert(_orderNumbers.size() == other._orderNumbers.size());
+        /*!
+         * compare the ordering numbers of two tasks to restore initial data order after processing multiple ones
+         * @param other
+         * @return
+         */
+        bool compareAscOrder(const ITask& other) const {
+            // make sure they have the same length
+            assert(_orderNumbers.size() == other._orderNumbers.size());
 
-        // this < other?
-        // compare one by one
-        for(int i = 0; i < other._orderNumbers.size(); ++i) {
-            if(_orderNumbers[i] >= other._orderNumbers[i])
-                return false;
+            // this < other?
+            // compare one by one
+            for(int i = 0; i < other._orderNumbers.size(); ++i) {
+                if(_orderNumbers[i] >= other._orderNumbers[i])
+                    return false;
+            }
+            return true;
         }
-        return true;
-    }
-};
+    };
 }
 #endif //TUPLEX_ITASK_H
\ No newline at end of file

From 2cdd269c11dd4cd87d0958846cb2338b7c4e06c8 Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Fri, 25 Mar 2022 00:52:36 -0400
Subject: [PATCH 35/56] Debugging Tests

---
 tuplex/core/src/ee/local/LocalBackend.cc | 7 ++++++-
 tuplex/core/src/physical/ResultSet.cc    | 3 +--
 tuplex/test/core/TakeTest.cc             | 4 +++-
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc
index 022d5a036..d51ef4523 100644
--- a/tuplex/core/src/ee/local/LocalBackend.cc
+++ b/tuplex/core/src/ee/local/LocalBackend.cc
@@ -884,7 +884,12 @@ namespace tuplex {
         // special case: skip stage, i.e. empty code and mem2mem
         if(tstage->code().empty() &&  !tstage->fileInputMode() && !tstage->fileOutputMode()) {
             auto pyObjects = inputExceptionsToPythonObjects(tstage->inputExceptions(), tstage->normalCaseInputSchema());
-            tstage->setMemoryResult(tstage->inputPartitions(), std::vector<Partition*>{}, std::unordered_map<std::string, ExceptionInfo>(), pyObjects);
+
+            auto output_par = tstage->inputPartitions();
+            if (tstage->hasOutputLimit()) {
+                trimPartitionsToLimit(output_par, tstage->outputTopLimit(), tstage->outputBottomLimit(), tstage);
+            }
+            tstage->setMemoryResult(output_par, std::vector<Partition*>{}, std::unordered_map<std::string, ExceptionInfo>(), pyObjects);
             pyObjects.clear();
             // skip stage
             Logger::instance().defaultLogger().info("[Transform Stage] skipped stage " + std::to_string(tstage->number()) + " because there is nothing todo here.");
diff --git a/tuplex/core/src/physical/ResultSet.cc b/tuplex/core/src/physical/ResultSet.cc
index 0f7bf7319..bfd656dc8 100644
--- a/tuplex/core/src/physical/ResultSet.cc
+++ b/tuplex/core/src/physical/ResultSet.cc
@@ -138,8 +138,7 @@ namespace tuplex {
                 auto num_rows = first->getNumRows();
                 // how many left to retrieve?
                 auto num_to_retrieve_from_partition = std::min(limit - i, num_rows - _curRowCounter);
-                if(num_to_retrieve_from_partition <= 0)
-                    break;
+                assert(num_to_retrieve_from_partition >= 0);
 
                 // make sure partition schema matches stored schema
                 assert(_schema == first->schema());
diff --git a/tuplex/test/core/TakeTest.cc b/tuplex/test/core/TakeTest.cc
index 08b648f34..2d8f81f2f 100644
--- a/tuplex/test/core/TakeTest.cc
+++ b/tuplex/test/core/TakeTest.cc
@@ -122,4 +122,6 @@ TEST_F(TakeTest, takeBothTest) {
     EXPECT_EQ(v3[2].getString(0), "!");
     EXPECT_EQ(v3[3].getString(0), "! :)");
     EXPECT_EQ(v3[4].getString(0), "!");
-}
\ No newline at end of file
+}
+
+// TODO(march): test empty code when reusing partitions. This might not work if user reused dataset
\ No newline at end of file

From c203de49f1dcd75062703703c8778eae2bd768e7 Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Wed, 6 Apr 2022 23:55:59 -0400
Subject: [PATCH 36/56] Change definition of take all

---
 tuplex/core/include/DataSet.h                 |  2 +-
 tuplex/core/include/logical/TakeOperator.h    |  2 +-
 tuplex/core/include/physical/StageBuilder.h   |  3 +--
 tuplex/core/include/physical/TransformStage.h |  2 +-
 tuplex/core/include/physical/TransformTask.h  |  2 --
 tuplex/core/src/DataSet.cc                    | 14 +++++++-------
 tuplex/core/src/physical/TransformTask.cc     |  3 +--
 tuplex/python/include/PythonDataSet.h         |  2 +-
 tuplex/python/src/PythonDataSet.cc            | 17 ++++++++++++++---
 9 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/tuplex/core/include/DataSet.h b/tuplex/core/include/DataSet.h
index f6bb97f2c..86ca23b6a 100644
--- a/tuplex/core/include/DataSet.h
+++ b/tuplex/core/include/DataSet.h
@@ -128,7 +128,7 @@ namespace tuplex {
          * @param numRows how many rows to print, i.e. top numRows are printed.xs
          * @param os ostream where to print table to
          */
-        virtual void show(const int64_t numRows = -1, std::ostream &os = std::cout);
+        virtual void show(int64_t numRows = -1, std::ostream &os = std::cout);
 
         // named dataset management functions
         /*!
diff --git a/tuplex/core/include/logical/TakeOperator.h b/tuplex/core/include/logical/TakeOperator.h
index fe5f1b0f2..f3841236b 100644
--- a/tuplex/core/include/logical/TakeOperator.h
+++ b/tuplex/core/include/logical/TakeOperator.h
@@ -26,7 +26,7 @@ namespace tuplex {
         TakeOperator(LogicalOperator *parent, size_t topLimit, size_t bottomLimit);
 
         std::string name() override {
-            if(_topLimit == 0 && _bottomLimit == 0)
+            if(_topLimit == std::numeric_limits<size_t>::max() || _bottomLimit == std::numeric_limits<size_t>::max())
                 return "collect";
             return "take";
         }
diff --git a/tuplex/core/include/physical/StageBuilder.h b/tuplex/core/include/physical/StageBuilder.h
index 83e63208a..1c322b9a6 100644
--- a/tuplex/core/include/physical/StageBuilder.h
+++ b/tuplex/core/include/physical/StageBuilder.h
@@ -158,9 +158,8 @@ namespace tuplex {
             size_t number() const { return _stageNumber; }
             int64_t outputDataSetID() const;
 
-            // default case: both _outputTopLimit and _outputBottomLimit is zero = take everything
             inline bool hasOutputLimit() const {
-                return _outputTopLimit != 0 || _outputBottomLimit != 0;
+                return _outputTopLimit != std::numeric_limits<size_t>::max() && _outputBottomLimit != std::numeric_limits<size_t>::max();
             }
 
             inline char csvOutputDelimiter() const {
diff --git a/tuplex/core/include/physical/TransformStage.h b/tuplex/core/include/physical/TransformStage.h
index 05c7df448..e1e45c97b 100644
--- a/tuplex/core/include/physical/TransformStage.h
+++ b/tuplex/core/include/physical/TransformStage.h
@@ -395,7 +395,7 @@ namespace tuplex {
 
         // default case: both _outputTopLimit and _outputBottomLimit is zero = take everything
         bool hasOutputLimit() const {
-            return _outputTopLimit != 0 || _outputBottomLimit != 0;
+            return _outputTopLimit != std::numeric_limits<size_t>::max() && _outputBottomLimit != std::numeric_limits<size_t>::max();
         }
     private:
         /*!
diff --git a/tuplex/core/include/physical/TransformTask.h b/tuplex/core/include/physical/TransformTask.h
index d065e86d3..e2b8bc5b6 100644
--- a/tuplex/core/include/physical/TransformTask.h
+++ b/tuplex/core/include/physical/TransformTask.h
@@ -184,7 +184,6 @@ namespace tuplex {
 
         void setOutputTopLimit(size_t limit) { _outTopLimit = limit; resetOutputLimitCounter(); }
         void setOutputBottomLimit(size_t limit) { _outBottomLimit = limit; resetOutputLimitCounter(); }
-        void setOutputSkip(size_t numRowsToSkip) { _outSkipRows = numRowsToSkip; }
         void execute() override;
 
         bool hasFileSink() const { return _outputFilePath != URI::INVALID; }
@@ -281,7 +280,6 @@ namespace tuplex {
 
         size_t _outTopLimit; // limits how many rows to write at max
         size_t _outBottomLimit; // limits how many last rows to write at max
-        size_t _outSkipRows; // how many rows at start to skip
 
         // memory source variables
         std::vector<Partition*> _inputPartitions;
diff --git a/tuplex/core/src/DataSet.cc b/tuplex/core/src/DataSet.cc
index c11482f86..d54edb567 100644
--- a/tuplex/core/src/DataSet.cc
+++ b/tuplex/core/src/DataSet.cc
@@ -38,7 +38,7 @@ namespace tuplex {
     }
 
     std::shared_ptr<ResultSet> DataSet::collect(std::ostream &os) {
-        return take(0, 0, os);
+        return take(std::numeric_limits<size_t>::max(), 0, os);
     }
 
     std::shared_ptr<ResultSet> DataSet::take(size_t topLimit, size_t bottomLimit, std::ostream &os) {
@@ -62,18 +62,14 @@ namespace tuplex {
 
     // collect functions
     std::vector<Row> DataSet::collectAsVector(std::ostream &os) {
-        return takeAsVector(-1, os);
+        return takeAsVector(std::numeric_limits<size_t>::max(), os);
     }
 
-    // -1 means to retrieve all elements
     std::vector<Row> DataSet::takeAsVector(size_t numElements, std::ostream &os) {
         auto rs = take(numElements, false, os);
         Timer timer;
 
 #warning "limiting should make this hack irrelevant..."
-        if (numElements < 0)
-            numElements = std::numeric_limits<int64_t>::max();
-
         // std::vector<Row> v;
         // while (rs->hasNextRow() && v.size() < numElements) {
         //     v.push_back(rs->getNextRow());
@@ -730,10 +726,14 @@ namespace tuplex {
     }
 
 
-    void DataSet::show(const int64_t numRows, std::ostream &os) {
+    void DataSet::show(int64_t numRows, std::ostream &os) {
         assert(_context);
 
         // get rows
+        if (numRows < 0) {
+            numRows = std::numeric_limits<size_t>::max();
+        }
+
         auto rows = takeAsVector(numRows, os);
         if (rows.empty()) {
             return;
diff --git a/tuplex/core/src/physical/TransformTask.cc b/tuplex/core/src/physical/TransformTask.cc
index 49d104bcc..377385deb 100644
--- a/tuplex/core/src/physical/TransformTask.cc
+++ b/tuplex/core/src/physical/TransformTask.cc
@@ -509,7 +509,6 @@ namespace tuplex {
         _outPrefix.reset();
         _outTopLimit = std::numeric_limits<size_t>::max(); // write all rows
         _outBottomLimit = 0;
-        _outSkipRows = 0; // skip no rows
 
         // reset memory sink
         _output.reset();
@@ -705,7 +704,7 @@ namespace tuplex {
 
         // skip rows? limit rows??
 
-        if(_numOutputRowsWritten >= _outSkipRows && _numOutputRowsWritten < (_outTopLimit - _outSkipRows)) {
+        if(_numOutputRowsWritten < _outTopLimit) {
             if(_outFile->write(buf, bufSize) != VirtualFileSystemStatus::VFS_OK)
                 return ecToI32(ExceptionCode::IOERROR);
         }
diff --git a/tuplex/python/include/PythonDataSet.h b/tuplex/python/include/PythonDataSet.h
index 23b09314d..ede482d9c 100644
--- a/tuplex/python/include/PythonDataSet.h
+++ b/tuplex/python/include/PythonDataSet.h
@@ -77,7 +77,7 @@ namespace tuplex {
         PythonDataSet resolve(const int64_t exceptionCode, const std::string& lambda_code, const std::string& pickled_code, const py::object& closure=py::object());
 
         py::object collect();
-        py::object take(const int64_t numTop, const int64_t numBottom);
+        py::object take(const int64_t topLimit, const int64_t bottomLimit);
         void show(const int64_t numRows=-1);
 
         // DataFrame like operations
diff --git a/tuplex/python/src/PythonDataSet.cc b/tuplex/python/src/PythonDataSet.cc
index 66f94e33f..f6079a143 100644
--- a/tuplex/python/src/PythonDataSet.cc
+++ b/tuplex/python/src/PythonDataSet.cc
@@ -107,7 +107,7 @@ namespace tuplex {
         }
     }
 
-    py::object PythonDataSet::take(const int64_t numTop, const int64_t numBottom) {
+    py::object PythonDataSet::take(const int64_t topLimit, const int64_t bottomLimit) {
         // make sure a dataset is wrapped
         assert(this->_dataset);
 
@@ -129,8 +129,19 @@ namespace tuplex {
 
             std::shared_ptr<ResultSet> rs;
             std::string err_message = "";
+
+            size_t castedTopLimit = 0;
+            if (topLimit < 0) {
+                castedTopLimit = std::numeric_limits<size_t>::max();
+            }
+
+            size_t castedBottomLimit = 0;
+            if (bottomLimit < 0) {
+                castedBottomLimit = std::numeric_limits<size_t>::max();
+            }
+
             try {
-                rs = _dataset->take(numTop, numBottom, ss);
+                rs = _dataset->take(castedTopLimit, castedBottomLimit, ss);
                 if(!rs)
                     throw std::runtime_error("invalid result set");
                 // if there are more than 1 million (100k in debug mode) elements print message...
@@ -162,7 +173,7 @@ namespace tuplex {
             // new version, directly interact with the interpreter
             Timer timer;
             // build python list object from resultset
-            auto listObj = resultSetToCPython(rs.get(), numTop);
+            auto listObj = resultSetToCPython(rs.get(), castedTopLimit);
             Logger::instance().logger("python").info("Data transfer back to python took "
                                                      + std::to_string(timer.time()) + " seconds");
             // Logger::instance().flushAll();

From 664cd14a5ef0b1d2bba2723b4fc914a395d1765e Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Wed, 6 Apr 2022 23:56:59 -0400
Subject: [PATCH 37/56] Random take test with some debugging

---
 tuplex/core/src/ee/local/LocalBackend.cc   | 35 ++++-----
 tuplex/core/src/physical/TransformStage.cc |  6 --
 tuplex/test/core/TakeTest.cc               | 87 +++++++++++++++++++++-
 3 files changed, 101 insertions(+), 27 deletions(-)

diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc
index d51ef4523..0b8157ecc 100644
--- a/tuplex/core/src/ee/local/LocalBackend.cc
+++ b/tuplex/core/src/ee/local/LocalBackend.cc
@@ -2143,7 +2143,7 @@ namespace tuplex {
                 clippedTop = topLimit - (numTopOutputRows - partition->getNumRows());
                 assert(clippedTop <= partition->getNumRows());
                 break;
-            } else if (partition == *partitions.end()) {
+            } else if (partition == partitions.back()) {
                 // last partition, mark full row, but don't put to output set yet to avoid double put
                 clippedTop = partition->getNumRows();
                 break;
@@ -2171,14 +2171,14 @@ namespace tuplex {
                 break;
             } else if (numBottomOutputRows >= bottomLimit) {
                 // clip last partition & leave loop
-                auto clipped = bottomLimit - (numTopOutputRows - partition->getNumRows());
+                auto clipped = bottomLimit - (numBottomOutputRows - partition->getNumRows());
                 assert(clipped <= partition->getNumRows());
-                Partition* newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage);
+                if (clipped > 0) {
+                    Partition *newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage);
+                    assert(newPart->getNumRows() == clipped);
+                    limitedTailPartitions.push_back(newPart);
+                }
                 partition->invalidate();
-                partition = newPart;
-                assert(partition->getNumRows() == clipped);
-                if (clipped > 0)
-                    limitedTailPartitions.push_back(partition);
                 break;
             } else {
                 // put full partition to output set
@@ -2197,9 +2197,12 @@ namespace tuplex {
                 lastBottomPart = newPartitionWithSkipRows(lastTopPart, lastTopPart->getNumRows() - clippedBottom, tstage);
             }
 
-            lastTopPart->setNumRows(clippedTop);
-
-            limitedPartitions.push_back(lastTopPart);
+            if (clippedTop != 0) {
+                lastTopPart->setNumRows(clippedTop);
+                limitedPartitions.push_back(lastTopPart);
+            } else {
+                lastTopPart->invalidate();
+            }
 
             if (lastBottomPart != nullptr) {
                 limitedPartitions.push_back(lastBottomPart);
@@ -2213,17 +2216,10 @@ namespace tuplex {
     }
 
     Partition* LocalBackend::newPartitionWithSkipRows(Partition *p_in, size_t numToSkip, TransformStage* tstage) {
-        if(!numToSkip)
-            return nullptr;
-
         auto ptr = p_in->lockRaw();
         auto num_rows = *((int64_t*) ptr);
         assert(numToSkip < num_rows);
 
-        Partition *p_out = _driver->allocWritablePartition(num_rows - numToSkip + sizeof(int64_t),
-                                                           tstage->outputSchema(), tstage->outputDataSetID(),
-                                                           tstage->context().id());
-
         ptr += sizeof(int64_t);
         size_t numBytesToSkip = 0;
 
@@ -2233,6 +2229,11 @@ namespace tuplex {
             numBytesToSkip += r.serializedLength();
         }
 
+        Partition *p_out = _driver->allocWritablePartition(p_in->size() - numBytesToSkip + sizeof(int64_t),
+                                                           tstage->outputSchema(), tstage->outputDataSetID(),
+                                                           tstage->context().id());
+        assert(p_out->capacity() >= p_in->size() - numBytesToSkip);
+
         auto ptr_out = p_out->lockRaw();
         *((int64_t*) ptr_out) = p_in->getNumRows() - numToSkip;
         ptr_out += sizeof(int64_t);
diff --git a/tuplex/core/src/physical/TransformStage.cc b/tuplex/core/src/physical/TransformStage.cc
index 6e216ac5c..060365697 100644
--- a/tuplex/core/src/physical/TransformStage.cc
+++ b/tuplex/core/src/physical/TransformStage.cc
@@ -133,15 +133,9 @@ namespace tuplex {
             auto schema = Schema::UNKNOWN;
 
             if(!partitions.empty()) {
-                size_t totalRowsCount = 0;
                 schema = partitions.front()->schema();
                 for (auto partition : partitions) {
                     assert(schema == partition->schema());
-                    totalRowsCount += partition->getNumRows();
-                }
-
-                if (hasOutputLimit()) {
-                    assert(totalRowsCount == _outputTopLimit + _outputBottomLimit);
                 }
             }
 
diff --git a/tuplex/test/core/TakeTest.cc b/tuplex/test/core/TakeTest.cc
index 2d8f81f2f..3990fcd07 100644
--- a/tuplex/test/core/TakeTest.cc
+++ b/tuplex/test/core/TakeTest.cc
@@ -8,13 +8,51 @@
 //  License: Apache 2.0                                                                                               //
 //--------------------------------------------------------------------------------------------------------------------//
 
+#include <random>
+
 #include <Context.h>
 #include "TestUtils.h"
 
+using namespace tuplex;
+using namespace std;
+
 class TakeTest : public PyTest {};
 
+/**
+ * Randomly generate a vector of rows for testing
+ * @param N the size of vector
+ * @return a vector of size N, containing the random data
+ */
+vector<Row> generateTestData(size_t N, uint64_t seed) {
+    mt19937 gen(seed); //Standard mersenne_twister_engine seeded with rd()
+    uniform_int_distribution<> distrib(1, 100000000);
+
+    vector<Row> data;
+    data.reserve(N);
+
+    for (int i = 0; i < N; i++) {
+        data.emplace_back(distrib(gen), distrib(gen), distrib(gen));
+    }
+
+    return data;
+}
+
+vector<Row> generateReferenceData(const vector<Row>& input, size_t topLimit, size_t bottomLimit) {
+    vector<Row> output;
+    for(size_t i = 0; i < topLimit && i < input.size(); i++) {
+        output.push_back(input[i]);
+    }
+    size_t start_bottom = input.size() >= bottomLimit ? input.size() - bottomLimit : 0;
+    start_bottom = max(topLimit, start_bottom);
+
+    for(size_t i = start_bottom; i < input.size(); i++) {
+        output.push_back(input[i]);
+    }
+
+    return output;
+}
+
 TEST_F(TakeTest, takeTopTest) {
-    using namespace tuplex;
     auto opt = testOptions();
     Context context(opt);
 
@@ -51,7 +89,6 @@ TEST_F(TakeTest, takeTopTest) {
 }
 
 TEST_F(TakeTest, takeBottomTest) {
-    using namespace tuplex;
     auto opt = testOptions();
     Context context(opt);
 
@@ -88,7 +125,6 @@ TEST_F(TakeTest, takeBottomTest) {
 }
 
 TEST_F(TakeTest, takeBothTest) {
-    using namespace tuplex;
     auto opt = testOptions();
     Context context(opt);
 
@@ -124,4 +160,47 @@ TEST_F(TakeTest, takeBothTest) {
     EXPECT_EQ(v3[4].getString(0), "!");
 }
 
-// TODO(march): test empty code when reusing partitions. This might not work if user reused dataset
\ No newline at end of file
+TEST_F(TakeTest, takeBigTest) {
+    mt19937 data_seed_gen(4242);
+
+    const std::vector<size_t> test_size{1, 10, 100, 1001, 10001};
+    const std::vector<size_t> limit_values{0, 1, 5, 11, 600, 10000};
+    const std::vector<string> partition_sizes{"256B", "512KB", "1MB"};
+
+    for(auto& part_size : partition_sizes) {
+        auto opt = testOptions();
+        opt.set("tuplex.partitionSize", part_size);
+        Context context(opt);
+
+        for(auto data_size : test_size) {
+            for (auto top_limit: limit_values) {
+                for (auto bottom_limit: limit_values) {
+                    std::cout << "testing with partition size:" << part_size << " data size:"
+                              << data_size << " top:" << top_limit << " bottom:" << bottom_limit << std::endl;
+
+                    auto data = generateTestData(data_size, data_seed_gen());
+                    auto ref_data = generateReferenceData(data, top_limit, bottom_limit);
+
+                    auto res = context.parallelize(data).take(top_limit, bottom_limit);
+                    ASSERT_EQ(ref_data.size(), res->rowCount());
+                    for (Row &r: ref_data) {
+                        Row res_row = res->getNextRow();
+                        if (!(res_row == r)) {
+                            ASSERT_EQ(res_row, r);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+// TODO(march): with map, filter function
+//TEST_F(TakeTest, takeMapFilterTest) {
+//    srand(4242);
+//}
+
+// TODO(march): with file input
+//    context.csv("../resources/");
+
+// TODO(march): collect operator
\ No newline at end of file

From 5048a9b81d35d5c97b14f2970e6316bc23b575a5 Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Thu, 7 Apr 2022 23:28:54 -0400
Subject: [PATCH 38/56] Polish the python interface

---
 tuplex/core/include/DataSet.h         |   8 ++
 tuplex/core/src/DataSet.cc            | 109 +++++++++++++++++++++++++
 tuplex/python/include/PythonDataSet.h |   1 +
 tuplex/python/src/PythonBindings.cc   |   1 +
 tuplex/python/src/PythonDataSet.cc    |  49 +++++++++++
 tuplex/python/tuplex/dataset.py       | 112 +++++++-------------------
 6 files changed, 196 insertions(+), 84 deletions(-)

diff --git a/tuplex/core/include/DataSet.h b/tuplex/core/include/DataSet.h
index 86ca23b6a..9510427e2 100644
--- a/tuplex/core/include/DataSet.h
+++ b/tuplex/core/include/DataSet.h
@@ -130,6 +130,14 @@ namespace tuplex {
          */
         virtual void show(int64_t numRows = -1, std::ostream &os = std::cout);
 
+        /*!
+         * Displays a formatted HTML table of a small portion of the data
+         * @param topLimit how many top rows to print
+         * @param bottomLimit how many bottom rows to print
+         * @param os ostream where to print table to
+         */
+        virtual void showHTMLPreview(size_t topLimit, size_t bottomLimit, std::ostream &os = std::cout);
+
         // named dataset management functions
         /*!
          * map Column using a UDF
diff --git a/tuplex/core/src/DataSet.cc b/tuplex/core/src/DataSet.cc
index d54edb567..210b3ec60 100644
--- a/tuplex/core/src/DataSet.cc
+++ b/tuplex/core/src/DataSet.cc
@@ -756,6 +756,115 @@ namespace tuplex {
         printTable(os, headers, rows);
     }
 
+    void printHTMLRow(std::ostream &os, size_t ind, const Row& r) {
+        os << "    <tr>\n";
+        os << fmt::format("      <th>{}</th>\n", ind);
+        for (auto& s : r.getAsStrings()) {
+            os << fmt::format("      <td>{}</td>\n", s);
+        }
+        os << "    </tr>\n";
+    }
+
+    void DataSet::showHTMLPreview(size_t topLimit, size_t bottomLimit, std::ostream &os) {
+        std::string HTML_TEMPLATE =
+                "<div>\n"
+                "<style scoped>\n"
+                "    .dataframe tbody tr th:only-of-type {\n"
+                "        vertical-align: middle;\n"
+                "    }\n"
+                "\n"
+                "    .dataframe tbody tr th {\n"
+                "        vertical-align: top;\n"
+                "    }\n"
+                "\n"
+                "    .dataframe thead th {\n"
+                "        text-align: right;\n"
+                "    }\n"
+                "</style>\n"
+                "<table border=\"1\" class=\"dataframe\">\n"
+                "  <thead>\n"
+                "    <tr style=\"text-align: right;\">\n"
+                "{}"
+                "    </tr>\n"
+                "  </thead>\n"
+                "  <tbody>\n"
+                "{}"
+                "  </tbody>\n"
+                "</table>\n"
+                "<p>{} columns</p>\n"
+                "</div>";
+
+        assert(_context);
+
+        auto rows = take(topLimit, bottomLimit);
+
+        if (rows->rowCount() == 0) {
+            os << fmt::format(HTML_TEMPLATE, "<th></th>\n", "<tr></tr>\n", 0);
+            return;
+        }
+
+        std::stringstream headers_stream, body_stream;
+        size_t numColumns = 0;
+        assert(rows->rowCount() <= topLimit + bottomLimit);
+
+        // construct tables
+        if (rows->rowCount() < topLimit + bottomLimit) {
+            // the data is small so we get everything (no need to render ...)
+            for (size_t i = 0; rows->hasNextRow(); i++) {
+                Row r = rows->getNextRow();
+                if (i == 0) {
+                    // we set num columns based on the first row
+                    numColumns = r.getNumColumns();
+                }
+
+                printHTMLRow(body_stream, i, r);
+            }
+        } else {
+            // some data is not processed because of limiting
+            size_t i;
+            for (i = 0; rows->hasNextRow() && i < topLimit; i++) {
+                Row r = rows->getNextRow();
+                if (i == 0) {
+                    // we set num columns based on the first row
+                    numColumns = r.getNumColumns();
+                }
+
+                printHTMLRow(body_stream, i, r);
+            }
+
+            // add the ...
+            body_stream << "    <tr>\n";
+            body_stream << "      <th>...</th>\n";
+            for(int j = 0; j < numColumns; j++) {
+                body_stream << "      <td>...</td>\n";
+                body_stream << "    </tr>\n";
+            }
+
+            while (rows->hasNextRow()) {
+                Row r = rows->getNextRow();
+                printHTMLRow(body_stream, i, r);
+            }
+        }
+
+        assert(numColumns != 0);
+
+        // construct headers
+        std::vector<std::string> headers(numColumns);
+        if (!_columnNames.empty()) {
+            assert(numColumns == _columnNames.size());
+            for (auto &c_name: _columnNames) {
+                headers_stream << fmt::format("      <th>{}</th>\n", c_name);
+            }
+        } else {
+            // default to generic name if column name doesn't exist
+            for (int i = 0; i < numColumns; ++i) {
+                headers_stream << fmt::format("      <th>Column {}</th>\n", i);
+            }
+        }
+
+        os << fmt::format(HTML_TEMPLATE, headers_stream.str(), body_stream.str(), numColumns);
+    }
+
     Schema DataSet::schema() const {
         if(!_operator)
             return Schema::UNKNOWN;
diff --git a/tuplex/python/include/PythonDataSet.h b/tuplex/python/include/PythonDataSet.h
index ede482d9c..4761ac7f0 100644
--- a/tuplex/python/include/PythonDataSet.h
+++ b/tuplex/python/include/PythonDataSet.h
@@ -79,6 +79,7 @@ namespace tuplex {
         py::object collect();
         py::object take(const int64_t topLimit, const int64_t bottomLimit);
         void show(const int64_t numRows=-1);
+        std::string showHTMLPreview(const int64_t topLimit, const int64_t bottomLimit);
 
         // DataFrame like operations
         PythonDataSet mapColumn(const std::string& column, const std::string& lambda_code, const std::string& pickled_code, const py::object& closure=py::object());
diff --git a/tuplex/python/src/PythonBindings.cc b/tuplex/python/src/PythonBindings.cc
index 6b3683853..ab239a1a2 100644
--- a/tuplex/python/src/PythonBindings.cc
+++ b/tuplex/python/src/PythonBindings.cc
@@ -41,6 +41,7 @@ PYMODULE {
 
     py::class_<tuplex::PythonDataSet>(m, "_DataSet")
             .def("show", &tuplex::PythonDataSet::show)
+            .def("showHTMLPreview", &tuplex::PythonDataSet::showHTMLPreview)
             .def("collect", &tuplex::PythonDataSet::collect)
             .def("take", &tuplex::PythonDataSet::take)
             .def("map", &tuplex::PythonDataSet::map)
diff --git a/tuplex/python/src/PythonDataSet.cc b/tuplex/python/src/PythonDataSet.cc
index f6079a143..1f543e5d2 100644
--- a/tuplex/python/src/PythonDataSet.cc
+++ b/tuplex/python/src/PythonDataSet.cc
@@ -880,6 +880,55 @@ namespace tuplex {
         }
     }
 
+    std::string PythonDataSet::showHTMLPreview(const int64_t topLimit, const int64_t bottomLimit) {
+        // make sure a dataset is wrapped
+        assert(this->_dataset);
+
+        // is callee error dataset? if so return list with error string
+        if (this->_dataset->isError()) {
+            auto errset = dynamic_cast<ErrorDataSet *>(this->_dataset);
+            assert(errset);
+            return "Error: " + errset->getError();
+        } else {
+            // release GIL & hand over everything to Tuplex
+            assert(PyGILState_Check()); // make sure this thread holds the GIL!
+            python::unlockGIL();
+
+            std::stringstream ss;
+            std::string err_message;
+
+            size_t castedTopLimit = 0;
+            if (topLimit < 0) {
+                castedTopLimit = std::numeric_limits<size_t>::max();
+            }
+
+            size_t castedBottomLimit = 0;
+            if (bottomLimit < 0) {
+                castedBottomLimit = std::numeric_limits<size_t>::max();
+            }
+
+            try {
+                this->_dataset->showHTMLPreview(castedTopLimit, castedBottomLimit, ss);
+            } catch (const std::exception &e) {
+                err_message = e.what();
+                Logger::instance().defaultLogger().error(err_message);
+            } catch (...) {
+                err_message = "unknown C++ exception occurred, please change type.";
+                Logger::instance().defaultLogger().error(err_message);
+            }
+
+            // reacquire GIL
+            python::lockGIL();
+            Logger::instance().flushToPython();
+
+            if (!ss.str().empty() && err_message.empty()) {
+                return ss.str();
+            } else {
+                return "Error occurred: " + err_message;
+            }
+        }
+    }
+
     PyObject* PythonDataSet::anyToCPythonWithPyObjects(ResultSet* rs, size_t maxRowCount) {
         assert(rs);
 
diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py
index 850b4ed83..4d02cf4d4 100644
--- a/tuplex/python/tuplex/dataset.py
+++ b/tuplex/python/tuplex/dataset.py
@@ -28,87 +28,8 @@ class DataSet:
     def __init__(self):
         self._dataSet = None
 
-    def getColumnSize(self):
-        data = self.collect()
-        if len(data) == 0:
-            return 0, 0
-        else:
-            return len(data[0])
-
-    def revTake(self, nRows = 5):
-        return self.collect()[-nRows:]
-
     def _repr_html_(self):
-        rows_list = self.take()
-        total_col_cnt = self.getColumnSize()
-        print('rowlist')
-        print(rows_list)
-        if len(rows_list) == 0:
-            header = '<th></th>\n'
-            rows = '<tr></tr>\n'
-        else:
-            header = '<th></th>\n'
-
-            if self.columns != None:
-                for x in self.columns:
-                    header += f'      <th>{x}</th>\n'
-            else:
-                for i in range(len(rows_list[0])):
-                    header += f'      <th>column {i + 1}</th>\n'
-
-            rows = ''
-            for i, r in enumerate(rows_list):
-                rows += '    <tr>\n'
-                rows += f'      <th>{i}</th>\n'
-                for data in r:
-                    rows += f'      <td>{data}</td>\n'
-                rows += '    </tr>\n'
-
-            # add the ...
-            rows += '    <tr>\n'
-            rows += '      <th>...</th>\n'
-            for i in range(total_col_cnt):
-                rows += '      <td>...</td>\n'
-            rows += '    </tr>\n'
-
-            lastData = self.revTake()
-            for i, r in enumerate(lastData):
-                rows += '    <tr>\n'
-                rows += f'      <th>{0 - len(lastData) + i}</th>\n'
-                for data in r:
-                    rows += f'      <td>{data}</td>\n'
-                rows += '    </tr>\n'
-
-        html_template = (
-            '<div>\n'
-            '<style scoped>\n'
-            '    .dataframe tbody tr th:only-of-type {\n'
-            '        vertical-align: middle;\n'
-            '    }\n'
-            '\n'
-            '    .dataframe tbody tr th {\n'
-            '        vertical-align: top;\n'
-            '    }\n'
-            '\n'
-            '    .dataframe thead th {\n'
-            '        text-align: right;\n'
-            '    }\n'
-            '</style>\n'
-            '<table border="1" class="dataframe">\n'
-            '  <thead>\n'
-            '    <tr style="text-align: right;">\n'
-            f'{header}'
-            '    </tr>\n'
-            '  </thead>\n'
-            '  <tbody>\n'
-            f'{rows}'
-            '  </tbody>\n'
-            '</table>\n'
-            f'<p>{total_col_cnt} columns</p>\n'
-            '</div>'
-        )
-
-        return html_template
+        return self._dataSet.showHTMLPreview()
 
     def unique(self):
         """ removes duplicates from Dataset (out-of-order). Equivalent to a DISTINCT clause in a SQL-statement.
@@ -201,11 +122,14 @@ def take(self, limitTop=5, limitBottom=0):
             (list): A list of tuples
 
         """
+        assert limitTop is None or isinstance(limitTop, int), 'num rows must be an integer or None'
+        assert limitBottom is None or isinstance(limitBottom, int), 'num bottom last must be an integer or None'
 
-        assert isinstance(limitTop, int), 'num rows must be an integer'
-        assert limitTop > 0, 'please specify a number greater than zero'
-        assert isinstance(limitBottom, int), 'num bottom last must be an integer'
-        assert limitBottom >= 0, 'please specify a number greater or equal to zero'
+        if limitTop is None or limitTop < 0:
+            limitTop = -1
+
+        if limitBottom is None or limitBottom < 0:
+            limitBottom = -1
 
         assert self._dataSet is not None, 'internal API error, datasets must be created via context objects'
 
@@ -227,6 +151,26 @@ def show(self, nrows=None):
 
         self._dataSet.show(nrows)
 
+    def showHTMLPreview(self, topLimit=5, bottomLimit=5):
+        """ action that generates a physical plan, processes data and return a subset of results as nicely formatted
+        HTML table to stdout.
+
+        Args:
+            topLimit (int): number of top rows to collect. If ``None`` all rows will be collected
+            bottomLimit (int): number of bottom rows to collect. If ``None`` all rows will be collected
+
+        Returns:
+            string: an HTML table showing a preview of the data
+        """
+        assert self._dataSet is not None, 'internal API error, datasets must be created via context objects'
+
+        if topLimit is None or topLimit < 0:
+            topLimit = -1
+        if bottomLimit is None or bottomLimit < 0:
+            bottomLimit = -1
+
+        return self._dataSet.showHTMLPreview(topLimit, bottomLimit)
+
     def resolve(self, eclass, ftor):
         """ Adds a resolver operator to the pipeline. The signature of ftor needs to be identical to the one of the preceding operator.
 

From 3658d84f0a20ff46f9b8bc81f1ba3c405638136a Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Fri, 8 Apr 2022 01:24:03 -0400
Subject: [PATCH 39/56] Address PR comments

Add two more testcases

Address PR feedbacks
---
 tuplex/core/include/DataSet.h                 |  35 +++-
 tuplex/core/include/EmptyDataset.h            |  14 +-
 tuplex/core/include/LocalEngine.h             |  59 ++++--
 tuplex/core/include/ee/IBackend.h             |  15 +-
 tuplex/core/include/ee/local/LocalBackend.h   |  36 +++-
 tuplex/core/include/physical/TransformStage.h |   6 +-
 tuplex/core/include/physical/TransformTask.h  |  41 +++-
 tuplex/core/src/DataSet.cc                    |   1 -
 tuplex/core/src/LocalEngine.cc                |  38 +++-
 tuplex/core/src/ee/local/LocalBackend.cc      |  72 ++++---
 tuplex/core/src/physical/PhysicalPlan.cc      |   2 +-
 tuplex/core/src/physical/ResultSet.cc         |  12 +-
 tuplex/core/src/physical/TransformStage.cc    |   9 +-
 tuplex/core/src/physical/TransformTask.cc     |  86 ++++----
 tuplex/python/tuplex/dataset.py               | 128 +++++++++++-
 tuplex/test/core/ContextBasics.cc             |  56 +++++-
 tuplex/test/core/ResultSetTest.cc             |   5 +-
 tuplex/test/core/TakeTest.cc                  | 185 +++++++++++++++---
 18 files changed, 643 insertions(+), 157 deletions(-)

diff --git a/tuplex/core/include/DataSet.h b/tuplex/core/include/DataSet.h
index 9510427e2..3a5f450ac 100644
--- a/tuplex/core/include/DataSet.h
+++ b/tuplex/core/include/DataSet.h
@@ -125,7 +125,7 @@ namespace tuplex {
 
         /*!
          * action that displays tuples as nicely formatted table
-         * @param numRows how many rows to print, i.e. top numRows are printed.xs
+         * @param numRows how many rows to print, i.e. top numRows are printed.xs, -1 means print all rows
          * @param os ostream where to print table to
          */
         virtual void show(int64_t numRows = -1, std::ostream &os = std::cout);
@@ -260,21 +260,48 @@ namespace tuplex {
          * @param memoryLayout
          * @return
          */
-        virtual DataSet& cache(const Schema::MemoryLayout& memoryLayout, bool storeSpecialized);
-        DataSet& cache(bool storeSpecialized=true) { return cache(Schema::MemoryLayout::ROW, storeSpecialized); }
+        virtual DataSet &cache(const Schema::MemoryLayout &memoryLayout, bool storeSpecialized);
+
+        DataSet &cache(bool storeSpecialized = true) { return cache(Schema::MemoryLayout::ROW, storeSpecialized); }
 
         /*!
          * helper setter without checks, to update internal column names.
          */
         void setColumns(const std::vector<std::string> &columnNames) { _columnNames = columnNames; }
 
-        // these are actions that cause execution
+        /*!
+         * Execute the pipeline and return all outputs
+         * @param os the logging output
+         * @return the output of the execution
+         */
         virtual std::shared_ptr<ResultSet> collect(std::ostream &os = std::cout);
 
+        /*!
+         * Execute the pipeline and take a subset of the output from the top and bottom rows.
+         * If both top and bottom rows limit exist, then the top and bottom rows will be concatenated.
+         * In the case where topLimit + bottomLimit exceeds the output size, all rows will be taken.
+         * To take all rows, pass in either topLimit=size_t::max(), bottomLimit=size_t::max(), or both.
+         * @param topLimit number of top rows to take. size_t::max() means taking all rows
+         * @param bottomLimit number of bottom rows to take. size_t::max() means taking all rows
+         * @param os the logging output
+         * @return result of the execution, trim to the size of top and bottom limit.
+         */
         virtual std::shared_ptr<ResultSet> take(size_t topLimit, size_t bottomLimit, std::ostream &os = std::cout);
 
+        /*!
+         * Execute the pipeline and return all outputs as vector
+         * @param os the logging output
+         * @return the output of the execution in vector
+         */
         virtual std::vector<Row> collectAsVector(std::ostream &os = std::cout);
 
+        /*!
+         * Execute the pipeline and take a subset of the output from the top rows, return as vector
+         * In the case where numElements exceeds the output size, all rows will be taken.
+         * @param numElements number of top rows to take. size_t::max() means taking all rows
+         * @param os the logging output
+         * @return result of the execution in vector, trim to the size of numElements
+         */
         virtual std::vector<Row> takeAsVector(size_t numElements, std::ostream &os = std::cout);
 
         /*!
diff --git a/tuplex/core/include/EmptyDataset.h b/tuplex/core/include/EmptyDataset.h
index 6fc3219a4..585b70881 100644
--- a/tuplex/core/include/EmptyDataset.h
+++ b/tuplex/core/include/EmptyDataset.h
@@ -67,18 +67,20 @@ namespace tuplex {
         virtual DataSet& aggregateByKey(const UDF& aggCombine, const UDF& aggUDF, const Row& aggInitial, const std::vector<std::string> &keyColumns) override { return *this; }
 
         //virtual void show(const int64_t numRows=-1, std::ostream& os=std::cout) override;
-        virtual std::shared_ptr<ResultSet> collect(std::ostream& os) override;
+        virtual std::shared_ptr<ResultSet> collect(std::ostream &os) override;
 
         // take / collect will print out the error only
-        virtual std::shared_ptr<ResultSet> take(size_t topLimit, size_t bottomLimit, std::ostream& os) override;
+        virtual std::shared_ptr<ResultSet> take(size_t topLimit, size_t bottomLimit, std::ostream &os) override;
 
         //virtual void show(const int64_t numRows=-1, std::ostream& os=std::cout) override;
-        virtual std::vector<Row> collectAsVector(std::ostream& os) override;
+        virtual std::vector<Row> collectAsVector(std::ostream &os) override;
 
-        // take / collect will print out the error only
-        virtual std::vector<Row> takeAsVector(size_t numElements, std::ostream& os) override;
+        /*!
+         * take / collect will print out the error only, return empty rows
+         */
+        virtual std::vector<Row> takeAsVector(size_t numElements, std::ostream &os) override;
 
-        DataSet& cache(const Schema::MemoryLayout& memoryLayout, bool storeSpecialized) override {
+        DataSet &cache(const Schema::MemoryLayout &memoryLayout, bool storeSpecialized) override {
             return *this;
         }
     };
diff --git a/tuplex/core/include/LocalEngine.h b/tuplex/core/include/LocalEngine.h
index 66ed3a1e8..740a40b4d 100644
--- a/tuplex/core/include/LocalEngine.h
+++ b/tuplex/core/include/LocalEngine.h
@@ -16,7 +16,28 @@
 #include <TSingleton.h>
 #include "RESTInterface.h"
 
+
 namespace tuplex {
+    struct ExecutorConfig {
+        size_t _size; // size in bytes that each executor should have
+        size_t _blockSize; // size of individual blocks used (can be used for coarse or fine grained parallelism)
+        size_t _runTimeMemory;
+        size_t _runTimeMemoryDefaultBlockSize;
+        URI _cache_path;
+
+        bool operator==(const ExecutorConfig &rhs) const {
+            return _size == rhs._size &&
+                   _blockSize == rhs._blockSize &&
+                   _runTimeMemory == rhs._runTimeMemory &&
+                   _runTimeMemoryDefaultBlockSize == rhs._runTimeMemoryDefaultBlockSize &&
+                   _cache_path == rhs._cache_path;
+        }
+
+        bool operator!=(const ExecutorConfig &rhs) const {
+            return !(rhs == *this);
+        }
+    };
+
     /*!
      * local execution engine. Provides local executors for a context
      * THIS IS NOT THREADSAFE. Should be only accessed by driver thread.
@@ -25,16 +46,18 @@ namespace tuplex {
 
     private:
         // non-detached executor that serves as the driver
-        std::unique_ptr<Executor> _driver;
+        std::shared_ptr<Executor> _driver;
+        ExecutorConfig _driver_cfg;
 
         std::vector<std::unique_ptr<Executor>> _executors;
-        std::map<Executor*, size_t> _refCounts; //! reference counts for each executor
+        std::map<Executor *, size_t> _refCounts; //! reference counts for each executor
+
+        LocalEngine(const LocalEngine &);
 
-        LocalEngine(const LocalEngine&);
-        void operator = (const LocalEngine&);
+        void operator=(const LocalEngine &);
 
         // The local task queue
-        WorkQueue  _queue;
+        WorkQueue _queue;
 
     protected:
         LocalEngine();
@@ -63,25 +86,25 @@ namespace tuplex {
          * @param cache_path directory where subfolders will be created for all executors to be started
          * @return array of executor references
          */
-        std::vector<Executor*> getExecutors(const size_t num,
-                                            const size_t size,
-                                            const size_t blockSize,
-                                            const size_t runTimeMemory,
-                                            const size_t runTimeMemoryDefaultBlockSize,
-                const URI& cache_path);
+        std::vector<Executor *> getExecutors(const size_t num,
+                                             const size_t size,
+                                             const size_t blockSize,
+                                             const size_t runTimeMemory,
+                                             const size_t runTimeMemoryDefaultBlockSize,
+                                             const URI &cache_path);
 
         /*!
          * releases executors (invoked by context)
          * @param executors
          * @param ctx
          */
-        void freeExecutors(const std::vector<Executor*>& executors, const Context* ctx=nullptr);
+        void freeExecutors(const std::vector<Executor *> &executors, const Context *ctx = nullptr);
 
-        Executor* getDriver(const size_t size,
-                            const size_t blockSize,
-                            const size_t runTimeMemory,
-                            const size_t runTimeMemoryDefaultBlockSize,
-                            const URI& cache_path);
+        std::shared_ptr<Executor> getDriver(const size_t size,
+                                            const size_t blockSize,
+                                            const size_t runTimeMemory,
+                                            const size_t runTimeMemoryDefaultBlockSize,
+                                            const URI &cache_path);
 
         void release();
 
@@ -89,7 +112,7 @@ namespace tuplex {
          * retrieves the global work queue for local executors
          * @return
          */
-        WorkQueue& getQueue() { return _queue; }
+        WorkQueue &getQueue() { return _queue; }
     };
 }
 #endif //TUPLEX_LOCALENGINE_H
\ No newline at end of file
diff --git a/tuplex/core/include/ee/IBackend.h b/tuplex/core/include/ee/IBackend.h
index e7a80e5bb..1a543df8f 100644
--- a/tuplex/core/include/ee/IBackend.h
+++ b/tuplex/core/include/ee/IBackend.h
@@ -29,19 +29,22 @@ namespace tuplex {
     class IBackend {
     public:
         IBackend() = delete;
-        IBackend(const IBackend& other) = delete;
-        IBackend(const Context& context) : _context(context) {}
+
+        IBackend(const IBackend &other) = delete;
+
+        IBackend(const Context &context) : _context(context) {}
 
         // driver, i.e. where to store local data.
-        virtual Executor* driver() = 0;
-        virtual void execute(PhysicalStage* stage) = 0;
+        virtual Executor *driver() = 0;
+
+        virtual void execute(PhysicalStage *stage) = 0;
 
         virtual ~IBackend() {} // virtual destructor needed b.c. of smart pointers
 
-        virtual const Context& context() const { return _context; }
+        virtual const Context &context() const { return _context; }
 
     private:
-        const Context& _context;
+        const Context &_context;
     };
 
     inline std::unordered_map<std::tuple<int64_t, ExceptionCode>, size_t> merge_ecounts(std::unordered_map<std::tuple<int64_t, ExceptionCode>, size_t> lhs,
diff --git a/tuplex/core/include/ee/local/LocalBackend.h b/tuplex/core/include/ee/local/LocalBackend.h
index d7a5ec25b..3d73a5d9f 100644
--- a/tuplex/core/include/ee/local/LocalBackend.h
+++ b/tuplex/core/include/ee/local/LocalBackend.h
@@ -40,14 +40,15 @@ namespace tuplex {
          * constructor for convenience
          * @param context
          */
-        explicit LocalBackend(const Context& context);
+        explicit LocalBackend(const Context &context);
 
-        Executor* driver() override; // for local execution
+        Executor *driver() override; // for local execution
+
+        void execute(PhysicalStage *stage) override;
 
-        void execute(PhysicalStage* stage) override;
     private:
-        Executor *_driver; //! driver from local backend...
-        std::vector<Executor*> _executors; //! drivers to be used
+        std::shared_ptr<Executor> _driver; //! driver from local backend...
+        std::vector<Executor *> _executors; //! drivers to be used
         std::unique_ptr<JITCompiler> _compiler;
 
         HistoryServerConnection _historyConn;
@@ -88,9 +89,6 @@ namespace tuplex {
 
         MessageHandler& logger() const { return Logger::instance().logger("local ee"); }
 
-        void trimPartitionsToLimit(std::vector<Partition *> &partitions, size_t topLimit, size_t bottomLimit, TransformStage* tstage);
-        Partition* newPartitionWithSkipRows(Partition* p_in, size_t numToSkip, TransformStage* tstage);
-
         // write output (may be already in correct format!)
         void writeOutput(TransformStage* tstage, std::vector<IExecutorTask*>& sortedTasks);
 
@@ -187,6 +185,28 @@ namespace tuplex {
      * @return
      */
     extern URI outputURI(const UDF& udf, const URI& baseURI, int64_t partNo, FileFormat fmt);
+
+    /*!
+     * Trim list of partitions so that it includes up to the first n rows and the last m rows
+     * if n + m > number of rows in input partitions, the partitions will remain unchanged
+     * @param partitions [in,out] the list of partitions to trim
+     * @param topLimit n, the number of top rows to include
+     * @param bottomLimit m, the number of bottom rows to include
+     * @param tstage pointer to transform stage, might be used to generate new partition
+     * @param exec pointer to executor, might be used to allocate new partition
+     */
+    extern void trimPartitionsToLimit(std::vector<Partition *> &partitions, size_t topLimit, size_t bottomLimit,
+                               TransformStage *tstage, Executor *exec);
+
+    /*!
+     * Create a newly allocated partition with the same data as the specified partition, but with the first n rows removed
+     * @param p_in the input partition
+     * @param numToSkip number of rows to remove from the new partition
+     * @param tstage pointer to transform stage, used to generate new partition
+     * @param exec pointer to executor, used to allocate new partition
+     * @return the new partition
+     */
+    extern Partition *newPartitionWithSkipRows(Partition *p_in, size_t numToSkip, TransformStage *tstage, Executor *exec);
 }
 
 #endif //TUPLEX_LOCALBACKEND_H
\ No newline at end of file
diff --git a/tuplex/core/include/physical/TransformStage.h b/tuplex/core/include/physical/TransformStage.h
index e1e45c97b..ff4ece1dd 100644
--- a/tuplex/core/include/physical/TransformStage.h
+++ b/tuplex/core/include/physical/TransformStage.h
@@ -107,9 +107,9 @@ namespace tuplex {
         std::unordered_map<std::string, ExceptionInfo> partitionToExceptionsMap() { return _partitionToExceptionsMap; }
 
         /*!
-         * sets maximum number of top rows this pipeline will produce
-         * @param topLimit
-         * @param bottomLimit
+         * sets maximum number of rows this pipeline will produce
+         * @param topLimit number of top rows to produce, 0 means none, and size_t::max means everything
+         * @param bottomLimit number of bottom rows to produce, 0 means none, and size_t::max means everything
          */
         inline void  setOutputLimit(size_t topLimit, size_t bottomLimit) {
             _outputTopLimit = topLimit;
diff --git a/tuplex/core/include/physical/TransformTask.h b/tuplex/core/include/physical/TransformTask.h
index e2b8bc5b6..8ac5ba6df 100644
--- a/tuplex/core/include/physical/TransformTask.h
+++ b/tuplex/core/include/physical/TransformTask.h
@@ -180,15 +180,32 @@ namespace tuplex {
         void setOutputPrefix(const char* buf, size_t bufSize); // extra prefix to write first to output.
 
         void sinkOutputToHashTable(HashTableFormat fmt, int64_t outputDataSetID);
+
         HashTableSink hashTableSink() const { return _htable; } // needs to be freed manually!
 
-        void setOutputTopLimit(size_t limit) { _outTopLimit = limit; resetOutputLimitCounter(); }
-        void setOutputBottomLimit(size_t limit) { _outBottomLimit = limit; resetOutputLimitCounter(); }
+        void setOutputTopLimit(size_t limit) {
+            _outTopLimit = limit;
+        }
+
+        void setOutputBottomLimit(size_t limit) {
+            _outBottomLimit = limit;
+        }
+
+        /*!
+         * Set the maximum task order number that the current stage execute and reset the row counter.
+         * This is used to detect and stop the execution when we have reached the rows limit
+         * @param maxOrder maximum task order number in the pipeline, infinity means disregarding the bottomLimit short circuit
+         */
+        static void setMaxOrderAndResetLimits(size_t maxOrder = std::numeric_limits<size_t>::max());
+
         void execute() override;
 
         bool hasFileSink() const { return _outputFilePath != URI::INVALID; }
+
         bool hasFileSource() const { return _inputFilePath != URI::INVALID; }
+
         bool hasMemorySink() const { return _outputSchema != Schema::UNKNOWN; }
+
         bool hasMemorySource() const { return !_inputPartitions.empty(); }
         bool hasHashTableSink() const { return _htableFormat != HashTableFormat::UNKNOWN; }
         HashTableFormat hashTableFormat() const { return _htableFormat; }
@@ -207,8 +224,6 @@ namespace tuplex {
         static codegen::i64_hash_row_f writeInt64HashTableAggregateCallback();
         static codegen::write_row_f aggCombineCallback();
 
-        static void resetOutputLimitCounter();
-
         // most be public because of C++ issues -.-
         int64_t writeRowToMemory(uint8_t* buf, int64_t bufSize);
         int64_t writeRowToFile(uint8_t* buf, int64_t bufSize);
@@ -310,12 +325,26 @@ namespace tuplex {
         inline int64_t contextID() const { return _contextID; }
 
         inline void unlockAllMemorySinks() {  // output partition existing? if so unlock
-           _output.unlock();
-           _exceptions.unlock();
+            _output.unlock();
+            _exceptions.unlock();
         }
 
+        /*!
+         * check whether the stage reached both top and bottom limit, to use this one must call
+         * setMaxOrderAndResetLimits before execution and set both top and bottom limit
+         * @return true if limit is reached
+         */
+        bool limitReached() const;
+
+        /*!
+         * Update the global stage limit counter, should only be called once, at the end of task
+         */
+        void updateLimits();
+
         void processMemorySourceWithExp();
+
         void processMemorySource();
+
         void processFileSource();
 
         // exceptions
diff --git a/tuplex/core/src/DataSet.cc b/tuplex/core/src/DataSet.cc
index 210b3ec60..b62946ae4 100644
--- a/tuplex/core/src/DataSet.cc
+++ b/tuplex/core/src/DataSet.cc
@@ -849,7 +849,6 @@ namespace tuplex {
         assert(numColumns != 0);
 
         // construct headers
-        std::vector<std::string> headers(numColumns);
         if (!_columnNames.empty()) {
             assert(numColumns == _columnNames.size());
             for (auto &c_name: _columnNames) {
diff --git a/tuplex/core/src/LocalEngine.cc b/tuplex/core/src/LocalEngine.cc
index 02c060a90..c9c6d506b 100644
--- a/tuplex/core/src/LocalEngine.cc
+++ b/tuplex/core/src/LocalEngine.cc
@@ -98,7 +98,8 @@ namespace tuplex {
             exec->processQueue(true);
 
             std::stringstream ss;
-            ss<<"started local executor "<<exec->name()<<" ("<<sizeToMemString(size)<<", "<<sizeToMemString(blockSize)<<" default partition size)";
+            ss << "started local executor " << exec->name() << " (" << sizeToMemString(size) << ", "
+               << sizeToMemString(blockSize) << " default partition size)";
             logger.info(ss.str());
         }
 
@@ -107,23 +108,44 @@ namespace tuplex {
         return execs;
     }
 
-    Executor* LocalEngine::getDriver(const size_t size, const size_t blockSize, const size_t runTimeMemory,
-                                     const size_t runTimeMemoryDefaultBlockSize, const tuplex::URI &cache_path) {
-        // lazy start driver
-        if(!_driver) {
+    std::shared_ptr<Executor>
+    LocalEngine::getDriver(const size_t size, const size_t blockSize, const size_t runTimeMemory,
+                           const size_t runTimeMemoryDefaultBlockSize, const tuplex::URI &cache_path) {
+        ExecutorConfig new_cfg = ExecutorConfig{
+                ._size = size,
+                ._blockSize = blockSize,
+                ._runTimeMemory = runTimeMemory,
+                ._runTimeMemoryDefaultBlockSize = runTimeMemoryDefaultBlockSize,
+                ._cache_path = cache_path
+        };
+
+        if (!_driver || _driver_cfg != new_cfg) {
+            if (_driver) {
+                Logger::instance().logger("local execution engine").info(
+                        "driver already exist, starting new driver with updated config");
+                _driver->release(); // TODO(march): test whether we need this
+            }
+
+            // lazy start driver
             URI uri = URI(cache_path.toString() + "/" + "driver");
-            _driver = std::make_unique<Executor>(size, blockSize, runTimeMemory, runTimeMemoryDefaultBlockSize, uri, "driver");
+            _driver = std::make_shared<Executor>(size, blockSize, runTimeMemory, runTimeMemoryDefaultBlockSize, uri,
+                                                 "driver");
+            _driver_cfg = new_cfg;
 
+            // TODO(march): this could be a problem, if multiple driver with number = 0
+            // TODO(march): write a test for two drivers existing together (thread number 0)
+            // TODO(march): make a comment about potential issue here
             // driver always has thread number 0!
             _driver->setThreadNumber(0);
 
             std::stringstream ss;
-            ss<<"started driver ("<<sizeToMemString(size)<<", "<<sizeToMemString(blockSize)<<" default partition size)";
+            ss << "started driver (" << sizeToMemString(size) << ", " << sizeToMemString(blockSize)
+               << " default partition size)";
             //  <<"overflow will be cached at "<<uri.toString();
             Logger::instance().logger("local execution engine").info(ss.str());
         }
 
-        return _driver.get();
+        return _driver;
     }
 
     void LocalEngine::freeExecutors(const std::vector<tuplex::Executor *> & executors, const Context* ctx) {
diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc
index 0b8157ecc..351d55b88 100644
--- a/tuplex/core/src/ee/local/LocalBackend.cc
+++ b/tuplex/core/src/ee/local/LocalBackend.cc
@@ -116,14 +116,14 @@ namespace tuplex {
     }
 
     Executor *LocalBackend::driver() {
-      assert(_driver);
-      return _driver;
+        assert(_driver);
+        return _driver.get();
     }
 
     void LocalBackend::execute(tuplex::PhysicalStage *stage) {
         assert(stage);
 
-        if(!stage)
+        if (!stage)
             return;
 
         // history server connection should be established
@@ -651,7 +651,6 @@ namespace tuplex {
             // --> issue for each memory partition a transform task and put it into local workqueue
             assert(tstage->inputMode() == EndPointMode::MEMORY);
 
-
             // restrict after input limit
             size_t numInputRows = 0;
             auto inputPartitions = tstage->inputPartitions();
@@ -702,9 +701,17 @@ namespace tuplex {
             tasks[i]->setOrder(i);
         }
 
+        TransformTask::setMaxOrderAndResetLimits(tasks.size() - 1);
+
         if (tstage->hasOutputLimit()) {
+            // There are 3 possible cases here:
+            // 1. both top and bottom limit
+            // 2. only top limit
+            // 3. only bottom limit
             if (tstage->outputTopLimit() > 0 && tstage->outputBottomLimit() > 0) {
-                // do task striping for output limit on both ends
+                // case 1: do task striping for output limit on both ends
+                // We are executing in the striping order instead of ascending or descending order
+                // This is an optimization in the case where we have small limits to avoid executing all partitions
                 vector<IExecutorTask*> newTasks;
                 for(size_t i = 0; i < tasks.size() - i; i++) {
                     const size_t rev_i = tasks.size() - 1 - i;
@@ -716,10 +723,13 @@ namespace tuplex {
                 assert(tasks.size() == newTasks.size());
                 tasks.swap(newTasks);
             } else if (tstage->outputBottomLimit() > 0) {
-                // bottom limit only, just reverse the task order
+                // case 3: bottom limit only, just reverse the task order
+                // We are executing the last partitions first, since we don't need the top rows.
+                // Thus speeding up the execution time
                 std::reverse(tasks.begin(), tasks.end());
             }
-            // if top limit only, do nothing since the order is already good
+            // case 3: if top limit only, do nothing since the order is already good
+            // (the tasks is generated in ascending order)
         }
 
         return tasks;
@@ -887,7 +897,8 @@ namespace tuplex {
 
             auto output_par = tstage->inputPartitions();
             if (tstage->hasOutputLimit()) {
-                trimPartitionsToLimit(output_par, tstage->outputTopLimit(), tstage->outputBottomLimit(), tstage);
+                trimPartitionsToLimit(output_par, tstage->outputTopLimit(), tstage->outputBottomLimit(), tstage,
+                                      _driver.get());
             }
             tstage->setMemoryResult(output_par, std::vector<Partition*>{}, std::unordered_map<std::string, ExceptionInfo>(), pyObjects);
             pyObjects.clear();
@@ -971,7 +982,6 @@ namespace tuplex {
         }
 
         auto tasks = createLoadAndTransformToMemoryTasks(tstage, _options, syms);
-
         auto completedTasks = performTasks(tasks);
 
         // Note: this doesn't work yet because of the globals.
@@ -1205,7 +1215,9 @@ namespace tuplex {
                 }
 
                 if (tstage->hasOutputLimit()) {
-                    trimPartitionsToLimit(output, tstage->outputTopLimit(), tstage->outputBottomLimit(), tstage);
+                    // the function expect the output to be sorted in ascending order (guaranteed by sortTasks())
+                    trimPartitionsToLimit(output, tstage->outputTopLimit(), tstage->outputBottomLimit(), tstage,
+                                          _driver.get());
                 }
 
                 tstage->setMemoryResult(output, generalOutput, partitionToExceptionsMap, nonConformingRows, remainingExceptions, ecounts);
@@ -1565,12 +1577,6 @@ namespace tuplex {
             logger().debug("task without order found, please fix in code.");
         }
 #endif
-
-        for (int i = 0; i < tasks.size(); i++) {
-            // take limit only work with uniform order
-            assert(tasks[i]->getOrder(0) == i);
-        }
-
         // add all tasks to queue
         for(auto& task : tasks) wq.addTask(task);
 
@@ -2125,17 +2131,18 @@ namespace tuplex {
         tstage->setFileResult(ecounts);
     }
 
-    void LocalBackend::trimPartitionsToLimit(std::vector<Partition *> &partitions,
+    void trimPartitionsToLimit(std::vector<Partition *> &partitions,
                                              size_t topLimit,
                                              size_t bottomLimit,
-                                             TransformStage* tstage) {
+                                             TransformStage* tstage,
+                                             Executor *exec) {
         std::vector<Partition *> limitedPartitions, limitedTailPartitions;
 
         // check top output limit, adjust partitions if necessary
         size_t numTopOutputRows = 0;
-        Partition* lastTopPart = nullptr;
+        Partition *lastTopPart = nullptr;
         size_t clippedTop = 0;
-        for (auto partition : partitions) {
+        for (auto partition: partitions) {
             numTopOutputRows += partition->getNumRows();
             lastTopPart = partition;
             if (numTopOutputRows >= topLimit) {
@@ -2174,7 +2181,8 @@ namespace tuplex {
                 auto clipped = bottomLimit - (numBottomOutputRows - partition->getNumRows());
                 assert(clipped <= partition->getNumRows());
                 if (clipped > 0) {
-                    Partition *newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage);
+                    Partition *newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage,
+                                                                  exec);
                     assert(newPart->getNumRows() == clipped);
                     limitedTailPartitions.push_back(newPart);
                 }
@@ -2191,10 +2199,11 @@ namespace tuplex {
             assert(clippedTop + clippedBottom <= lastTopPart->getNumRows());
 
             // split into two partitions with both top and bottom are in the same partition
-            Partition* lastBottomPart = nullptr;
+            Partition *lastBottomPart = nullptr;
 
             if (clippedBottom != 0) {
-                lastBottomPart = newPartitionWithSkipRows(lastTopPart, lastTopPart->getNumRows() - clippedBottom, tstage);
+                lastBottomPart = newPartitionWithSkipRows(lastTopPart, lastTopPart->getNumRows() - clippedBottom,
+                                                          tstage, exec);
             }
 
             if (clippedTop != 0) {
@@ -2215,27 +2224,28 @@ namespace tuplex {
         partitions.insert(partitions.end(), limitedTailPartitions.rbegin(), limitedTailPartitions.rend());
     }
 
-    Partition* LocalBackend::newPartitionWithSkipRows(Partition *p_in, size_t numToSkip, TransformStage* tstage) {
+    Partition *newPartitionWithSkipRows(Partition *p_in, size_t numToSkip, TransformStage *tstage, Executor *exec) {
         auto ptr = p_in->lockRaw();
-        auto num_rows = *((int64_t*) ptr);
+        auto num_rows = *((int64_t *) ptr);
         assert(numToSkip < num_rows);
 
         ptr += sizeof(int64_t);
         size_t numBytesToSkip = 0;
 
-        for(unsigned i = 0; i < numToSkip; ++i) {
-            Row r = Row::fromMemory(tstage->outputSchema(), ptr, p_in->capacity() - numBytesToSkip);
+        Deserializer ds(tstage->outputSchema());
+        for (unsigned i = 0; i < numToSkip; ++i) {
+            Row r = Row::fromMemory(ds, ptr, p_in->capacity() - numBytesToSkip);
             ptr += r.serializedLength();
             numBytesToSkip += r.serializedLength();
         }
 
-        Partition *p_out = _driver->allocWritablePartition(p_in->size() - numBytesToSkip + sizeof(int64_t),
-                                                           tstage->outputSchema(), tstage->outputDataSetID(),
-                                                           tstage->context().id());
+        Partition *p_out = exec->allocWritablePartition(p_in->size() - numBytesToSkip + sizeof(int64_t),
+                                                        tstage->outputSchema(), tstage->outputDataSetID(),
+                                                        tstage->context().id());
         assert(p_out->capacity() >= p_in->size() - numBytesToSkip);
 
         auto ptr_out = p_out->lockRaw();
-        *((int64_t*) ptr_out) = p_in->getNumRows() - numToSkip;
+        *((int64_t *) ptr_out) = p_in->getNumRows() - numToSkip;
         ptr_out += sizeof(int64_t);
         memcpy((void *) ptr_out, ptr, p_in->size() - numBytesToSkip);
         p_out->unlock();
diff --git a/tuplex/core/src/physical/PhysicalPlan.cc b/tuplex/core/src/physical/PhysicalPlan.cc
index f289064d5..e88189447 100644
--- a/tuplex/core/src/physical/PhysicalPlan.cc
+++ b/tuplex/core/src/physical/PhysicalPlan.cc
@@ -240,7 +240,7 @@ namespace tuplex {
         // user wants to merge exceptions in order.
         bool updateInputExceptions = hasFilter && hasInputExceptions && _context.getOptions().OPT_MERGE_EXCEPTIONS_INORDER();
 
-        // create transfrom stage via builder pattern
+        // create transform stage via builder pattern
         auto builder = codegen::StageBuilder(_num_stages++,
                                                isRootStage,
                                                _context.getOptions().UNDEFINED_BEHAVIOR_FOR_OPERATORS(),
diff --git a/tuplex/core/src/physical/ResultSet.cc b/tuplex/core/src/physical/ResultSet.cc
index bfd656dc8..0eb6d95ad 100644
--- a/tuplex/core/src/physical/ResultSet.cc
+++ b/tuplex/core/src/physical/ResultSet.cc
@@ -127,15 +127,19 @@ namespace tuplex {
                 return vector<Row>{};
 
             Deserializer ds(_schema);
-            for(int i = 0; i < limit;) {
+            for (size_t i = 0; i < limit;) {
 
                 // all exhausted
-                if(_partitions.empty())
+                if (_partitions.empty())
                     break;
 
                 // get number of rows in first partition
                 Partition *first = _partitions.front();
                 auto num_rows = first->getNumRows();
+
+                assert(num_rows >= _curRowCounter);
+                assert(limit >= i);
+
                 // how many left to retrieve?
                 auto num_to_retrieve_from_partition = std::min(limit - i, num_rows - _curRowCounter);
                 assert(num_to_retrieve_from_partition >= 0);
@@ -145,8 +149,8 @@ namespace tuplex {
 
                 // thread safe version (slow)
                 // get next element of partition
-                const uint8_t* ptr = first->lock();
-                for(int j = 0; j < num_to_retrieve_from_partition; ++j) {
+                const uint8_t *ptr = first->lock();
+                for (size_t j = 0; j < num_to_retrieve_from_partition; ++j) {
                     auto row = Row::fromMemory(ds, ptr + _byteCounter, first->capacity() - _byteCounter);
                     _byteCounter += row.serializedLength();
                     _curRowCounter++;
diff --git a/tuplex/core/src/physical/TransformStage.cc b/tuplex/core/src/physical/TransformStage.cc
index 060365697..b0e2e70ab 100644
--- a/tuplex/core/src/physical/TransformStage.cc
+++ b/tuplex/core/src/physical/TransformStage.cc
@@ -139,10 +139,17 @@ namespace tuplex {
                 }
             }
 
+            size_t maxRows;
+            if (hasOutputLimit()) {
+                maxRows = outputTopLimit() + outputBottomLimit();
+            } else {
+                maxRows = std::numeric_limits<size_t>::max();
+            }
+
             // put ALL partitions to result set
             _rs = std::make_shared<ResultSet>(schema, partitions,
                                               generalCase, partitionToExceptionsMap, interpreterRows,
-                                              outputTopLimit() + outputBottomLimit());
+                                              maxRows);
         }
     }
 
diff --git a/tuplex/core/src/physical/TransformTask.cc b/tuplex/core/src/physical/TransformTask.cc
index 377385deb..2de71e4fe 100644
--- a/tuplex/core/src/physical/TransformTask.cc
+++ b/tuplex/core/src/physical/TransformTask.cc
@@ -18,18 +18,22 @@
 #include <bucket.h>
 
 namespace tuplex {
-    // atomic var to count output rows!
-    static std::atomic_int64_t g_totalTopOutputRows;
-    static std::atomic_int64_t g_totalBottomOutputRows;
+    // this is a logic to stop the execution once it has reached the topLimit and bottomLimit
+    // here, we assume that task order starts with zero and count up by 1, e.g. 0, 1, 2, ..., n
+    // To implement limit, we maintain a mapping from the task order to the number of rows done in that task
+    // (rows done are either 0 or #output rows after processing)
+    // we can then find out how many top rows are done by looking at g_rowsDone[0], g_rowsDone[1], ...
+    // until we reach some segment that's 0
+    // likewise, we can find the bottom rows done by looking at g_rowsDone[g_maxOrder], g_rowsDone[g_maxOrder - 1], ...
 
     // mapping from order number -> row count if the task is finished
     static std::mutex g_rowsDoneMutex;
-    static std::map<size_t, size_t> g_rowsDone;
+    static std::unordered_map<size_t, size_t> g_rowsDone;
+    static std::atomic_size_t g_maxOrder;
 
-    void TransformTask::resetOutputLimitCounter() {
-        g_totalTopOutputRows = 0;
-        g_totalBottomOutputRows = 0;
+    void TransformTask::setMaxOrderAndResetLimits(size_t maxOrder) {
         g_rowsDone.clear();
+        g_maxOrder = maxOrder;
     }
 }
 
@@ -602,25 +606,16 @@ namespace tuplex {
 #endif
     }
 
-    void TransformTask::processMemorySource() {
-        assert(!_inputPartitions.empty());
-        assert(_functor);
-
-        _numInputRowsRead = 0;
-        _numOutputRowsWritten = 0;
-
-        int64_t  num_normal_rows = 0, num_bad_rows = 0;
-
-        auto functor = reinterpret_cast<codegen::read_block_f>(_functor);
+    bool TransformTask::limitReached() const {
+        size_t numTopCompleted = 0;
+        size_t numBottomCompleted = 0;
+        bool isTopLimitReached = false;
+        bool isBottomLimitReached = false;
 
-        // go over all input partitions.
-        for(const auto &inputPartition : _inputPartitions) {
-            size_t numTopCompleted = 0;
-            size_t numBottomCompleted = 0;
-            bool isTopLimitReached = false;
-            bool isBottomLimitReached = false;
-
-            tuplex::g_rowsDoneMutex.lock();
+        tuplex::g_rowsDoneMutex.lock();
+        if (_outTopLimit == 0) {
+            isTopLimitReached = true;
+        } else {
             for (size_t i = 0; tuplex::g_rowsDone.count(i) != 0; i++) {
                 numTopCompleted += tuplex::g_rowsDone[i];
                 if (numTopCompleted >= _outTopLimit) {
@@ -628,17 +623,44 @@ namespace tuplex {
                     break;
                 }
             }
-            // TODO: what is the max task number here
-            for (size_t i = 100; tuplex::g_rowsDone.count(i) != 0; i--) {
+        }
+
+        if (_outBottomLimit == 0) {
+            isBottomLimitReached = true;
+        } else {
+            for (size_t i = tuplex::g_maxOrder; tuplex::g_rowsDone.count(i) != 0; i--) {
                 numBottomCompleted += tuplex::g_rowsDone[i];
-                if (numBottomCompleted >= _outTopLimit) {
+                if (numBottomCompleted >= _outBottomLimit) {
                     isBottomLimitReached = true;
                     break;
                 }
             }
-            tuplex::g_rowsDoneMutex.unlock();
+        }
+        tuplex::g_rowsDoneMutex.unlock();
 
-            if (isTopLimitReached && isBottomLimitReached) {
+        return isTopLimitReached && isBottomLimitReached;
+    }
+
+    void TransformTask::updateLimits() {
+        tuplex::g_rowsDoneMutex.lock();
+        tuplex::g_rowsDone[getOrder(0)] += getNumOutputRows();
+        tuplex::g_rowsDoneMutex.unlock();
+    }
+
+    void TransformTask::processMemorySource() {
+        assert(!_inputPartitions.empty());
+        assert(_functor);
+
+        _numInputRowsRead = 0;
+        _numOutputRowsWritten = 0;
+
+        int64_t  num_normal_rows = 0, num_bad_rows = 0;
+
+        auto functor = reinterpret_cast<codegen::read_block_f>(_functor);
+
+        // go over all input partitions.
+        for(const auto &inputPartition : _inputPartitions) {
+            if (limitReached()) {
                 // skip the execution, enough is done
                 break;
             }
@@ -665,9 +687,7 @@ namespace tuplex {
             if(_invalidateSourceAfterUse)
                 inputPartition->invalidate();
 
-            tuplex::g_rowsDoneMutex.lock();
-            tuplex::g_rowsDone[getOrder(0)] += getNumOutputRows();
-            tuplex::g_rowsDoneMutex.unlock();
+            updateLimits();
         }
 
 #ifndef NDEBUG
diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py
index 4d02cf4d4..7eda223a1 100644
--- a/tuplex/python/tuplex/dataset.py
+++ b/tuplex/python/tuplex/dataset.py
@@ -113,7 +113,8 @@ def collect(self):
         return self._dataSet.collect()
 
     def take(self, limitTop=5, limitBottom=0):
-        """ action that generates a physical plan, processes data and collects the top results then as list of tuples.
+        """ action that generates a physical plan, processes data and collects the top and bottom results
+        then as list of tuples.
 
         Args:
             limitTop (int): number of top rows to collect. Per default ``5``.
@@ -135,6 +136,26 @@ def take(self, limitTop=5, limitBottom=0):
 
         return self._dataSet.take(limitTop, limitBottom)
 
+    def head(self, nrows):
+        """ action that generates a physical plan, processes data and collects the top results then as list of tuples.
+
+        Args:
+            nrows (int): number of rows to collect.
+        Returns:
+            (list): A list of tuples
+        """
+        return self.take(nrows, 0)
+
+    def tail(self, nrows):
+        """ action that generates a physical plan, processes data and collects the bottom results then as list of tuples.
+
+        Args:
+            nrows (int): number of rows to collect.
+        Returns:
+            (list): A list of tuples
+        """
+        return self.take(0, nrows)
+
     def show(self, nrows=None):
         """ action that generates a physical plan, processes data and prints results as nicely formatted
         ASCII table to stdout.
@@ -151,6 +172,15 @@ def show(self, nrows=None):
 
         self._dataSet.show(nrows)
 
+    def _getHTMLRow(self, ind, row):
+        row_str = ""
+        row_str += "    <tr>\n"
+        row_str += "      <th>{}</th>\n".format(ind)
+        for col in row:
+            row_str += "      <td>{}</td>\n".format(col)
+        row_str += "    </tr>\n"
+        return row_str
+
     def showHTMLPreview(self, topLimit=5, bottomLimit=5):
         """ action that generates a physical plan, processes data and return a subset of results as nicely formatted
         HTML table to stdout.
@@ -162,14 +192,108 @@ def showHTMLPreview(self, topLimit=5, bottomLimit=5):
         Returns:
             string: an HTML table showing a preview of the data
         """
+        HTML_TEMPLATE = (
+            "<div>\n"
+            "<style scoped>\n"
+            "    .dataframe tbody tr th:only-of-type {\n"
+            "        vertical-align: middle;\n"
+            "    }\n"
+            "\n"
+            "    .dataframe tbody tr th {\n"
+            "        vertical-align: top;\n"
+            "    }\n"
+            "\n"
+            "    .dataframe thead th {\n"
+            "        text-align: right;\n"
+            "    }\n"
+            "</style>\n"
+            "<table border=\"1\" class=\"dataframe\">\n"
+            "  <thead>\n"
+            "    <tr style=\"text-align: right;\">\n"
+            "{}"
+            "    </tr>\n"
+            "  </thead>\n"
+            "  <tbody>\n"
+            "{}"
+            "  </tbody>\n"
+            "</table>\n"
+            "<p>{} columns</p>\n"
+            "</div>")
+
         assert self._dataSet is not None, 'internal API error, datasets must be created via context objects'
 
+        # TODO(march): edit this top/bottom limit
         if topLimit is None or topLimit < 0:
             topLimit = -1
         if bottomLimit is None or bottomLimit < 0:
             bottomLimit = -1
 
-        return self._dataSet.showHTMLPreview(topLimit, bottomLimit)
+        rows = self.take(topLimit, bottomLimit)
+
+        if len(rows) == 0:
+            return HTML_TEMPLATE.format("<th></th>\n", "<tr></tr>\n")
+
+        assert topLimit == -1 or bottomLimit == -1 or len(rows) <= topLimit + bottomLimit
+
+        headers_str = ""
+        body = ""
+        num_columns = None
+
+        # construct tables
+        if len(rows) < topLimit + bottomLimit:
+            # the data is small so we get everything (no need to render ...)
+            i = 0
+            for r in rows:
+                if i == 0:
+                    # we set num columns based on the first row
+                    num_columns = r.getNumColumns()
+                body += self._getHTMLRow(i, r)
+                i += 1
+        else:
+            # some data is not processed because of limiting
+            i = 0
+            for r in rows:
+                if i >= topLimit:
+                    break
+                if i == 0:
+                    # we set num columns based on the first row
+                    num_columns = r.getNumColumns()
+
+                body += self._getHTMLRow(i, r)
+                i += 1
+
+            # add the ...
+            body += "    <tr>\n"
+            body += "      <th>...</th>\n"
+            for _ in range(num_columns):
+                body += "      <td>...</td>\n"
+                body += "    </tr>\n"
+
+            for j in range(i, len(rows)):
+                body += self._getHTMLRow(i, rows[j])
+
+        assert num_columns is not None
+
+        # construct headers
+        column_names = self._dataSet.columns()
+        if column_names is not None:
+            assert (num_columns == column_names.size())
+            for c_name in column_names:
+                headers_str += "      <th>{}</th>\n".format(c_name)
+        else:
+            # default to generic name if column name doesn't exist
+            for i in range(num_columns):
+                headers_str += "      <th>Column {}</th>\n".format(i)
+
+        return HTML_TEMPLATE.format(headers_str, body, num_columns)
+
+    def _getConsoleRow(self, ind, row):
+        # TODO(march): (work on this)
+        pass
+
+    def showConsolePreview(self, topLimit=5, bottomLimit=5):
+        # TODO(march): (work on this)
+        pass
 
     def resolve(self, eclass, ftor):
         """ Adds a resolver operator to the pipeline. The signature of ftor needs to be identical to the one of the preceding operator.
diff --git a/tuplex/test/core/ContextBasics.cc b/tuplex/test/core/ContextBasics.cc
index fdbdd8d50..0be3c6030 100644
--- a/tuplex/test/core/ContextBasics.cc
+++ b/tuplex/test/core/ContextBasics.cc
@@ -136,4 +136,58 @@ TEST_F(ContextBasicsTest, JSON) {
 
     auto str = ContextOptions::defaults().asJSON();
     EXPECT_GT(str.length(), 2);
-}
\ No newline at end of file
+}
+
+TEST_F(ContextBasicsTest, twoContextTest) {
+    using namespace tuplex;
+
+    python::initInterpreter();
+    python::unlockGIL();
+
+    ContextOptions co = testOptions();
+    co.set("tuplex.partitionSize", "100B");
+    co.set("tuplex.executorMemory", "1MB");
+    co.set("tuplex.scratchDir", scratchDir + "/context1");
+
+    // second context with different executor config, should cause the driver to split up
+    ContextOptions co2 = testOptions();
+    co.set("tuplex.partitionSize", "100B");
+    co2.set("tuplex.executorMemory", "2MB");
+    co2.set("tuplex.scratchDir", scratchDir + "/context2");
+
+    Context c1(co);
+    Context c2(co2);
+    Row row1(Tuple(0), Tuple("hello"));
+    Row row2(Tuple(1), Tuple("this"));
+    Row row3(Tuple(2), Tuple("is"));
+    Row row4(Tuple(3), Tuple("a"));
+    Row row5(Tuple(4), Tuple("test"));
+
+    for (int t = 0; t < 10; t++) {
+        auto ds1 = c1.parallelize({row1, row2, row3, row4, row5})
+                .map(UDF("lambda x: x[1][0]")); // new code: string index operator! first to raise an exception!
+
+        auto ds2 = c2.parallelize({row1, row2, row3, row4, row5})
+                .map(UDF("lambda x: x[1][0]")); // new code: string index operator! first to raise an exception!
+
+        auto v1 = ds1.collectAsVector();
+        auto v2 = ds2.collectAsVector();
+
+        std::vector<std::string> ref{"hello", "this", "is", "a", "test"};
+
+        EXPECT_EQ(v1.size(), 5);
+        for (int i = 0; i < 5; i++) {
+            EXPECT_EQ(v1[i].getString(0), ref[i]);
+        }
+
+        EXPECT_EQ(v2.size(), 5);
+        for (int i = 0; i < 5; i++) {
+            EXPECT_EQ(v2[i].getString(0), ref[i]);
+        }
+    }
+
+    python::lockGIL();
+    python::closeInterpreter();
+}
+
+// TODO(march): multiple context test
\ No newline at end of file
diff --git a/tuplex/test/core/ResultSetTest.cc b/tuplex/test/core/ResultSetTest.cc
index 4acd38921..2ea273062 100644
--- a/tuplex/test/core/ResultSetTest.cc
+++ b/tuplex/test/core/ResultSetTest.cc
@@ -14,7 +14,7 @@
 
 class ResultSetTest : public PyTest {
 protected:
-    tuplex::Executor *driver;
+    std::shared_ptr<tuplex::Executor> driver;
     tuplex::ContextOptions options;
 public:
     // init function
@@ -45,7 +45,8 @@ class ResultSetTest : public PyTest {
             EXPECT_EQ(r.getRowType(), first_type);
 
         // now write via partition writer
-        tuplex::PartitionWriter pw(driver, Schema(Schema::MemoryLayout::ROW, first_type), 0, 0, options.PARTITION_SIZE());
+        tuplex::PartitionWriter pw(driver.get(), Schema(Schema::MemoryLayout::ROW, first_type), 0, 0,
+                                   options.PARTITION_SIZE());
         for(const auto& r : rows)
             pw.writeRow(r);
         return pw.getOutputPartitions();
diff --git a/tuplex/test/core/TakeTest.cc b/tuplex/test/core/TakeTest.cc
index 3990fcd07..86173e40b 100644
--- a/tuplex/test/core/TakeTest.cc
+++ b/tuplex/test/core/TakeTest.cc
@@ -16,7 +16,8 @@
 using namespace tuplex;
 using namespace std;
 
-class TakeTest : public PyTest {};
+class TakeTest : public PyTest {
+};
 
 /**
  * Randomly generate a vector of rows for testing
@@ -37,15 +38,15 @@ vector<Row> generateTestData(size_t N, uint64_t seed) {
     return data;
 }
 
-vector<Row> generateReferenceData(const vector<Row>& input, size_t topLimit, size_t bottomLimit) {
+vector<Row> generateReferenceData(const vector<Row> &input, size_t topLimit, size_t bottomLimit) {
     vector<Row> output;
-    for(size_t i = 0; i < topLimit && i < input.size(); i++) {
+    for (size_t i = 0; i < topLimit && i < input.size(); i++) {
         output.push_back(input[i]);
     }
     size_t start_bottom = input.size() >= bottomLimit ? input.size() - bottomLimit : 0;
     start_bottom = max(topLimit, start_bottom);
 
-    for(size_t i = start_bottom; i < input.size(); i++) {
+    for (size_t i = start_bottom; i < input.size(); i++) {
         output.push_back(input[i]);
     }
 
@@ -57,7 +58,7 @@ TEST_F(TakeTest, takeTopTest) {
     Context context(opt);
 
     auto rs = context.parallelize(
-        {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(1, 0);
+            {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(1, 0);
 
     ASSERT_EQ(rs->rowCount(), 1);
     auto v = rs->getRows(1);
@@ -65,7 +66,7 @@ TEST_F(TakeTest, takeTopTest) {
     EXPECT_EQ(v[0].getInt(0), 1);
 
     auto rs2 = context.parallelize(
-        {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(3, 0);
+            {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(3, 0);
 
     ASSERT_EQ(rs2->rowCount(), 3);
     auto v2 = rs2->getRows(3);
@@ -75,7 +76,8 @@ TEST_F(TakeTest, takeTopTest) {
     EXPECT_EQ(v2[2].getInt(0), 3);
 
     auto rs3 = context.parallelize(
-        {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"), Row("!")}).take(5, 0);
+            {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"),
+             Row("!")}).take(5, 0);
 
     ASSERT_EQ(rs3->rowCount(), 5);
     auto v3 = rs3->getRows(5);
@@ -93,7 +95,7 @@ TEST_F(TakeTest, takeBottomTest) {
     Context context(opt);
 
     auto rs = context.parallelize(
-        {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(0, 1);
+            {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(0, 1);
 
     ASSERT_EQ(rs->rowCount(), 1);
     auto v = rs->getRows(1);
@@ -101,7 +103,7 @@ TEST_F(TakeTest, takeBottomTest) {
     EXPECT_EQ(v[0].getInt(0), 6);
 
     auto rs2 = context.parallelize(
-        {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(0, 3);
+            {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(0, 3);
 
     ASSERT_EQ(rs2->rowCount(), 3);
     auto v2 = rs2->getRows(3);
@@ -111,7 +113,8 @@ TEST_F(TakeTest, takeBottomTest) {
     EXPECT_EQ(v2[2].getInt(0), 6);
 
     auto rs3 = context.parallelize(
-        {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"), Row("!")}).take(0, 5);
+            {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"),
+             Row("!")}).take(0, 5);
 
     ASSERT_EQ(rs3->rowCount(), 5);
     auto v3 = rs3->getRows(5);
@@ -129,7 +132,7 @@ TEST_F(TakeTest, takeBothTest) {
     Context context(opt);
 
     auto rs = context.parallelize(
-        {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(1, 1);
+            {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(1, 1);
 
     ASSERT_EQ(rs->rowCount(), 2);
     auto v = rs->getRows(2);
@@ -138,7 +141,7 @@ TEST_F(TakeTest, takeBothTest) {
     EXPECT_EQ(v[1].getInt(0), 6);
 
     auto rs2 = context.parallelize(
-        {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(2, 1);
+            {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(2, 1);
 
     ASSERT_EQ(rs2->rowCount(), 3);
     auto v2 = rs2->getRows(3);
@@ -148,7 +151,8 @@ TEST_F(TakeTest, takeBothTest) {
     EXPECT_EQ(v2[2].getInt(0), 6);
 
     auto rs3 = context.parallelize(
-        {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"), Row("!")}).take(2, 3);
+            {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"),
+             Row("!")}).take(2, 3);
 
     ASSERT_EQ(rs3->rowCount(), 5);
     auto v3 = rs3->getRows(5);
@@ -167,12 +171,12 @@ TEST_F(TakeTest, takeBigTest) {
     const std::vector<size_t> limit_values{0, 1, 5, 11, 600, 10000};
     const std::vector<string> partition_sizes{"256B", "512KB", "1MB"};
 
-    for(auto& part_size : partition_sizes) {
+    for (auto &part_size: partition_sizes) {
         auto opt = testOptions();
         opt.set("tuplex.partitionSize", part_size);
         Context context(opt);
 
-        for(auto data_size : test_size) {
+        for (auto data_size: test_size) {
             for (auto top_limit: limit_values) {
                 for (auto bottom_limit: limit_values) {
                     std::cout << "testing with partition size:" << part_size << " data size:"
@@ -195,12 +199,149 @@ TEST_F(TakeTest, takeBigTest) {
     }
 }
 
-// TODO(march): with map, filter function
-//TEST_F(TakeTest, takeMapFilterTest) {
-//    srand(4242);
-//}
+vector<Row> generateMapFilterReferenceData(const vector<Row> &input, size_t topLimit, size_t bottomLimit) {
+    if (input.empty()) {
+        return {};
+    }
+
+    assert(input[0].getNumColumns() == 3);
+    vector<Row> intermediate;
+    for (const Row &r: input) {
+        int64_t new_a = r.getInt(0) + r.getInt(1);
+
+        if (new_a % 2 == 0) {
+            intermediate.emplace_back(new_a, r.getInt(2));
+        }
+    }
+
+    return generateReferenceData(intermediate, topLimit, bottomLimit);
+}
+
+TEST_F(TakeTest, takeMapFilterTest) {
+    mt19937 data_seed_gen(56120);
+
+    const std::vector<size_t> test_size{1, 10, 100, 1001, 10001};
+    const std::vector<size_t> limit_values{0, 1, 5, 11, 600, 10000};
+    const std::vector<string> partition_sizes{"256B", "512KB", "1MB"};
+
+    UDF map_udf("lambda a, b, c: ((a + b), c)");
+    UDF filter_udf("lambda a, b: a % 2 == 0");
+
+    for (auto &part_size: partition_sizes) {
+        auto opt = testOptions();
+        opt.set("tuplex.partitionSize", part_size);
+        Context context(opt);
+
+        for (auto data_size: test_size) {
+            for (auto top_limit: limit_values) {
+                for (auto bottom_limit: limit_values) {
+                    std::cout << "testing with partition size:" << part_size << " data size:"
+                              << data_size << " top:" << top_limit << " bottom:" << bottom_limit << std::endl;
+
+                    auto data = generateTestData(data_size, data_seed_gen());
+                    auto ref_data = generateMapFilterReferenceData(data, top_limit, bottom_limit);
+
+                    auto ds = context.parallelize(data).map(map_udf).filter(filter_udf);
+                    auto res = ds.take(top_limit, bottom_limit);
+                    ASSERT_EQ(ref_data.size(), res->rowCount());
+                    for (Row &r: ref_data) {
+                        Row res_row = res->getNextRow();
+                        if (!(res_row == r)) {
+                            ASSERT_EQ(res_row, r);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+TEST_F(TakeTest, collectIdentityTest) {
+    mt19937 data_seed_gen(123454);
+
+    const std::vector<size_t> test_size{1, 10, 100, 1001, 10001};
+    const std::vector<string> partition_sizes{"256B", "512KB", "1MB"};
+
+    for (auto &part_size: partition_sizes) {
+        auto opt = testOptions();
+        opt.set("tuplex.partitionSize", part_size);
+        Context context(opt);
+
+        for (auto data_size: test_size) {
+            auto data = generateTestData(data_size, data_seed_gen());
+            auto res = context.parallelize(data).collect();
+            ASSERT_EQ(data.size(), res->rowCount());
+            for (Row &r: data) {
+                Row res_row = res->getNextRow();
+                if (!(res_row == r)) {
+                    ASSERT_EQ(res_row, r);
+                }
+            }
+        }
+    }
+}
+
+TEST_F(TakeTest, fileInputTest) {
+    const std::vector<size_t> test_size{1, 10, 100, 1001, 50001};
+    const std::vector<size_t> limit_values{0, 1, 5, 11, 600, 10000};
+    const std::vector<string> partition_sizes{"256B", "512KB", "1MB"};
+    std::vector<std::vector<Row>> expected_outputs;
+
+    if (!boost::filesystem::exists(scratchDir)) {
+        boost::filesystem::create_directory(scratchDir);
+    }
+
+    std::vector<string> fileInputNames;
+    for (unsigned long N: test_size) {
+        std::vector<Row> ref_output;
+        // write temp file
+        auto fName = fmt::format("{}/{}-{}.csv", scratchDir, testName, N);
+
+        FILE *fp = fopen(fName.c_str(), "w");
+        ASSERT_TRUE(fp);
+        fprintf(fp, "colA,colStr,colB\n");
+        for (int i = 0; i < N; ++i) {
+            fprintf(fp, "%d,\"hello%d\",%d\n", i, (i * 3) % 7, i % 15);
+            ref_output.emplace_back(i, fmt::format("hello{}", (i * 3) % 7), (i % 15) * (i % 15));
+        }
+        fclose(fp);
+
+        expected_outputs.push_back(std::move(ref_output));
+        fileInputNames.push_back(fName);
+    }
 
-// TODO(march): with file input
-//    context.csv("../resources/");
+    ASSERT_TRUE(expected_outputs.size() == test_size.size());
+    ASSERT_TRUE(fileInputNames.size() == test_size.size());
+
+    for (auto &part_size: partition_sizes) {
+        auto opt = microTestOptions();
+        opt.set("tuplex.partitionSize", part_size);
+        Context context(opt);
+
+        for (int t = 0; t < test_size.size(); t++) {
+            const size_t data_size = test_size[t];
+
+            for (auto top_limit: limit_values) {
+                for (auto bottom_limit: limit_values) {
+                    std::cout << "file testing with partition size:" << part_size << " data size:"
+                              << data_size << " top:" << top_limit << " bottom:" << bottom_limit << std::endl;
+
+                    auto ref_output = generateReferenceData(expected_outputs[t], top_limit, bottom_limit);
+                    auto res = context.csv(testName + ".csv")
+                            .mapColumn("colB", UDF("lambda x: x * x"))
+                            .take(top_limit, bottom_limit);
+
+                    ASSERT_EQ(ref_output.size(), res->rowCount());
+                    for (Row &r: ref_output) {
+                        Row res_row = res->getNextRow();
+                        if (!(res_row == r)) {
+                            ASSERT_EQ(res_row, r);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
 
-// TODO(march): collect operator
\ No newline at end of file
+// TODO(march): write test for trimPartitionsToLimit
\ No newline at end of file

From 72b6580c763f46e9867aa04ee8488fa9bd5400c3 Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Fri, 15 Apr 2022 02:50:20 -0400
Subject: [PATCH 40/56] Add file testcases

---
 tuplex/core/src/LocalEngine.cc    |  6 ++----
 tuplex/test/core/ContextBasics.cc |  4 +---
 tuplex/test/core/TakeTest.cc      | 22 ++++++++++++----------
 3 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/tuplex/core/src/LocalEngine.cc b/tuplex/core/src/LocalEngine.cc
index c9c6d506b..91892d44d 100644
--- a/tuplex/core/src/LocalEngine.cc
+++ b/tuplex/core/src/LocalEngine.cc
@@ -123,7 +123,6 @@ namespace tuplex {
             if (_driver) {
                 Logger::instance().logger("local execution engine").info(
                         "driver already exist, starting new driver with updated config");
-                _driver->release(); // TODO(march): test whether we need this
             }
 
             // lazy start driver
@@ -132,10 +131,9 @@ namespace tuplex {
                                                  "driver");
             _driver_cfg = new_cfg;
 
-            // TODO(march): this could be a problem, if multiple driver with number = 0
-            // TODO(march): write a test for two drivers existing together (thread number 0)
-            // TODO(march): make a comment about potential issue here
             // driver always has thread number 0!
+            // Note: this could be a potential issue if the config change and the old driver is still running
+            // due to external reference. Then there could be two executors with the same number
             _driver->setThreadNumber(0);
 
             std::stringstream ss;
diff --git a/tuplex/test/core/ContextBasics.cc b/tuplex/test/core/ContextBasics.cc
index 0be3c6030..e85107b40 100644
--- a/tuplex/test/core/ContextBasics.cc
+++ b/tuplex/test/core/ContextBasics.cc
@@ -188,6 +188,4 @@ TEST_F(ContextBasicsTest, twoContextTest) {
 
     python::lockGIL();
     python::closeInterpreter();
-}
-
-// TODO(march): multiple context test
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/tuplex/test/core/TakeTest.cc b/tuplex/test/core/TakeTest.cc
index 86173e40b..eda609518 100644
--- a/tuplex/test/core/TakeTest.cc
+++ b/tuplex/test/core/TakeTest.cc
@@ -282,9 +282,9 @@ TEST_F(TakeTest, collectIdentityTest) {
 }
 
 TEST_F(TakeTest, fileInputTest) {
-    const std::vector<size_t> test_size{1, 10, 100, 1001, 50001};
-    const std::vector<size_t> limit_values{0, 1, 5, 11, 600, 10000};
-    const std::vector<string> partition_sizes{"256B", "512KB", "1MB"};
+    const std::vector<size_t> test_size{1, 10, 1001, 50001};
+    const std::vector<size_t> limit_values{0, 1, 6, 600, 10000};
+    const std::vector<string> partition_sizes{"256B", "1MB"};
     std::vector<std::vector<Row>> expected_outputs;
 
     if (!boost::filesystem::exists(scratchDir)) {
@@ -327,21 +327,23 @@ TEST_F(TakeTest, fileInputTest) {
                               << data_size << " top:" << top_limit << " bottom:" << bottom_limit << std::endl;
 
                     auto ref_output = generateReferenceData(expected_outputs[t], top_limit, bottom_limit);
-                    auto res = context.csv(testName + ".csv")
+                    auto res = context.csv(fileInputNames[t])
                             .mapColumn("colB", UDF("lambda x: x * x"))
                             .take(top_limit, bottom_limit);
 
                     ASSERT_EQ(ref_output.size(), res->rowCount());
                     for (Row &r: ref_output) {
                         Row res_row = res->getNextRow();
-                        if (!(res_row == r)) {
-                            ASSERT_EQ(res_row, r);
-                        }
+                        ASSERT_EQ(res_row.getInt(0), r.getInt(0));
+                        ASSERT_EQ(res_row.getString(1), r.getString(1));
+                        ASSERT_EQ(res_row.getInt(2), r.getInt(2));
+                        // TODO(march): this doesn't work because schema are different (for some reason infer as opt[int]?)
+                        // if (!(res_row == r)) {
+                        //     ASSERT_EQ(res_row, r);
+                        // }
                     }
                 }
             }
         }
     }
-}
-
-// TODO(march): write test for trimPartitionsToLimit
\ No newline at end of file
+}
\ No newline at end of file

From 172d6b57690b2cb0b0dadc0c584178b1d1ce862f Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Tue, 19 Apr 2022 23:45:01 -0400
Subject: [PATCH 41/56] Python Dataset Debug

---
 tuplex/python/src/PythonDataSet.cc         |   4 +
 tuplex/python/tuplex/dataset.py            | 128 +++++++++++++++------
 tuplex/python/tuplex/utils/table_format.py |  80 +++++++++++++
 tuplex/test/core/TakeTest.cc               |   8 +-
 4 files changed, 178 insertions(+), 42 deletions(-)
 create mode 100644 tuplex/python/tuplex/utils/table_format.py

diff --git a/tuplex/python/src/PythonDataSet.cc b/tuplex/python/src/PythonDataSet.cc
index 1f543e5d2..5382ad24d 100644
--- a/tuplex/python/src/PythonDataSet.cc
+++ b/tuplex/python/src/PythonDataSet.cc
@@ -133,11 +133,15 @@ namespace tuplex {
             size_t castedTopLimit = 0;
             if (topLimit < 0) {
                 castedTopLimit = std::numeric_limits<size_t>::max();
+            } else {
+                castedTopLimit = topLimit;
             }
 
             size_t castedBottomLimit = 0;
             if (bottomLimit < 0) {
                 castedBottomLimit = std::numeric_limits<size_t>::max();
+            } else {
+                castedBottomLimit = bottomLimit;
             }
 
             try {
diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py
index 7eda223a1..c0b9ef4d0 100644
--- a/tuplex/python/tuplex/dataset.py
+++ b/tuplex/python/tuplex/dataset.py
@@ -19,6 +19,7 @@
 from tuplex.utils.framework import UDFCodeExtractionError
 from tuplex.utils.source_vault import SourceVault
 from .exceptions import classToExceptionCode
+import tuplex.utils.table_format as table_format
 
 # signed 64bit limit
 max_rows = 9223372036854775807
@@ -29,7 +30,10 @@ def __init__(self):
         self._dataSet = None
 
     def _repr_html_(self):
-        return self._dataSet.showHTMLPreview()
+        return self.showHTMLPreview()
+
+    def __repr__(self):
+        return self.showStrPreview()
 
     def unique(self):
         """ removes duplicates from Dataset (out-of-order). Equivalent to a DISTINCT clause in a SQL-statement.
@@ -172,15 +176,6 @@ def show(self, nrows=None):
 
         self._dataSet.show(nrows)
 
-    def _getHTMLRow(self, ind, row):
-        row_str = ""
-        row_str += "    <tr>\n"
-        row_str += "      <th>{}</th>\n".format(ind)
-        for col in row:
-            row_str += "      <td>{}</td>\n".format(col)
-        row_str += "    </tr>\n"
-        return row_str
-
     def showHTMLPreview(self, topLimit=5, bottomLimit=5):
         """ action that generates a physical plan, processes data and return a subset of results as nicely formatted
         HTML table to stdout.
@@ -195,17 +190,17 @@ def showHTMLPreview(self, topLimit=5, bottomLimit=5):
         HTML_TEMPLATE = (
             "<div>\n"
             "<style scoped>\n"
-            "    .dataframe tbody tr th:only-of-type {\n"
+            "    .dataframe tbody tr th:only-of-type {{\n"
             "        vertical-align: middle;\n"
-            "    }\n"
+            "    }}\n"
             "\n"
-            "    .dataframe tbody tr th {\n"
+            "    .dataframe tbody tr th {{\n"
             "        vertical-align: top;\n"
-            "    }\n"
+            "    }}\n"
             "\n"
-            "    .dataframe thead th {\n"
+            "    .dataframe thead th {{\n"
             "        text-align: right;\n"
-            "    }\n"
+            "    }}\n"
             "</style>\n"
             "<table border=\"1\" class=\"dataframe\">\n"
             "  <thead>\n"
@@ -222,16 +217,10 @@ def showHTMLPreview(self, topLimit=5, bottomLimit=5):
 
         assert self._dataSet is not None, 'internal API error, datasets must be created via context objects'
 
-        # TODO(march): edit this top/bottom limit
-        if topLimit is None or topLimit < 0:
-            topLimit = -1
-        if bottomLimit is None or bottomLimit < 0:
-            bottomLimit = -1
-
         rows = self.take(topLimit, bottomLimit)
 
         if len(rows) == 0:
-            return HTML_TEMPLATE.format("<th></th>\n", "<tr></tr>\n")
+            return HTML_TEMPLATE.format("<th></th>\n", "<tr></tr>\n", 0)
 
         assert topLimit == -1 or bottomLimit == -1 or len(rows) <= topLimit + bottomLimit
 
@@ -246,8 +235,8 @@ def showHTMLPreview(self, topLimit=5, bottomLimit=5):
             for r in rows:
                 if i == 0:
                     # we set num columns based on the first row
-                    num_columns = r.getNumColumns()
-                body += self._getHTMLRow(i, r)
+                    num_columns = len(r) if isinstance(r, list) or isinstance(r, tuple) else 1
+                body += table_format.getHTMLRow(i, r)
                 i += 1
         else:
             # some data is not processed because of limiting
@@ -257,9 +246,9 @@ def showHTMLPreview(self, topLimit=5, bottomLimit=5):
                     break
                 if i == 0:
                     # we set num columns based on the first row
-                    num_columns = r.getNumColumns()
+                    num_columns = len(r) if isinstance(r, list) or isinstance(r, tuple) else 1
 
-                body += self._getHTMLRow(i, r)
+                body += table_format.getHTMLRow(i, r)
                 i += 1
 
             # add the ...
@@ -270,14 +259,15 @@ def showHTMLPreview(self, topLimit=5, bottomLimit=5):
                 body += "    </tr>\n"
 
             for j in range(i, len(rows)):
-                body += self._getHTMLRow(i, rows[j])
+                body += table_format.getHTMLRow(len(rows) - j, rows[j])
 
         assert num_columns is not None
 
         # construct headers
         column_names = self._dataSet.columns()
-        if column_names is not None:
-            assert (num_columns == column_names.size())
+        headers_str += "      <th></th>\n"
+        if len(column_names) > 0:
+            assert (num_columns == len(column_names))
             for c_name in column_names:
                 headers_str += "      <th>{}</th>\n".format(c_name)
         else:
@@ -287,13 +277,79 @@ def showHTMLPreview(self, topLimit=5, bottomLimit=5):
 
         return HTML_TEMPLATE.format(headers_str, body, num_columns)
 
-    def _getConsoleRow(self, ind, row):
-        # TODO(march): (work on this)
-        pass
+    def showStrPreview(self, topLimit=5, bottomLimit=5):
+        """ action that generates a physical plan, processes data and return a subset of results as nicely formatted
+        ASCII table to stdout.
+
+        Args:
+            topLimit (int): number of top rows to collect. If ``None`` all rows will be collected
+            bottomLimit (int): number of bottom rows to collect. If ``None`` all rows will be collected
+
+        Returns:
+            string: an HTML table showing a preview of the data
+        """
+        assert self._dataSet is not None, 'internal API error, datasets must be created via context objects'
+
+        rows = self.take(topLimit, bottomLimit)
+
+        if len(rows) == 0:
+            return (
+                "---\n"
+                "| |\n"
+                "---\n"
+                "0 columns\n")
+
+        assert topLimit == -1 or bottomLimit == -1 or len(rows) <= topLimit + bottomLimit
+
+        str_table = []
+        num_columns = None
+
+        # construct tables
+        if len(rows) < topLimit + bottomLimit:
+            # the data is small so we get everything (no need to render ...)
+            i = 0
+            for r in rows:
+                if i == 0:
+                    # we set num columns based on the first row
+                    num_columns = len(r) if isinstance(r, list) or isinstance(r, tuple) else 1
+                str_table.append(table_format.getStrTableRow(i, r))
+                i += 1
+        else:
+            # some data is not processed because of limiting
+            i = 0
+            for r in rows:
+                if i >= topLimit:
+                    break
+                if i == 0:
+                    # we set num columns based on the first row
+                    num_columns = len(r) if isinstance(r, list) or isinstance(r, tuple) else 1
+
+                str_table.append(table_format.getStrTableRow(i, r))
+                i += 1
+
+            # add the ...
+            str_table.append(["..."] * (num_columns + 1))
+
+            for j in range(i, len(rows)):
+                str_table.append(table_format.getStrTableRow(len(rows) - j, rows[j]))
+
+        assert num_columns is not None
+
+        # construct headers
+        column_names = self._dataSet.columns()
+        headers_list = [""]
+        if len(column_names) > 0:
+            assert (num_columns == len(column_names))
+            for c_name in column_names:
+                headers_list.append("{}".format(c_name))
+        else:
+            # default to generic name if column name doesn't exist
+            for i in range(num_columns):
+                headers_list.append("Column {}".format(i))
+
+        str_table = [headers_list] + str_table
 
-    def showConsolePreview(self, topLimit=5, bottomLimit=5):
-        # TODO(march): (work on this)
-        pass
+        return table_format.generateStrTable(num_columns + 1, str_table)
 
     def resolve(self, eclass, ftor):
         """ Adds a resolver operator to the pipeline. The signature of ftor needs to be identical to the one of the preceding operator.
diff --git a/tuplex/python/tuplex/utils/table_format.py b/tuplex/python/tuplex/utils/table_format.py
new file mode 100644
index 000000000..bb83118b4
--- /dev/null
+++ b/tuplex/python/tuplex/utils/table_format.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+# ----------------------------------------------------------------------------------------------------------------------#
+#                                                                                                                      #
+#                                       Tuplex: Blazing Fast Python Data Science                                       #
+#                                                                                                                      #
+#                                                                                                                      #
+#  (c) 2017 - 2021, Tuplex team                                                                                        #
+#  Created by Leonhard Spiegelberg first on 4/19/2022                                                                  #
+#  License: Apache 2.0                                                                                                 #
+# ----------------------------------------------------------------------------------------------------------------------#
+
+def getHTMLRow(ind, row):
+    """
+    Given a row, converts all the contents to an HTML row and return
+    :param ind: the index of that row
+    :param row: a row output from dataset
+    :return: an HTML row, representative of the row
+    """
+    row_str = ""
+    row_str += "    <tr>\n"
+    row_str += "      <th>{}</th>\n".format(ind)
+    if isinstance(row, list) or isinstance(row, tuple):
+        for col in row:
+            row_str += "      <td>{}</td>\n".format(col)
+    else:
+        row_str += "      <td>{}</td>\n".format(row)
+    row_str += "    </tr>\n"
+    return row_str
+
+
+def getStrTableRow(ind, row):
+    """
+    Given a row, converts all the contents to string and return
+    :param ind: the index of that row
+    :param row: a row output from dataset
+    :return: a list of string, representative of the row
+    """
+    row_str_list = ["{}".format(ind)]
+    if isinstance(row, list) or isinstance(row, tuple):
+        for col in row:
+            row_str_list.append("{}".format(col))
+    else:
+        row_str_list.append("{}".format(row))
+    return row_str_list
+
+
+def _getLineDivider(col_width):
+    out = ""
+    for w in col_width:
+        out += "+" + ("-" * (w + 2))
+    out += "+\n"
+
+    return out
+
+def generateStrTable(numCols, strTable):
+    """
+    Given a 2-dimensional list of strings, print a nicely formatted table of the contents in the list
+    :param numCols: number of columns in the table
+    :param strTable: 2-dimensional list of strings, as list of list
+    :return: a nicely formatted table in string
+    """
+    max_col_width = [0] * numCols
+
+    for r in strTable:
+        for i in range(0, len(r)):
+            assert (isinstance(r[i], str))
+            if len(r[i]) > max_col_width[i]:
+                max_col_width[i] = len(r[i])
+
+    output_str = ""
+
+    for r in strTable:
+        output_str += _getLineDivider(max_col_width)
+        for i in range(0, len(r)):
+            output_str += "| {:<{width}} ".format(r[i], width=max_col_width[i])
+        output_str += "|\n"
+
+    output_str += _getLineDivider(max_col_width) + "{} columns\n".format(numCols)
+
+    return output_str
diff --git a/tuplex/test/core/TakeTest.cc b/tuplex/test/core/TakeTest.cc
index eda609518..4e4a70f53 100644
--- a/tuplex/test/core/TakeTest.cc
+++ b/tuplex/test/core/TakeTest.cc
@@ -282,8 +282,8 @@ TEST_F(TakeTest, collectIdentityTest) {
 }
 
 TEST_F(TakeTest, fileInputTest) {
-    const std::vector<size_t> test_size{1, 10, 1001, 50001};
-    const std::vector<size_t> limit_values{0, 1, 6, 600, 10000};
+    const std::vector<size_t> test_size{1, 1001, 50001};
+    const std::vector<size_t> limit_values{0, 1, 600, 10000};
     const std::vector<string> partition_sizes{"256B", "1MB"};
     std::vector<std::vector<Row>> expected_outputs;
 
@@ -337,10 +337,6 @@ TEST_F(TakeTest, fileInputTest) {
                         ASSERT_EQ(res_row.getInt(0), r.getInt(0));
                         ASSERT_EQ(res_row.getString(1), r.getString(1));
                         ASSERT_EQ(res_row.getInt(2), r.getInt(2));
-                        // TODO(march): this doesn't work because schema are different (for some reason infer as opt[int]?)
-                        // if (!(res_row == r)) {
-                        //     ASSERT_EQ(res_row, r);
-                        // }
                     }
                 }
             }

From a2d41784b6a00050b96920c0b7f7cb61f2fce206 Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Tue, 19 Apr 2022 23:47:17 -0400
Subject: [PATCH 42/56] Remove showHTMLPreview from Dataset in C++

---
 tuplex/core/include/DataSet.h         |   8 --
 tuplex/core/src/DataSet.cc            | 108 --------------------------
 tuplex/python/include/PythonDataSet.h |   1 -
 tuplex/python/src/PythonBindings.cc   |   1 -
 tuplex/python/src/PythonDataSet.cc    |  49 ------------
 5 files changed, 167 deletions(-)

diff --git a/tuplex/core/include/DataSet.h b/tuplex/core/include/DataSet.h
index 3a5f450ac..1b11c1f75 100644
--- a/tuplex/core/include/DataSet.h
+++ b/tuplex/core/include/DataSet.h
@@ -130,14 +130,6 @@ namespace tuplex {
          */
         virtual void show(int64_t numRows = -1, std::ostream &os = std::cout);
 
-        /*!
-         * Displays a formatted HTML table of a small portion of the data
-         * @param topLimit how many top rows to print
-         * @param bottomLimit how many bottom rows to print
-         * @param os ostream where to print table to
-         */
-        virtual void showHTMLPreview(size_t topLimit, size_t bottomLimit, std::ostream &os = std::cout);
-
         // named dataset management functions
         /*!
          * map Column using a UDF
diff --git a/tuplex/core/src/DataSet.cc b/tuplex/core/src/DataSet.cc
index b62946ae4..d54edb567 100644
--- a/tuplex/core/src/DataSet.cc
+++ b/tuplex/core/src/DataSet.cc
@@ -756,114 +756,6 @@ namespace tuplex {
         printTable(os, headers, rows);
     }
 
-    void printHTMLRow(std::ostream &os, size_t ind, const Row& r) {
-        os << "    <tr>\n";
-        os << fmt::format("      <th>{}</th>\n", ind);
-        for (auto& s : r.getAsStrings()) {
-            os << fmt::format("      <td>{}</td>\n", s);
-        }
-        os << "    </tr>\n";
-    }
-
-    void DataSet::showHTMLPreview(size_t topLimit, size_t bottomLimit, std::ostream &os) {
-        std::string HTML_TEMPLATE =
-                "<div>\n"
-                "<style scoped>\n"
-                "    .dataframe tbody tr th:only-of-type {\n"
-                "        vertical-align: middle;\n"
-                "    }\n"
-                "\n"
-                "    .dataframe tbody tr th {\n"
-                "        vertical-align: top;\n"
-                "    }\n"
-                "\n"
-                "    .dataframe thead th {\n"
-                "        text-align: right;\n"
-                "    }\n"
-                "</style>\n"
-                "<table border=\"1\" class=\"dataframe\">\n"
-                "  <thead>\n"
-                "    <tr style=\"text-align: right;\">\n"
-                "{}"
-                "    </tr>\n"
-                "  </thead>\n"
-                "  <tbody>\n"
-                "{}"
-                "  </tbody>\n"
-                "</table>\n"
-                "<p>{} columns</p>\n"
-                "</div>";
-
-        assert(_context);
-
-        auto rows = take(topLimit, bottomLimit);
-
-        if (rows->rowCount() == 0) {
-            os << fmt::format(HTML_TEMPLATE, "<th></th>\n", "<tr></tr>\n", 0);
-            return;
-        }
-
-        std::stringstream headers_stream, body_stream;
-        size_t numColumns = 0;
-        assert(rows->rowCount() <= topLimit + bottomLimit);
-
-        // construct tables
-        if (rows->rowCount() < topLimit + bottomLimit) {
-            // the data is small so we get everything (no need to render ...)
-            for (size_t i = 0; rows->hasNextRow(); i++) {
-                Row r = rows->getNextRow();
-                if (i == 0) {
-                    // we set num columns based on the first row
-                    numColumns = r.getNumColumns();
-                }
-
-                printHTMLRow(body_stream, i, r);
-            }
-        } else {
-            // some data is not processed because of limiting
-            size_t i;
-            for (i = 0; rows->hasNextRow() && i < topLimit; i++) {
-                Row r = rows->getNextRow();
-                if (i == 0) {
-                    // we set num columns based on the first row
-                    numColumns = r.getNumColumns();
-                }
-
-                printHTMLRow(body_stream, i, r);
-            }
-
-            // add the ...
-            body_stream << "    <tr>\n";
-            body_stream << "      <th>...</th>\n";
-            for(int j = 0; j < numColumns; j++) {
-                body_stream << "      <td>...</td>\n";
-                body_stream << "    </tr>\n";
-            }
-
-            while (rows->hasNextRow()) {
-                Row r = rows->getNextRow();
-                printHTMLRow(body_stream, i, r);
-            }
-        }
-
-        assert(numColumns != 0);
-
-        // construct headers
-        if (!_columnNames.empty()) {
-            assert(numColumns == _columnNames.size());
-            for (auto &c_name: _columnNames) {
-                headers_stream << fmt::format("      <th>{}</th>\n", c_name);
-            }
-        } else {
-            // default to generic name if column name doesn't exist
-            for (int i = 0; i < numColumns; ++i) {
-                headers_stream << fmt::format("      <th>Column {}</th>\n", i);
-            }
-        }
-
-        os << fmt::format(HTML_TEMPLATE, headers_stream.str(), body_stream.str(), numColumns);
-    }
-
     Schema DataSet::schema() const {
         if(!_operator)
             return Schema::UNKNOWN;
diff --git a/tuplex/python/include/PythonDataSet.h b/tuplex/python/include/PythonDataSet.h
index 4761ac7f0..ede482d9c 100644
--- a/tuplex/python/include/PythonDataSet.h
+++ b/tuplex/python/include/PythonDataSet.h
@@ -79,7 +79,6 @@ namespace tuplex {
         py::object collect();
         py::object take(const int64_t topLimit, const int64_t bottomLimit);
         void show(const int64_t numRows=-1);
-        std::string showHTMLPreview(const int64_t topLimit, const int64_t bottomLimit);
 
         // DataFrame like operations
         PythonDataSet mapColumn(const std::string& column, const std::string& lambda_code, const std::string& pickled_code, const py::object& closure=py::object());
diff --git a/tuplex/python/src/PythonBindings.cc b/tuplex/python/src/PythonBindings.cc
index ab239a1a2..6b3683853 100644
--- a/tuplex/python/src/PythonBindings.cc
+++ b/tuplex/python/src/PythonBindings.cc
@@ -41,7 +41,6 @@ PYMODULE {
 
     py::class_<tuplex::PythonDataSet>(m, "_DataSet")
             .def("show", &tuplex::PythonDataSet::show)
-            .def("showHTMLPreview", &tuplex::PythonDataSet::showHTMLPreview)
             .def("collect", &tuplex::PythonDataSet::collect)
             .def("take", &tuplex::PythonDataSet::take)
             .def("map", &tuplex::PythonDataSet::map)
diff --git a/tuplex/python/src/PythonDataSet.cc b/tuplex/python/src/PythonDataSet.cc
index 5382ad24d..ec972a899 100644
--- a/tuplex/python/src/PythonDataSet.cc
+++ b/tuplex/python/src/PythonDataSet.cc
@@ -884,55 +884,6 @@ namespace tuplex {
         }
     }
 
-    std::string PythonDataSet::showHTMLPreview(const int64_t topLimit, const int64_t bottomLimit) {
-        // make sure a dataset is wrapped
-        assert(this->_dataset);
-
-        // is callee error dataset? if so return list with error string
-        if (this->_dataset->isError()) {
-            auto errset = dynamic_cast<ErrorDataSet *>(this->_dataset);
-            assert(errset);
-            return "Error: " + errset->getError();
-        } else {
-            // release GIL & hand over everything to Tuplex
-            assert(PyGILState_Check()); // make sure this thread holds the GIL!
-            python::unlockGIL();
-
-            std::stringstream ss;
-            std::string err_message;
-
-            size_t castedTopLimit = 0;
-            if (topLimit < 0) {
-                castedTopLimit = std::numeric_limits<size_t>::max();
-            }
-
-            size_t castedBottomLimit = 0;
-            if (bottomLimit < 0) {
-                castedBottomLimit = std::numeric_limits<size_t>::max();
-            }
-
-            try {
-                this->_dataset->showHTMLPreview(castedTopLimit, castedBottomLimit, ss);
-            } catch (const std::exception &e) {
-                err_message = e.what();
-                Logger::instance().defaultLogger().error(err_message);
-            } catch (...) {
-                err_message = "unknown C++ exception occurred, please change type.";
-                Logger::instance().defaultLogger().error(err_message);
-            }
-
-            // reacquire GIL
-            python::lockGIL();
-            Logger::instance().flushToPython();
-
-            if (!ss.str().empty() && err_message.empty()) {
-                return ss.str();
-            } else {
-                return "Error occurred: " + err_message;
-            }
-        }
-    }
-
     PyObject* PythonDataSet::anyToCPythonWithPyObjects(ResultSet* rs, size_t maxRowCount) {
         assert(rs);
 

From 993937d33918ca7e7e3036779ee8d563ff196a89 Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Wed, 20 Apr 2022 00:15:03 -0400
Subject: [PATCH 43/56] Separate out partition utils

---
 tuplex/core/include/PartitionUtils.h        |  46 +++++++
 tuplex/core/include/ee/local/LocalBackend.h |  22 ----
 tuplex/core/src/PartitionUtils.cc           | 138 ++++++++++++++++++++
 tuplex/core/src/ee/local/LocalBackend.cc    | 125 +-----------------
 tuplex/python/tuplex/utils/table_format.py  |   2 +-
 tuplex/test/core/TakeTest.cc                |   2 +-
 6 files changed, 187 insertions(+), 148 deletions(-)
 create mode 100644 tuplex/core/include/PartitionUtils.h
 create mode 100644 tuplex/core/src/PartitionUtils.cc

diff --git a/tuplex/core/include/PartitionUtils.h b/tuplex/core/include/PartitionUtils.h
new file mode 100644
index 000000000..d247edcfc
--- /dev/null
+++ b/tuplex/core/include/PartitionUtils.h
@@ -0,0 +1,46 @@
+//--------------------------------------------------------------------------------------------------------------------//
+//                                                                                                                    //
+//                                      Tuplex: Blazing Fast Python Data Science                                      //
+//                                                                                                                    //
+//                                                                                                                    //
+//  (c) 2017 - 2021, Tuplex team                                                                                      //
+//  Created by March Boonyapaluk first on 4/19/2021                                                                   //
+//  License: Apache 2.0                                                                                               //
+//--------------------------------------------------------------------------------------------------------------------//
+
+#ifndef TUPLEX_PARTITIONUTILS_H
+#define TUPLEX_PARTITIONUTILS_H
+
+#include <vector>
+#include <physical/TransformStage.h>
+#include <Executor.h>
+
+namespace tuplex {
+    /*!
+     * Trim list of partitions so that it includes up to the first n rows and the last m rows
+     * if n + m > number of rows in input partitions, the partitions will remain unchanged
+     * @param partitions [in,out] the list of partitions to trim
+     * @param topLimit n, the number of top rows to include
+     * @param bottomLimit m, the number of bottom rows to include
+     * @param tstage pointer to transform stage, might be used to generate new partition
+     * @param exec pointer to executor, might be used to allocate new partition
+     */
+    void trimPartitionsToLimit(std::vector<Partition *> &partitions, size_t topLimit, size_t bottomLimit,
+                               TransformStage *tstage, Executor *exec);
+
+    /*!
+     * Create a newly allocated partition with the same data as the specified partition, but with the first n rows removed
+     * @param p_in the input partition
+     * @param numToSkip number of rows to remove from the new partition
+     * @param tstage pointer to transform stage, used to generate new partition
+     * @param exec pointer to executor, used to allocate new partition
+     * @return the new partition
+     */
+    Partition *newPartitionWithSkipRows(Partition *p_in,
+                                        size_t numToSkip,
+                                        TransformStage *tstage,
+                                        Executor *exec);
+
+}
+
+#endif //TUPLEX_PARTITIONUTILS_H
diff --git a/tuplex/core/include/ee/local/LocalBackend.h b/tuplex/core/include/ee/local/LocalBackend.h
index 3d73a5d9f..7f42ff1cb 100644
--- a/tuplex/core/include/ee/local/LocalBackend.h
+++ b/tuplex/core/include/ee/local/LocalBackend.h
@@ -185,28 +185,6 @@ namespace tuplex {
      * @return
      */
     extern URI outputURI(const UDF& udf, const URI& baseURI, int64_t partNo, FileFormat fmt);
-
-    /*!
-     * Trim list of partitions so that it includes up to the first n rows and the last m rows
-     * if n + m > number of rows in input partitions, the partitions will remain unchanged
-     * @param partitions [in,out] the list of partitions to trim
-     * @param topLimit n, the number of top rows to include
-     * @param bottomLimit m, the number of bottom rows to include
-     * @param tstage pointer to transform stage, might be used to generate new partition
-     * @param exec pointer to executor, might be used to allocate new partition
-     */
-    extern void trimPartitionsToLimit(std::vector<Partition *> &partitions, size_t topLimit, size_t bottomLimit,
-                               TransformStage *tstage, Executor *exec);
-
-    /*!
-     * Create a newly allocated partition with the same data as the specified partition, but with the first n rows removed
-     * @param p_in the input partition
-     * @param numToSkip number of rows to remove from the new partition
-     * @param tstage pointer to transform stage, used to generate new partition
-     * @param exec pointer to executor, used to allocate new partition
-     * @return the new partition
-     */
-    extern Partition *newPartitionWithSkipRows(Partition *p_in, size_t numToSkip, TransformStage *tstage, Executor *exec);
 }
 
 #endif //TUPLEX_LOCALBACKEND_H
\ No newline at end of file
diff --git a/tuplex/core/src/PartitionUtils.cc b/tuplex/core/src/PartitionUtils.cc
new file mode 100644
index 000000000..745332c93
--- /dev/null
+++ b/tuplex/core/src/PartitionUtils.cc
@@ -0,0 +1,138 @@
+//--------------------------------------------------------------------------------------------------------------------//
+//                                                                                                                    //
+//                                      Tuplex: Blazing Fast Python Data Science                                      //
+//                                                                                                                    //
+//                                                                                                                    //
+//  (c) 2017 - 2021, Tuplex team                                                                                      //
+//  Created by March Boonyapaluk first on 4/19/2021                                                                   //
+//  License: Apache 2.0                                                                                               //
+//--------------------------------------------------------------------------------------------------------------------//
+
+#include "PartitionUtils.h"
+
+namespace tuplex {
+
+    void trimPartitionsToLimit(std::vector<Partition *> &partitions,
+                               size_t topLimit,
+                               size_t bottomLimit,
+                               TransformStage* tstage,
+                               Executor *exec) {
+        std::vector<Partition *> limitedPartitions, limitedTailPartitions;
+
+        // check top output limit, adjust partitions if necessary
+        size_t numTopOutputRows = 0;
+        Partition *lastTopPart = nullptr;
+        size_t clippedTop = 0;
+        for (auto partition: partitions) {
+            numTopOutputRows += partition->getNumRows();
+            lastTopPart = partition;
+            if (numTopOutputRows >= topLimit) {
+                // clip last partition & leave loop
+                clippedTop = topLimit - (numTopOutputRows - partition->getNumRows());
+                assert(clippedTop <= partition->getNumRows());
+                break;
+            } else if (partition == partitions.back()) {
+                // last partition, mark full row, but don't put to output set yet to avoid double put
+                clippedTop = partition->getNumRows();
+                break;
+            } else {
+                // put full partition to output set
+                limitedPartitions.push_back(partition);
+            }
+        }
+
+        // check the bottom output limit, adjust partitions if necessary
+        size_t numBottomOutputRows = 0;
+        size_t clippedBottom = 0;
+        for (auto it = partitions.rbegin(); it != partitions.rend(); it++) {
+            auto partition = *it;
+            numBottomOutputRows += partition->getNumRows();
+
+            if (partition == lastTopPart) {
+                // the bottom and the top partitions are overlapping
+                clippedBottom = bottomLimit - (numBottomOutputRows - partition->getNumRows());
+                if (clippedTop + clippedBottom >= partition->getNumRows()) {
+                    // if top and bottom range intersect, use full partitions
+                    clippedTop = partition->getNumRows();
+                    clippedBottom = 0;
+                }
+                break;
+            } else if (numBottomOutputRows >= bottomLimit) {
+                // clip last partition & leave loop
+                auto clipped = bottomLimit - (numBottomOutputRows - partition->getNumRows());
+                assert(clipped <= partition->getNumRows());
+                if (clipped > 0) {
+                    Partition *newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage,
+                                                                  exec);
+                    assert(newPart->getNumRows() == clipped);
+                    limitedTailPartitions.push_back(newPart);
+                }
+                partition->invalidate();
+                break;
+            } else {
+                // put full partition to output set
+                limitedTailPartitions.push_back(partition);
+            }
+        }
+
+        // push the middle partition
+        if (lastTopPart != nullptr && (clippedTop > 0 || clippedBottom > 0)) {
+            assert(clippedTop + clippedBottom <= lastTopPart->getNumRows());
+
+            // split into two partitions with both top and bottom are in the same partition
+            Partition *lastBottomPart = nullptr;
+
+            if (clippedBottom != 0) {
+                lastBottomPart = newPartitionWithSkipRows(lastTopPart, lastTopPart->getNumRows() - clippedBottom,
+                                                          tstage, exec);
+            }
+
+            if (clippedTop != 0) {
+                lastTopPart->setNumRows(clippedTop);
+                limitedPartitions.push_back(lastTopPart);
+            } else {
+                lastTopPart->invalidate();
+            }
+
+            if (lastBottomPart != nullptr) {
+                limitedPartitions.push_back(lastBottomPart);
+            }
+        }
+
+        // merge the head and tail partitions
+        partitions.clear();
+        partitions.insert(partitions.end(), limitedPartitions.begin(), limitedPartitions.end());
+        partitions.insert(partitions.end(), limitedTailPartitions.rbegin(), limitedTailPartitions.rend());
+    }
+
+    Partition *newPartitionWithSkipRows(Partition *p_in, size_t numToSkip, TransformStage *tstage, Executor *exec) {
+        auto ptr = p_in->lockRaw();
+        auto num_rows = *((int64_t *) ptr);
+        assert(numToSkip < num_rows);
+
+        ptr += sizeof(int64_t);
+        size_t numBytesToSkip = 0;
+
+        Deserializer ds(tstage->outputSchema());
+        for (unsigned i = 0; i < numToSkip; ++i) {
+            Row r = Row::fromMemory(ds, ptr, p_in->capacity() - numBytesToSkip);
+            ptr += r.serializedLength();
+            numBytesToSkip += r.serializedLength();
+        }
+
+        Partition *p_out = exec->allocWritablePartition(p_in->size() - numBytesToSkip + sizeof(int64_t),
+                                                        tstage->outputSchema(), tstage->outputDataSetID(),
+                                                        tstage->context().id());
+        assert(p_out->capacity() >= p_in->size() - numBytesToSkip);
+
+        auto ptr_out = p_out->lockRaw();
+        *((int64_t *) ptr_out) = p_in->getNumRows() - numToSkip;
+        ptr_out += sizeof(int64_t);
+        memcpy((void *) ptr_out, ptr, p_in->size() - numBytesToSkip);
+        p_out->unlock();
+
+        p_in->unlock();
+
+        return p_out;
+    }
+} // namespace tuplex
\ No newline at end of file
diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc
index 351d55b88..676a4e3b3 100644
--- a/tuplex/core/src/ee/local/LocalBackend.cc
+++ b/tuplex/core/src/ee/local/LocalBackend.cc
@@ -27,7 +27,7 @@
 #include <physical/HashProbeTask.h>
 #include <physical/LLVMOptimizer.h>
 #include <HybridHashTable.h>
-#include <int_hashmap.h>
+#include "PartitionUtils.h"
 
 namespace tuplex {
 
@@ -2131,127 +2131,4 @@ namespace tuplex {
         tstage->setFileResult(ecounts);
     }
 
-    void trimPartitionsToLimit(std::vector<Partition *> &partitions,
-                                             size_t topLimit,
-                                             size_t bottomLimit,
-                                             TransformStage* tstage,
-                                             Executor *exec) {
-        std::vector<Partition *> limitedPartitions, limitedTailPartitions;
-
-        // check top output limit, adjust partitions if necessary
-        size_t numTopOutputRows = 0;
-        Partition *lastTopPart = nullptr;
-        size_t clippedTop = 0;
-        for (auto partition: partitions) {
-            numTopOutputRows += partition->getNumRows();
-            lastTopPart = partition;
-            if (numTopOutputRows >= topLimit) {
-                // clip last partition & leave loop
-                clippedTop = topLimit - (numTopOutputRows - partition->getNumRows());
-                assert(clippedTop <= partition->getNumRows());
-                break;
-            } else if (partition == partitions.back()) {
-                // last partition, mark full row, but don't put to output set yet to avoid double put
-                clippedTop = partition->getNumRows();
-                break;
-            } else {
-                // put full partition to output set
-                limitedPartitions.push_back(partition);
-            }
-        }
-
-        // check the bottom output limit, adjust partitions if necessary
-        size_t numBottomOutputRows = 0;
-        size_t clippedBottom = 0;
-        for (auto it = partitions.rbegin(); it != partitions.rend(); it++) {
-            auto partition = *it;
-            numBottomOutputRows += partition->getNumRows();
-
-            if (partition == lastTopPart) {
-                // the bottom and the top partitions are overlapping
-                clippedBottom = bottomLimit - (numBottomOutputRows - partition->getNumRows());
-                if (clippedTop + clippedBottom >= partition->getNumRows()) {
-                    // if top and bottom range intersect, use full partitions
-                    clippedTop = partition->getNumRows();
-                    clippedBottom = 0;
-                }
-                break;
-            } else if (numBottomOutputRows >= bottomLimit) {
-                // clip last partition & leave loop
-                auto clipped = bottomLimit - (numBottomOutputRows - partition->getNumRows());
-                assert(clipped <= partition->getNumRows());
-                if (clipped > 0) {
-                    Partition *newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage,
-                                                                  exec);
-                    assert(newPart->getNumRows() == clipped);
-                    limitedTailPartitions.push_back(newPart);
-                }
-                partition->invalidate();
-                break;
-            } else {
-                // put full partition to output set
-                limitedTailPartitions.push_back(partition);
-            }
-        }
-
-        // push the middle partition
-        if (lastTopPart != nullptr && (clippedTop > 0 || clippedBottom > 0)) {
-            assert(clippedTop + clippedBottom <= lastTopPart->getNumRows());
-
-            // split into two partitions with both top and bottom are in the same partition
-            Partition *lastBottomPart = nullptr;
-
-            if (clippedBottom != 0) {
-                lastBottomPart = newPartitionWithSkipRows(lastTopPart, lastTopPart->getNumRows() - clippedBottom,
-                                                          tstage, exec);
-            }
-
-            if (clippedTop != 0) {
-                lastTopPart->setNumRows(clippedTop);
-                limitedPartitions.push_back(lastTopPart);
-            } else {
-                lastTopPart->invalidate();
-            }
-
-            if (lastBottomPart != nullptr) {
-                limitedPartitions.push_back(lastBottomPart);
-            }
-        }
-
-        // merge the head and tail partitions
-        partitions.clear();
-        partitions.insert(partitions.end(), limitedPartitions.begin(), limitedPartitions.end());
-        partitions.insert(partitions.end(), limitedTailPartitions.rbegin(), limitedTailPartitions.rend());
-    }
-
-    Partition *newPartitionWithSkipRows(Partition *p_in, size_t numToSkip, TransformStage *tstage, Executor *exec) {
-        auto ptr = p_in->lockRaw();
-        auto num_rows = *((int64_t *) ptr);
-        assert(numToSkip < num_rows);
-
-        ptr += sizeof(int64_t);
-        size_t numBytesToSkip = 0;
-
-        Deserializer ds(tstage->outputSchema());
-        for (unsigned i = 0; i < numToSkip; ++i) {
-            Row r = Row::fromMemory(ds, ptr, p_in->capacity() - numBytesToSkip);
-            ptr += r.serializedLength();
-            numBytesToSkip += r.serializedLength();
-        }
-
-        Partition *p_out = exec->allocWritablePartition(p_in->size() - numBytesToSkip + sizeof(int64_t),
-                                                        tstage->outputSchema(), tstage->outputDataSetID(),
-                                                        tstage->context().id());
-        assert(p_out->capacity() >= p_in->size() - numBytesToSkip);
-
-        auto ptr_out = p_out->lockRaw();
-        *((int64_t *) ptr_out) = p_in->getNumRows() - numToSkip;
-        ptr_out += sizeof(int64_t);
-        memcpy((void *) ptr_out, ptr, p_in->size() - numBytesToSkip);
-        p_out->unlock();
-
-        p_in->unlock();
-
-        return p_out;
-    }
 } // namespace tuplex
\ No newline at end of file
diff --git a/tuplex/python/tuplex/utils/table_format.py b/tuplex/python/tuplex/utils/table_format.py
index bb83118b4..ecd333f5a 100644
--- a/tuplex/python/tuplex/utils/table_format.py
+++ b/tuplex/python/tuplex/utils/table_format.py
@@ -5,7 +5,7 @@
 #                                                                                                                      #
 #                                                                                                                      #
 #  (c) 2017 - 2021, Tuplex team                                                                                        #
-#  Created by Leonhard Spiegelberg first on 4/19/2022                                                                  #
+#  Created by March Boonyapaluk first on 4/19/2022                                                                     #
 #  License: Apache 2.0                                                                                                 #
 # ----------------------------------------------------------------------------------------------------------------------#
 
diff --git a/tuplex/test/core/TakeTest.cc b/tuplex/test/core/TakeTest.cc
index 4e4a70f53..40b624ca8 100644
--- a/tuplex/test/core/TakeTest.cc
+++ b/tuplex/test/core/TakeTest.cc
@@ -4,7 +4,7 @@
 //                                                                                                                    //
 //                                                                                                                    //
 //  (c) 2017 - 2021, Tuplex team                                                                                      //
-//  Created by Leonhard Spiegelberg first on 1/1/2021                                                                 //
+//  Created by March Boonyapaluk first on 4/19/2021                                                                   //
 //  License: Apache 2.0                                                                                               //
 //--------------------------------------------------------------------------------------------------------------------//
 

From 6f528f889221a0553b426d4939572c8fa307a8b2 Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Wed, 20 Apr 2022 11:45:34 -0400
Subject: [PATCH 44/56] Fix Azure pipeline failing

---
 tuplex/python/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tuplex/python/CMakeLists.txt b/tuplex/python/CMakeLists.txt
index b0b0e54c5..7ccb7057c 100644
--- a/tuplex/python/CMakeLists.txt
+++ b/tuplex/python/CMakeLists.txt
@@ -104,6 +104,7 @@ FILE(COPY ${CMAKE_CURRENT_SOURCE_DIR}/tuplex/utils/__init__.py
         ${CMAKE_CURRENT_SOURCE_DIR}/tuplex/utils/tracebacks.py
         ${CMAKE_CURRENT_SOURCE_DIR}/tuplex/utils/version.py
         ${CMAKE_CURRENT_SOURCE_DIR}/tuplex/utils/globs.py
+        ${CMAKE_CURRENT_SOURCE_DIR}/tuplex/utils/table_format.py
         DESTINATION ${PYTHON_DIST_DIR}/tuplex/utils)
 
 FILE(COPY ${CMAKE_CURRENT_SOURCE_DIR}/tests/test_tuples.py

From 816567f9e9c78cc3154286c500318c344802c991 Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Wed, 20 Apr 2022 12:58:29 -0400
Subject: [PATCH 45/56] Minor Debug in Python lib

---
 tuplex/python/src/PythonDataSet.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tuplex/python/src/PythonDataSet.cc b/tuplex/python/src/PythonDataSet.cc
index ec972a899..e04fc73fe 100644
--- a/tuplex/python/src/PythonDataSet.cc
+++ b/tuplex/python/src/PythonDataSet.cc
@@ -177,7 +177,7 @@ namespace tuplex {
             // new version, directly interact with the interpreter
             Timer timer;
             // build python list object from resultset
-            auto listObj = resultSetToCPython(rs.get(), castedTopLimit);
+            auto listObj = resultSetToCPython(rs.get(), rs->rowCount());
             Logger::instance().logger("python").info("Data transfer back to python took "
                                                      + std::to_string(timer.time()) + " seconds");
             // Logger::instance().flushAll();

From 4b2e2af40ea5c7fa1fcfdafbdc9dbcb992c43b45 Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Fri, 22 Apr 2022 03:45:52 -0400
Subject: [PATCH 46/56] Remove column counts

---
 tuplex/python/tuplex/dataset.py            | 5 ++---
 tuplex/python/tuplex/utils/table_format.py | 2 --
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py
index c0b9ef4d0..6a3f9ca71 100644
--- a/tuplex/python/tuplex/dataset.py
+++ b/tuplex/python/tuplex/dataset.py
@@ -212,7 +212,6 @@ def showHTMLPreview(self, topLimit=5, bottomLimit=5):
             "{}"
             "  </tbody>\n"
             "</table>\n"
-            "<p>{} columns</p>\n"
             "</div>")
 
         assert self._dataSet is not None, 'internal API error, datasets must be created via context objects'
@@ -256,7 +255,7 @@ def showHTMLPreview(self, topLimit=5, bottomLimit=5):
             body += "      <th>...</th>\n"
             for _ in range(num_columns):
                 body += "      <td>...</td>\n"
-                body += "    </tr>\n"
+            body += "    </tr>\n"
 
             for j in range(i, len(rows)):
                 body += table_format.getHTMLRow(len(rows) - j, rows[j])
@@ -275,7 +274,7 @@ def showHTMLPreview(self, topLimit=5, bottomLimit=5):
             for i in range(num_columns):
                 headers_str += "      <th>Column {}</th>\n".format(i)
 
-        return HTML_TEMPLATE.format(headers_str, body, num_columns)
+        return HTML_TEMPLATE.format(headers_str, body)
 
     def showStrPreview(self, topLimit=5, bottomLimit=5):
         """ action that generates a physical plan, processes data and return a subset of results as nicely formatted
diff --git a/tuplex/python/tuplex/utils/table_format.py b/tuplex/python/tuplex/utils/table_format.py
index ecd333f5a..7bc8dd9d9 100644
--- a/tuplex/python/tuplex/utils/table_format.py
+++ b/tuplex/python/tuplex/utils/table_format.py
@@ -75,6 +75,4 @@ def generateStrTable(numCols, strTable):
             output_str += "| {:<{width}} ".format(r[i], width=max_col_width[i])
         output_str += "|\n"
 
-    output_str += _getLineDivider(max_col_width) + "{} columns\n".format(numCols)
-
     return output_str

From a935f1e71c064aea9b631755c2ada55fe22f63b2 Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Wed, 11 May 2022 13:18:59 -0400
Subject: [PATCH 47/56] Fix CI not running core tests

---
 tuplex/test/CMakeLists.txt | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/tuplex/test/CMakeLists.txt b/tuplex/test/CMakeLists.txt
index 8497ef8b2..ce7200e01 100755
--- a/tuplex/test/CMakeLists.txt
+++ b/tuplex/test/CMakeLists.txt
@@ -1,14 +1,26 @@
 find_package(Threads REQUIRED)
 
 # some tests require the cloudpickle package to be installed, hence check for it here
-find_package(Python3 COMPONENTS Interpreter)
-if(Python3_FOUND)
+find_package(Python3 COMPONENTS Interpreter Development QUIET)
+if (Python3_FOUND)
+    message(STATUS "Found full python3-dev installation")
+    set(Python3_Embed_FOUND TRUE)
+else ()
+    find_package(Python3 COMPONENTS Interpreter REQUIRED)
+    # python3 -c 'import distutils.sysconfig; print(distutils.sysconfig.get_python_lib(plat_specific=False,standard_lib=True))'
+    # try to get get module libs at least
+
+    # mark embed lib as not found
+    unset(Python3_Embed_FOUND)
+endif ()
+
+if (Python3_FOUND)
     # check that cloudpickle is installed via import
     set(cmd -c "import cloudpickle")
     execute_process(COMMAND ${Python3_EXECUTABLE} ${cmd} RESULT_VARIABLE ret)
-    if(NOT "${ret}" STREQUAL "0")
+    if (NOT "${ret}" STREQUAL "0")
         message(FATAL_ERROR "Could not find cloudpickle module, please install via pip3 install cloudpickle.")
-    endif()
+    endif ()
 
     # check that numpy is installed too for testing purposes...
     set(cmd -c "import numpy")
@@ -68,7 +80,7 @@ if(Python3_Embed_FOUND)
     add_subdirectory(core)
     add_subdirectory(wrappers)
 else()
-    message(STATUS "deactivating C++ tests for core/wrappers because no full Python dev installation found.")
+    message(WARNING "deactivating C++ tests for core/wrappers because no full Python dev installation found.")
 endif()
 
 # Resources:::

From 359ffed15f903ccd400b2231b914abae17d201db Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Fri, 13 May 2022 17:22:55 -0400
Subject: [PATCH 48/56] Speed up tests

---
 tuplex/test/core/TakeTest.cc        | 130 ++++++++++++++++++----------
 tuplex/test/wrappers/WrapperTest.cc |  41 +++++++++
 2 files changed, 127 insertions(+), 44 deletions(-)

diff --git a/tuplex/test/core/TakeTest.cc b/tuplex/test/core/TakeTest.cc
index 40b624ca8..2c7a1e067 100644
--- a/tuplex/test/core/TakeTest.cc
+++ b/tuplex/test/core/TakeTest.cc
@@ -19,6 +19,58 @@ using namespace std;
 class TakeTest : public PyTest {
 };
 
+
+struct TakeTestConfig {
+    size_t data_size;
+    size_t top_limit;
+    size_t bottom_limit;
+    string partition_sizes;
+};
+
+/**
+ * Generate a predefine list of test scenarios composing of different data size and limit values
+ */
+vector<TakeTestConfig> generateTakeTestCfgs() {
+    std::vector<TakeTestConfig> testCfgs;
+
+    // generate exhaustive test for small values
+    const std::vector<size_t> small_test_size{1, 10};
+    const std::vector<size_t> small_limit_values{0, 1, 5, 11};
+    for (auto data_size: small_test_size) {
+        for (auto top_limit: small_limit_values) {
+            for (auto bottom_limit: small_limit_values) {
+                testCfgs.push_back({data_size, top_limit, bottom_limit, "256B"});
+            }
+        }
+    }
+
+    // add pre-defined bigger cases
+    testCfgs.push_back({1000, 600, 0, "256B"});
+    testCfgs.push_back({1000, 600, 600, "256B"});
+    testCfgs.push_back({1000, 11, 600, "512KB"});
+
+    testCfgs.push_back({10001, 600, 1001, "256B"});
+    testCfgs.push_back({10001, 600, 1001, "512KB"});
+    testCfgs.push_back({10001, 600, 1001, "1MB"});
+
+    testCfgs.push_back({10001, 5000, 4950, "256B"});
+    testCfgs.push_back({10001, 5000, 4950, "512KB"});
+    testCfgs.push_back({10001, 5000, 4950, "1MB"});
+
+    return testCfgs;
+}
+
+/**
+ * partition test into different partition sizes to avoid reinitializing the same context multiple times
+ */
+map<string, vector<TakeTestConfig>> splitCfgsByPartitionSize(const std::vector<TakeTestConfig> &testCfgs) {
+    map<string, vector<TakeTestConfig>> mp;
+    for (const auto &cfg: testCfgs) {
+        mp[cfg.partition_sizes].push_back(cfg);
+    }
+    return mp;
+}
+
 /**
  * Randomly generate a vector of rows for testing
  * @param N the size of vector
@@ -167,32 +219,27 @@ TEST_F(TakeTest, takeBothTest) {
 TEST_F(TakeTest, takeBigTest) {
     mt19937 data_seed_gen(4242);
 
-    const std::vector<size_t> test_size{1, 10, 100, 1001, 10001};
-    const std::vector<size_t> limit_values{0, 1, 5, 11, 600, 10000};
-    const std::vector<string> partition_sizes{"256B", "512KB", "1MB"};
+    auto testCfgs = generateTakeTestCfgs();
+    auto partitionedCfgs = splitCfgsByPartitionSize(testCfgs);
 
-    for (auto &part_size: partition_sizes) {
+    for (const auto &cfg_pair: partitionedCfgs) {
         auto opt = testOptions();
-        opt.set("tuplex.partitionSize", part_size);
+        opt.set("tuplex.partitionSize", cfg_pair.first);
         Context context(opt);
 
-        for (auto data_size: test_size) {
-            for (auto top_limit: limit_values) {
-                for (auto bottom_limit: limit_values) {
-                    std::cout << "testing with partition size:" << part_size << " data size:"
-                              << data_size << " top:" << top_limit << " bottom:" << bottom_limit << std::endl;
+        for (const auto &cfg: cfg_pair.second) {
+            std::cout << "testing with partition size:" << cfg.partition_sizes << " data size:"
+                      << cfg.data_size << " top:" << cfg.top_limit << " bottom:" << cfg.bottom_limit << std::endl;
 
-                    auto data = generateTestData(data_size, data_seed_gen());
-                    auto ref_data = generateReferenceData(data, top_limit, bottom_limit);
+            auto data = generateTestData(cfg.data_size, data_seed_gen());
+            auto ref_data = generateReferenceData(data, cfg.top_limit, cfg.bottom_limit);
 
-                    auto res = context.parallelize(data).take(top_limit, bottom_limit);
-                    ASSERT_EQ(ref_data.size(), res->rowCount());
-                    for (Row &r: ref_data) {
-                        Row res_row = res->getNextRow();
-                        if (!(res_row == r)) {
-                            ASSERT_EQ(res_row, r);
-                        }
-                    }
+            auto res = context.parallelize(data).take(cfg.top_limit, cfg.bottom_limit);
+            ASSERT_EQ(ref_data.size(), res->rowCount());
+            for (Row &r: ref_data) {
+                Row res_row = res->getNextRow();
+                if (!(res_row == r)) {
+                    ASSERT_EQ(res_row, r);
                 }
             }
         }
@@ -220,36 +267,31 @@ vector<Row> generateMapFilterReferenceData(const vector<Row> &input, size_t topL
 TEST_F(TakeTest, takeMapFilterTest) {
     mt19937 data_seed_gen(56120);
 
-    const std::vector<size_t> test_size{1, 10, 100, 1001, 10001};
-    const std::vector<size_t> limit_values{0, 1, 5, 11, 600, 10000};
-    const std::vector<string> partition_sizes{"256B", "512KB", "1MB"};
+    auto testCfgs = generateTakeTestCfgs();
+    auto partitionedCfgs = splitCfgsByPartitionSize(testCfgs);
 
     UDF map_udf("lambda a, b, c: ((a + b), c)");
     UDF filter_udf("lambda a, b: a % 2 == 0");
 
-    for (auto &part_size: partition_sizes) {
+    for (const auto &cfg_pair: partitionedCfgs) {
         auto opt = testOptions();
-        opt.set("tuplex.partitionSize", part_size);
+        opt.set("tuplex.partitionSize", cfg_pair.first);
         Context context(opt);
 
-        for (auto data_size: test_size) {
-            for (auto top_limit: limit_values) {
-                for (auto bottom_limit: limit_values) {
-                    std::cout << "testing with partition size:" << part_size << " data size:"
-                              << data_size << " top:" << top_limit << " bottom:" << bottom_limit << std::endl;
+        for (const auto &cfg: cfg_pair.second) {
+            std::cout << "testing with partition size:" << cfg.partition_sizes << " data size:"
+                      << cfg.data_size << " top:" << cfg.top_limit << " bottom:" << cfg.bottom_limit << std::endl;
 
-                    auto data = generateTestData(data_size, data_seed_gen());
-                    auto ref_data = generateMapFilterReferenceData(data, top_limit, bottom_limit);
+            auto data = generateTestData(cfg.data_size, data_seed_gen());
+            auto ref_data = generateMapFilterReferenceData(data, cfg.top_limit, cfg.bottom_limit);
 
-                    auto ds = context.parallelize(data).map(map_udf).filter(filter_udf);
-                    auto res = ds.take(top_limit, bottom_limit);
-                    ASSERT_EQ(ref_data.size(), res->rowCount());
-                    for (Row &r: ref_data) {
-                        Row res_row = res->getNextRow();
-                        if (!(res_row == r)) {
-                            ASSERT_EQ(res_row, r);
-                        }
-                    }
+            auto ds = context.parallelize(data).map(map_udf).filter(filter_udf);
+            auto res = ds.take(cfg.top_limit, cfg.bottom_limit);
+            ASSERT_EQ(ref_data.size(), res->rowCount());
+            for (Row &r: ref_data) {
+                Row res_row = res->getNextRow();
+                if (!(res_row == r)) {
+                    ASSERT_EQ(res_row, r);
                 }
             }
         }
@@ -259,7 +301,7 @@ TEST_F(TakeTest, takeMapFilterTest) {
 TEST_F(TakeTest, collectIdentityTest) {
     mt19937 data_seed_gen(123454);
 
-    const std::vector<size_t> test_size{1, 10, 100, 1001, 10001};
+    const std::vector<size_t> test_size{1, 10, 1000, 10001};
     const std::vector<string> partition_sizes{"256B", "512KB", "1MB"};
 
     for (auto &part_size: partition_sizes) {
@@ -282,8 +324,8 @@ TEST_F(TakeTest, collectIdentityTest) {
 }
 
 TEST_F(TakeTest, fileInputTest) {
-    const std::vector<size_t> test_size{1, 1001, 50001};
-    const std::vector<size_t> limit_values{0, 1, 600, 10000};
+    const std::vector<size_t> test_size{1, 1001, 10001};
+    const std::vector<size_t> limit_values{0, 1, 600, 5000};
     const std::vector<string> partition_sizes{"256B", "1MB"};
     std::vector<std::vector<Row>> expected_outputs;
 
diff --git a/tuplex/test/wrappers/WrapperTest.cc b/tuplex/test/wrappers/WrapperTest.cc
index ede9dd82d..cec703086 100644
--- a/tuplex/test/wrappers/WrapperTest.cc
+++ b/tuplex/test/wrappers/WrapperTest.cc
@@ -2521,6 +2521,47 @@ TEST_F(WrapperTest, PartitionRelease) {
 }
 
 
+TEST_F(WrapperTest, ResultWithLimitMerge) {
+    using namespace tuplex;
+
+    PythonContext c("c", "", testOptions());
+
+    PyObject *listObj = PyList_New(4);
+    PyObject *tupleObj1 = PyTuple_New(2);
+    PyTuple_SET_ITEM(tupleObj1, 0, PyLong_FromLong(1));
+    PyTuple_SET_ITEM(tupleObj1, 1, python::PyString_FromString("a"));
+
+    PyObject *tupleObj2 = PyTuple_New(2);
+    PyTuple_SET_ITEM(tupleObj2, 0, PyLong_FromLong(2));
+    PyTuple_SET_ITEM(tupleObj2, 1, python::PyString_FromString("b"));
+
+
+    PyObject *tupleObj3 = PyTuple_New(2);
+    PyTuple_SET_ITEM(tupleObj3, 0, PyLong_FromLong(3));
+    PyTuple_SET_ITEM(tupleObj3, 1, PyLong_FromLong(42));
+
+
+    PyObject *tupleObj4 = PyTuple_New(2);
+    PyTuple_SET_ITEM(tupleObj4, 0, PyLong_FromLong(4));
+    PyTuple_SET_ITEM(tupleObj4, 1, python::PyString_FromString("d"));
+
+    PyList_SetItem(listObj, 0, tupleObj1);
+    PyList_SetItem(listObj, 1, tupleObj2);
+    PyList_SetItem(listObj, 2, tupleObj3);
+    PyList_SetItem(listObj, 3, tupleObj4);
+
+    {
+        auto list = py::reinterpret_borrow<py::list>(listObj);
+        auto res = c.parallelize(list).filter("lambda a, b: a > 1", "").take(1, 0);
+        auto resObj = res.ptr();
+
+        ASSERT_TRUE(PyList_Check(resObj));
+        ASSERT_EQ(PyList_GET_SIZE(resObj), 1);
+
+        PyObject_Print(resObj, stdout, 0);
+    }
+}
+
 //// debug any python module...
 ///** Takes a path and adds it to sys.paths by calling PyRun_SimpleString.
 // * This does rather laborious C string concatenation so that it will work in

From d36e4b2c78be6dd8bb9e490c3ce1a7ac18dad77b Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Wed, 18 May 2022 13:13:14 -0400
Subject: [PATCH 49/56] Disable the limit merge test

---
 tuplex/test/wrappers/WrapperTest.cc | 82 ++++++++++++++---------------
 1 file changed, 41 insertions(+), 41 deletions(-)

diff --git a/tuplex/test/wrappers/WrapperTest.cc b/tuplex/test/wrappers/WrapperTest.cc
index cec703086..97fd2a3f6 100644
--- a/tuplex/test/wrappers/WrapperTest.cc
+++ b/tuplex/test/wrappers/WrapperTest.cc
@@ -2520,47 +2520,47 @@ TEST_F(WrapperTest, PartitionRelease) {
 
 }
 
-
-TEST_F(WrapperTest, ResultWithLimitMerge) {
-    using namespace tuplex;
-
-    PythonContext c("c", "", testOptions());
-
-    PyObject *listObj = PyList_New(4);
-    PyObject *tupleObj1 = PyTuple_New(2);
-    PyTuple_SET_ITEM(tupleObj1, 0, PyLong_FromLong(1));
-    PyTuple_SET_ITEM(tupleObj1, 1, python::PyString_FromString("a"));
-
-    PyObject *tupleObj2 = PyTuple_New(2);
-    PyTuple_SET_ITEM(tupleObj2, 0, PyLong_FromLong(2));
-    PyTuple_SET_ITEM(tupleObj2, 1, python::PyString_FromString("b"));
-
-
-    PyObject *tupleObj3 = PyTuple_New(2);
-    PyTuple_SET_ITEM(tupleObj3, 0, PyLong_FromLong(3));
-    PyTuple_SET_ITEM(tupleObj3, 1, PyLong_FromLong(42));
-
-
-    PyObject *tupleObj4 = PyTuple_New(2);
-    PyTuple_SET_ITEM(tupleObj4, 0, PyLong_FromLong(4));
-    PyTuple_SET_ITEM(tupleObj4, 1, python::PyString_FromString("d"));
-
-    PyList_SetItem(listObj, 0, tupleObj1);
-    PyList_SetItem(listObj, 1, tupleObj2);
-    PyList_SetItem(listObj, 2, tupleObj3);
-    PyList_SetItem(listObj, 3, tupleObj4);
-
-    {
-        auto list = py::reinterpret_borrow<py::list>(listObj);
-        auto res = c.parallelize(list).filter("lambda a, b: a > 1", "").take(1, 0);
-        auto resObj = res.ptr();
-
-        ASSERT_TRUE(PyList_Check(resObj));
-        ASSERT_EQ(PyList_GET_SIZE(resObj), 1);
-
-        PyObject_Print(resObj, stdout, 0);
-    }
-}
+// TODO: reenable this once the issue is fixed
+//TEST_F(WrapperTest, ResultWithLimitMerge) {
+//    using namespace tuplex;
+//
+//    PythonContext c("c", "", testOptions());
+//
+//    PyObject *listObj = PyList_New(4);
+//    PyObject *tupleObj1 = PyTuple_New(2);
+//    PyTuple_SET_ITEM(tupleObj1, 0, PyLong_FromLong(1));
+//    PyTuple_SET_ITEM(tupleObj1, 1, python::PyString_FromString("a"));
+//
+//    PyObject *tupleObj2 = PyTuple_New(2);
+//    PyTuple_SET_ITEM(tupleObj2, 0, PyLong_FromLong(2));
+//    PyTuple_SET_ITEM(tupleObj2, 1, python::PyString_FromString("b"));
+//
+//
+//    PyObject *tupleObj3 = PyTuple_New(2);
+//    PyTuple_SET_ITEM(tupleObj3, 0, PyLong_FromLong(3));
+//    PyTuple_SET_ITEM(tupleObj3, 1, PyLong_FromLong(42));
+//
+//
+//    PyObject *tupleObj4 = PyTuple_New(2);
+//    PyTuple_SET_ITEM(tupleObj4, 0, PyLong_FromLong(4));
+//    PyTuple_SET_ITEM(tupleObj4, 1, python::PyString_FromString("d"));
+//
+//    PyList_SetItem(listObj, 0, tupleObj1);
+//    PyList_SetItem(listObj, 1, tupleObj2);
+//    PyList_SetItem(listObj, 2, tupleObj3);
+//    PyList_SetItem(listObj, 3, tupleObj4);
+//
+//    {
+//        auto list = py::reinterpret_borrow<py::list>(listObj);
+//        auto res = c.parallelize(list).filter("lambda a, b: a > 1", "").take(1, 0);
+//        auto resObj = res.ptr();
+//
+//        ASSERT_TRUE(PyList_Check(resObj));
+//        ASSERT_EQ(PyList_GET_SIZE(resObj), 1);
+//
+//        PyObject_Print(resObj, stdout, 0);
+//    }
+//}
 
 //// debug any python module...
 ///** Takes a path and adds it to sys.paths by calling PyRun_SimpleString.

From 2ed45d5b2d62f04e538bcd22ef7bec3aa64fb362 Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Wed, 18 May 2022 14:18:58 -0400
Subject: [PATCH 50/56] Fix the wrapper test

---
 tuplex/core/src/physical/ResultSet.cc |  2 +-
 tuplex/test/wrappers/WrapperTest.cc   | 83 +++++++++++++--------------
 2 files changed, 42 insertions(+), 43 deletions(-)

diff --git a/tuplex/core/src/physical/ResultSet.cc b/tuplex/core/src/physical/ResultSet.cc
index 0eb6d95ad..977c0e188 100644
--- a/tuplex/core/src/physical/ResultSet.cc
+++ b/tuplex/core/src/physical/ResultSet.cc
@@ -254,7 +254,7 @@ namespace tuplex {
         for(const auto& partition : _partitions) {
             count += partition->getNumRows();
         }
-        return count + _pyobjects.size();
+        return std::min(count + _pyobjects.size(), _maxRows);
     }
 
     void ResultSet::removeFirstPartition() {
diff --git a/tuplex/test/wrappers/WrapperTest.cc b/tuplex/test/wrappers/WrapperTest.cc
index 97fd2a3f6..c615c53aa 100644
--- a/tuplex/test/wrappers/WrapperTest.cc
+++ b/tuplex/test/wrappers/WrapperTest.cc
@@ -2511,7 +2511,7 @@ TEST_F(WrapperTest, PartitionRelease) {
         cols_to_select = PyList_New(1);
         PyList_SET_ITEM(cols_to_select, 0, python::PyString_FromString("Incident Zip"));
 
-        ctx2.csv(service_path,py::none(), true, false, "", "\"",
+        ctx2.csv(service_path, py::none(), true, false, "", "\"",
                  py::none(), py::reinterpret_steal<py::dict>(type_dict))
                 .mapColumn("Incident Zip", fix_zip_codes_c, "")
                 .selectColumns(py::reinterpret_steal<py::dict>(cols_to_select))
@@ -2520,47 +2520,46 @@ TEST_F(WrapperTest, PartitionRelease) {
 
 }
 
-// TODO: reenable this once the issue is fixed
-//TEST_F(WrapperTest, ResultWithLimitMerge) {
-//    using namespace tuplex;
-//
-//    PythonContext c("c", "", testOptions());
-//
-//    PyObject *listObj = PyList_New(4);
-//    PyObject *tupleObj1 = PyTuple_New(2);
-//    PyTuple_SET_ITEM(tupleObj1, 0, PyLong_FromLong(1));
-//    PyTuple_SET_ITEM(tupleObj1, 1, python::PyString_FromString("a"));
-//
-//    PyObject *tupleObj2 = PyTuple_New(2);
-//    PyTuple_SET_ITEM(tupleObj2, 0, PyLong_FromLong(2));
-//    PyTuple_SET_ITEM(tupleObj2, 1, python::PyString_FromString("b"));
-//
-//
-//    PyObject *tupleObj3 = PyTuple_New(2);
-//    PyTuple_SET_ITEM(tupleObj3, 0, PyLong_FromLong(3));
-//    PyTuple_SET_ITEM(tupleObj3, 1, PyLong_FromLong(42));
-//
-//
-//    PyObject *tupleObj4 = PyTuple_New(2);
-//    PyTuple_SET_ITEM(tupleObj4, 0, PyLong_FromLong(4));
-//    PyTuple_SET_ITEM(tupleObj4, 1, python::PyString_FromString("d"));
-//
-//    PyList_SetItem(listObj, 0, tupleObj1);
-//    PyList_SetItem(listObj, 1, tupleObj2);
-//    PyList_SetItem(listObj, 2, tupleObj3);
-//    PyList_SetItem(listObj, 3, tupleObj4);
-//
-//    {
-//        auto list = py::reinterpret_borrow<py::list>(listObj);
-//        auto res = c.parallelize(list).filter("lambda a, b: a > 1", "").take(1, 0);
-//        auto resObj = res.ptr();
-//
-//        ASSERT_TRUE(PyList_Check(resObj));
-//        ASSERT_EQ(PyList_GET_SIZE(resObj), 1);
-//
-//        PyObject_Print(resObj, stdout, 0);
-//    }
-//}
+TEST_F(WrapperTest, ResultWithLimitMerge) {
+    using namespace tuplex;
+
+    PythonContext c("c", "", testOptions());
+
+    PyObject *listObj = PyList_New(4);
+    PyObject *tupleObj1 = PyTuple_New(2);
+    PyTuple_SET_ITEM(tupleObj1, 0, PyLong_FromLong(1));
+    PyTuple_SET_ITEM(tupleObj1, 1, python::PyString_FromString("a"));
+
+    PyObject *tupleObj2 = PyTuple_New(2);
+    PyTuple_SET_ITEM(tupleObj2, 0, PyLong_FromLong(2));
+    PyTuple_SET_ITEM(tupleObj2, 1, python::PyString_FromString("b"));
+
+
+    PyObject *tupleObj3 = PyTuple_New(2);
+    PyTuple_SET_ITEM(tupleObj3, 0, PyLong_FromLong(3));
+    PyTuple_SET_ITEM(tupleObj3, 1, PyLong_FromLong(42));
+
+
+    PyObject *tupleObj4 = PyTuple_New(2);
+    PyTuple_SET_ITEM(tupleObj4, 0, PyLong_FromLong(4));
+    PyTuple_SET_ITEM(tupleObj4, 1, python::PyString_FromString("d"));
+
+    PyList_SetItem(listObj, 0, tupleObj1);
+    PyList_SetItem(listObj, 1, tupleObj2);
+    PyList_SetItem(listObj, 2, tupleObj3);
+    PyList_SetItem(listObj, 3, tupleObj4);
+
+    {
+        auto list = py::reinterpret_borrow<py::list>(listObj);
+        auto res = c.parallelize(list).filter("lambda a, b: a > 1", "").take(1, 0);
+        auto resObj = res.ptr();
+
+        ASSERT_TRUE(PyList_Check(resObj));
+        ASSERT_EQ(PyList_GET_SIZE(resObj), 1);
+
+        PyObject_Print(resObj, stdout, 0);
+    }
+}
 
 //// debug any python module...
 ///** Takes a path and adds it to sys.paths by calling PyRun_SimpleString.

From 053081984562e2ad5d4e954e93a08833744b3a23 Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Thu, 19 May 2022 22:47:59 -0400
Subject: [PATCH 51/56] Fix Typo (and rerun CI)

---
 tuplex/core/src/DataSet.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tuplex/core/src/DataSet.cc b/tuplex/core/src/DataSet.cc
index d54edb567..8e618d012 100644
--- a/tuplex/core/src/DataSet.cc
+++ b/tuplex/core/src/DataSet.cc
@@ -66,7 +66,7 @@ namespace tuplex {
     }
 
     std::vector<Row> DataSet::takeAsVector(size_t numElements, std::ostream &os) {
-        auto rs = take(numElements, false, os);
+        auto rs = take(numElements, 0, os);
         Timer timer;
 
 #warning "limiting should make this hack irrelevant..."

From 2099d7a86a5dd5a13e4445a6e5226802634ca68b Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Fri, 20 May 2022 00:02:47 -0400
Subject: [PATCH 52/56] Add logging after load and transform task

---
 tuplex/core/src/ee/local/LocalBackend.cc | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc
index 676a4e3b3..da9d77d43 100644
--- a/tuplex/core/src/ee/local/LocalBackend.cc
+++ b/tuplex/core/src/ee/local/LocalBackend.cc
@@ -982,6 +982,13 @@ namespace tuplex {
         }
 
         auto tasks = createLoadAndTransformToMemoryTasks(tstage, _options, syms);
+
+        {
+            std::stringstream ss;
+            ss<<"[Transform Stage] Stage "<<tstage->number()<<" starting "<<tasks.size()<<" load&transform tasks";
+            Logger::instance().defaultLogger().info(ss.str());
+        }
+
         auto completedTasks = performTasks(tasks);
 
         // Note: this doesn't work yet because of the globals.

From 809b87d2336e15514af89110bbd48b4701da7972 Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Fri, 20 May 2022 00:25:32 -0400
Subject: [PATCH 53/56] Fix missing completed work issue

---
 tuplex/core/src/Executor.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tuplex/core/src/Executor.cc b/tuplex/core/src/Executor.cc
index acfdd0aa6..618b01345 100644
--- a/tuplex/core/src/Executor.cc
+++ b/tuplex/core/src/Executor.cc
@@ -103,8 +103,6 @@ namespace tuplex {
         // save which thread executed this task
         task->setID(std::this_thread::get_id());
 
-        _numPendingTasks.fetch_add(-1, std::memory_order_release);
-
         // add task to done list
         TRACE_LOCK("completedTasks");
         _completedTasksMutex.lock();
@@ -113,6 +111,8 @@ namespace tuplex {
         _numCompletedTasks.fetch_add(1, std::memory_order_release);
         TRACE_UNLOCK("completedTasks");
 
+        _numPendingTasks.fetch_add(-1, std::memory_order_release);
+
         return true;
     }
 

From bfb56a3a680167f8f6f9f150622db333f095d5a1 Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Fri, 20 May 2022 10:41:24 -0400
Subject: [PATCH 54/56] Resolve merge conflict

---
 tuplex/core/src/PartitionUtils.cc          | 3 ++-
 tuplex/core/src/physical/ResultSet.cc      | 6 +++---
 tuplex/core/src/physical/TransformStage.cc | 4 ----
 tuplex/test/wrappers/WrapperTest.cc        | 9 ++++++++-
 4 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/tuplex/core/src/PartitionUtils.cc b/tuplex/core/src/PartitionUtils.cc
index 745332c93..52f1ffab7 100644
--- a/tuplex/core/src/PartitionUtils.cc
+++ b/tuplex/core/src/PartitionUtils.cc
@@ -125,10 +125,11 @@ namespace tuplex {
                                                         tstage->context().id());
         assert(p_out->capacity() >= p_in->size() - numBytesToSkip);
 
-        auto ptr_out = p_out->lockRaw();
+        auto ptr_out = p_out->lockWriteRaw();
         *((int64_t *) ptr_out) = p_in->getNumRows() - numToSkip;
         ptr_out += sizeof(int64_t);
         memcpy((void *) ptr_out, ptr, p_in->size() - numBytesToSkip);
+        p_out->setNumRows(p_in->getNumRows() - numToSkip);
         p_out->unlock();
 
         p_in->unlock();
diff --git a/tuplex/core/src/physical/ResultSet.cc b/tuplex/core/src/physical/ResultSet.cc
index 0b72c2d6d..cb9373335 100644
--- a/tuplex/core/src/physical/ResultSet.cc
+++ b/tuplex/core/src/physical/ResultSet.cc
@@ -210,19 +210,19 @@ namespace tuplex {
             for (size_t i = 0; i < limit;) {
 
                 // all exhausted
-                if(_currentNormalPartitions.empty())
+                if (_currentNormalPartitions.empty())
                     break;
 
                 // get number of rows in first partition
                 Partition *first = _currentNormalPartitions.front();
                 auto num_rows = first->getNumRows();
 
-                assert(num_rows >= _curRowCounter);
+                assert(num_rows >= _curNormalRowCounter);
                 assert(limit >= i);
 
                 // how many left to retrieve?
                 auto num_to_retrieve_from_partition = std::min(limit - i, num_rows - _curNormalRowCounter);
-                if(num_to_retrieve_from_partition <= 0)
+                if (num_to_retrieve_from_partition <= 0)
                     break;
 
                 // make sure partition schema matches stored schema
diff --git a/tuplex/core/src/physical/TransformStage.cc b/tuplex/core/src/physical/TransformStage.cc
index e21012edb..bd468d67e 100644
--- a/tuplex/core/src/physical/TransformStage.cc
+++ b/tuplex/core/src/physical/TransformStage.cc
@@ -146,10 +146,6 @@ namespace tuplex {
             }
 
             // put ALL partitions to result set
-            _rs = std::make_shared<ResultSet>(schema, partitions,
-                                              generalCase, partitionToExceptionsMap, interpreterRows,
-                                              maxRows);
-
             _rs = std::make_shared<ResultSet>(schema, normalPartitions, generalPartitions, fallbackPartitions, partitionGroups, maxRows);
         }
     }
diff --git a/tuplex/test/wrappers/WrapperTest.cc b/tuplex/test/wrappers/WrapperTest.cc
index 263c63a5f..314c6e21c 100644
--- a/tuplex/test/wrappers/WrapperTest.cc
+++ b/tuplex/test/wrappers/WrapperTest.cc
@@ -2573,7 +2573,14 @@ TEST_F(WrapperTest, PartitionRelease) {
 TEST_F(WrapperTest, ResultWithLimitMerge) {
     using namespace tuplex;
 
-    PythonContext c("c", "", testOptions());
+    auto ctx_opts = "{\"webui.enable\": false,"
+                    " \"driverMemory\": \"8MB\","
+                    " \"partitionSize\": \"256KB\","
+                    "\"executorCount\": 0,"
+                    "\"tuplex.scratchDir\": \"file://" + scratchDir + "\","
+                    "\"resolveWithInterpreterOnly\": true}";
+
+    PythonContext c("c", "", ctx_opts);
 
     PyObject *listObj = PyList_New(4);
     PyObject *tupleObj1 = PyTuple_New(2);

From 71e0fe508e7dc0ac26f95895c6a544c6c81b88a1 Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Fri, 27 May 2022 18:36:49 -0400
Subject: [PATCH 55/56] Resolve merge conflict

---
 tuplex/core/src/PartitionUtils.cc | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/tuplex/core/src/PartitionUtils.cc b/tuplex/core/src/PartitionUtils.cc
index 52f1ffab7..7f2aeedfc 100644
--- a/tuplex/core/src/PartitionUtils.cc
+++ b/tuplex/core/src/PartitionUtils.cc
@@ -15,7 +15,7 @@ namespace tuplex {
     void trimPartitionsToLimit(std::vector<Partition *> &partitions,
                                size_t topLimit,
                                size_t bottomLimit,
-                               TransformStage* tstage,
+                               TransformStage *tstage,
                                Executor *exec) {
         std::vector<Partition *> limitedPartitions, limitedTailPartitions;
 
@@ -99,6 +99,21 @@ namespace tuplex {
             }
         }
 
+        if (partitions.size() != limitedPartitions.size() + limitedTailPartitions.size()) {
+            // partition is changed, we need to change the partition grouping too
+            std::vector<PartitionGroup> oldGrouping = tstage->partitionGroups();
+            std::vector<PartitionGroup> newGrouping;
+            size_t new_normal_num = limitedPartitions.size() + limitedTailPartitions.size();
+            // remove all normal partition, put new one at the front
+            newGrouping.push_back(PartitionGroup(new_normal_num, 0, 0, 0, 0, 0));
+            for (auto gp: oldGrouping) {
+                gp.numNormalPartitions = 0;
+                newGrouping.push_back(gp);
+            }
+
+            tstage->setPartitionGroups(newGrouping);
+        }
+
         // merge the head and tail partitions
         partitions.clear();
         partitions.insert(partitions.end(), limitedPartitions.begin(), limitedPartitions.end());

From ee5ff30a9209b626ded55ea7f3fe0c50c60b6352 Mon Sep 17 00:00:00 2001
From: KorlaMarch <khemarat_boonyapaluk@brown.edu>
Date: Fri, 27 May 2022 18:37:35 -0400
Subject: [PATCH 56/56] update partition grouping when trim partitions

---
 tuplex/core/src/PartitionUtils.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tuplex/core/src/PartitionUtils.cc b/tuplex/core/src/PartitionUtils.cc
index 7f2aeedfc..349f41048 100644
--- a/tuplex/core/src/PartitionUtils.cc
+++ b/tuplex/core/src/PartitionUtils.cc
@@ -105,7 +105,7 @@ namespace tuplex {
             std::vector<PartitionGroup> newGrouping;
             size_t new_normal_num = limitedPartitions.size() + limitedTailPartitions.size();
             // remove all normal partition, put new one at the front
-            newGrouping.push_back(PartitionGroup(new_normal_num, 0, 0, 0, 0, 0));
+            newGrouping.emplace_back(new_normal_num, 0, 0, 0, 0, 0);
             for (auto gp: oldGrouping) {
                 gp.numNormalPartitions = 0;
                 newGrouping.push_back(gp);