diff --git a/benchmarks/incremental/.gitignore b/benchmarks/incremental/.gitignore new file mode 100644 index 000000000..be21f6805 --- /dev/null +++ b/benchmarks/incremental/.gitignore @@ -0,0 +1,6 @@ +data/*.csv +tuplex_output/ +python_code_pipeline_stage_*.py +transform_stage_*.txt +results_dirty_zillow@10G/ +tuplex_config.json \ No newline at end of file diff --git a/benchmarks/incremental/README.md b/benchmarks/incremental/README.md new file mode 100644 index 000000000..e98368ccc --- /dev/null +++ b/benchmarks/incremental/README.md @@ -0,0 +1,25 @@ + +## Incremental Exception Resolution Experiment + +In this experiment, we run a sequence of six pipelines over the dirty zillow dataset. +- The first pipeline contains no ignore or resolver operations +- The final pipeline contains 5 unique ignore and resolver operations +- Each of the pipelines in between incrementally adds on an additional resolver until all are present for the final pipeline + +We compare the following conditions, for a total of 8 experimental trials +- Plain vs Incremental Resolution +- Single vs Multi-threaded +- Merge in order vs Merge without order + +In order to get 10GB of input data, replicate dirty zillow data 1460x (or use 1500x for simplicity). + +### Setup +To replicate the original data, create the 10G files with the following settings: +``` +python3 replicate-data.py -s 1460 -o data/zillow_dirty@10G.csv +``` +Note that both files have the same number of rows, but the synthetic version is slightly larger. + +### Running the benchmark +Use +`nohup perflock ./benchmark.sh -hwloc &` diff --git a/benchmarks/incremental/benchmark-synthetic.sh b/benchmarks/incremental/benchmark-synthetic.sh new file mode 100755 index 000000000..54217275a --- /dev/null +++ b/benchmarks/incremental/benchmark-synthetic.sh @@ -0,0 +1,96 @@ +#!/usr/bin/env bash + +# Parse HWLOC settings +HWLOC="" +if [ $# -ne 0 ] && [ $# -ne 1 ]; then # check nmber of inputs + echo "usage: ./benchmark.sh [-hwloc]" + exit 1 +fi + +if [ $# -eq 1 ]; then # check if hwloc + if [ "$1" != "-hwloc" ]; then # check flag + echo -e "invalid flag: $1\nusage: ./benchmark.sh [-hwloc]" + exit 1 + fi + HWLOC="hwloc-bind --cpubind node:1 --membind node:1 --cpubind node:2 --membind node:2" +fi + +# use 10 runs (3 for very long jobs) and a timeout after 180min/3h +NUM_RUNS=1 +NUM_STEPS=10 +TIMEOUT=14400 + +RESDIR='results_synthetic' +DATA_PATH='/hot/scratch/bgivertz/data/synthetic/synth' +INCREMENTAL_OUT_PATH='/hot/scratch/bgivertz/output/incremental' +COMMIT_OUT_PATH='/hot/scratch/bgivertz/output/commit' +PLAIN_OUT_PATH='/hot/scratch/bgivertz/output/plain' + +rm -rf $RESDIR +rm -rf $INCREMENTAL_OUT_PATH +rm -rf $PLAIN_OUT_PATH +rm -rf $COMMIT_OUT_PATH + +mkdir -p ${RESDIR} + +# create tuplex_config.json +python3 create_conf.py --opt-pushdown --opt-filter --opt-llvm --executor-count 63 --executor-memory "6G" > tuplex_config.json + +echo "running out of order experiments" +for ((r = 1; r <= NUM_RUNS; r++)); do + echo "trial ($r/$NUM_RUNS)" + + echo "running plain (0/10)" + LOG="${RESDIR}/plain-out-of-order-e0-t$r.txt" + timeout $TIMEOUT ${HWLOC} python3 runsynthetic.py --clear-cache --input-path "$DATA_PATH""0.csv" --output-path $PLAIN_OUT_PATH >$LOG 2>$LOG.stderr + + echo "running incremental (0/10)" + LOG="${RESDIR}/incremental-out-of-order-e0-t$r.txt" + timeout $TIMEOUT ${HWLOC} python3 runsynthetic.py --clear-cache --incremental-resolution --input-path "$DATA_PATH""0.csv" --output-path $INCREMENTAL_OUT_PATH >$LOG 2>$LOG.stderr + + for ((s = 1; s <= 10; s++)) do + echo "running plain ($s/10)" + LOG="${RESDIR}/plain-out-of-order-e$s-t$r.txt" + timeout $TIMEOUT ${HWLOC} python3 runsynthetic.py --use-resolve-step --clear-cache --input-path "$DATA_PATH$s.csv" --output-path $PLAIN_OUT_PATH >$LOG 2>$LOG.stderr + + echo "running incremental ($s/10)" + LOG="${RESDIR}/incremental-out-of-order-e$s-t$r.txt" + timeout $TIMEOUT ${HWLOC} python3 runsynthetic.py --use-resolve-step --clear-cache --incremental-resolution --input-path "$DATA_PATH$s.csv" --output-path $INCREMENTAL_OUT_PATH >$LOG 2>$LOG.stderr + done +done + +echo "running in order experiments" +for ((r = 1; r <= NUM_RUNS; r++)); do + echo "trial ($r/$NUM_RUNS)" + + echo "running plain (0/10)" + LOG="${RESDIR}/plain-in-order-e0-t$r.txt" + timeout $TIMEOUT ${HWLOC} python3 runsynthetic.py --clear-cache --resolve-in-order --input-path "$DATA_PATH""0.csv" --output-path $PLAIN_OUT_PATH >$LOG 2>$LOG.stderr + + echo "running incremental (0/10)" + LOG="${RESDIR}/incremental-in-order-e0-t$r.txt" + timeout $TIMEOUT ${HWLOC} python3 runsynthetic.py --clear-cache --resolve-in-order --incremental-resolution --input-path "$DATA_PATH""0.csv" --output-path $INCREMENTAL_OUT_PATH >$LOG 2>$LOG.stderr + + echo "running commit (0/10)" + LOG="${RESDIR}/commit-in-order-e0-t$r.txt" + timeout $TIMEOUT ${HWLOC} python3 runsynthetic.py --clear-cache --resolve-in-order --incremental-resolution --commit --input-path "$DATA_PATH""0.csv" --output-path $COMMIT_OUT_PATH >$LOG 2>$LOG.stderr + + for ((s = 1; s <= 10; s++)) do + echo "running plain ($s/10)" + LOG="${RESDIR}/plain-in-order-e$s-t$r.txt" + timeout $TIMEOUT ${HWLOC} python3 runsynthetic.py --use-resolve-step --clear-cache --resolve-in-order --input-path "$DATA_PATH$s.csv" --output-path $PLAIN_OUT_PATH >$LOG 2>$LOG.stderr + + echo "running incremental ($s/10)" + LOG="${RESDIR}/incremental-in-order-e$s-t$r.txt" + timeout $TIMEOUT ${HWLOC} python3 runsynthetic.py --use-resolve-step --clear-cache --resolve-in-order --incremental-resolution --input-path "$DATA_PATH$s.csv" --output-path $INCREMENTAL_OUT_PATH >$LOG 2>$LOG.stderr + + echo "running commit ($s/10)" + LOG="${RESDIR}/commit-in-order-e$s-t$r.txt" + timeout $TIMEOUT ${HWLOC} python3 runsynthetic.py --use-resolve-step --clear-cache --resolve-in-order --incremental-resolution --commit --input-path "$DATA_PATH$s.csv" --output-path $COMMIT_OUT_PATH >$LOG 2>$LOG.stderr + done +done + + +rm -rf $INCREMENTAL_OUT_PATH +rm -rf $PLAIN_OUT_PATH +rm -rf $COMMIT_OUT_PATH diff --git a/benchmarks/incremental/benchmark.sh b/benchmarks/incremental/benchmark.sh new file mode 100755 index 000000000..1691a9a6c --- /dev/null +++ b/benchmarks/incremental/benchmark.sh @@ -0,0 +1,90 @@ +#!/usr/bin/env bash + +# Parse HWLOC settings +HWLOC="" +if [ $# -ne 0 ] && [ $# -ne 1 ]; then # check nmber of inputs + echo "usage: ./benchmark.sh [-hwloc]" + exit 1 +fi + +if [ $# -eq 1 ]; then # check if hwloc + if [ "$1" != "-hwloc" ]; then # check flag + echo -e "invalid flag: $1\nusage: ./benchmark.sh [-hwloc]" + exit 1 + fi + HWLOC="hwloc-bind --cpubind node:1 --membind node:1 --cpubind node:2 --membind node:2" +fi + +# use 10 runs (3 for very long jobs) and a timeout after 180min/3h +NUM_RUNS=3 +TIMEOUT=14400 + +RESDIR='results_dirty_zillow@100G' +DATA_PATH_SSD='/hot/scratch/bgivertz/data/zillow_dirty@100G.csv' +INCREMENTAL_OUT_PATH_SSD='/hot/scratch/bgivertz/output/incremental' +INCREMENTAL_COMMIT_OUT_PATH_SSD='/hot/scratch/bgivertz/output/commit' +PLAIN_OUT_PATH_SSD='/hot/scratch/bgivertz/output/plain' + +rm -rf $RESDIR +rm -rf $INCREMENTAL_OUT_PATH_SSD +rm -rf $PLAIN_OUT_PATH_SSD +rm -rf $INCREMENTAL_COMMIT_OUT_PATH_SSD + +# does file exist? +if [[ ! -f "$DATA_PATH_SSD" ]]; then + echo "file $DATA_PATH_SSD not found, abort." + exit 1 +fi + +mkdir -p ${RESDIR} + +# create tuplex_config.json +python3 create_conf.py --opt-pushdown --opt-filter --opt-llvm --executor-count 63 --executor-memory "6G" > tuplex_config.json + +echo "running out-of-order ssd experiments" +for ((r = 1; r <= NUM_RUNS; r++)); do + echo "trial ($r/$NUM_RUNS)" + + echo "running plain" + LOG="${RESDIR}/tuplex-plain-out-of-order-ssd-$r.txt" + timeout $TIMEOUT ${HWLOC} python3 runtuplex.py --clear-cache --path $DATA_PATH_SSD --output-path $PLAIN_OUT_PATH_SSD >$LOG 2>$LOG.stderr + + echo "running incremental" + LOG="${RESDIR}/tuplex-incremental-out-of-order-ssd-$r.txt" + timeout $TIMEOUT ${HWLOC} python3 runtuplex.py --clear-cache --incremental-resolution --path $DATA_PATH_SSD --output-path $INCREMENTAL_OUT_PATH_SSD >$LOG 2>$LOG.stderr + +# echo "validating results" +# LOG="${RESDIR}/tuplex-compare-out-of-order-ssd-$r.txt" +# timeout $TIMEOUT ${HWLOC} python3 compare_folders.py $PLAIN_OUT_PATH_SSD $INCREMENTAL_OUT_PATH_SSD >$LOG 2>$LOG.stderr +done + +echo "running in-order ssd experiments" +for ((r = 1; r <= NUM_RUNS; r++)); do + echo "trial ($r/$NUM_RUNS)" + + echo "running plain" + LOG="${RESDIR}/tuplex-plain-in-order-ssd-$r.txt" + timeout $TIMEOUT ${HWLOC} python3 runtuplex.py --clear-cache --resolve-in-order --path $DATA_PATH_SSD --output-path $PLAIN_OUT_PATH_SSD >$LOG 2>$LOG.stderr + + echo "running incremental" + LOG="${RESDIR}/tuplex-incremental-in-order-ssd-$r.txt" + timeout $TIMEOUT ${HWLOC} python3 runtuplex.py --clear-cache --resolve-in-order --incremental-resolution --path $DATA_PATH_SSD --output-path $INCREMENTAL_OUT_PATH_SSD >$LOG 2>$LOG.stderr + + echo "running commit" + LOG="${RESDIR}/tuplex-incremental-in-order-commit-ssd-$r.txt" + timeout $TIMEOUT ${HWLOC} python3 runtuplex.py --clear-cache --resolve-in-order --incremental-resolution --commit --path $DATA_PATH_SSD --output-path $INCREMENTAL_COMMIT_OUT_PATH_SSD >$LOG 2>$LOG.stderr + +# echo "validating results" +# LOG="${RESDIR}/tuplex-compare-in-order-ssd-$r.txt" +# timeout $TIMEOUT ${HWLOC} python3 compare_folders.py --in-order $PLAIN_OUT_PATH_SSD $INCREMENTAL_OUT_PATH_SSD >$LOG 2>$LOG.stderr +# +# LOG="${RESDIR}/tuplex-compare-in-order-commit-ssd-$r.txt" +# timeout $TIMEOUT ${HWLOC} python3 compare_folders.py --in-order $INCREMENTAL_COMMIT_OUT_PATH_SSD $INCREMENTAL_OUT_PATH_SSD >$LOG 2>$LOG.stderr +done + +echo "graphing results" +python3 graph.py --results-path $RESDIR --num-trials $NUM_RUNS --num-steps 7 + +rm -rf $INCREMENTAL_OUT_PATH_SSD +rm -rf $PLAIN_OUT_PATH_SSD +rm -rf $INCREMENTAL_COMMIT_OUT_PATH_SSD diff --git a/benchmarks/incremental/compare_folders.py b/benchmarks/incremental/compare_folders.py new file mode 100755 index 000000000..f8daddb6c --- /dev/null +++ b/benchmarks/incremental/compare_folders.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 +# (c) L.Spiegelberg 2021 +# compare the csv output contents of two folders (ignoring order) + +import os +import sys +import argparse +import glob + + +def wc_files(path): + files = sorted(glob.glob(os.path.join(path, '*.csv'))) + + all_lines = [] + num_rows = 0 + header = None + matching_headers = 0 + for f in files: + with open(f, 'r') as fp: + lines = fp.readlines() + if header is None and len(lines) > 0: + header = lines[0] + num_rows += len(lines) + if len(lines) > 0: + if header == lines[0]: + matching_headers += 1 + all_lines += lines[1:] + else: + all_lines += lines + + if matching_headers == len(files): + num_rows -= matching_headers + + print('-- counted {} rows in {} files in folder {}'.format(num_rows, len(files), path)) + return num_rows, len(files), all_lines + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("folderA") + parser.add_argument("folderB") + parser.add_argument("--in-order", help='whether to compare in order', action='store_true') + args = parser.parse_args() + + print('== Dirty Zillow experiment validation ==') + + # count lines in each folder + print('-- loading folder contents...') + rowCountA, filesA, rowsA = wc_files(args.folderA) + rowCountB, filesB, rowsB = wc_files(args.folderB) + + if rowCountA != rowCountB: + print('>>> number of rows does not match') + sys.exit(1) + + # sort lines and compare them + if not args.in_order: + print('-- sorting rows from {}'.format(args.folderA)) + rowsA = sorted(rowsA) + print('-- sorting rows from {}'.format(args.folderB)) + rowsB = sorted(rowsB) + + print('-- computing comparison of rows...') + non_matching_indices = [ind for ind, (i, j) in enumerate(zip(rowsA, rowsB)) if i != j] + + if len(non_matching_indices) > 0: + print('>>> rows do not match up, details:') + + for idx in non_matching_indices: + print('{:5d}: {} != {}'.format(idx, rowsA[idx], rowsB[idx])) + sys.exit(1) + + print('>>> contents of folders match.') + + sys.exit(0) + + +if __name__ == '__main__': + main() diff --git a/benchmarks/incremental/create-synthetic.sh b/benchmarks/incremental/create-synthetic.sh new file mode 100644 index 000000000..dc862b661 --- /dev/null +++ b/benchmarks/incremental/create-synthetic.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +set -x + +python3 synthesize-data.py --dataset-size $1 --output-path /hot/scratch/bgivertz/data/synthetic/synth0.csv --exceptions 0 + +for ((i = 1; i <= 9; i++)) do + python3 synthesize-data.py --dataset-size $1 --output-path /hot/scratch/bgivertz/data/synthetic/synth$i.csv --exceptions 0.$i +done + +python3 synthesize-data.py --dataset-size $1 --output-path /hot/scratch/bgivertz/data/synthetic/synth10.csv --exceptions 1 diff --git a/benchmarks/incremental/create_conf.py b/benchmarks/incremental/create_conf.py new file mode 100644 index 000000000..10420e3c8 --- /dev/null +++ b/benchmarks/incremental/create_conf.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 +# (c) 2020 L.Spiegelberg +# this script creates Tuplex json configuration files for benchmarks + +import json +import argparse + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--executor-memory', default='10G', help='how much memory each thread gets') + parser.add_argument('--executor-count', default=15, help='how many worker threads') + parser.add_argument('--partition-size', default='32MB', help='task size') + parser.add_argument('--runtime-memory', default='64MB', help='how much maximum runtime memory to use') + parser.add_argument('--input-split-size', default='64MB', help='chunk size of input files') + parser.add_argument('--opt-null', help='enable null value optimization', action='store_true') + parser.add_argument('--opt-pushdown', help='enable projection pushdown', action='store_true') + parser.add_argument('--opt-filter', help='enable filter pushdown', action='store_true') + parser.add_argument('--opt-parser', help='generate CSV parser', action='store_true') + parser.add_argument('--opt-llvm', help='run llvm optimizers', action='store_true') + + args = parser.parse_args() + + conf = {'webui.enable' : False, + 'executorMemory' : args.executor_memory, + 'executorCount' : args.executor_count, + 'driverMemory' : args.executor_memory, + 'partitionSize' : args.partition_size, + 'runTimeMemory' : args.runtime_memory, + 'inputSplitSize' : args.input_split_size, + 'useLLVMOptimizer' : args.opt_llvm, + 'optimizer.nullValueOptimization' : args.opt_null, + 'csv.selectionPushdown' : args.opt_pushdown, + 'optimizer.generateParser' : args.opt_parser, + 'optimizer.filterPushdown' : args.opt_filter} + + print(json.dumps(conf)) \ No newline at end of file diff --git a/benchmarks/incremental/export_results.py b/benchmarks/incremental/export_results.py new file mode 100644 index 000000000..60fa7f18d --- /dev/null +++ b/benchmarks/incremental/export_results.py @@ -0,0 +1,121 @@ +import argparse +import os +import json +from enum import Enum + +class Mode(Enum): + OUT_OF_ORDER = 1 + IN_ORDER = 2 + COMMIT = 3 + +def validate_experiment(compare_path): + with open(compare_path) as f: + lines = f.read().splitlines() + return ">>> contents of folders match." in lines + +def get_metric(path, metric, step): + with open(path) as f: + lines = f.read().splitlines() + ind = lines.index("EXPERIMENTAL RESULTS") + 2 + line = lines[ind + step] + metrics = json.loads(line) + if metric == 'jobTime': + return metrics[metric] + else: + return metrics["stages"][0][metric] + +def compare_path(trial, mode): + return "tuplex-compare-{}{}-ssd-{}.txt".format('out-of-order' if mode == Mode.OUT_OF_ORDER else 'in-order', + '-commit' if mode == Mode.COMMIT else '', + trial) + + +def experiment_path(trial, incremental, mode): + return "tuplex-{}-{}{}-ssd-{}.txt".format('incremental' if incremental else 'plain', + 'out-of-order' if mode == Mode.OUT_OF_ORDER else 'in-order', + '-commit' if mode == Mode.COMMIT else '', + trial) + +def write_metric_to_file(f, results_path, num_trials, num_steps, mode, metric): + header = "{},".format("Out of Order" if mode == Mode.OUT_OF_ORDER else "In Order") + \ + "," * num_trials + "Plain," + \ + "," * num_trials + "Incremental" + \ + ("," * (num_trials + 1) + "Commit\n" if mode == Mode.IN_ORDER else "\n") + f.write(header) + + header = "Resolvers," + \ + ','.join(["Trial {}".format(i + 1) for i in range(num_trials)]) + \ + ",Average," + \ + ','.join(["Trial {}".format(i + 1) for i in range(num_trials)]) + \ + ",Average" + \ + ("," + ','.join(["Trial {}".format(i + 1) for i in range(num_trials)]) + ",Average\n" if mode == Mode.IN_ORDER else "\n") + f.write(header) + + for step in range(num_steps): + line = f"{step}," + + plain_total = 0 + for trial in range(num_trials): + plain_path = os.path.join(results_path, experiment_path(trial + 1, False, mode)) + plain_time = get_metric(plain_path, metric, step) + + plain_total += plain_time + line += f"{plain_time}," + line += f"{plain_total / num_trials}," + + incremental_total = 0 + for trial in range(num_trials): + incremental_path = os.path.join(results_path, experiment_path(trial + 1, True, mode)) + incremental_time = get_metric(incremental_path, metric, step) + + incremental_total += incremental_time + line += f"{incremental_time}," + line += f"{incremental_total / num_trials}" + + if mode == Mode.IN_ORDER: + line += "," + commit_total = 0 + for trial in range(num_trials): + commit_path = os.path.join(results_path, experiment_path(trial + 1, True, Mode.COMMIT)) + commit_time = get_metric(commit_path, metric, step) + + commit_total += commit_time + line += f"{commit_time}," + line += f"{commit_total / num_trials}\n" + else: + line += "\n" + + f.write(line) + +def export_experiments(results_path, num_trials, num_steps): + # Validate all experiments + # for i in range(num_trials): + # for mode in Mode: + # validate_path = os.path.join(results_path, compare_path(i + 1, mode)) + # assert validate_experiment(validate_path) + + metrics = ['jobTime', 'fast_path_time_s', 'slow_path_time_s', 'write_output_wall_time_s'] + + file_path = "experiments.csv" + with open(file_path, 'w') as f: + for metric in metrics: + write_metric_to_file(f, results_path, num_trials, num_steps, Mode.OUT_OF_ORDER, metric) + write_metric_to_file(f, results_path, num_trials, num_steps, Mode.IN_ORDER, metric) + +def main(): + parser = argparse.ArgumentParser(description='Parse results of experiment') + parser.add_argument('--results-path', type=str, dest='results_path', default='results_dirty_zillow@10G') + parser.add_argument('--num-trials', type=int, dest='num_trials', default=1,) + parser.add_argument('--num-steps', type=int, dest='num_steps', default=7) + args = parser.parse_args() + + results_path = args.results_path + num_trials = args.num_trials + num_steps = args.num_steps + + assert os.path.isdir(results_path) + + export_experiments(results_path, num_trials, num_steps) + +if __name__ == '__main__': + main() diff --git a/benchmarks/incremental/graph-synthetic.py b/benchmarks/incremental/graph-synthetic.py new file mode 100644 index 000000000..e74f9a0a6 --- /dev/null +++ b/benchmarks/incremental/graph-synthetic.py @@ -0,0 +1,120 @@ +import argparse +import os.path +import json +import numpy as np +import matplotlib.pyplot as plt +import matplotlib.patches as mpatches + +PLAIN_COLOR = "#4285F4" +INCREMENTAL_COLOR = '#DB4437' +COMMIT_COLOR = "#F4B400" + +class Experiment: + def __init__(self, results_path, num_trials, num_steps, save_path): + self.results_path = results_path + self.num_trials = num_trials + self.num_steps = num_steps + self.save_path = save_path + + def graph_in_order(self): + plain_results = self.get_results(False, 'plain') + inc_results = self.get_results(False, 'incremental') + commit_results = self.get_results(False, 'commit') + + fig = plt.figure(figsize=(6, 4)) + + plt.plot(plain_results, marker='o', color=PLAIN_COLOR) + plt.plot(inc_results, marker='o', color=INCREMENTAL_COLOR) + plt.plot(commit_results, marker='o', color=COMMIT_COLOR) + # plt.ylim(0, 110) + + plt.ylabel('Total Execution Time (s)') + plt.xlabel('Amount of Exceptions') + labels = ['0', '0.1', '0.2', '0.3', '0.4', '0.5', '0.6', '0.7', '0.8', '0.9', '1.0'] + x = np.arange(len(labels)) + plt.xticks(x, labels) + + plt.title('In Order | Synthetic') + plt.legend(handles=[ + mpatches.Patch(color=PLAIN_COLOR, label='Plain'), + mpatches.Patch(color=INCREMENTAL_COLOR, label='Incremental'), + mpatches.Patch(color=COMMIT_COLOR, label='Commit') + ], loc='lower right') + + fig.savefig(os.path.join(self.save_path, 'in-order-synth.png'), dpi=400, bbox_inches='tight') + + def graph_out_of_order(self): + plain_results = self.get_results(True, 'plain') + inc_results = self.get_results(True, 'incremental') + + fig = plt.figure(figsize=(6, 4)) + + plt.plot(plain_results, marker='o', color=PLAIN_COLOR) + plt.plot(inc_results, marker='o', color=INCREMENTAL_COLOR) + # plt.ylim(0, 110) + + plt.ylabel('Total Execution Time (s)') + plt.xlabel('Amount of Exceptions') + labels = ['0', '0.1', '0.2', '0.3', '0.4', '0.5', '0.6', '0.7', '0.8', '0.9', '1.0'] + x = np.arange(len(labels)) + plt.xticks(x, labels) + + plt.title('Out of Order | Synthetic') + plt.legend(handles=[ + mpatches.Patch(color=PLAIN_COLOR, label='Plain'), + mpatches.Patch(color=INCREMENTAL_COLOR, label='Incremental'), + ], loc='upper right') + + fig.savefig(os.path.join(self.save_path, 'out-of-order-synth.png'), dpi=400, bbox_inches='tight') + + def get_path(self, out_of_order, mode, step, trial): + filename = f"{mode}-{'out-of-order' if out_of_order else 'in-order'}-e{step}-t{trial}.txt" + return os.path.join(self.results_path, filename) + + def get_results(self, out_of_order, mode): + results = [] + for step in range(self.num_steps): + step_results = [] + for trial in range(self.num_trials): + path = self.get_path(out_of_order, mode, step, trial + 1) + step_results.append(self.get_metric(path)) + results.append(sum(step_results) / len(step_results)) + return np.array(results) + + def get_metric(self, path): + with open(path, 'r') as fp: + lines = fp.read().splitlines() + ind = lines.index("EXPERIMENTAL RESULTS") + 1 + line = lines[ind] + metrics = json.loads(line) + return metrics['totalRunTime'] + +def main(): + parser = argparse.ArgumentParser(description='Graph results of synthetic experiment') + parser.add_argument('--results-path', type=str, dest='results_path', default='results_synthetic') + parser.add_argument('--num-trials', type=int, dest='num_trials', default=1) + parser.add_argument('--num-steps', type=int, dest='num_steps', default=11) + parser.add_argument('--save-path', type=str, dest='save_path', default='graphs-synthetic') + args = parser.parse_args() + + results_path = args.results_path + num_trials = args.num_trials + num_steps = args.num_steps + save_path = args.save_path + + if not os.path.isdir(save_path): + os.makedirs(save_path) + assert os.path.isdir(results_path) + + params = {'font.family': 'Times', + 'legend.fontsize': 'medium', + 'axes.labelsize': 'medium', + 'axes.titlesize': 'medium'} + plt.rcParams.update(params) + + e = Experiment(results_path, num_trials, num_steps, save_path) + e.graph_out_of_order() + e.graph_in_order() + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/benchmarks/incremental/graph.py b/benchmarks/incremental/graph.py new file mode 100644 index 000000000..13ddaefa5 --- /dev/null +++ b/benchmarks/incremental/graph.py @@ -0,0 +1,279 @@ +import matplotlib.pyplot as plt +import numpy as np +import matplotlib.patches as mpatches +from brokenaxes import brokenaxes +from enum import Enum +import argparse +import os +import json + +PLAIN_COLOR = "#4285F4" +INCREMENTAL_COLOR = '#DB4437' +COMMIT_COLOR = "#F4B400" + +class Mode(Enum): + OUT_OF_ORDER = 1 + IN_ORDER = 2 + COMMIT = 3 + +def in_order_total(save_path, plain_times, incremental_times, commit_times): + width = 0.7 + separator = 0.02 + + # labels = ['No\nResolvers', 'Bedroom\nResolve', 'Bedroom\nIgnore', 'Bathroom\nResolve', 'Bathroom\nIgnore', 'Price\nResolve', 'Price\nIgnore'] + labels = ['0', '1', '2', '3', '4', '5', '6'] + x = np.arange(len(labels)) + + fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(6, 4), gridspec_kw={'height_ratios': [1, 4]}) + fig.subplots_adjust(hspace=0.05) + + ax1.bar(x - width/3 - separator, plain_times, width/3, color=PLAIN_COLOR) + ax1.bar(x, incremental_times, width/3, color=INCREMENTAL_COLOR) + ax1.bar(x + width/3 + separator, commit_times, width/3, color=COMMIT_COLOR) + + ax2.bar(x - width/3 - separator, plain_times, width/3, color=PLAIN_COLOR) + ax2.bar(x, incremental_times, width/3, color=INCREMENTAL_COLOR) + ax2.bar(x + width/3 + separator, commit_times, width/3, color=COMMIT_COLOR) + + ax1.set_ylim(164.0, 200.0) + ax2.set_ylim(0.0, 38.0) + + ax1.spines.bottom.set_visible(False) + ax2.spines.top.set_visible(False) + ax1.xaxis.tick_top() + ax1.tick_params(labeltop=False) + ax2.xaxis.tick_bottom() + d = 0.5 + kwargs = dict(marker=[(-1, -d), (1, d)], markersize=12, + linestyle="none", color='k', mec='k', mew=1, clip_on=False) + ax1.plot([0, 1], [0, 0], transform=ax1.transAxes, **kwargs) + ax2.plot([0, 1], [1, 1], transform=ax2.transAxes, **kwargs) + + ax1.set_title('In Order') + plt.ylabel('Execution Time (s)') + plt.xlabel('Exception Resolution Step') + fig.legend(handles=[ + mpatches.Patch(color=PLAIN_COLOR, label='Plain'), + mpatches.Patch(color=INCREMENTAL_COLOR, label='Incremental'), + mpatches.Patch(color=COMMIT_COLOR, label='Commit') + ], loc=(0.727, 0.748)) + + fig.savefig(os.path.join(save_path, 'in-order-total.png'), dpi=400, bbox_inches='tight') + +def out_of_order_total(save_path, plain_times, incremental_times): + width = 0.35 + separator = 0.02 + + labels = ['0', '1', '2', '3', '4', '5', '6'] + x = np.arange(len(labels)) + + # Use 6x4 for size, use latex text from paper script + fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(6, 4), gridspec_kw={'height_ratios': [1, 5]}) + fig.subplots_adjust(hspace=0.05) + + ax1.bar(x - width/2 - separator, plain_times, width + separator, color=PLAIN_COLOR) + ax1.bar(x + width/2 + separator, incremental_times, width + separator, color=INCREMENTAL_COLOR) + + ax2.bar(x - width/2 - separator, plain_times, width + separator, color=PLAIN_COLOR) + ax2.bar(x + width/2 + separator, incremental_times, width + separator, color=INCREMENTAL_COLOR) + + ax1.set_ylim(184.0, 200.0) + ax2.set_ylim(0.0, 38.0) + + ax1.set_title('Out of Order') + plt.xlabel('Exception Resolution Step') + plt.ylabel('Execution Time (s)') + ax1.legend(handles=[ + mpatches.Patch(color=PLAIN_COLOR, label='Plain'), + mpatches.Patch(color=INCREMENTAL_COLOR, label='Incremental') + ], loc='upper right') + + ax1.spines.bottom.set_visible(False) + ax2.spines.top.set_visible(False) + ax1.xaxis.tick_top() + ax1.tick_params(labeltop=False) + ax2.xaxis.tick_bottom() + d = 0.5 + kwargs = dict(marker=[(-1, -d), (1, d)], markersize=12, + linestyle="none", color='k', mec='k', mew=1, clip_on=False) + ax1.plot([0, 1], [0, 0], transform=ax1.transAxes, **kwargs) + ax2.plot([0, 1], [1, 1], transform=ax2.transAxes, **kwargs) + + fig.savefig(os.path.join(save_path, 'out-of-order-total.png'), dpi=400, bbox_inches='tight') + +def time_breakdown(save_path, title, save_name, fast_path, slow_path, write): + width = 0.6 + + # labels = ['No\nResolvers', 'Bedroom\nResolve', 'Bedroom\nIgnore', 'Bathroom\nResolve', 'Bathroom\nIgnore', 'Price\nResolve', 'Price\nIgnore'] + labels = ['0', '1', '2', '3', '4', '5', '6'] + x = np.arange(len(labels)) + + fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(6, 4), gridspec_kw={'height_ratios': [1, 5]}) + fig.subplots_adjust(hspace=0.05) + + + ax1.bar(x, fast_path, width, color=PLAIN_COLOR) + ax1.bar(x, slow_path, width, bottom=fast_path, color=INCREMENTAL_COLOR) + ax1.bar(x, write, width, bottom=fast_path + slow_path, color=COMMIT_COLOR) + + ax2.bar(x, fast_path, width, color=PLAIN_COLOR) + ax2.bar(x, slow_path, width, bottom=fast_path, color=INCREMENTAL_COLOR) + ax2.bar(x, write, width, bottom=fast_path + slow_path, color=COMMIT_COLOR) + + ax1.set_ylim(184.0, 200.0) + ax2.set_ylim(0.0, 38.0) + + ax1.spines.bottom.set_visible(False) + ax2.spines.top.set_visible(False) + ax1.xaxis.tick_top() + ax1.tick_params(labeltop=False) + ax2.xaxis.tick_bottom() + d = 0.5 + kwargs = dict(marker=[(-1, -d), (1, d)], markersize=12, + linestyle="none", color='k', mec='k', mew=1, clip_on=False) + ax1.plot([0, 1], [0, 0], transform=ax1.transAxes, **kwargs) + ax2.plot([0, 1], [1, 1], transform=ax2.transAxes, **kwargs) + + ax1.set_title(title) + plt.ylabel('Execution Time (s)') + plt.xlabel('Exception Resolution Step') + + fig.legend(handles=[ + mpatches.Patch(color=PLAIN_COLOR, label='Fast Path'), + mpatches.Patch(color=INCREMENTAL_COLOR, label='Slow Path'), + mpatches.Patch(color=COMMIT_COLOR, label='Write') + ], loc=(0.745, 0.748)) + + fig.savefig(os.path.join(save_path, save_name), dpi=400, bbox_inches='tight') + +# def out_of_order_total(save_path, plain_times, incremental_times): +# width = 0.35 +# separator = 0.02 +# +# # labels = ['No\nResolvers', 'Bedroom\nResolve', 'Bedroom\nIgnore', 'Bathroom\nResolve', 'Bathroom\nIgnore', 'Price\nResolve', 'Price\nIgnore'] +# labels = ['0', '1', '2', '3', '4', '5', '6'] +# x = np.arange(len(labels)) +# +# fig = plt.figure(figsize=(10, 6)) +# +# plt.bar(x - width/2 - separator, plain_times, width + separator, color=PLAIN_COLOR) +# plt.bar(x + width/2 + separator, incremental_times, width + separator, color=INCREMENTAL_COLOR) +# +# plt.title('Out of Order') +# plt.xticks(x, labels) +# plt.ylabel('Execution Time (s)') +# plt.xlabel('Exception Resolution Step') +# plt.legend(handles=[ +# mpatches.Patch(color=PLAIN_COLOR, label='Plain'), +# mpatches.Patch(color=INCREMENTAL_COLOR, label='Incremental') +# ], loc='upper right') +# +# fig.savefig(os.path.join(save_path, 'out-of-order-total.png'), dpi=400) + +def validate_experiment(compare_path): + with open(compare_path) as f: + lines = f.read().splitlines() + return ">>> contents of folders match." in lines + +def get_metric(path, metric, step): + with open(path) as f: + lines = f.read().splitlines() + ind = lines.index("EXPERIMENTAL RESULTS") + 2 + line = lines[ind + step] + metrics = json.loads(line) + if metric == 'jobTime': + return metrics[metric] + else: + return metrics["stages"][0][metric] + +def compare_path(trial, mode): + return "tuplex-compare-{}{}-ssd-{}.txt".format('out-of-order' if mode == Mode.OUT_OF_ORDER else 'in-order', + '-commit' if mode == Mode.COMMIT else '', + trial) + + +def experiment_path(trial, incremental, mode): + return "tuplex-{}-{}{}-ssd-{}.txt".format('incremental' if incremental else 'plain', + 'out-of-order' if mode == Mode.OUT_OF_ORDER else 'in-order', + '-commit' if mode == Mode.COMMIT else '', + trial) + +def get_average_times(results_path, metric, num_trials, num_steps, incremental, mode): + times = [] + for i in range(num_steps): + total = 0 + for j in range(num_trials): + total += get_metric(os.path.join(results_path, experiment_path(j + 1, incremental, mode)), metric, i) + total /= num_trials + times.append(total) + return np.array(times) + +def main(): + parser = argparse.ArgumentParser(description='Parse results of experiment') + parser.add_argument('--results-path', type=str, dest='results_path', default='results_dirty_zillow@10G') + parser.add_argument('--num-trials', type=int, dest='num_trials', default=1) + parser.add_argument('--num-steps', type=int, dest='num_steps', default=7) + parser.add_argument('--save-path', type=str, dest='save_path', default='graphs') + args = parser.parse_args() + + results_path = args.results_path + num_trials = args.num_trials + num_steps = args.num_steps + save_path = args.save_path + + if not os.path.isdir(save_path): + os.makedirs(save_path) + assert os.path.isdir(results_path) + + params = {'font.family': 'Times', + 'legend.fontsize': 'medium', + 'axes.labelsize': 'medium', + 'axes.titlesize': 'medium'} + plt.rcParams.update(params) + + # for i in range(num_trials): + # for mode in Mode: + # validate_path = os.path.join(results_path, compare_path(i + 1, mode)) + # assert validate_experiment(validate_path) + + + # Total Times + plain_times = get_average_times(results_path, 'jobTime', num_trials, num_steps, False, Mode.OUT_OF_ORDER) + inc_times = get_average_times(results_path, 'jobTime', num_trials, num_steps, True, Mode.OUT_OF_ORDER) + out_of_order_total(save_path, plain_times, inc_times) + + plain_times = get_average_times(results_path, 'jobTime', num_trials, num_steps, False, Mode.IN_ORDER) + inc_times = get_average_times(results_path, 'jobTime', num_trials, num_steps, True, Mode.IN_ORDER) + commit_times = get_average_times(results_path, 'jobTime', num_trials, num_steps, True, Mode.COMMIT) + in_order_total(save_path, plain_times, inc_times, commit_times) + + # Time Break Down + plain_fast = get_average_times(results_path, 'fast_path_time_s', num_trials, num_steps, False, Mode.OUT_OF_ORDER) + plain_slow = get_average_times(results_path, 'slow_path_time_s', num_trials, num_steps, False, Mode.OUT_OF_ORDER) + plain_write = get_average_times(results_path, 'write_output_wall_time_s', num_trials, num_steps, False, Mode.OUT_OF_ORDER) + time_breakdown(save_path, 'Out of Order | Plain', 'out-of-order-plain-breakdown.png', plain_fast, plain_slow, plain_write) + + inc_fast = get_average_times(results_path, 'fast_path_time_s', num_trials, num_steps, True, Mode.OUT_OF_ORDER) + inc_slow = get_average_times(results_path, 'slow_path_time_s', num_trials, num_steps, True, Mode.OUT_OF_ORDER) + inc_write = get_average_times(results_path, 'write_output_wall_time_s', num_trials, num_steps, True, Mode.OUT_OF_ORDER) + time_breakdown(save_path, 'Out of Order | Incremental', 'out-of-order-incremental-breakdown.png', inc_fast, inc_slow, inc_write) + + plain_fast = get_average_times(results_path, 'fast_path_time_s', num_trials, num_steps, False, Mode.IN_ORDER) + plain_slow = get_average_times(results_path, 'slow_path_time_s', num_trials, num_steps, False, Mode.IN_ORDER) + plain_write = get_average_times(results_path, 'write_output_wall_time_s', num_trials, num_steps, False, Mode.IN_ORDER) + time_breakdown(save_path, 'In Order | Plain', 'in-order-plain-breakdown.png', plain_fast, plain_slow, plain_write) + + inc_fast = get_average_times(results_path, 'fast_path_time_s', num_trials, num_steps, True, Mode.IN_ORDER) + inc_slow = get_average_times(results_path, 'slow_path_time_s', num_trials, num_steps, True, Mode.IN_ORDER) + inc_write = get_average_times(results_path, 'write_output_wall_time_s', num_trials, num_steps, True, Mode.IN_ORDER) + time_breakdown(save_path, 'In Order | Incremental', 'in-order-incremental-breakdown.png', inc_fast, inc_slow, inc_write) + + commit_fast = get_average_times(results_path, 'fast_path_time_s', num_trials, num_steps, True, Mode.COMMIT) + commit_slow = get_average_times(results_path, 'slow_path_time_s', num_trials, num_steps, True, Mode.COMMIT) + commit_write = get_average_times(results_path, 'write_output_wall_time_s', num_trials, num_steps, True, Mode.COMMIT) + time_breakdown(save_path, 'In Order | Commit', 'in-order-commit-breakdown.png', commit_fast, commit_slow, commit_write) + + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/benchmarks/incremental/piechart.py b/benchmarks/incremental/piechart.py new file mode 100644 index 000000000..ee9fafc5d --- /dev/null +++ b/benchmarks/incremental/piechart.py @@ -0,0 +1,123 @@ +import matplotlib.pyplot as plt +from matplotlib import gridspec +import math +import matplotlib.patches as mpatches +import numpy as np + +class Graph: + def __init__(self, data): + self.data = data + self.num_cols = len(data[0]) + self.num_rows = len(data) + self.num_grid_cols = 1000 + + def plot(self): + fig = plt.figure(figsize=(6, 4)) + for i in range(self.num_rows): + row = self.data[i] + cur_col = 0 + for j in range(self.num_cols): + col = row[j] + colspan = self.get_num_cols(i, j) + print(colspan) + ax = plt.subplot2grid((self.num_rows, self.num_grid_cols), (i, cur_col), colspan=colspan) + cur_col += colspan + ax.pie(col.get_data()) + plt.show() + + def get_num_cols(self, row, col): + total = 0 + for step in self.data[row]: + total += sum(step.get_data()) + val = int((sum(self.data[row][col].get_data()) / total) * self.num_grid_cols) + if val == 0: + return val + 1 + else: + return val + +class Step: + def __init__(self, fast_path, slow_path, write): + self.fast_path = fast_path + self.slow_path = slow_path + self.write = write + + def get_data(self): + return [self.fast_path, self.slow_path, self.write] + + def get_ratio(self): + return math.sqrt(sum(self.get_data())) + +def main(): + # data = [ + # [Step(190.9079, 0, 2.381094), Step(17.8558, 6.106002, 6.650906), Step(15.79836, 2.799532, 6.771469), Step(16.50858, 2.214784, 6.747705), Step(15.70112, 1.991375, 6.759192), Step(15.06146, 0.7536615, 6.796239), Step(14.98719, 0.8343657, 6.780956)], + # [Step(191.0442, 0, 2.409159), Step(0, 9.969823, 0.06659347), Step(0, 3.194658, 0.001719711), Step(0, 2.255742, 0.00131141), Step(0, 2.273456, 0.001078326), Step(0, 1.590703, 0.0008960344), Step(0, 0.1210075, 0.0004500517)] + # ] + + data = [Step(190.9079, 0, 2.381094), Step(17.8558, 6.106002, 6.650906), Step(15.79836, 2.799532, 6.771469), Step(16.50858, 2.214784, 6.747705), Step(15.70112, 1.991375, 6.759192), Step(15.06146, 0.7536615, 6.796239), Step(14.98719, 0.8343657, 6.780956)] + # data = [Step(191.0442, 0, 2.409159), Step(0, 9.969823, 0.06659347), Step(0, 3.194658, 0.001719711), Step(0, 2.255742, 0.00131141), Step(0, 2.273456, 0.001078326), Step(0, 1.590703, 0.0008960344), Step(0, 0.1210075, 0.0004500517)] + # data = [Step(0, 9.969823, 0.06659347), Step(0, 3.194658, 0.001719711), Step(0, 2.255742, 0.00131141), Step(0, 2.273456, 0.001078326), Step(0, 1.590703, 0.0008960344), Step(0, 0.1210075, 0.0004500517)] + + # graph = Graph(data) + # graph.plot() + # for i in range(num_rows): + # row = data[i] + # for j in range(num_cols): + # col = row[j] + # ax = plt.subplot2grid((num_rows, num_cols), (i, j)) + # ax.pie(col.data()) + # # ax.axis('equal') + # + # plt.show() + + fig, axes = plt.subplots(1, len(data), figsize=(6, 4), sharex=True, gridspec_kw={'width_ratios': [d.get_ratio() for d in data]}) + for i, step in enumerate(data): + axes[i].pie(step.get_data()) + + plt.suptitle('Time breakdown') + labels = ['0', '1', '2', '3', '4', '5', '6'] + plt.xticks(np.arange(len(labels)), labels) + # plt.legend(handles=[ + # # mpatches.Patch(label='Fast Path'), + # # mpatches.Patch(label='Slow Path'), + # # mpatches.Patch(label='Write') + # ]) + + plt.show() + + # p1 = Step(190.9079, 0, 2.381094) + # p2 = Step(17.8558, 6.106002, 6.650906) + + # gs1 = gridspec.GridSpec(1, 2, width_ratios=[p1.ratio(), p2.ratio()]) + + # i1 = Step(191.0442, 0, 2.409159) + # i2 = Step(0, 9.969823, 0.09659347) + + # gs2 = gridspec.GridSpec(1, 2, width_ratios=[i1.ratio(), i2.ratio()]) + + # fig = plt.figure(figsize=(6, 4)) + # fig, axes = plt.subplots(1, 2, figsize=(6, 4), gridspec_kw={'width_ratios': [p1.ratio(), p2.ratio()]}) + # ax1 = plt.subplot2grid((2, 100), (0, 0), colspan=86, rowspan=1) + # ax2 = plt.subplot2grid((2, 100), (0, 86), colspan=14, rowspan=1) + # ax3 = plt.subplot2grid((2, 100), (1, 0), colspan=95, rowspan=1) + # ax4 = plt.subplot2grid((2, 100), (1, 95), colspan=5, rowspan=1) + + # axes[0].pie([p1.fast_path, p1.slow_path, p1.write]) + # axes[1].pie([p2.fast_path, p2.slow_path, p2.write]) + + # axes.pie([i1.fast_path, i1.slow_path, i1.write]) + # ax4.pie([i2.fast_path, i2.slow_path, i2.write]) + + # plt.tight_layout() + + +# fig, axes = plt.subplots(rows, cols) + # fig.suptitle('Out of Order Time Breakdown') + # + # for row in range(rows): + # for col in range(cols): + # axes[row, col].pie([10, 20, 70]) + + # plt.show() + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/benchmarks/incremental/replicate-data.py b/benchmarks/incremental/replicate-data.py new file mode 100755 index 000000000..705f2e8ce --- /dev/null +++ b/benchmarks/incremental/replicate-data.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 + +import argparse +import os +from tqdm import tqdm + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Zillow cleaning') + parser.add_argument('-i', '--in', type=str, dest='input_path', default='data/zillow_dirty.csv', + help='path or pattern to zillow data') + parser.add_argument('-o', '--output-path', type=str, dest='output_path', default='data/zillow_dirty@10G.csv', + help='specify path where to save output data files') + parser.add_argument('-s', '--scale-factor', type=int, dest='scale_factor', default=1460, help='how many times to replicate file') + parser.add_argument('--include-header', action='store_true', dest='include_header', help='whether to explicitly include the first line or not when replicating') + args = parser.parse_args() + + assert args.input_path, 'need to set input data path!' + assert args.output_path, 'need to set output data path!' + + args.scale_factor = int(max(1, args.scale_factor)) # no fractional support yet + + print('>>> reading input file') + with open(args.input_path, 'r') as fp: + lines = fp.readlines() + + print('>>> replicating data {}x'.format(args.scale_factor)) + with open(args.output_path, 'w') as fp: + # write lines as is + fp.writelines(lines) + + if not args.include_header: + lines = lines[1:] + for n in tqdm(range(args.scale_factor)): + fp.writelines(lines) + + print('done.') \ No newline at end of file diff --git a/benchmarks/incremental/runsynthetic.py b/benchmarks/incremental/runsynthetic.py new file mode 100644 index 000000000..9ed1fa2f6 --- /dev/null +++ b/benchmarks/incremental/runsynthetic.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +# (c) L.Spiegelberg 2021 +# conduct dirty Zillow data experiment as described in README.md + +import tuplex +import time +import sys +import json +import os +import glob +import argparse +import math +import re +import shutil +import subprocess +import random + +def synthetic_pipeline(ctx, path, output_path, resolve, commit): + ds = ctx.csv(path, header=True) + ds = ds.withColumn("c", lambda x: 1 // x["a"] if x["a"] == 0 else x["a"]) + if resolve: + ds = ds.resolve(ZeroDivisionError, lambda x: x["a"]) + ds.tocsv(output_path, commit=commit) + return ctx.metrics + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Incremental resolution') + parser.add_argument('--input-path', type=str, dest='data_path', default='synth0.csv', help='path or pattern to synthetic data') + parser.add_argument('--output-path', type=str, dest='output_path', default='synthetic-output/', help='specify path where to save output data files') + parser.add_argument('--use-resolve-step', dest='use_resolve_step', action='store_true') + parser.add_argument('--incremental-resolution', dest='incremental_resolution', action="store_true", help="whether to use incremental resolution") + parser.add_argument('--commit-mode', dest='commit_mode', action='store_true', help='whether to use commit mode') + parser.add_argument('--resolve-in-order', dest='resolve_in_order', action="store_true", help="whether to resolve exceptions in order") + parser.add_argument('--clear-cache', dest='clear_cache', action='store_true', help='whether to clear the cache or not') + args = parser.parse_args() + + assert args.data_path, 'need to set data path!' + + # config vars + path = args.data_path + output_path = args.output_path + + if not path: + print('found no zillow data to process, abort.') + sys.exit(1) + + print('>>> running {} on {}'.format('tuplex', path)) + + # load data + tstart = time.time() + + # configuration, make sure to give enough runtime memory to the executors! + conf = {"webui.enable" : False, + "executorCount" : 16, + "executorMemory" : "6G", + "driverMemory" : "6G", + "partitionSize" : "32MB", + "runTimeMemory" : "128MB", + "useLLVMOptimizer" : True, + "optimizer.nullValueOptimization" : False, + "csv.selectionPushdown" : True, + "optimizer.generateParser" : False} # bug when using generated parser. Need to fix that. + + if os.path.exists('tuplex_config.json'): + with open('tuplex_config.json') as fp: + conf = json.load(fp) + + if args.incremental_resolution: + conf["optimizer.incrementalResolution"] = True + else: + conf["optimizer.incrementalResolution"] = False + + if args.resolve_in_order: + conf['optimizer.mergeExceptionsInOrder'] = True + else: + conf['optimizer.mergeExceptionsInOrder'] = False + + # Note: there's a bug in the merge in order mode here -.- + # force to false version + conf["optimizer.generateParser"] = False + conf["tuplex.optimizer.sharedObjectPropagation"] = False + conf["resolveWithInterpreterOnly"] = False + + tstart = time.time() + import tuplex + ctx = tuplex.Context(conf) + + print(json.dumps(ctx.options())) + + startup_time = time.time() - tstart + print('Tuplex startup time: {}'.format(startup_time)) + + shutil.rmtree(output_path, ignore_errors=True) + + if args.clear_cache: + subprocess.run(["clearcache"]) + + tstart = time.time() + metrics = [] + + use_resolve_step = args.use_resolve_step + + if use_resolve_step: + jobstart = time.time() + m = synthetic_pipeline(ctx, path, output_path, False, not args.commit_mode) + m = m.as_dict() + m["jobTime"] = time.time() - jobstart + metrics.append(m) + + jobstart = time.time() + m = synthetic_pipeline(ctx, path, output_path, True, True) + m = m.as_dict() + m["jobTime"] = time.time() - jobstart + metrics.append(m) + else: + jobstart = time.time() + m = synthetic_pipeline(ctx, path, output_path, False, True) + m = m.as_dict() + m['jobTime'] = time.time() - jobstart + metrics.append(m) + + runtime = time.time() - tstart + + print("EXPERIMENTAL RESULTS") + print(json.dumps({"startupTime": startup_time, + "totalRunTime": runtime, + "mergeExceptionsInOrder": conf["optimizer.mergeExceptionsInOrder"], + "incrementalResolution": conf["optimizer.incrementalResolution"]})) + + for metric in metrics: + print(json.dumps(metric)) diff --git a/benchmarks/incremental/runtuplex.py b/benchmarks/incremental/runtuplex.py new file mode 100644 index 000000000..5e94fe67c --- /dev/null +++ b/benchmarks/incremental/runtuplex.py @@ -0,0 +1,294 @@ +#!/usr/bin/env python3 +# (c) L.Spiegelberg 2021 +# conduct dirty Zillow data experiment as described in README.md + +import tuplex +import time +import sys +import json +import os +import glob +import argparse +import math +import re +import shutil +import subprocess +import random + +# UDFs for pipeline +def extractBd(x): + val = x['facts and features'] + max_idx = val.find(' bd') + if max_idx < 0: + max_idx = len(val) + s = val[:max_idx] + + # find comma before + split_idx = s.rfind(',') + if split_idx < 0: + split_idx = 0 + else: + split_idx += 2 + r = s[split_idx:] + return int(r) + +def extractBa(x): + val = x['facts and features'] + max_idx = val.find(' ba') + if max_idx < 0: + max_idx = len(val) + s = val[:max_idx] + + # find comma before + split_idx = s.rfind(',') + if split_idx < 0: + split_idx = 0 + else: + split_idx += 2 + r = s[split_idx:] + ba = math.ceil(2.0 * float(r)) / 2.0 + return ba + +def extractSqft(x): + val = x['facts and features'] + max_idx = val.find(' sqft') + if max_idx < 0: + max_idx = len(val) + s = val[:max_idx] + + split_idx = s.rfind('ba ,') + if split_idx < 0: + split_idx = 0 + else: + split_idx += 5 + r = s[split_idx:] + r = r.replace(',', '') + return int(r) + +def extractOffer(x): + offer = x['title'].lower() + if 'sale' in offer: + return 'sale' + if 'rent' in offer: + return 'rent' + if 'sold' in offer: + return 'sold' + if 'foreclose' in offer.lower(): + return 'foreclosed' + return offer + +def extractType(x): + t = x['title'].lower() + type = 'unknown' + if 'condo' in t or 'apartment' in t: + type = 'condo' + if 'house' in t: + type = 'house' + return type + +def extractPrice(x): + price = x['price'] + p = 0 + if x['offer'] == 'sold': + # price is to be calculated using price/sqft * sqft + val = x['facts and features'] + s = val[val.find('Price/sqft:') + len('Price/sqft:') + 1:] + r = s[s.find('$')+1:s.find(', ') - 1] + price_per_sqft = int(r) + p = price_per_sqft * x['sqft'] + elif x['offer'] == 'rent': + max_idx = price.rfind('/') + p = int(price[1:max_idx].replace(',', '')) + else: + # take price from price column + p = int(price[1:].replace(',', '')) + + return p + +def resolveBd(x): + if 'Studio' in x['facts and features']: + return 1 + raise ValueError + +#compare types and contents +def dirty_zillow_pipeline(ctx, path, output_path, step, commit): + + # Increases write times to highlight differences + + # ds = ctx.csv(path) + # + # ds = ds.withColumn('bedrooms', extractBd) + # if step > 0: + # ds = ds.resolve(ValueError, resolveBd) + # if step > 1: + # ds = ds.ignore(ValueError) + # + # ds = ds.withColumn('bathrooms', extractBa) + # if step > 2: + # ds = ds.resolve(ValueError, resolveBa) + # if step > 3: + # ds = ds.ignore(ValueError) + # + # ds = ds.withColumn('sqft', extractSqft) + # if step > 3: + # ds = ds.ignore(ValueError) + # + # ds = ds.withColumn('offer', extractOffer) + # ds = ds.withColumn('price', extractPrice) + # if step > 4: + # ds = ds.resolve(ValueError, lambda x: int(re.sub('[^0-9.]*', '', x['price']))) + # if step > 5: + # ds = ds.ignore(TypeError) + # ds = ds.ignore(ValueError) + # ds = ds.selectColumns(["address", "bedrooms", "bathrooms", "sqft", "price"]) + +# Original pipeline, most realistic, taken from previous paper to run benchmark on + + # ds = ds.withColumn("bedrooms", extractBd) + # if step > 0: + # ds = ds.resolve(ValueError, resolveBd) + # if step > 1: + # ds = ds.ignore(ValueError) + # ds = ds.withColumn("type", extractType) + # ds = ds.withColumn("zipcode", lambda x: '%05d' % int(x['postal_code'])) + # if step > 2: + # ds = ds.ignore(TypeError) + # ds = ds.mapColumn("city", lambda x: x[0].upper() + x[1:].lower()) + # ds = ds.withColumn("bathrooms", extractBa) + # if step > 3: + # ds = ds.ignore(ValueError) + # ds = ds.withColumn("sqft", extractSqft) + # if step > 4: + # ds = ds.ignore(ValueError) + # ds = ds.withColumn("offer", extractOffer) + # ds = ds.withColumn("price", extractPrice) + # if step > 5: + # ds = ds.resolve(ValueError, lambda x: int(re.sub('[^0-9.]*', '', x['price']))) + # ds = ds.filter(lambda x: 100000 < x['price'] < 2e7 and x['offer'] == 'sale') + # ds = ds.selectColumns(["url", "zipcode", "address", "city", "state", + # "bedrooms", "bathrooms", "sqft", "offer", "type", "price"]) + # ds.tocsv(output_path, commit=commit) + + ds = ctx.csv(path) + ds = ds.withColumn('bedrooms', extractBd) + if step > 0: + ds = ds.resolve(ValueError, resolveBd) + if step > 1: + ds = ds.ignore(ValueError) + ds = ds.filter(lambda x: x['bedrooms'] < 10) + ds = ds.withColumn('type', extractType) + ds = ds.filter(lambda x: x['type'] == 'condo') + ds = ds.withColumn('zipcode', lambda x: '%05d' % int(x['postal_code'])) + if step > 2: + ds = ds.ignore(TypeError) + ds = ds.mapColumn("city", lambda x: x[0].upper() + x[1:].lower()) + ds = ds.withColumn("bathrooms", extractBa) + if step > 3: + ds = ds.ignore(ValueError) + ds = ds.withColumn('sqft', extractSqft) + if step > 4: + ds = ds.ignore(ValueError) + ds = ds.withColumn('offer', extractOffer) + ds = ds.withColumn('price', extractPrice) + if step > 5: + ds = ds.resolve(ValueError, lambda x: int(re.sub('[^0-9.]*', '', x['price']))) + ds = ds.filter(lambda x: 100000 < x['price'] < 2e7 and x['offer'] == 'sale') + ds = ds.selectColumns(["url", "zipcode", "address", "city", "state", + "bedrooms", "bathrooms", "sqft", "offer", "type", "price"]) + ds.tocsv(output_path, commit=commit) + return ctx.metrics + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Incremental resolution') + parser.add_argument('--path', type=str, dest='data_path', default='/hot/scratch/bgivertz/data/zillow_dirty.csv', help='path or pattern to zillow data') + parser.add_argument('--output-path', type=str, dest='output_path', default='/hot/scratch/bgivertz/output/', help='specify path where to save output data files') + parser.add_argument('--resolve-in-order', dest='resolve_in_order', action="store_true", help="whether to resolve exceptions in order") + parser.add_argument('--num-steps', dest='num_steps', type=int, default=7) + parser.add_argument('--incremental-resolution', dest='incremental_resolution', action="store_true", help="whether to use incremental resolution") + parser.add_argument('--commit-mode', dest='commit_mode', action='store_true', help='whether to use commit mode') + parser.add_argument('--clear-cache', dest='clear_cache', action='store_true', help='whether to clear the cache or not') + args = parser.parse_args() + + assert args.data_path, 'need to set data path!' + + # config vars + path = args.data_path + output_path = args.output_path + + if not path: + print('found no zillow data to process, abort.') + sys.exit(1) + + print('>>> running {} on {}'.format('tuplex', path)) + + # load data + tstart = time.time() + + # configuration, make sure to give enough runtime memory to the executors! + conf = {"webui.enable" : False, + "executorCount" : 16, + "executorMemory" : "6G", + "driverMemory" : "6G", + "partitionSize" : "32MB", + "runTimeMemory" : "128MB", + "useLLVMOptimizer" : True, + "optimizer.nullValueOptimization" : False, + "csv.selectionPushdown" : True, + "optimizer.generateParser" : False} # bug when using generated parser. Need to fix that. + + if os.path.exists('tuplex_config.json'): + with open('tuplex_config.json') as fp: + conf = json.load(fp) + + if args.incremental_resolution: + conf["optimizer.incrementalResolution"] = True + else: + conf["optimizer.incrementalResolution"] = False + + if args.resolve_in_order: + conf['optimizer.mergeExceptionsInOrder'] = True + else: + conf['optimizer.mergeExceptionsInOrder'] = False + + # Note: there's a bug in the merge in order mode here -.- + # force to false version + conf["optimizer.generateParser"] = False + conf["tuplex.optimizer.sharedObjectPropagation"] = False + + tstart = time.time() + import tuplex + ctx = tuplex.Context(conf) + + print(json.dumps(ctx.options())) + + startup_time = time.time() - tstart + print('Tuplex startup time: {}'.format(startup_time)) + + shutil.rmtree(output_path, ignore_errors=True) + + if args.clear_cache: + subprocess.run(["clearcache"]) + + tstart = time.time() + # decide which pipeline to run based on argparse arg! + num_steps = args.num_steps + metrics = [] + for step in range(num_steps): + print(f'>>> running pipeline with {step} resolver(s) enabled...') + jobstart = time.time() + m = dirty_zillow_pipeline(ctx, path, output_path, step, not args.commit_mode or step == num_steps - 1) + m = m.as_dict() + m["numResolvers"] = step + m["jobTime"] = time.time() - jobstart + metrics.append(m) + + runtime = time.time() - tstart + + print("EXPERIMENTAL RESULTS") + print(json.dumps({"startupTime": startup_time, + "totalRunTime": runtime, + "mergeExceptionsInOrder": conf["optimizer.mergeExceptionsInOrder"], + "incrementalResolution": conf["optimizer.incrementalResolution"]})) + + for metric in metrics: + print(json.dumps(metric)) diff --git a/benchmarks/incremental/synthesize-data.py b/benchmarks/incremental/synthesize-data.py new file mode 100644 index 000000000..7c50d329e --- /dev/null +++ b/benchmarks/incremental/synthesize-data.py @@ -0,0 +1,49 @@ +import argparse +import math +import random +from tqdm import tqdm + +def generate_data(num_rows, row_size, exceptions): + num_exceptions = int(num_rows * exceptions) + exps = set(random.sample(range(num_rows), num_exceptions)) + + data = [] + + padding = 'a' * (row_size - 3) + normal_row = '1,' + padding + "\n" + exp_row = '0,' + padding + "\n" + for i in range(num_rows): + if i in exps: + data.append(exp_row) + else: + data.append(normal_row) + + return data + +def main(): + parser = argparse.ArgumentParser(description='Synthesize data') + parser.add_argument('--row-size', type=int, dest='row_size', default=200, help='number of bytes per row') + parser.add_argument('--exceptions', type=float, dest='exceptions', default=0.25, help='amount of exception rows in dataset') + parser.add_argument('--dataset-size', type=int, dest='dataset_size', default=10, help='number of megabytes in dataset') + parser.add_argument('--output-path', type=str, dest='output_path', default='synthetic.csv', help='path to output the file') + args = parser.parse_args() + + row_size = args.row_size + exceptions = args.exceptions + dataset_size = args.dataset_size * 1000000 + output_path = args.output_path + + num_rows = dataset_size // row_size + num_sample_rows = min(num_rows, 100000) + + data = generate_data(num_sample_rows, row_size, exceptions) + + with open(output_path, 'w') as fp: + header = "a,b\n" + fp.write(header) + + for _ in tqdm(range(math.ceil(num_rows // num_sample_rows))): + fp.writelines(data) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/tuplex/core/include/Context.h b/tuplex/core/include/Context.h index b75409533..6f070d2ff 100644 --- a/tuplex/core/include/Context.h +++ b/tuplex/core/include/Context.h @@ -15,6 +15,7 @@ #include #include #include "Partition.h" +#include #include "Row.h" #include "HistoryServerClasses.h" #include @@ -27,6 +28,7 @@ #include #include #include "JobMetrics.h" +#include namespace tuplex { @@ -37,7 +39,8 @@ namespace tuplex { class Executor; class Partition; class IBackend; - class ExceptionInfo; + class PartitionGroup; + class IncrementalCache; class Context { private: @@ -59,24 +62,7 @@ namespace tuplex { // needed because of C++ template issues void addPartition(DataSet* ds, Partition *partition); - void addParallelizeNode(DataSet *ds, const std::vector> &badParallelizeObjects=std::vector>(), const std::vector &numExceptionsInPartition=std::vector()); //! adds a paralellize node to the computation graph - - /*! - * serialize python objects as pickled objects into exception partitions. Set the python objects map to - * map all normalPartitions to the exceptions that occured within them. - * @param pythonObjects normal case schema violations and their initial row numbers - * @param numExceptionsInPartition number of exceptions in each normal partition - * @param normalPartitions normal partitions created - * @param opID parallelize operator ID - * @param serializedPythonObjects output vector for partitions - * @param pythonObjectsMap output for mapping - */ - void serializePythonObjects(const std::vector>& pythonObjects, - const std::vector &numExceptionsInPartition, - const std::vector &normalPartitions, - const int64_t opID, - std::vector &serializedPythonObjects, - std::unordered_map &pythonObjectsMap); + void addParallelizeNode(DataSet *ds, const std::vector& fallbackPartitions=std::vector{}, const std::vector& partitionGroups=std::vector{}); //! adds a paralellize node to the computation graph Partition* requestNewPartition(const Schema& schema, const int dataSetID, size_t minBytesRequired); uint8_t* partitionLockRaw(Partition *partition); @@ -93,6 +79,8 @@ namespace tuplex { std::shared_ptr _lastJobMetrics; + std::shared_ptr _incrementalCache; + codegen::CompilePolicy _compilePolicy; codegen::CompilePolicy compilePolicyFromOptions(const ContextOptions& options); @@ -264,6 +252,14 @@ namespace tuplex { return _lastJobMetrics; } + /*! + * gets an IncrementalCache object + * @return ptr to IncrementalCache object + */ + std::shared_ptr getIncrementalCache() const { + return _incrementalCache; + } + /*! * construct a dataset using customly allocated partitions. * @param schema schema of the data within this dataset. @@ -276,7 +272,7 @@ namespace tuplex { * @param numExceptionsInPartition number of schema violations that occured in each of the partitions * @return reference to newly created dataset. */ - DataSet& fromPartitions(const Schema& schema, const std::vector& partitions, const std::vector& columns, const std::vector> &badParallelizeObjects, const std::vector &numExceptionsInPartition); + DataSet& fromPartitions(const Schema& schema, const std::vector& partitions, const std::vector& fallbackPartitions, const std::vector& partitionGroups, const std::vector& columns); }; // needed for template mechanism to work #include diff --git a/tuplex/core/include/ContextOptions.h b/tuplex/core/include/ContextOptions.h index 51912124f..c776b188c 100644 --- a/tuplex/core/include/ContextOptions.h +++ b/tuplex/core/include/ContextOptions.h @@ -53,6 +53,7 @@ namespace tuplex { bool OPT_FILTER_PUSHDOWN() const { return stringToBool(_store.at("tuplex.optimizer.filterPushdown")); } bool OPT_OPERATOR_REORDERING() const { return stringToBool(_store.at("tuplex.optimizer.operatorReordering")); } bool OPT_MERGE_EXCEPTIONS_INORDER() const { return stringToBool(_store.at("tuplex.optimizer.mergeExceptionsInOrder")); } + bool OPT_INCREMENTAL_RESOLUTION() const { return stringToBool(_store.at("tuplex.optimizer.incrementalResolution")); } bool CSV_PARSER_SELECTION_PUSHDOWN() const; //! whether to use selection pushdown in the parser. If false, then full data will be serialized. bool INTERLEAVE_IO() const { return stringToBool(_store.at("tuplex.interleaveIO")); } //! whether to first load, compute, then write or use IO thread to interleave IO work with compute work for faster speeds. bool RESOLVE_WITH_INTERPRETER_ONLY() const { return stringToBool(_store.at("tuplex.resolveWithInterpreterOnly")); } diff --git a/tuplex/core/include/ExceptionInfo.h b/tuplex/core/include/ExceptionInfo.h deleted file mode 100644 index d6ee35886..000000000 --- a/tuplex/core/include/ExceptionInfo.h +++ /dev/null @@ -1,47 +0,0 @@ -//--------------------------------------------------------------------------------------------------------------------// -// // -// Tuplex: Blazing Fast Python Data Science // -// // -// // -// (c) 2017 - 2021, Tuplex team // -// Created by Benjamin Givertz first on 1/1/2022 // -// License: Apache 2.0 // -//--------------------------------------------------------------------------------------------------------------------// - -#ifndef TUPLEX_EXCEPTIONINFO_H -#define TUPLEX_EXCEPTIONINFO_H - -namespace tuplex { - /*! - * Struct to hold information that maps input partitions to input exceptions that occur within them. - * - * Explanation: - * Each input partition is passed the same vector of all input exceptions that occured during data parallelization - * or caching. Thus, each input partition must know how many input exceptions occur in its partition, the index - * of the input exception partition where its first exception occurs, and the offset into that partition where the - * first exception occurs. These values are held in this struct and each input partition is mapped to an ExceptionInfo. - */ - struct ExceptionInfo { - size_t numExceptions; //! number of exception rows that occur within a single input partition - size_t exceptionIndex; //! index into a vector of input exception partitions that holds the first input exception - size_t exceptionRowOffset; //! offset in rows into the first input exception partition where the first exception occurs. - size_t exceptionByteOffset; //! offset in bytes into the first input exception partition where the first exception occurs - - ExceptionInfo() : - numExceptions(0), - exceptionIndex(0), - exceptionRowOffset(0), - exceptionByteOffset(0) {} - - ExceptionInfo(size_t numExps, - size_t expIndex, - size_t expRowOffset, - size_t expByteOffset) : - numExceptions(numExps), - exceptionIndex(expIndex), - exceptionRowOffset(expRowOffset), - exceptionByteOffset(expByteOffset) {} - }; -} - -#endif //TUPLEX_EXCEPTIONINFO_H \ No newline at end of file diff --git a/tuplex/core/include/IncrementalCache.h b/tuplex/core/include/IncrementalCache.h new file mode 100644 index 000000000..a3fc41fb7 --- /dev/null +++ b/tuplex/core/include/IncrementalCache.h @@ -0,0 +1,134 @@ +//--------------------------------------------------------------------------------------------------------------------// +// // +// Tuplex: Blazing Fast Python Data Science // +// // +// // +// (c) 2017 - 2021, Tuplex team // +// Created by Leonhard Spiegelberg first on 1/1/2021 // +// License: Apache 2.0 // +//--------------------------------------------------------------------------------------------------------------------// + +#ifndef TUPLEX_INCREMENTALCACHE_H +#define TUPLEX_INCREMENTALCACHE_H + +#include +#include + +namespace tuplex { + + /*! + * Holds information about pipeline execution to use for incremental exception resolution + */ + class IncrementalCacheEntry { + private: + LogicalOperator* _pipeline; + std::vector _normalPartitions; + std::vector _exceptionPartitions; + std::vector _generalPartitions; + std::vector _fallbackPartitions; + std::vector _partitionGroups; + size_t _startFileNumber; + public: + /*! + * Incremental cache entry for merge out of order + * @param pipeline original logical plan + * @param exceptionPartitions exception rows + * @param generalPartitions general rows + * @param fallbackPartitions fallback rows + * @param startFileNumber next available file number + */ + IncrementalCacheEntry(LogicalOperator* pipeline, + const std::vector& exceptionPartitions, + const std::vector& generalPartitions, + const std::vector& fallbackPartitions, + size_t startFileNumber); + + /*! + * Incremental cache entry for merge in order + * @param pipeline original logical plan + * @param normalPartitions normal rows + * @param exceptionPartitions exception rows + * @param partitionGroups mapping of normal rows to exception rows + */ + IncrementalCacheEntry(LogicalOperator *pipeline, + const std::vector& normalPartitions, + const std::vector& exceptionPartitions, + const std::vector& partitionGroups); + + ~IncrementalCacheEntry(); + + LogicalOperator* pipeline() const { + return _pipeline; + } + + void setExceptionPartitions(const std::vector& exceptionPartitions) { _exceptionPartitions = exceptionPartitions; } + + std::vector partitionGroups() const { return _partitionGroups; } + + std::vector normalPartitions() const { return _normalPartitions; } + + std::vector exceptionPartitions() const { + return _exceptionPartitions; + } + + std::vector generalPartitions() const { + return _generalPartitions; + } + + std::vector fallbackPartitions() const { + return _fallbackPartitions; + } + + size_t startFileNumber() const { + return _startFileNumber; + } + }; + + /*! + * Maps pipelines to their cached results + */ + class IncrementalCache { + private: + std::unordered_map _cache; + public: + ~IncrementalCache() { + clear(); + } + + /*! + * Add entry to the cache + * @param key string representation of the pipeline + * @param entry entry to store + */ + void addEntry(const std::string& key, IncrementalCacheEntry* entry); + + /*! + * Retrieve entry from the cache + * @param key string representation of the pipeline + * @return cache entry + */ + IncrementalCacheEntry* getEntry(const std::string& key) const { + auto elt = _cache.find(key); + if (elt == _cache.end()) + return nullptr; + return elt->second; + } + + /*! + * Reset incremental cache entries + */ + void clear() { + _cache.clear(); + } + + /*! + * convert pipeline to unique string + * @param pipeline original logical plan + * @return + */ + static std::string newKey(LogicalOperator* pipeline); + }; + +} + +#endif //TUPLEX_INCREMENTALCACHE_H \ No newline at end of file diff --git a/tuplex/core/include/JobMetrics.h b/tuplex/core/include/JobMetrics.h index 54a9cabec..071e561db 100644 --- a/tuplex/core/include/JobMetrics.h +++ b/tuplex/core/include/JobMetrics.h @@ -38,10 +38,13 @@ namespace tuplex { double slow_path_time_s = 0.0; double fast_path_per_row_time_ns = 0.0; double slow_path_per_row_time_ns = 0.0; - // size_t fast_path_input_row_count; - // size_t fast_path_output_row_count; - // size_t slow_path_input_row_count; - // size_t slow_path_output_row_count; + + size_t fast_path_input_row_count = 0; + size_t fast_path_output_row_count = 0; + size_t slow_path_input_row_count = 0; + size_t slow_path_output_row_count = 0; + + double write_output_time_s = 0.0; // disk spilling metrics int partitions_swapin_count = 0; @@ -161,6 +164,40 @@ namespace tuplex { it->slow_path_per_row_time_ns = slow_path_per_row_time_ns; } + /*! + * set fast path row count info + * @param stageNo + * @param inputRows + * @param outputRows + */ + void setFastPathRowCount(int stageNo, size_t inputRows, size_t outputRows) { + auto it = get_or_create_stage_metrics(stageNo); + it->fast_path_input_row_count = inputRows; + it->fast_path_output_row_count = outputRows; + } + + /*! + * set slow path row count info + * @param stageNo + * @param inputRows + * @param outputRows + */ + void setSlowPathRowCount(int stageNo, size_t inputRows, size_t outputRows) { + auto it = get_or_create_stage_metrics(stageNo); + it->slow_path_input_row_count = inputRows; + it->slow_path_output_row_count = outputRows; + } + + /*! + * set write timing info + * @param stageNo + * @param wallTime + */ + void setWriteOutputTimes(int stageNo, double wallTime) { + auto it = get_or_create_stage_metrics(stageNo); + it->write_output_time_s = wallTime; + } + /*! * set fast path timing info * @param stageNo @@ -234,7 +271,12 @@ namespace tuplex { ss<<"\"partitions_swapin_count\":"< createLoadAndTransformToMemoryTasks(TransformStage* tstage, const ContextOptions& options, const std::shared_ptr& syms); void executeTransformStage(TransformStage* tstage); + std::vector createIncrementalTasks(TransformStage* tstage, const ContextOptions& options, const std::shared_ptr& syms); + void executeIncrementalStage(TransformStage* tstage); /*! * Create the final hashmap from all of the input [tasks] (e.g. either merge them (join) or combine them (aggregate) @@ -89,7 +91,7 @@ namespace tuplex { MessageHandler& logger() const { return Logger::instance().logger("local ee"); } // write output (may be already in correct format!) - void writeOutput(TransformStage* tstage, std::vector& sortedTasks); + size_t writeOutput(TransformStage* tstage, std::vector& sortedTasks, size_t startFileNumber=0); std::vector performTasks(std::vector& tasks, std::function driverCallback=[](){}); @@ -99,7 +101,7 @@ namespace tuplex { return std::accumulate(counts.begin(), counts.end(), 0, [](size_t acc, std::pair, size_t> val) { return acc + val.second; }); } - inline std::vector getOutputPartitions(IExecutorTask* task) { + inline std::vector getNormalPartitions(IExecutorTask* task) { if(!task) return std::vector(); @@ -113,7 +115,7 @@ namespace tuplex { return std::vector(); } - inline std::vector getRemainingExceptions(IExecutorTask* task) { + inline std::vector getExceptionPartitions(IExecutorTask* task) { if(!task) return std::vector(); @@ -127,7 +129,7 @@ namespace tuplex { return std::vector(); } - inline std::vector generalCasePartitions(IExecutorTask* task) { + inline std::vector getGeneralPartitions(IExecutorTask* task) { if(!task) return std::vector(); @@ -155,18 +157,18 @@ namespace tuplex { return std::unordered_map, size_t>(); } - inline std::vector> getNonConformingRows(IExecutorTask* task) { + inline std::vector getFallbackPartitions(IExecutorTask* task) { if(!task) - return std::vector>(); + return std::vector(); if(task->type() == TaskType::UDFTRAFOTASK) - return std::vector>(); // none here, can be only result from ResolveTask. + return std::vector(); // none here, can be only result from ResolveTask. if(task->type() == TaskType::RESOLVE) - return dynamic_cast(task)->getNonConformingRows(); + return dynamic_cast(task)->getOutputFallbackPartitions(); throw std::runtime_error("unknown task type seen in " + std::string(__FILE_NAME__) + ":" + std::to_string(__LINE__)); - return std::vector>(); + return std::vector(); } std::vector resolveViaSlowPath(std::vector& tasks, diff --git a/tuplex/core/include/logical/CacheOperator.h b/tuplex/core/include/logical/CacheOperator.h index 563aa8f0b..2ca184a1d 100644 --- a/tuplex/core/include/logical/CacheOperator.h +++ b/tuplex/core/include/logical/CacheOperator.h @@ -79,9 +79,10 @@ namespace tuplex { * @return */ bool isCached() const { return _cached; } - std::vector cachedPartitions() const { return _normalCasePartitions; } - std::vector cachedExceptions() const { return _generalCasePartitions; } - std::unordered_map partitionToExceptionsMap() const { return _partitionToExceptionsMap; } + std::vector cachedNormalPartitions() const { return _normalPartitions; } + std::vector cachedGeneralPartitions() const { return _generalPartitions; } + std::vector cachedFallbackPartitions() const { return _fallbackPartitions; } + std::vector partitionGroups() const { return _partitionGroups; } size_t getTotalCachedRows() const; @@ -107,28 +108,19 @@ namespace tuplex { // or merge them. bool _cached; bool _storeSpecialized; - std::vector _normalCasePartitions; //! holds all data conforming to the normal case schema - std::vector _generalCasePartitions; //! holds all data which is considered to be a normal-case violation, - //! i.e. which does not adhere to the normal case schema, but did not produce - //! an exception while being processed through the pipeline before - std::unordered_map _partitionToExceptionsMap; //! maps normal case partitions to corresponding general case ones - std::vector _py_objects; //! all python objects who do not adhere to the general case schema + std::vector _normalPartitions; //! holds all data conforming to the normal case schema + std::vector _generalPartitions; //! holds all data which is considered to be a normal-case violation, + std::vector _fallbackPartitions; //! holds all data which is output as a python object from interpreter processing + std::vector _partitionGroups; //! groups together partitions for correct row ordering std::vector _columns; // internal sample of normal case rows, used for tracing & Co. std::vector _sample; // number of rows need to be stored for cost estimates - size_t _normalCaseRowCount; - size_t _generalCaseRowCount; - - // @TODO: there should be 3 things stored - // 1.) common case => i.e. - // 2.) general case => i.e. what in general can be done (null-values & Co, wide integers, ...) - // 3.) python case => i.e. things that don't fit into either case (interpreter objects serialized via pickle) - - // Note: the pickling could be parallelized by simply matching python types & Co... - // ==> store python data as tuple of elements! + size_t _normalRowCount; + size_t _generalRowCount; + size_t _fallbackRowCount; }; } diff --git a/tuplex/core/include/logical/LogicalPlan.h b/tuplex/core/include/logical/LogicalPlan.h index fa6e9ea30..a5bc09596 100644 --- a/tuplex/core/include/logical/LogicalPlan.h +++ b/tuplex/core/include/logical/LogicalPlan.h @@ -32,7 +32,7 @@ namespace tuplex { void optimizeFilters(); void emitPartialFilters(); void reorderDataProcessingOperators(); - + void incrementalResolution(const Context &context); public: LogicalPlan() = delete; diff --git a/tuplex/core/include/logical/ParallelizeOperator.h b/tuplex/core/include/logical/ParallelizeOperator.h index 0960baf89..87666950c 100644 --- a/tuplex/core/include/logical/ParallelizeOperator.h +++ b/tuplex/core/include/logical/ParallelizeOperator.h @@ -17,10 +17,9 @@ namespace tuplex { class ParallelizeOperator : public LogicalOperator { - std::vector _partitions; // data, conforming to majority type - std::vector _pythonObjects; // schema violations stored for interpreter processing as python objects - // maps partitions to their corresponding python objects - std::unordered_map _inputPartitionToPythonObjectsMap; + std::vector _normalPartitions; // data, conforming to majority type + std::vector _fallbackPartitions; // schema violations stored for interpreter processing as python objects + std::vector _partitionGroups; // maps normal partitions to their corresponding fallback partitions std::vector _columnNames; std::vector _sample; // sample, not necessary conforming to one type @@ -31,7 +30,7 @@ namespace tuplex { // this a root node ParallelizeOperator(const Schema& schema, - const std::vector& partitions, + const std::vector& normalPartitions, const std::vector& columns); std::string name() override { return "parallelize"; } @@ -47,13 +46,13 @@ namespace tuplex { * get the partitions where the parallelized data is stored. * @return vector of partitions. */ - std::vector getPartitions(); + std::vector getNormalPartitions(); - void setPythonObjects(const std::vector &pythonObjects) { _pythonObjects = pythonObjects; } - std::vector getPythonObjects() { return _pythonObjects; } + void setFallbackPartitions(const std::vector &fallbackPartitions) { _fallbackPartitions = fallbackPartitions; } + std::vector getFallbackPartitions() { return _fallbackPartitions; } - void setInputPartitionToPythonObjectsMap(const std::unordered_map& pythonObjectsMap) { _inputPartitionToPythonObjectsMap = pythonObjectsMap; } - std::unordered_map getInputPartitionToPythonObjectsMap() { return _inputPartitionToPythonObjectsMap; } + void setPartitionGroups(const std::vector& partitionGroups) { _partitionGroups = partitionGroups; } + std::vector getPartitionGroups() { return _partitionGroups; } Schema getInputSchema() const override { return getOutputSchema(); } diff --git a/tuplex/core/include/physical/CodeDefs.h b/tuplex/core/include/physical/CodeDefs.h index b8c3cd76b..c185b2d54 100644 --- a/tuplex/core/include/physical/CodeDefs.h +++ b/tuplex/core/include/physical/CodeDefs.h @@ -48,8 +48,14 @@ namespace tuplex { typedef int64_t(*read_block_f)(void*, const uint8_t*, int64_t, int64_t*, int64_t*, int8_t); // protoype of the function generated by the below builder - // parameters are userData, block, blocksize, expPtrs, expPtrSizes, numExceptions, normalrowsout, badrowsout, lastRow - typedef int64_t(*read_block_exp_f)(void*, const uint8_t*, int64_t, uint8_t **, int64_t *, int64_t, int64_t*, int64_t*, bool); + // parameters are userData, block, blocksize, normalrowsout, badrowsout, lastRow, + // totalFilterCounter, totalNormalRowCounter, totalGeneralRowCounter, totalFallbackRowCounter, + // generalPartitions, numGeneralPartitions, generalIndexOffset, generalRowOffset, generalByteOffset + // fallbackPartitions, numFallbackPartitions, fallbackIndexOffset, fallbackRowOffset, fallbackByteOffset + typedef int64_t(*read_block_exp_f)(void*, const uint8_t*, int64_t, int64_t*, int64_t*, bool, + int64_t*, int64_t*, int64_t*, int64_t*, + uint8_t **, int64_t, int64_t*, int64_t*, int64_t*, + uint8_t **, int64_t, int64_t*, int64_t*, int64_t*); /*! * prototype for processing a single row (with callbacks etc.). Returns how many bytes were processed diff --git a/tuplex/core/include/physical/PhysicalPlan.h b/tuplex/core/include/physical/PhysicalPlan.h index 560867b64..fe02bcc92 100644 --- a/tuplex/core/include/physical/PhysicalPlan.h +++ b/tuplex/core/include/physical/PhysicalPlan.h @@ -101,6 +101,8 @@ namespace tuplex { const LogicalPlan* logicalPlan() const { return _lp; } + const LogicalPlan* originalLogicalPlan() const { return _lpOriginal; } + nlohmann::json getStagedRepresentationAsJSON() const; /*! diff --git a/tuplex/core/include/physical/PhysicalStage.h b/tuplex/core/include/physical/PhysicalStage.h index 34e5b2c51..764ac3f85 100644 --- a/tuplex/core/include/physical/PhysicalStage.h +++ b/tuplex/core/include/physical/PhysicalStage.h @@ -22,7 +22,7 @@ namespace tuplex { - class IBackend;\ + class IBackend; class PhysicalStage; class PhysicalPlan; class LogicalPlan; diff --git a/tuplex/core/include/physical/ResolveTask.h b/tuplex/core/include/physical/ResolveTask.h index 2044a5699..83d3d8bfd 100644 --- a/tuplex/core/include/physical/ResolveTask.h +++ b/tuplex/core/include/physical/ResolveTask.h @@ -61,9 +61,9 @@ namespace tuplex { ResolveTask(int64_t stageID, int64_t contextID, const std::vector& partitions, - const std::vector& runtimeExceptions, - const std::vector& inputExceptions, - ExceptionInfo inputExceptionInfo, + const std::vector& exceptionPartitions, + const std::vector& generalPartitions, + const std::vector& fallbackPartitions, const std::vector& operatorIDsAffectedByResolvers, //! used to identify which exceptions DO require reprocessing because there might be a resolver in the slow path for them. Schema exceptionInputSchema, //! schema of the input rows in which both user exceptions and normal-case violations are stored in. This is also the schema in which rows which on the slow path produce again an exception will be stored in. Schema resolverOutputSchema, //! schema of rows that the resolve function outputs if it doesn't rethrow exceptions @@ -75,15 +75,16 @@ namespace tuplex { char csvDelimiter, char csvQuotechar, codegen::resolve_f functor=nullptr, - PyObject* interpreterFunctor=nullptr) : IExceptionableTask::IExceptionableTask(exceptionInputSchema, contextID), + PyObject* interpreterFunctor=nullptr, + bool isIncremental=false) : IExceptionableTask::IExceptionableTask(exceptionInputSchema, contextID), _stageID(stageID), _partitions(partitions), - _runtimeExceptions(runtimeExceptions), - _inputExceptions(inputExceptions), - _numInputExceptions(inputExceptionInfo.numExceptions), - _inputExceptionIndex(inputExceptionInfo.exceptionIndex), - _inputExceptionRowOffset(inputExceptionInfo.exceptionRowOffset), - _inputExceptionByteOffset(inputExceptionInfo.exceptionByteOffset), + _exceptionPartitions(exceptionPartitions), + _generalPartitions(generalPartitions), + _fallbackPartitions(fallbackPartitions), + _exceptionCounter(0), + _generalCounter(0), + _fallbackCounter(0), _resolverOutputSchema(resolverOutputSchema), _targetOutputSchema(targetNormalCaseOutputSchema), _mergeRows(mergeRows), @@ -100,7 +101,9 @@ namespace tuplex { _outputRowNumber(0), _wallTime(0.0), _numInputRowsRead(0), - _numUnresolved(0) { + _numUnresolved(0), + _numResolved(0), + _isIncremental(isIncremental) { // copy the IDs and sort them so binary search can be used. std::sort(_operatorIDsAffectedByResolvers.begin(), _operatorIDsAffectedByResolvers.end()); _normalPtrBytesRemaining = 0; @@ -170,7 +173,7 @@ namespace tuplex { std::vector getOutputPartitions() const override { return _partitions; } - std::vector> getNonConformingRows() const { return _py_nonconfirming; } + std::vector getOutputFallbackPartitions() const { return _fallbackSink.partitions; } /// very important to override this because of the special two exceptions fields of ResolveTask /// i.e. _generalCasePartitions store what exceptions to resolve, IExceptionableTask::_generalCasePartitions exceptions that occurred @@ -214,12 +217,16 @@ namespace tuplex { private: int64_t _stageID; /// to which stage does this task belong to. std::vector _partitions; - std::vector _runtimeExceptions; - std::vector _inputExceptions; - size_t _numInputExceptions; - size_t _inputExceptionIndex; - size_t _inputExceptionRowOffset; - size_t _inputExceptionByteOffset; + std::vector _exceptionPartitions; + std::vector _generalPartitions; + std::vector _fallbackPartitions; + + size_t _exceptionCounter; + size_t _generalCounter; + size_t _fallbackCounter; + + bool _isIncremental; + inline Schema commonCaseInputSchema() const { return _deserializerGeneralCaseOutput->getSchema(); } Schema _resolverOutputSchema; //! what the resolve functor produces Schema _targetOutputSchema; //! which schema the final rows should be in... @@ -234,6 +241,7 @@ namespace tuplex { char _csvQuotechar; size_t _numUnresolved; + size_t _numResolved; int64_t _currentRowNumber; // std::vector _mergedPartitions; @@ -258,6 +266,8 @@ namespace tuplex { // sink for type violation rows MemorySink _generalCaseSink; + MemorySink _fallbackSink; + // hash table sink // -> hash to be a hybrid because sometimes incompatible python objects have to be hashed here. HashTableSink _htable; @@ -271,7 +281,6 @@ namespace tuplex { // python output which can't be consolidated, saved as separate list void writePythonObject(PyObject* out_row); - std::vector> _py_nonconfirming; int64_t _outputRowNumber; diff --git a/tuplex/core/include/physical/ResultSet.h b/tuplex/core/include/physical/ResultSet.h index e94b8f1ae..6a97be70c 100644 --- a/tuplex/core/include/physical/ResultSet.h +++ b/tuplex/core/include/physical/ResultSet.h @@ -13,7 +13,7 @@ #include #include -#include +#include #include #include #include @@ -25,23 +25,61 @@ namespace tuplex { class ResultSet { private: - std::list _partitions; - std::vector _exceptions; // unresolved exceptions - std::unordered_map _partitionToExceptionsMap; - // @TODO: use here rows instead? would make it potentially cleaner... - std::deque> _pyobjects; // python objects remaining whose type - // did not confirm to the one of partitions. Maybe use Row here instead? - size_t _curRowCounter; //! row counter for the current partition - size_t _byteCounter; //! byte offset for the current partition - size_t _rowsRetrieved; - size_t _totalRowCounter; // used for merging in rows! - size_t _maxRows; - Schema _schema; - - void removeFirstPartition(); + std::list _currentNormalPartitions; //! normal partitions in current group + std::list _currentGeneralPartitions; //! general partitions in current group + std::list _currentFallbackPartitions; //! fallback partitions in current group + std::list _remainingNormalPartitions; //! remaining normal partitions in other groups + std::list _remainingGeneralPartitions; //! remaining general partitions in other groups + std::list _remainingFallbackPartitions; //! remaining fallback partitions in other groups + std::list _partitionGroups; //! groups together normal, general, and fallback partitions for merging + + size_t _totalRowCounter; //! total rows emitted across all groups + size_t _maxRows; //! max number of rows to emit + Schema _schema; //! normal case schema + + size_t _curNormalRowCounter; + size_t _curNormalByteCounter; + size_t _curGeneralRowCounter; + size_t _curGeneralByteCounter; + size_t _curFallbackRowCounter; + size_t _curFallbackByteCounter; + size_t _normalRowCounter; + size_t _generalRowCounter; + size_t _fallbackRowCounter; + + int64_t currentGeneralRowInd(); + int64_t currentFallbackRowInd(); + + Row getNextNormalRow(); + bool hasNextNormalRow(); + Row getNextFallbackRow(); + bool hasNextFallbackRow(); + Row getNextGeneralRow(); + bool hasNextGeneralRow(); + + void removeFirstGeneralPartition(); + void removeFirstFallbackPartition(); + void removeFirstNormalPartition(); public: - ResultSet() : _curRowCounter(0), _byteCounter(0), _rowsRetrieved(0), - _totalRowCounter(0), _maxRows(0), _schema(Schema::UNKNOWN) {} + /*! + * Create new result set with normal, general, and fallback rows + * @param schema normal case schema + * @param normalPartitions normal case rows + * @param generalPartitions general case rows + * @param fallbackPartitions fallback case rows + * @param partitionGroups information to merge row numbers correctly + * @param maxRows limit on rows to emit + */ + ResultSet(const Schema& schema, + const std::vector& normalPartitions, + const std::vector& generalPartitions=std::vector{}, + const std::vector& fallbackPartitions=std::vector{}, + const std::vector& partitionGroups=std::vector{}, + int64_t maxRows=std::numeric_limits::max()); + + ResultSet() : _curNormalRowCounter(0), _curNormalByteCounter(0), _curGeneralRowCounter(0), _curGeneralByteCounter(0), + _curFallbackRowCounter(0), _curFallbackByteCounter(0), _totalRowCounter(0), _maxRows(0), _schema(Schema::UNKNOWN), + _normalRowCounter(0), _generalRowCounter(0), _fallbackRowCounter(0) {} ~ResultSet() = default; // Non copyable @@ -51,13 +89,6 @@ namespace tuplex { ResultSet(const ResultSet&) = delete; ResultSet& operator = (const ResultSet&) = delete; - ResultSet(const Schema& _schema, - const std::vector& partitions, - const std::vector& exceptions=std::vector{}, - const std::unordered_map& partitionToExceptionsMap=std::unordered_map(), - const std::vector> pyobjects=std::vector>{}, - int64_t maxRows=std::numeric_limits::max()); - /*! * check whether result contains one more row */ @@ -75,52 +106,107 @@ namespace tuplex { */ std::vector getRows(size_t limit); - bool hasNextPartition() const; + /*! + * check whether general partitions remain + * @return + */ + bool hasNextGeneralPartition() const; + + /*! + * get next general partition but does not invalidate it + * @return + */ + Partition* getNextGeneralPartition(); + + /*! + * check whether fallback partitions remain + * @return + */ + bool hasNextFallbackPartition() const; + + /*! + * get next fallback partition but does not invalidate it + * @return + */ + Partition* getNextFallbackPartition(); + + /*! + * check whether normal partitions remain + * @return + */ + bool hasNextNormalPartition() const; /*! user needs to invalidate then! - * * @return */ - Partition* getNextPartition(); + Partition* getNextNormalPartition(); + + /*! + * number of rows across all cases of partitions + * @return + */ size_t rowCount() const; + /*! + * normal case schema + * @return + */ Schema schema() const { return _schema; } /*! - * removes and invalidates all partitions! + * removes and invalidates all normalPartitions! */ void clear(); + /*! + * number of rows in fallback partitions + * @return + */ + size_t fallbackRowCount() const; + /*! * retrieve all good rows in bulk, removes them from this result set. * @return */ - std::vector partitions() { + std::vector normalPartitions() { std::vector p; - while(hasNextPartition()) - p.push_back(getNextPartition()); + while(hasNextNormalPartition()) + p.push_back(getNextNormalPartition()); return p; } /*! - * retrieve all unresolved rows (should be only called internally). DOES NOT REMOVE THEM FROM result set. + * returns/removes all general partitions * @return */ - std::vector exceptions() const { return _exceptions; } - - std::unordered_map partitionToExceptionsMap() const { return _partitionToExceptionsMap; } + std::vector generalPartitions() { + std::vector p; + while(hasNextGeneralPartition()) + p.push_back(getNextGeneralPartition()); + return p; + } /*! - * returns/removes all objects + * returns/removes all fallback partitions * @return */ - std::deque> pyobjects() { - return std::move(_pyobjects); + std::vector fallbackPartitions() { + std::vector p; + while(hasNextFallbackPartition()) + p.push_back(getNextFallbackPartition()); + return p; } - size_t pyobject_count() const { return _pyobjects.size(); } - - size_t numPartitions() const { return _partitions.size(); } + /*! + * returns/removes all partition groups + * @return + */ + std::vector partitionGroups() { + std::vector g; + for (const auto& group : _partitionGroups) + g.push_back(group); + return g; + } }; } #endif //TUPLEX_RESULTSET_H \ No newline at end of file diff --git a/tuplex/core/include/physical/StageBuilder.h b/tuplex/core/include/physical/StageBuilder.h index 63b94bd57..551878f3a 100644 --- a/tuplex/core/include/physical/StageBuilder.h +++ b/tuplex/core/include/physical/StageBuilder.h @@ -35,6 +35,7 @@ namespace tuplex { * @param sharedObjectPropagation whether to use shared object propogation * @param nullValueOptimization whether to use null value optimization * @param updateInputExceptions whether input exceptions indices need to be updated + * @param incrementalResolution whether to execute with incremental resolution */ StageBuilder(int64_t stage_number, bool rootStage, @@ -43,7 +44,8 @@ namespace tuplex { double normalCaseThreshold, bool sharedObjectPropagation, bool nullValueOptimization, - bool updateInputExceptions); + bool updateInputExceptions, + bool incrementalResolution); // builder functions void addMemoryInput(const Schema& schema, LogicalOperator* node); @@ -91,6 +93,7 @@ namespace tuplex { bool _sharedObjectPropagation; bool _nullValueOptimization; bool _updateInputExceptions; + bool _incrementalResolution; std::vector _operators; // codegen strings diff --git a/tuplex/core/include/physical/TransformStage.h b/tuplex/core/include/physical/TransformStage.h index 22d7f5fb4..ec64c9dea 100644 --- a/tuplex/core/include/physical/TransformStage.h +++ b/tuplex/core/include/physical/TransformStage.h @@ -13,7 +13,7 @@ #include #include -#include +#include #include "PhysicalStage.h" #include "LLVMOptimizer.h" #include @@ -32,6 +32,7 @@ #include #include #include +#include #ifdef BUILD_WITH_AWS // include protobuf serialization of TrafoStage for Lambda executor @@ -95,16 +96,59 @@ namespace tuplex { } /*! - * set input exceptions, i.e. rows that could come from a parallelize or csv operator. - * @param pythonObjects + * set stage's general case normalPartitions + * @param generalPartitions */ - void setInputExceptions(const std::vector& inputExceptions) { _inputExceptions = inputExceptions; } + void setGeneralPartitions(const std::vector& generalPartitions) { _generalPartitions = generalPartitions; } - std::vector inputExceptions() { return _inputExceptions; } + /*! + * get stage's general case normalPartitions + * @return + */ + std::vector generalPartitions() const { return _generalPartitions; } - void setPartitionToExceptionsMap(const std::unordered_map& partitionToExceptionsMap) { _partitionToExceptionsMap = partitionToExceptionsMap; } + /*! + * set stage's fallback normalPartitions as serialized python objects + * @param fallbackPartitions + */ + void setFallbackPartitions(const std::vector& fallbackPartitions) { _fallbackPartitions = fallbackPartitions; } - std::unordered_map partitionToExceptionsMap() { return _partitionToExceptionsMap; } + /*! + * get fallback normalPartitions as serialized python objects + * @return + */ + std::vector fallbackPartitions() const { return _fallbackPartitions; } + + /*! + * set merge information for each set of normal, fallback, and general partitions + * @param partitionGroups + */ + void setPartitionGroups(const std::vector& partitionGroups) { + _partitionGroups = partitionGroups; + } + + /*! + * get partition groups for all sets of partitions + */ + std::vector partitionGroups() const { return _partitionGroups; } + + /*! + * set cache entry of previous execution to be used by the incremental resolution + * @param entry + */ + void setIncrementalCacheEntry(IncrementalCacheEntry* entry) { _incrementalCacheEntry = entry; } + + /*! + * get cache entry of previous execution + * @return + */ + IncrementalCacheEntry* incrementalCacheEntry() const { return _incrementalCacheEntry; } + + /*! + * whether or not to use incremental resolution during stage execution + * @return + */ + bool incrementalResolution() const { return _incrementalResolution; } /*! * sets maximum number of rows this pipeline will produce @@ -157,12 +201,34 @@ namespace tuplex { */ std::shared_ptr resultSet() const override { return _rs;} - void setMemoryResult(const std::vector& partitions, - const std::vector& generalCase=std::vector{}, - const std::unordered_map& parttionToExceptionsMap=std::unordered_map(), - const std::vector>& interpreterRows=std::vector>{}, - const std::vector& remainingExceptions=std::vector{}, - const std::unordered_map, size_t>& ecounts=std::unordered_map, size_t>()); // creates local result set? + /*! + * Cache pipeline execution for merge in order + * @param normalPartitions normal rows + * @param exceptionPartitions exception rows + * @param partitionGroups mapping of normal to exception rows + */ + void setIncrementalResult(const std::vector& normalPartitions, + const std::vector& exceptionPartitions, + const std::vector& partitionGroups); + + /*! + * Cache pipeline execution for merge out of order + * @param exceptionPartitions exception rows + * @param generalPartitions general rows + * @param fallbackPartitions fallback rows + * @param startFileNumber next file number to output rows to + */ + void setIncrementalResult(const std::vector& exceptionPartitions, + const std::vector& generalPartitions, + const std::vector& fallbackPartitions, + size_t startFileNumber); + + void setMemoryResult(const std::vector& normalPartitions=std::vector{}, + const std::vector& generalPartitions=std::vector{}, + const std::vector& fallbackPartitions=std::vector{}, + const std::vector& partitionGroups=std::vector{}, + const std::unordered_map, size_t>& exceptionCounts=std::unordered_map, size_t>()); // creates local result set? + void setFileResult(const std::unordered_map, size_t>& ecounts); // creates empty result set with exceptions void setEmptyResult() { @@ -173,9 +239,8 @@ namespace tuplex { setMemoryResult( std::vector(), std::vector(), - std::unordered_map(), - std::vector>(), std::vector(), + std::vector(), ecounts); } @@ -443,6 +508,9 @@ namespace tuplex { std::vector _inputPartitions; //! memory input partitions for this task. size_t _inputLimit; //! limit number of input rows (inf per default) size_t _outputLimit; //! output limit, set e.g. by take, to_csv etc. (inf per default) + std::vector _generalPartitions; //! general case input partitions + std::vector _fallbackPartitions; //! fallback case input partitions + std::vector _partitionGroups; //! groups partitions together for correct row indices std::shared_ptr _rs; //! result set @@ -459,7 +527,10 @@ namespace tuplex { std::string _pyCode; std::string _pyPipelineName; std::string _writerFuncName; + bool _updateInputExceptions; + bool _incrementalResolution; + IncrementalCacheEntry* _incrementalCacheEntry; std::shared_ptr emptyResultSet() const; @@ -469,11 +540,6 @@ namespace tuplex { // Todo: move this to physicalplan!!! //void pushDownOutputLimit(); //! enable optimizations for limited pipeline by restricting input read! - // unresolved exceptions. Important i.e. when no IO interleave is used... - std::vector _inputExceptions; - std::unordered_map _partitionToExceptionsMap; - - // for hash output, the key and bucket type python::Type _hashOutputKeyType; python::Type _hashOutputBucketType; diff --git a/tuplex/core/include/physical/TransformTask.h b/tuplex/core/include/physical/TransformTask.h index 2868ba668..3eb8013dd 100644 --- a/tuplex/core/include/physical/TransformTask.h +++ b/tuplex/core/include/physical/TransformTask.h @@ -239,11 +239,14 @@ namespace tuplex { */ std::unordered_map, size_t> exceptionCounts() const { return _exceptionCounts; } - ExceptionInfo inputExceptionInfo() { return _inputExceptionInfo; } - std::vector inputExceptions() { return _inputExceptions; } + std::vector generalPartitions() const { return _generalPartitions; } + + std::vector fallbackPartitions() const { return _fallbackPartitions; } + + void setGeneralPartitions(const std::vector& generalPartitions) { _generalPartitions = generalPartitions; } + + void setFallbackPartitions(const std::vector& fallbackPartitions) { _fallbackPartitions = fallbackPartitions; } - void setInputExceptionInfo(ExceptionInfo info) { _inputExceptionInfo = info; } - void setInputExceptions(const std::vector& inputExceptions) { _inputExceptions = inputExceptions; } void setUpdateInputExceptions(bool updateInputExceptions) { _updateInputExceptions = updateInputExceptions; } double wallTime() const override { return _wallTime; } @@ -292,8 +295,8 @@ namespace tuplex { MemorySink _exceptions; Schema _inputSchema; - ExceptionInfo _inputExceptionInfo; - std::vector _inputExceptions; + std::vector _generalPartitions; + std::vector _fallbackPartitions; bool _updateInputExceptions; // hash table sink diff --git a/tuplex/core/src/Context.cc b/tuplex/core/src/Context.cc index e9a30e902..0fa4faa6c 100644 --- a/tuplex/core/src/Context.cc +++ b/tuplex/core/src/Context.cc @@ -31,6 +31,9 @@ namespace tuplex { Context::Context(const ContextOptions& options) : _datasetIDGenerator(0), _compilePolicy(compilePolicyFromOptions(options)), _id(getNextContextID()) { // init metrics _lastJobMetrics = std::make_unique(); + // init incremental cache + _incrementalCache = std::make_shared(); + // make sure this is called without holding the GIL if(python::isInterpreterRunning()) assert(!python::holdsGIL()); @@ -202,7 +205,7 @@ namespace tuplex { } - DataSet& Context::fromPartitions(const Schema& schema, const std::vector& partitions, const std::vector& columns, const std::vector> &badParallelizeObjects, const std::vector &numExceptionsInPartition) { + DataSet& Context::fromPartitions(const Schema& schema, const std::vector& partitions, const std::vector& fallbackPartitions, const std::vector& partitionGroups, const std::vector& columns) { auto dataSetID = getNextDataSetID(); DataSet *dsptr = createDataSet(schema); @@ -214,7 +217,7 @@ namespace tuplex { // empty? if(partitions.empty()) { dsptr->setColumns(columns); - addParallelizeNode(dsptr, badParallelizeObjects, numExceptionsInPartition); + addParallelizeNode(dsptr, fallbackPartitions, partitionGroups); return *dsptr; } else { size_t numRows = 0; @@ -230,7 +233,8 @@ namespace tuplex { // set rows dsptr->setColumns(columns); - addParallelizeNode(dsptr, badParallelizeObjects, numExceptionsInPartition); + addParallelizeNode(dsptr, fallbackPartitions, partitionGroups); + // signal check if(check_and_forward_signals()) { @@ -257,6 +261,7 @@ namespace tuplex { addParallelizeNode(dsptr); return *dsptr; } else { + std::vector partitionGroups; // get row type from first element @TODO: should be inferred from sample, no? auto rtype = rows.front().getRowType(); schema = Schema(Schema::MemoryLayout::ROW, rtype); @@ -303,6 +308,7 @@ namespace tuplex { numWrittenRowsInPartition++; capacityRemaining -= bytesWritten; } else { + partitionGroups.push_back(PartitionGroup(1, dsptr->getPartitions().size())); // partition is full, request new one. // create new partition... partition->unlock(); @@ -319,6 +325,7 @@ namespace tuplex { base_ptr = (uint8_t*)partition->lock(); } } + partitionGroups.push_back(PartitionGroup(1, dsptr->getPartitions().size())); partition->unlock(); partition->setNumRows(numWrittenRowsInPartition); @@ -330,7 +337,7 @@ namespace tuplex { // set rows dsptr->setColumns(columnNames); - addParallelizeNode(dsptr); + addParallelizeNode(dsptr, std::vector{}, partitionGroups); // signal check if(check_and_forward_signals()) { @@ -349,94 +356,7 @@ namespace tuplex { return op; } - void Context::serializePythonObjects(const std::vector>& pythonObjects, - const std::vector &numExceptionsInPartition, - const std::vector &normalPartitions, - const int64_t opID, - std::vector &serializedPythonObjects, - std::unordered_map &pythonObjectsMap) { - if (pythonObjects.empty()) { - for (const auto &p : normalPartitions) { - pythonObjectsMap[uuidToString(p->uuid())] = ExceptionInfo(); - } - return; - } - - Schema schema(Schema::MemoryLayout::ROW, python::Type::makeTupleType({python::Type::STRING})); - const size_t allocMinSize = 1024 * 64; // 64KB - - Partition* partition = requestNewPartition(schema, -1, allocMinSize); - int64_t* rawPtr = (int64_t*)partition->lockWriteRaw(); - *rawPtr = 0; - uint8_t* ptr = (uint8_t*)(rawPtr + 1); - size_t numBytesSerialized = 0; - - auto prevExpByteOffset = 0; - auto prevExpRowOffset = 0; - auto prevExpInd = 0; - auto curNormalPartitionInd = 0; - auto numNewExps = 0; - - // Serialize each exception to a partition using the following schema: - // (1) is the field containing rowNum - // (2) is the field containing ecCode - // (3) is the field containing opID - // (4) is the field containing pickledObjectSize - // (5) is the field containing pickledObject - for(auto &exception : pythonObjects) { - auto rowNum = std::get<0>(exception); - auto pyObj = std::get<1>(exception); - auto ecCode = ecToI64(ExceptionCode::PYTHON_PARALLELIZE); - auto pickledObject = python::pickleObject(python::getMainModule(), pyObj); - auto pickledObjectSize = pickledObject.size(); - size_t requiredBytes = sizeof(int64_t) * 4 + pickledObjectSize; - - if (partition->capacity() < numBytesSerialized + requiredBytes) { - partition->unlockWrite(); - serializedPythonObjects.push_back(partition); - partition = requestNewPartition(schema, -1, allocMinSize); - rawPtr = (int64_t *) partition->lockWriteRaw(); - *rawPtr = 0; - ptr = (uint8_t * )(rawPtr + 1); - numBytesSerialized = 0; - } - - // Check if we have reached the number of exceptions in the input partition - // Record the current exception index and offset and iterate to next one - auto curNormalPartition = normalPartitions[curNormalPartitionInd]; - auto normalUUID = uuidToString(curNormalPartition->uuid()); - auto numExps = numExceptionsInPartition[curNormalPartitionInd]; - if (numNewExps >= numExps) { - pythonObjectsMap[normalUUID] = ExceptionInfo(numExps, prevExpInd, prevExpRowOffset, prevExpByteOffset); - prevExpRowOffset = *rawPtr; - prevExpByteOffset = numBytesSerialized; - prevExpInd = serializedPythonObjects.size(); - numNewExps = 0; - curNormalPartitionInd++; - } - - *((int64_t*)(ptr)) = rowNum; ptr += sizeof(int64_t); - *((int64_t*)(ptr)) = ecCode; ptr += sizeof(int64_t); - *((int64_t*)(ptr)) = opID; ptr += sizeof(int64_t); - *((int64_t*)(ptr)) = pickledObjectSize; ptr += sizeof(int64_t); - memcpy(ptr, pickledObject.c_str(), pickledObjectSize); ptr += pickledObjectSize; - - *rawPtr = *rawPtr + 1; - numBytesSerialized += requiredBytes; - numNewExps += 1; - } - - // Record mapping for normal last partition - auto curNormalPartition = normalPartitions[curNormalPartitionInd]; - auto normalUUID = uuidToString(curNormalPartition->uuid()); - auto numExceptions = numExceptionsInPartition[curNormalPartitionInd]; - pythonObjectsMap[normalUUID] = ExceptionInfo(numExceptions, prevExpInd, prevExpRowOffset, prevExpByteOffset); - - partition->unlockWrite(); - serializedPythonObjects.push_back(partition); - } - - void Context::addParallelizeNode(DataSet *ds, const std::vector> &badParallelizeObjects, const std::vector &numExceptionsInPartition) { + void Context::addParallelizeNode(DataSet *ds, const std::vector& fallbackPartitions, const std::vector& partitionGroups) { assert(ds); // @TODO: make empty list as special case work. Also true for empty files. @@ -446,11 +366,17 @@ namespace tuplex { assert(ds->_schema.getRowType() != python::Type::UNKNOWN); auto op = new ParallelizeOperator(ds->_schema, ds->getPartitions(), ds->columns()); - std::vector serializedPythonObjects; - std::unordered_map pythonObjectsMap; - serializePythonObjects(badParallelizeObjects, numExceptionsInPartition, ds->getPartitions(), op->getID(), serializedPythonObjects, pythonObjectsMap); - op->setPythonObjects(serializedPythonObjects); - op->setInputPartitionToPythonObjectsMap(pythonObjectsMap); + op->setFallbackPartitions(fallbackPartitions); + if (partitionGroups.empty()) { + std::vector defaultPartitionGroups; + for (int i = 0; i < ds->getPartitions().size(); ++i) { + defaultPartitionGroups.push_back(PartitionGroup(1, i)); + } + op->setPartitionGroups(defaultPartitionGroups); + } else { + op->setPartitionGroups(partitionGroups); + } + // add new (root) node ds->_operator = addOperator(op); diff --git a/tuplex/core/src/ContextOptions.cc b/tuplex/core/src/ContextOptions.cc index 49b498969..7879376cc 100644 --- a/tuplex/core/src/ContextOptions.cc +++ b/tuplex/core/src/ContextOptions.cc @@ -232,6 +232,7 @@ namespace tuplex { {"tuplex.optimizer.operatorReordering", "false"}, {"tuplex.optimizer.sharedObjectPropagation", "true"}, {"tuplex.optimizer.mergeExceptionsInOrder", "true"}, + {"tuplex.optimizer.incrementalResolution", "false"}, {"tuplex.interleaveIO", "true"}, {"tuplex.aws.scratchDir", ""}, {"tuplex.aws.requestTimeout", "600"}, @@ -286,6 +287,7 @@ namespace tuplex { {"tuplex.optimizer.operatorReordering", "false"}, {"tuplex.optimizer.sharedObjectPropagation", "true"}, {"tuplex.optimizer.mergeExceptionsInOrder", "false"}, + {"tuplex.optimizer.incrementalResolution", "false"}, {"tuplex.interleaveIO", "true"}, {"tuplex.aws.scratchDir", ""}, {"tuplex.aws.requestTimeout", "600"}, diff --git a/tuplex/core/src/DataSet.cc b/tuplex/core/src/DataSet.cc index a53a14094..a33925d7f 100644 --- a/tuplex/core/src/DataSet.cc +++ b/tuplex/core/src/DataSet.cc @@ -869,9 +869,10 @@ namespace tuplex { // what data source operators are there? if(_operator->type() == LogicalOperatorType::FILEINPUT) return static_cast(_operator)->isEmpty(); - else if(_operator->type() == LogicalOperatorType::PARALLELIZE) - return static_cast(_operator)->getPartitions().empty(); - else + else if(_operator->type() == LogicalOperatorType::PARALLELIZE) { + auto pop = static_cast(_operator); assert(pop); + return pop->getNormalPartitions().empty() && pop->getFallbackPartitions().empty(); + } else throw std::runtime_error("unknown data source operator detected"); } else return false; diff --git a/tuplex/core/src/Executor.cc b/tuplex/core/src/Executor.cc index 845b78e6a..7aab9c5fa 100644 --- a/tuplex/core/src/Executor.cc +++ b/tuplex/core/src/Executor.cc @@ -325,7 +325,7 @@ namespace tuplex { // remove from list _storedPartitions.remove(partition); } else { - + return; error("INTERNAL ERROR: Could not find partition " + uuidToString(partition->uuid()) + " belonging to operator " + std::to_string(partition->getDataSetID()) + " and type " + partition->schema().getRowType().desc() + ""); std::abort(); diff --git a/tuplex/core/src/IncrementalCache.cc b/tuplex/core/src/IncrementalCache.cc new file mode 100644 index 000000000..102aaae96 --- /dev/null +++ b/tuplex/core/src/IncrementalCache.cc @@ -0,0 +1,72 @@ +//--------------------------------------------------------------------------------------------------------------------// +// // +// Tuplex: Blazing Fast Python Data Science // +// // +// // +// (c) 2017 - 2021, Tuplex team // +// Created by Leonhard Spiegelberg first on 1/1/2021 // +// License: Apache 2.0 // +//--------------------------------------------------------------------------------------------------------------------// + +#include + +#include + +namespace tuplex { + + IncrementalCacheEntry::IncrementalCacheEntry( + LogicalOperator* pipeline, + const std::vector& exceptionPartitions, + const std::vector& generalPartitions, + const std::vector& fallbackPartitions, + size_t startFileNumber) { + _pipeline = pipeline->clone(); + _exceptionPartitions = exceptionPartitions; + _generalPartitions = generalPartitions; + _fallbackPartitions = fallbackPartitions; + _startFileNumber = startFileNumber; + } + + IncrementalCacheEntry::IncrementalCacheEntry(LogicalOperator *pipeline, + const std::vector& normalPartitions, + const std::vector& exceptionPartitions, + const std::vector& partitionGroups) { + _pipeline = pipeline->clone(); + _normalPartitions = normalPartitions; + for (auto &p : _normalPartitions) + p->makeImmortal(); + _exceptionPartitions = exceptionPartitions; + _partitionGroups = partitionGroups; + } + + void IncrementalCache::addEntry(const std::string& key, IncrementalCacheEntry* entry) { + auto elt = _cache.find(key); + if (elt != _cache.end()) + _cache.erase(key); + + _cache[key] = entry; + } + + IncrementalCacheEntry::~IncrementalCacheEntry() { + delete _pipeline; + } + + std::string IncrementalCache::newKey(LogicalOperator* pipeline) { + assert(pipeline); + std::stringstream ss; + + std::queue q; + q.push(pipeline); + while (!q.empty()) { + auto cur = q.front(); q.pop(); + if (cur->type() != LogicalOperatorType::RESOLVE && cur->type() != LogicalOperatorType::IGNORE) { + ss << std::to_string(static_cast(cur->type())); + } + for (const auto& p : cur->parents()) { + q.push(p); + } + } + + return ss.str(); + } +} \ No newline at end of file diff --git a/tuplex/core/src/Partition.cc b/tuplex/core/src/Partition.cc index a16d1c2eb..c554788cd 100644 --- a/tuplex/core/src/Partition.cc +++ b/tuplex/core/src/Partition.cc @@ -55,7 +55,7 @@ namespace tuplex { uint8_t* Partition::lockWriteRaw() { // must be the thread who allocated this - assert(_owner->getThreadID() == std::this_thread::get_id()); +// assert(_owner->getThreadID() == std::this_thread::get_id()); TRACE_LOCK("partition " + uuidToString(_uuid)); std::this_thread::yield(); diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc index bed96ec5a..da131be3f 100644 --- a/tuplex/core/src/ee/local/LocalBackend.cc +++ b/tuplex/core/src/ee/local/LocalBackend.cc @@ -147,8 +147,13 @@ namespace tuplex { // check what type of stage it is auto tstage = dynamic_cast(stage); - if(tstage) - executeTransformStage(tstage); + if(tstage) { + if (tstage->incrementalResolution()) { + executeIncrementalStage(tstage); + } else { + executeTransformStage(tstage); + } + } else if(dynamic_cast(stage)) { executeHashJoinStage(dynamic_cast(stage)); } else if(dynamic_cast(stage)) { @@ -266,8 +271,8 @@ namespace tuplex { Timer timer; // BUILD phase // TODO: codegen build phase. I.e. a function should be code generated which hashes a partition to a hashmap. - while(rsRight->hasNextPartition()) { - Partition* p = rsRight->getNextPartition(); + while(rsRight->hasNextNormalPartition()) { + Partition* p = rsRight->getNextNormalPartition(); // lock partition! auto ptr = p->lockRaw(); @@ -435,7 +440,7 @@ namespace tuplex { auto combinedType = hstage->combinedType(); Schema combinedSchema(Schema::MemoryLayout::ROW, combinedType); std::vector probeTasks; - for(auto partition : rsLeft->partitions()) { + for(auto partition : rsLeft->normalPartitions()) { probeTasks.emplace_back(new HashProbeTask(partition, hmap, probeFunction, hstage->combinedType(), hstage->outputDataSetID(), @@ -648,12 +653,35 @@ namespace tuplex { // --> issue for each memory partition a transform task and put it into local workqueue assert(tstage->inputMode() == EndPointMode::MEMORY); - - // restrict after input limit size_t numInputRows = 0; + auto inputPartitions = tstage->inputPartitions(); - for(int i = 0; i < inputPartitions.size(); ++i) { - auto partition = inputPartitions[i]; + auto generalPartitions = tstage->generalPartitions(); + auto fallbackPartitions = tstage->fallbackPartitions(); + auto partitionGroups = tstage->partitionGroups(); + for (const auto &group : partitionGroups) { + std::vector taskNormalPartitions; + bool invalidateAfterUse = false; + for (int i = group.normalPartitionStartInd; i < group.normalPartitionStartInd + group.numNormalPartitions; ++i) { + auto p = inputPartitions[i]; + numInputRows += p->getNumRows(); + if (!p->isImmortal()) + invalidateAfterUse = true; + taskNormalPartitions.push_back(p); + } + std::vector taskGeneralPartitions; + for (int i = group.generalPartitionStartInd; i < group.generalPartitionStartInd + group.numGeneralPartitions; ++i) { + auto p = generalPartitions[i]; + numInputRows += p->getNumRows(); + taskGeneralPartitions.push_back(p); + } + std::vector taskFallbackPartitions; + for (int i = group.fallbackPartitionStartInd; i < group.fallbackPartitionStartInd + group.numFallbackPartitions; ++i) { + auto p = fallbackPartitions[i]; + numInputRows += p->getNumRows(); + taskFallbackPartitions.push_back(p); + } + auto task = new TransformTask(); if (tstage->updateInputExceptions()) { task->setFunctor(syms->functorWithExp); @@ -661,7 +689,9 @@ namespace tuplex { task->setFunctor(syms->functor); } task->setUpdateInputExceptions(tstage->updateInputExceptions()); - task->setInputMemorySource(partition, !partition->isImmortal()); + task->setInputMemorySources(taskNormalPartitions, invalidateAfterUse); + task->setGeneralPartitions(taskGeneralPartitions); + task->setFallbackPartitions(taskFallbackPartitions); // hash table or memory output? if(tstage->outputMode() == EndPointMode::HASHTABLE) { if (tstage->hashtableKeyByteWidth() == 8) @@ -676,16 +706,10 @@ namespace tuplex { tstage->outputMode() == EndPointMode::MEMORY); task->sinkOutputToMemory(outputSchema, tstage->outputDataSetID(), tstage->context().id()); } - - auto partitionId = uuidToString(partition->uuid()); - auto info = tstage->partitionToExceptionsMap()[partitionId]; - task->setInputExceptionInfo(info); - task->setInputExceptions(tstage->inputExceptions()); - task->sinkExceptionsToMemory(inputSchema); + task->sinkExceptionsToMemory(tstage->normalCaseInputSchema()); task->setStageID(tstage->getID()); task->setOutputLimit(tstage->outputLimit()); tasks.emplace_back(std::move(task)); - numInputRows += partition->getNumRows(); // input limit exhausted? break! if(numInputRows >= tstage->inputLimit()) @@ -750,90 +774,296 @@ namespace tuplex { return pip_object; } - std::vector> inputExceptionsToPythonObjects(const std::vector& partitions, Schema schema) { - using namespace tuplex; + std::vector LocalBackend::createIncrementalTasks(TransformStage* tstage, const ContextOptions& options, const std::shared_ptr& syms) { + using namespace std; + vector tasks; + assert(tstage); + assert(syms); - std::vector> pyObjects; - for (const auto &partition : partitions) { - auto numRows = partition->getNumRows(); - const uint8_t* ptr = partition->lock(); + auto cacheEntry = tstage->incrementalCacheEntry(); + assert(cacheEntry); + auto cachedExceptionPartitions = cacheEntry->exceptionPartitions(); + auto cachedGeneralPartitions = cacheEntry->generalPartitions(); + auto cachedFallbackPartitions = cacheEntry->fallbackPartitions(); + auto cachedPartitionGroups = cacheEntry->partitionGroups(); + auto cachedNormalPartitions = cacheEntry->normalPartitions(); - python::lockGIL(); - for (int i = 0; i < numRows; ++i) { - int64_t rowNum = *((int64_t*)ptr); - ptr += sizeof(int64_t); - int64_t ecCode = *((int64_t*)ptr); - ptr += 2 * sizeof(int64_t); - int64_t objSize = *((int64_t*)ptr); - ptr += sizeof(int64_t); - - PyObject* pyObj = nullptr; - if (ecCode == ecToI64(ExceptionCode::PYTHON_PARALLELIZE)) { - pyObj = python::deserializePickledObject(python::getMainModule(), (char *) ptr, objSize); - } else { - pyObj = python::rowToPython(Row::fromMemory(schema, ptr, objSize), true); - } + for (auto &p : cachedNormalPartitions) + p->makeMortal(); + for (auto &p : cachedExceptionPartitions) + p->makeMortal(); - ptr += objSize; - pyObjects.emplace_back(rowNum, pyObj); - } - python::unlockGIL(); + auto stageID = tstage->getID(); + auto contextID = tstage->context().id(); + auto operatorIDsWithResolvers = tstage->operatorIDsWithResolvers(); + auto exceptionInputSchema = tstage->inputSchema(); + auto outputSchema = tstage->outputSchema(); + auto normalCaseOutputSchema = tstage->normalCaseOutputSchema(); + auto mergeExceptionsInOrder = options.OPT_MERGE_EXCEPTIONS_INORDER(); + auto autoUpcastNumbers = options.AUTO_UPCAST_NUMBERS(); + auto outputFormat = tstage->outputFormat(); + auto csvOutputDelimiter = tstage->csvOutputDelimiter(); + auto csvOutputQuotechar = tstage->csvOutputQuotechar(); + auto resolveFunctor = options.RESOLVE_WITH_INTERPRETER_ONLY() ? nullptr : syms->resolveFunctor; - partition->unlock(); - partition->invalidate(); + + // compile & prep python pipeline for this stage + Timer timer; + auto pipObject = preparePythonPipeline(tstage->purePythonCode(), tstage->pythonPipelineName()); + if(!pipObject) { + logger().error("python pipeline invalid, details: \n" + core::withLineNumbers(tstage->purePythonCode())); + return tasks; } + logger().info("compiled pure python pipeline in " + std::to_string(timer.time()) + "s"); + timer.reset(); + + auto order = 0; + if (mergeExceptionsInOrder) { + for (const auto &partitionGroup : cachedPartitionGroups) { + std::vector taskNormalPartitions; + for (int i = partitionGroup.normalPartitionStartInd; i < partitionGroup.normalPartitionStartInd + partitionGroup.numNormalPartitions; ++i) + taskNormalPartitions.push_back(cachedNormalPartitions[i]); + std::vector taskExceptionPartitions; + for (int i = partitionGroup.exceptionPartitionStartInd; i < partitionGroup.exceptionPartitionStartInd + partitionGroup.numExceptionPartitions; ++i) + taskExceptionPartitions.push_back(cachedExceptionPartitions[i]); + + + auto rtask = new ResolveTask( + stageID, + contextID, + taskNormalPartitions, + taskExceptionPartitions, + vector{}, + vector{}, + operatorIDsWithResolvers, + exceptionInputSchema, + outputSchema, + normalCaseOutputSchema, + outputSchema, + mergeExceptionsInOrder, + autoUpcastNumbers, + outputFormat, + csvOutputDelimiter, + csvOutputQuotechar, + resolveFunctor, + pipObject, + true); + + rtask->setOrder(order); + order++; + tasks.push_back(rtask); + } + } else { + for (const auto &p : cachedExceptionPartitions) { + tasks.push_back(new ResolveTask( + stageID, + contextID, + vector{}, + vector{p}, + vector{}, + vector{}, + operatorIDsWithResolvers, + exceptionInputSchema, + outputSchema, + normalCaseOutputSchema, + outputSchema, + mergeExceptionsInOrder, + autoUpcastNumbers, + outputFormat, + csvOutputDelimiter, + csvOutputQuotechar, + resolveFunctor, + pipObject, + true)); + } + + for (const auto &p : cachedGeneralPartitions) { + tasks.push_back(new ResolveTask( + stageID, + contextID, + vector{}, + vector{}, + vector{p}, + vector{}, + operatorIDsWithResolvers, + exceptionInputSchema, + outputSchema, + normalCaseOutputSchema, + outputSchema, + mergeExceptionsInOrder, + autoUpcastNumbers, + outputFormat, + csvOutputDelimiter, + csvOutputQuotechar, + resolveFunctor, + pipObject, + true)); + } - return pyObjects; + for (const auto &p : cachedFallbackPartitions) { + tasks.push_back(new ResolveTask( + stageID, + contextID, + vector{}, + vector{}, + vector{}, + vector{p}, + operatorIDsWithResolvers, + exceptionInputSchema, + outputSchema, + normalCaseOutputSchema, + outputSchema, + mergeExceptionsInOrder, + autoUpcastNumbers, + outputFormat, + csvOutputDelimiter, + csvOutputQuotechar, + resolveFunctor, + pipObject, + true)); + } + } + return tasks; } - void setExceptionInfo(const std::vector &normalOutput, const std::vector &exceptions, std::unordered_map &partitionToExceptionsMap) { - if (exceptions.empty()) { - for (const auto &p : normalOutput) { - partitionToExceptionsMap[uuidToString(p->uuid())] = ExceptionInfo(); + void LocalBackend::executeIncrementalStage(TransformStage *tstage) { + using namespace std; + + Timer stageTimer; + Timer timer; + + Partition::resetStatistics(); + + assert(tstage); + auto cacheEntry = tstage->incrementalCacheEntry(); + assert(cacheEntry); + auto cachedGeneralPartitions = cacheEntry->generalPartitions(); + auto cachedFallbackPartitions = cacheEntry->fallbackPartitions(); + auto cachedExceptionPartitions = cacheEntry->exceptionPartitions(); + + // If pipeline does not contain code, or no new exceptions to resolve skip stage and store new cache entry + if (cachedExceptionPartitions.empty() && cachedGeneralPartitions.empty() && cachedFallbackPartitions.empty()) { + switch (tstage->outputMode()) { + case EndPointMode::FILE: { + tstage->setFileResult(std::unordered_map, size_t>()); + break; + } + default: + throw std::runtime_error("output mode not yet supported for incremental resolution"); } + Logger::instance().defaultLogger().info("[Transform Stage] skipped stage " + std::to_string(tstage->number()) + " because there is nothing todo here."); return; } - auto expRowCount = 0; - auto expInd = 0; - auto expRowOff = 0; - auto expByteOff = 0; - - auto expNumRows = exceptions[0]->getNumRows(); - auto expPtr = exceptions[0]->lockWrite(); - auto rowsProcessed = 0; - for (const auto &p : normalOutput) { - auto pNumRows = p->getNumRows(); - auto curNumExps = 0; - auto curExpOff = expRowOff; - auto curExpInd = expInd; - auto curExpByteOff = expByteOff; - - while (*((int64_t *) expPtr) - rowsProcessed <= pNumRows + curNumExps && expRowCount < expNumRows) { - *((int64_t *) expPtr) -= rowsProcessed; - curNumExps++; - expRowOff++; - auto eSize = ((int64_t *)expPtr)[3] + 4*sizeof(int64_t); - expPtr += eSize; - expByteOff += eSize; - expRowCount++; - - if (expRowOff == expNumRows && expInd < exceptions.size() - 1) { - exceptions[expInd]->unlockWrite(); - expInd++; - expPtr = exceptions[expInd]->lockWrite(); - expNumRows = exceptions[expInd]->getNumRows(); - expRowOff = 0; - expByteOff = 0; - expRowCount = 0; + // Compile the pipeline + LLVMOptimizer optimizer; + auto syms = tstage->compile(*_compiler, _options.USE_LLVM_OPTIMIZER() ? &optimizer : nullptr, false); + bool combineOutputHashmaps = syms->aggInitFunctor && syms->aggCombineFunctor && syms->aggAggregateFunctor; + JobMetrics& metrics = tstage->PhysicalStage::plan()->getContext().metrics(); + double total_compilation_time = metrics.getTotalCompilationTime() + timer.time(); + metrics.setTotalCompilationTime(total_compilation_time); + { + std::stringstream ss; + ss<<"[Transform Stage] Stage "<number()<<" compiled to x86 in "<getNumInputRows(); + numOutputRows += task->getNumOutputRows(); + totalWallTime += task->wallTime(); + } + { + std::stringstream ss; + double time_per_slow_path_row_in_ms = totalWallTime / numInputRows * 1000.0; + ss<<"[Transform Stage] Stage "<number()<<" total wall clock time: " + <number(), numInputRows, numOutputRows); + // fast path + metrics.setFastPathTimes(tstage->number(), 0, 0, 0); + metrics.setSlowPathTimes(tstage->number(), totalWallTime, timer.time(), time_per_slow_path_row_in_ms * 1000000.0); + } + + sortTasks(completedTasks); + + // fetch partitions & ecounts + vector normalPartitions; + vector generalPartitions; + vector fallbackPartitions; + vector exceptionPartitions; + vector partitionGroups; + unordered_map, size_t> exceptionCounts; + + for (const auto& task : completedTasks) { + auto taskNormalPartitions = getNormalPartitions(task); + auto taskGeneralPartitions = getGeneralPartitions(task); + auto taskFallbackPartitions = getFallbackPartitions(task); + auto taskExceptionPartitions = getExceptionPartitions(task); + auto taskExceptionCounts = getExceptionCounts(task); + + // update exception counts + exceptionCounts = merge_ecounts(exceptionCounts, taskExceptionCounts); + + // debug trace issues + using namespace std; + std::string task_name = "unknown"; + if(task->type() == TaskType::UDFTRAFOTASK) + task_name = "udf trafo task"; + if(task->type() == TaskType::RESOLVE) + task_name = "resolve"; + + partitionGroups.push_back(PartitionGroup( + taskNormalPartitions.size(), normalPartitions.size(), + taskGeneralPartitions.size(), generalPartitions.size(), + taskFallbackPartitions.size(), fallbackPartitions.size(), + taskExceptionPartitions.size(), exceptionPartitions.size())); + std::copy(taskNormalPartitions.begin(), taskNormalPartitions.end(), std::back_inserter(normalPartitions)); + std::copy(taskGeneralPartitions.begin(), taskGeneralPartitions.end(), std::back_inserter(generalPartitions)); + std::copy(taskFallbackPartitions.begin(), taskFallbackPartitions.end(), std::back_inserter(fallbackPartitions)); + std::copy(taskExceptionPartitions.begin(), taskExceptionPartitions.end(), std::back_inserter(exceptionPartitions)); + } + + switch (tstage->outputMode()) { + case EndPointMode::FILE: { + if (_options.OPT_MERGE_EXCEPTIONS_INORDER()) { + tstage->setIncrementalResult(normalPartitions, exceptionPartitions, partitionGroups); + if (stringToBool(tstage->outputOptions()["commit"])) { + timer.reset(); + writeOutput(tstage, completedTasks); + metrics.setWriteOutputTimes(tstage->number(), timer.time()); + } else { + tstage->setFileResult(exceptionCounts); + } + } else { + timer.reset(); + auto partNo = writeOutput(tstage, completedTasks, cacheEntry->startFileNumber()); + metrics.setWriteOutputTimes(tstage->number(), timer.time()); + tstage->setIncrementalResult(exceptionPartitions, generalPartitions, fallbackPartitions, + partNo); } + break; } - - rowsProcessed += curNumExps + pNumRows; - partitionToExceptionsMap[uuidToString(p->uuid())] = ExceptionInfo(curNumExps, curExpInd, curExpOff, curExpByteOff); + default: + throw std::runtime_error("output mode not yet supported for incremental resolution"); } - exceptions[expInd]->unlockWrite(); + // call release func for stage globals + if(syms->releaseStageFunctor() != 0) + throw std::runtime_error("releaseStage() failed for stage " + std::to_string(tstage->number())); + + freeTasks(completedTasks); } void LocalBackend::executeTransformStage(tuplex::TransformStage *tstage) { @@ -855,9 +1085,8 @@ namespace tuplex { // special case: skip stage, i.e. empty code and mem2mem if(tstage->code().empty() && !tstage->fileInputMode() && !tstage->fileOutputMode()) { - auto pyObjects = inputExceptionsToPythonObjects(tstage->inputExceptions(), tstage->normalCaseInputSchema()); - tstage->setMemoryResult(tstage->inputPartitions(), std::vector{}, std::unordered_map(), pyObjects); - pyObjects.clear(); + tstage->setMemoryResult(tstage->inputPartitions(), tstage->generalPartitions(), tstage->fallbackPartitions(), + tstage->partitionGroups()); // skip stage Logger::instance().defaultLogger().info("[Transform Stage] skipped stage " + std::to_string(tstage->number()) + " because there is nothing todo here."); return; @@ -949,9 +1178,11 @@ namespace tuplex { // calc number of input rows and total wall clock time size_t numInputRows = 0; + size_t numOutputRows = 0; double totalWallTime = 0.0; for(auto task : completedTasks) { numInputRows += task->getNumInputRows(); + numOutputRows += task->getNumOutputRows(); totalWallTime += task->wallTime(); } @@ -969,6 +1200,7 @@ namespace tuplex { <<", time to process 1 row via fast path: "<number(), numInputRows, numOutputRows); // fast path metrics.setFastPathTimes(tstage->number(), totalWallTime, timer.time(), time_per_fast_path_row_in_ms * 1000000.0); } @@ -985,7 +1217,7 @@ namespace tuplex { bool executeSlowPath = true; //TODO: implement pure python resolution here... // exceptions found or slowpath data given? - if(totalECountsBeforeResolution > 0 || !tstage->inputExceptions().empty()) { + if(totalECountsBeforeResolution > 0 || !tstage->generalPartitions().empty() || !tstage->fallbackPartitions().empty()) { stringstream ss; // log out what exists in a table ss<<"Exception details: "<inputExceptions().empty()) { + if(!tstage->generalPartitions().empty()) { + size_t numExceptions = 0; + for (auto &p : tstage->generalPartitions()) + numExceptions += p->getNumRows(); + lines.push_back(Row("(cache)", exceptionCodeToPythonClass(ExceptionCode::NORMALCASEVIOLATION), (int64_t)numExceptions)); + totalECountsBeforeResolution += numExceptions; + } + + if(!tstage->fallbackPartitions().empty()) { size_t numExceptions = 0; - for (auto &p : tstage->inputExceptions()) + for (auto &p : tstage->fallbackPartitions()) numExceptions += p->getNumRows(); - lines.push_back(Row("(input)", exceptionCodeToPythonClass(ExceptionCode::NORMALCASEVIOLATION), (int64_t)numExceptions)); + lines.push_back(Row("(parallelize)", exceptionCodeToPythonClass(ExceptionCode::NORMALCASEVIOLATION), (int64_t)numExceptions)); totalECountsBeforeResolution += numExceptions; } @@ -1044,7 +1284,7 @@ namespace tuplex { executeSlowPath = true; // input exceptions or py objects? - if(!tstage->inputExceptions().empty()) + if(!tstage->generalPartitions().empty() || !tstage->fallbackPartitions().empty()) executeSlowPath = true; if(executeSlowPath) { @@ -1076,10 +1316,12 @@ namespace tuplex { totalWallTime = 0.0; size_t slowPathNumInputRows = 0; + size_t slowPathNumOutputRows = 0; for(auto task : completedTasks) { if(task->type() == TaskType::RESOLVE) { totalWallTime += task->wallTime(); slowPathNumInputRows += task->getNumInputRows(); + slowPathNumOutputRows += task->getNumOutputRows(); } } double time_per_row_slow_path_ms = totalWallTime / slowPathNumInputRows * 1000.0; @@ -1089,6 +1331,7 @@ namespace tuplex { ss<<"slow path for Stage "<number()<<": total wall clock time: "<number(), slowPathNumInputRows, slowPathNumOutputRows); metrics.setSlowPathTimes(tstage->number(), totalWallTime, slow_path_total_time, time_per_row_slow_path_ms * 1000000.0); } @@ -1113,64 +1356,76 @@ namespace tuplex { // sorting only make sense when order is needed sortTasks(completedTasks); + // fetch partitions & ecounts + vector normalPartitions; + vector generalPartitions; + vector fallbackPartitions; + vector exceptionPartitions; + vector partitionGroups; + unordered_map, size_t> exceptionCounts; + + for (const auto& task : completedTasks) { + auto taskNormalPartitions = getNormalPartitions(task); + auto taskGeneralPartitions = getGeneralPartitions(task); + auto taskFallbackPartitions = getFallbackPartitions(task); + auto taskExceptionPartitions = getExceptionPartitions(task); + auto taskExceptionCounts = getExceptionCounts(task); + + // update exception counts + exceptionCounts = merge_ecounts(exceptionCounts, taskExceptionCounts); + + // debug trace issues + using namespace std; + std::string task_name = "unknown"; + if(task->type() == TaskType::UDFTRAFOTASK) + task_name = "udf trafo task"; + if(task->type() == TaskType::RESOLVE) + task_name = "resolve"; + + auto pGroup = PartitionGroup( + taskNormalPartitions.size(), normalPartitions.size(), + taskGeneralPartitions.size(), generalPartitions.size(), + taskFallbackPartitions.size(), fallbackPartitions.size()); + pGroup.numExceptionPartitions = taskExceptionPartitions.size(); + pGroup.exceptionPartitionStartInd = exceptionPartitions.size(); + partitionGroups.push_back(pGroup); + std::copy(taskNormalPartitions.begin(), taskNormalPartitions.end(), std::back_inserter(normalPartitions)); + std::copy(taskGeneralPartitions.begin(), taskGeneralPartitions.end(), std::back_inserter(generalPartitions)); + std::copy(taskFallbackPartitions.begin(), taskFallbackPartitions.end(), std::back_inserter(fallbackPartitions)); + std::copy(taskExceptionPartitions.begin(), taskExceptionPartitions.end(), std::back_inserter(exceptionPartitions)); + } + // set result according to endpoint mode switch(tstage->outputMode()) { case EndPointMode::FILE: { // i.e. if output format is tuplex, then attach special writer! // ==> could maybe codegen avro as output format, and then write to whatever?? - writeOutput(tstage, completedTasks); + if (_options.OPT_INCREMENTAL_RESOLUTION()) { + if (_options.OPT_MERGE_EXCEPTIONS_INORDER()) { + tstage->setIncrementalResult(normalPartitions, exceptionPartitions, partitionGroups); + if (stringToBool(tstage->outputOptions()["commit"])) { + timer.reset(); + writeOutput(tstage, completedTasks); + metrics.setWriteOutputTimes(tstage->number(), timer.time()); + } else { + tstage->setFileResult(exceptionCounts); + } + } else { + timer.reset(); + auto partNo = writeOutput(tstage, completedTasks); + metrics.setWriteOutputTimes(tstage->number(), timer.time()); + tstage->setIncrementalResult(exceptionPartitions, generalPartitions, fallbackPartitions, + partNo); + } + } else { + timer.reset(); + writeOutput(tstage, completedTasks); + metrics.setWriteOutputTimes(tstage->number(), timer.time()); + } break; } case EndPointMode::MEMORY: { - // memory output, fetch partitions & ecounts - vector output; - vector generalOutput; - unordered_map partitionToExceptionsMap; - vector remainingExceptions; - vector> nonConformingRows; // rows where the output type does not fit, - // need to manually merged. - unordered_map, size_t> ecounts; - size_t rowDelta = 0; - for (const auto& task : completedTasks) { - auto taskOutput = getOutputPartitions(task); - auto taskRemainingExceptions = getRemainingExceptions(task); - auto taskGeneralOutput = generalCasePartitions(task); - auto taskNonConformingRows = getNonConformingRows(task); - auto taskExceptionCounts = getExceptionCounts(task); - - // update exception counts - ecounts = merge_ecounts(ecounts, taskExceptionCounts); - - // update nonConforming with delta - for(int i = 0; i < taskNonConformingRows.size(); ++i) { - auto t = taskNonConformingRows[i]; - t = std::make_tuple(std::get<0>(t) + rowDelta, std::get<1>(t)); - taskNonConformingRows[i] = t; - } - - // debug trace issues - using namespace std; - std::string task_name = "unknown"; - if(task->type() == TaskType::UDFTRAFOTASK) - task_name = "udf trafo task"; - if(task->type() == TaskType::RESOLVE) - task_name = "resolve"; - - setExceptionInfo(taskOutput, taskGeneralOutput, partitionToExceptionsMap); - std::copy(taskOutput.begin(), taskOutput.end(), std::back_inserter(output)); - std::copy(taskRemainingExceptions.begin(), taskRemainingExceptions.end(), std::back_inserter(remainingExceptions)); - std::copy(taskGeneralOutput.begin(), taskGeneralOutput.end(), std::back_inserter(generalOutput)); - std::copy(taskNonConformingRows.begin(), taskNonConformingRows.end(), std::back_inserter(nonConformingRows)); - - // compute the delta used to offset records! - for (const auto &p : taskOutput) - rowDelta += p->getNumRows(); - for (const auto &p : taskGeneralOutput) - rowDelta += p->getNumRows(); - rowDelta += taskNonConformingRows.size(); - } - - tstage->setMemoryResult(output, generalOutput, partitionToExceptionsMap, nonConformingRows, remainingExceptions, ecounts); + tstage->setMemoryResult(normalPartitions, generalPartitions, fallbackPartitions, partitionGroups, exceptionCounts); break; } case EndPointMode::HASHTABLE: { @@ -1247,6 +1502,21 @@ namespace tuplex { Logger::instance().defaultLogger().info(ss.str()); } +// if(_driver) +// _driver->freeAllPartitionsOfContext(&context()); +// for(auto exec : _executors) +// exec->freeAllPartitionsOfContext(&context()); + + +// for (auto task : completedTasks) +// task->freePartitions(); + + if (!_options.OPT_INCREMENTAL_RESOLUTION()) { + for (auto &p : exceptionPartitions) { + p->invalidate(); + } + } + freeTasks(completedTasks); // update metrics @@ -1262,6 +1532,140 @@ namespace tuplex { Logger::instance().defaultLogger().info(ss.str()); } +// void LocalBackend::setPartitionMergeInfo(const std::vector& normalPartitions, +// const std::vector& generalPartitions, const size_t generalStartInd, +// const std::vector& fallbackPartitions, const size_t fallbackStartInd, +// std::vector& partitionMergeInfo) { +// +// +// +// +// auto generalInd = 0; +// auto generalRowOff = 0; +// auto generalByteOff = 0; +// auto generalRowsInPartition = 0; +// const uint8_t *generalPtr = nullptr; +// if (!generalPartitions.empty()) { +// generalRowsInPartition = generalPartitions[0]->getNumRows(); +// generalPtr = generalPartitions[0]->lock(); +// } +// +// auto fallbackInd = 0; +// auto fallbackRowOff = 0; +// auto fallbackByteOff = 0; +// auto fallbackRowsInPartition = 0; +// const uint8_t *fallbackPtr = nullptr; +// if (!fallbackPartitions.empty()) { +// fallbackRowsInPartition = fallbackPartitions[0]->getNumRows(); +// fallbackPtr = fallbackPartitions[0]->lock(); +// } +// +// auto exceptionInd = 0; +// auto exceptionRowOff = 0; +// auto exceptionByteOff = 0; +// auto exceptionRowsInPartition = 0; +// const uint8_t *exceptionPtr = nullptr; +// if (!exceptionPartitions.empty()) { +// exceptionRowsInPartition = exceptionPartitions[0]->getNumRows(); +// exceptionPtr = exceptionPartitions[0]->lock(); +// } +// +// auto totalRowCounter = 0; +// auto rowDelta = 0; +// for (const auto &p : normalPartitions) { +// auto mergeInfo = MergeInfo(); +// mergeInfo.setRowDelta(rowDelta); +// auto numNormalRows = p->getNumRows(); +// +// auto generalRowCounter = 0; +// auto curGeneralStartInd = generalInd + generalStartInd; +// auto curGeneralRowOff = generalRowOff; +// auto curGeneralByteOff = generalByteOff; +// while (generalPtr && *((int64_t*)generalPtr) <= totalRowCounter + numNormalRows) { +// generalRowCounter++; +// totalRowCounter++; +// +// auto dataSize = ((int64_t*)generalPtr)[3] + 4*sizeof(int64_t); +// generalByteOff += dataSize; +// generalPtr += dataSize; +// generalRowOff++; +// +// if (generalRowOff == generalRowsInPartition) { +// generalPartitions[generalInd]->unlock(); +// generalInd++; +// if (generalInd < generalPartitions.size()) { +// generalPtr = generalPartitions[generalInd]->lock(); +// generalRowsInPartition = generalPartitions[generalInd]->getNumRows(); +// generalRowOff = 0; +// generalByteOff = 0; +// } else { +// generalPtr = nullptr; +// } +// } +// } +// mergeInfo.setGeneralInfo(generalRowCounter, curGeneralStartInd, curGeneralRowOff, curGeneralByteOff); +// +// auto fallbackRowCounter = 0; +// auto curFallbackStartInd = fallbackInd + fallbackStartInd; +// auto curFallbackRowOff = fallbackRowOff; +// auto curFallbackByteOff = fallbackByteOff; +// while (fallbackPtr && *((int64_t*)fallbackPtr) <= totalRowCounter + numNormalRows + generalRowCounter) { +// fallbackRowCounter++; +// totalRowCounter++; +// +// auto dataSize = ((int64_t*)fallbackPtr)[1] + 2*sizeof(int64_t); +// fallbackByteOff += dataSize; +// fallbackPtr += dataSize; +// fallbackRowOff++; +// +// if (fallbackRowOff == fallbackRowsInPartition) { +// fallbackPartitions[fallbackInd]->unlock(); +// fallbackInd++; +// if (fallbackInd < fallbackPartitions.size()) { +// fallbackPtr = fallbackPartitions[fallbackInd]->lock(); +// fallbackRowsInPartition = fallbackPartitions[fallbackInd]->getNumRows(); +// fallbackRowOff = 0; +// fallbackByteOff = 0; +// } else { +// fallbackPtr = nullptr; +// } +// } +// } +// mergeInfo.setFallbackInfo(fallbackRowCounter, curFallbackStartInd, curFallbackRowOff, curFallbackByteOff); +// +// auto exceptionRowCounter = 0; +// auto curExceptionStartInd = exceptionInd + exceptionStartInd; +// auto curExceptionRowOff = exceptionRowOff; +// auto curExceptionByteOff = exceptionByteOff; +// while (exceptionPtr && *((int64_t*)exceptionPtr) <= totalRowCounter + numNormalRows + generalRowCounter + fallbackRowCounter) { +// exceptionRowCounter++; +// totalRowCounter++; +// +// auto dataSize = ((int64_t*)exceptionPtr)[3] + 4*sizeof(int64_t); +// exceptionByteOff += dataSize; +// exceptionPtr += dataSize; +// exceptionRowOff++; +// +// if (exceptionRowOff == exceptionRowsInPartition) { +// exceptionPartitions[exceptionInd]->unlock(); +// exceptionInd++; +// if (exceptionInd < exceptionPartitions.size()) { +// exceptionPtr = exceptionPartitions[exceptionInd]->lock(); +// exceptionRowsInPartition = exceptionPartitions[exceptionInd]->getNumRows(); +// exceptionRowOff = 0; +// exceptionByteOff = 0; +// } else { +// exceptionPtr = nullptr; +// } +// } +// } +// mergeInfo.setExceptionInfo(exceptionRowCounter, curExceptionStartInd, curExceptionRowOff, curExceptionByteOff); +// +// rowDelta += numNormalRows + generalRowCounter + fallbackRowCounter + exceptionRowCounter; +// partitionMergeInfo.push_back(mergeInfo); +// } +// } + std::vector LocalBackend::resolveViaSlowPath( std::vector &tasks, bool merge_rows_in_order, @@ -1391,7 +1795,7 @@ namespace tuplex { else if(compareOrders(maxOrder, tt->getOrder())) maxOrder = tt->getOrder(); - if (tt->exceptionCounts().size() > 0 || tt->inputExceptionInfo().numExceptions > 0) { + if (tt->exceptionCounts().size() > 0 || !tt->generalPartitions().empty() || !tt->fallbackPartitions().empty()) { // task found with exceptions in it => exception partitions need to be resolved using special functor // hash-table output not yet supported @@ -1407,8 +1811,8 @@ namespace tuplex { tstage->context().id(), tt->getOutputPartitions(), tt->getExceptionPartitions(), - tt->inputExceptions(), - tt->inputExceptionInfo(), + tt->generalPartitions(), + tt->fallbackPartitions(), opsToCheck, exceptionInputSchema, compiledSlowPathOutputSchema, @@ -1498,11 +1902,6 @@ namespace tuplex { // cout<<"*** git "<inputExceptions()) { - p->invalidate(); - } - // cout<<"*** total number of tasks to return is "<hasNextPartition()) { - Partition* p = rs->getNextPartition(); + while(rs->hasNextNormalPartition()) { + Partition* p = rs->getNextNormalPartition(); // lock partition! auto ptr = p->lockRaw(); @@ -1946,7 +2345,7 @@ namespace tuplex { } } - void LocalBackend::writeOutput(TransformStage *tstage, std::vector &tasks) { + size_t LocalBackend::writeOutput(TransformStage *tstage, std::vector &tasks, size_t startFileNumber) { using namespace std; Timer timer; @@ -1973,8 +2372,13 @@ namespace tuplex { auto ecounts = calcExceptionCounts(tasks); + if (outputs.empty()) { + tstage->setFileResult(ecounts); + return startFileNumber; + } + // write to one file - int partNo = 0; + int partNo = startFileNumber; auto outputFilePath = outputURI(udf, uri, partNo, fmt); // check that outputFilePath is NOT empty. @@ -2071,7 +2475,6 @@ namespace tuplex { // run using queue! // execute tasks using work queue. auto completedTasks = performTasks(wtasks); - if(header) { delete [] header; header = nullptr; @@ -2079,5 +2482,7 @@ namespace tuplex { Logger::instance().defaultLogger().info("writing output took " + std::to_string(timer.time()) + "s"); tstage->setFileResult(ecounts); + + return partNo; } } // namespace tuplex \ No newline at end of file diff --git a/tuplex/core/src/logical/CacheOperator.cc b/tuplex/core/src/logical/CacheOperator.cc index f71522f21..4a571599c 100644 --- a/tuplex/core/src/logical/CacheOperator.cc +++ b/tuplex/core/src/logical/CacheOperator.cc @@ -22,19 +22,16 @@ namespace tuplex { LogicalOperator::copyMembers(other); auto cop = (CacheOperator*)other; setSchema(other->getOutputSchema()); - _normalCasePartitions = cop->cachedPartitions(); - _generalCasePartitions = cop->cachedExceptions(); - _partitionToExceptionsMap = cop->partitionToExceptionsMap(); - // copy python objects and incref for each! - _py_objects = cop->_py_objects; - python::lockGIL(); - for(auto obj : _py_objects) - Py_XINCREF(obj); - python::unlockGIL(); + _normalPartitions = cop->cachedNormalPartitions(); + _generalPartitions = cop->cachedGeneralPartitions(); + _fallbackPartitions = cop->cachedFallbackPartitions(); + _partitionGroups = cop->partitionGroups(); + _optimizedSchema = cop->_optimizedSchema; _cached = cop->_cached; - _normalCaseRowCount = cop->_normalCaseRowCount; - _generalCaseRowCount = cop->_generalCaseRowCount; + _normalRowCount = cop->_normalRowCount; + _generalRowCount = cop->_generalRowCount; + _fallbackRowCount = cop->_fallbackRowCount; _columns = cop->_columns; _sample = cop->_sample; _storeSpecialized = cop->_storeSpecialized; @@ -60,7 +57,7 @@ namespace tuplex { // is operator cached? => return combined cost! // @NOTE: could make exceptions more expensive than normal rows if(isCached()) { - return _generalCaseRowCount + _normalCaseRowCount; + return _generalRowCount + _fallbackRowCount + _normalRowCount; } else { // return parent cost return parent()->cost(); @@ -73,30 +70,29 @@ namespace tuplex { _cached = true; // fetch both partitions (consume) from resultset + any unresolved exceptions - _normalCasePartitions = rs->partitions(); - for(auto p : _normalCasePartitions) + _normalPartitions = rs->normalPartitions(); + for(auto p : _normalPartitions) p->makeImmortal(); - // @TODO: there are two sorts of exceptions here... - // i.e. separate normal-case violations out from the rest - // => these can be stored separately for faster processing! - // @TODO: right now, everything just gets cached... + _generalPartitions = rs->generalPartitions(); + for(auto p : _generalPartitions) + p->makeImmortal(); - _generalCasePartitions = rs->exceptions(); - for(auto p : _generalCasePartitions) + _fallbackPartitions = rs->fallbackPartitions(); + for(auto p : _fallbackPartitions) p->makeImmortal(); - _partitionToExceptionsMap = rs->partitionToExceptionsMap(); + _partitionGroups = rs->partitionGroups(); // check whether partitions have different schema than the currently set one // => i.e. they have been specialized. - if(!_normalCasePartitions.empty()) { - _optimizedSchema = _normalCasePartitions.front()->schema(); + if(!_normalPartitions.empty()) { + _optimizedSchema = _normalPartitions.front()->schema(); assert(_optimizedSchema != Schema::UNKNOWN); } // if exceptions are empty, then force output schema to be the optimized one as well! - if(_generalCasePartitions.empty()) + if(_generalPartitions.empty()) setSchema(_optimizedSchema); // because the schema might have changed due to the result, need to update the dataset! @@ -104,36 +100,46 @@ namespace tuplex { getDataSet()->setSchema(getOutputSchema()); // print out some statistics about cached data - size_t cachedPartitionsMemory = 0; - size_t totalCachedPartitionsMemory = 0; - size_t totalCachedRows = 0; - size_t cachedExceptionsMemory = 0; - size_t totalCachedExceptionsMemory = 0; - size_t totalCachedExceptions = 0; - - int pos = 0; - for(auto p : _normalCasePartitions) { - totalCachedRows += p->getNumRows(); - cachedPartitionsMemory += p->bytesWritten(); - totalCachedPartitionsMemory += p->size(); - pos++; + size_t normalBytesWritten = 0; + size_t normalCapacity = 0; + size_t normalRows = 0; + size_t generalBytesWritten = 0; + size_t generalCapacity = 0; + size_t generalRows = 0; + size_t fallbackBytesWritten = 0; + size_t fallbackCapacity = 0; + size_t fallbackRows = 0; + + for(const auto &p : _normalPartitions) { + normalRows += p->getNumRows(); + normalBytesWritten += p->bytesWritten(); + normalCapacity += p->size(); } - for(auto p : _generalCasePartitions) { - totalCachedExceptions += p->getNumRows(); - cachedExceptionsMemory += p->bytesWritten(); - totalCachedExceptionsMemory += p->size(); + for(const auto &p : _generalPartitions) { + generalRows += p->getNumRows(); + generalBytesWritten += p->bytesWritten(); + generalCapacity += p->size(); } + for(const auto &p : _fallbackPartitions) { + fallbackRows += p->getNumRows(); + fallbackBytesWritten += p->bytesWritten(); + fallbackCapacity += p->size(); + } + - _normalCaseRowCount = totalCachedRows; - _generalCaseRowCount = totalCachedExceptions; + _normalRowCount = normalRows; + _generalRowCount = generalRows; + _fallbackRowCount = fallbackRows; stringstream ss; - ss<<"Cached "<getNumRows(); + } + for(const auto &p : _generalPartitions) { totalCachedRows += p->getNumRows(); } - for(auto p : _generalCasePartitions) { + for (const auto &p : _fallbackPartitions) { totalCachedRows += p->getNumRows(); } return totalCachedRows; diff --git a/tuplex/core/src/logical/LogicalPlan.cc b/tuplex/core/src/logical/LogicalPlan.cc index f9322a203..25449dc28 100644 --- a/tuplex/core/src/logical/LogicalPlan.cc +++ b/tuplex/core/src/logical/LogicalPlan.cc @@ -58,6 +58,8 @@ namespace tuplex { // optimize first if desired (context options object) // ==> optimize creates a copy if required + incrementalResolution(context); + auto optimized_plan = optimize(context, !copy_required); // overwrite double logical_optimization_time = timer.time(); @@ -68,6 +70,40 @@ namespace tuplex { return new PhysicalPlan(optimized_plan, this, context); } + void updateIDs(LogicalOperator *previous, LogicalOperator *current) { + std::queue currentQ; + std::queue previousQ; + currentQ.push(current); + previousQ.push(previous); + bool updated = false; + while(!currentQ.empty() && !previousQ.empty()) { + auto curNode = currentQ.front(); currentQ.pop(); + auto prevNode = previousQ.front(); previousQ.pop(); + + if (!updated && (curNode->type() == LogicalOperatorType::RESOLVE || curNode->type() == LogicalOperatorType::IGNORE)) { + curNode = curNode->parent(); + updated = true; + } + + curNode->setID(prevNode->getID()); + for (auto parent : curNode->parents()) { + currentQ.push(parent); + } + for (auto parent : prevNode->parents()) { + previousQ.push(parent); + } + } + } + + void LogicalPlan::incrementalResolution(const Context& context) { + // If cache entry exists, need to copy over operator Ids from previous pipeline to current pipeline + // This is because the exceptions are already encoded with the previous pipeline's operator IDs. + auto cache = context.getIncrementalCache(); + auto cacheEntry = cache->getEntry(IncrementalCache::newKey(_action)); + if (cacheEntry && context.getOptions().OPT_INCREMENTAL_RESOLUTION()) { + updateIDs(cacheEntry->pipeline(), _action); + } + } void rewriteAllFollowingResolvers(LogicalOperator* op, const std::unordered_map& rewriteMap) { // go over children (single!) diff --git a/tuplex/core/src/logical/ParallelizeOperator.cc b/tuplex/core/src/logical/ParallelizeOperator.cc index 770ac2d4f..3ea6916e6 100644 --- a/tuplex/core/src/logical/ParallelizeOperator.cc +++ b/tuplex/core/src/logical/ParallelizeOperator.cc @@ -12,15 +12,15 @@ namespace tuplex { ParallelizeOperator::ParallelizeOperator(const Schema& schema, - const std::vector& partitions, - const std::vector& columns) : _partitions(partitions), - _columnNames(columns) { + const std::vector& normalPartitions, + const std::vector& columns) : _normalPartitions(normalPartitions), + _columnNames(columns) { setSchema(schema); // parallelize operator holds data in memory for infinite lifetime. // => make partitions immortal - for(auto& partition : _partitions) + for(auto& partition : _normalPartitions) partition->makeImmortal(); // get sample @@ -31,15 +31,15 @@ namespace tuplex { _sample.clear(); // todo: general python objects from parallelize... - if(!_partitions.empty()) { + if(!_normalPartitions.empty()) { auto maxRows = getDataSet() ? getDataSet()->getContext()->getOptions().CSV_MAX_DETECTION_ROWS() : MAX_TYPE_SAMPLING_ROWS; // @TODO: change this variable/config name // fetch up to maxRows from partitions! - auto schema = _partitions.front()->schema(); + auto schema = _normalPartitions.front()->schema(); Deserializer ds(schema); size_t rowCount = 0; size_t numBytesRead = 0; - for(auto p : _partitions) { + for(auto p : _normalPartitions) { const uint8_t* ptr = p->lockRaw(); auto partitionRowCount = *(int64_t*)ptr; ptr += sizeof(int64_t); @@ -59,8 +59,8 @@ namespace tuplex { } } - std::vector ParallelizeOperator::getPartitions() { - return _partitions; + std::vector ParallelizeOperator::getNormalPartitions() { + return _normalPartitions; } bool ParallelizeOperator::good() const { @@ -69,7 +69,7 @@ namespace tuplex { std::vector ParallelizeOperator::getSample(const size_t num) const { // samples exist? - if(_partitions.empty() || 0 == num) { + if(_normalPartitions.empty() || 0 == num) { return std::vector(); } @@ -109,11 +109,11 @@ namespace tuplex { } LogicalOperator *ParallelizeOperator::clone() { - auto copy = new ParallelizeOperator(getOutputSchema(), _partitions, columns()); + auto copy = new ParallelizeOperator(getOutputSchema(), _normalPartitions, columns()); copy->setDataSet(getDataSet()); copy->copyMembers(this); - copy->setPythonObjects(_pythonObjects); - copy->setInputPartitionToPythonObjectsMap(_inputPartitionToPythonObjectsMap); + copy->setFallbackPartitions(_fallbackPartitions); + copy->setPartitionGroups(_partitionGroups); assert(getID() == copy->getID()); return copy; } @@ -121,7 +121,9 @@ namespace tuplex { int64_t ParallelizeOperator::cost() const { // use #rows stored in partitions int64_t numRows = 0; - for(auto p : _partitions) + for(const auto& p : _normalPartitions) + numRows += p->getNumRows(); + for(const auto& p : _fallbackPartitions) numRows += p->getNumRows(); return numRows; } diff --git a/tuplex/core/src/physical/BlockBasedTaskBuilder.cc b/tuplex/core/src/physical/BlockBasedTaskBuilder.cc index 80e21c0a1..111e97d8f 100644 --- a/tuplex/core/src/physical/BlockBasedTaskBuilder.cc +++ b/tuplex/core/src/physical/BlockBasedTaskBuilder.cc @@ -58,12 +58,23 @@ namespace tuplex { FunctionType* read_block_type = FunctionType::get(env().i64Type(), {env().i8ptrType(), env().i8ptrType(), env().i64Type(), + env().i64Type()->getPointerTo(0), + env().i64Type()->getPointerTo(0), + env().getBooleanType(), + env().i64Type()->getPointerTo(0), + env().i64Type()->getPointerTo(0), + env().i64Type()->getPointerTo(0), + env().i64Type()->getPointerTo(0), env().i8ptrType()->getPointerTo(0), + env().i64Type(), env().i64Type()->getPointerTo(0), + env().i64Type()->getPointerTo(0), + env().i64Type()->getPointerTo(0), + env().i8ptrType()->getPointerTo(0), env().i64Type(), env().i64Type()->getPointerTo(0), env().i64Type()->getPointerTo(0), - env().getBooleanType()}, false); + env().i64Type()->getPointerTo(0)}, false); // create function and set argNames Function* read_block_func = Function::Create(read_block_type, Function::ExternalLinkage, _desiredFuncName, env().getModule().get()); @@ -76,12 +87,24 @@ namespace tuplex { vector argNames{"userData", "inPtr", "inSize", - "expPtrs", - "expPtrSizes", - "numExps", "outNormalRowCount", "outBadRowCount", - "ignoreLastRow"}; + "ignoreLastRow", + "totalFilterCounter", + "totalNormalRowCounter", + "totalGeneralRowCounter", + "totalFallbackRowCounter", + "generalPartitions", + "numGeneralPartitions", + "generalIndexOffset", + "generalRowOffset", + "generalByteOffset", + "fallbackPartitions", + "numFallbackPartitions", + "fallbackIndexOffset", + "fallbackRowOffset", + "fallbackByteOffset"}; + for(int i = 0; i < argNames.size(); ++i) { args[i]->setName(argNames[i]); _args[argNames[i]] = args[i]; diff --git a/tuplex/core/src/physical/ExceptionSourceTaskBuilder.cc b/tuplex/core/src/physical/ExceptionSourceTaskBuilder.cc index b3bd3847f..dd37a1c07 100644 --- a/tuplex/core/src/physical/ExceptionSourceTaskBuilder.cc +++ b/tuplex/core/src/physical/ExceptionSourceTaskBuilder.cc @@ -106,89 +106,115 @@ namespace tuplex { assert(read_block_func); + // Initialize context auto& context = env().getContext(); + // Load function arguments auto argUserData = arg("userData"); auto argInPtr = arg("inPtr"); auto argInSize = arg("inSize"); - auto argExpPtrs = arg("expPtrs"); - auto argExpPtrSizes = arg("expPtrSizes"); - auto argNumExps = arg("numExps"); auto argOutNormalRowCount = arg("outNormalRowCount"); auto argOutBadRowCount = arg("outBadRowCount"); auto argIgnoreLastRow = arg("ignoreLastRow"); - + auto totalFilterCounter = arg("totalFilterCounter"); + auto totalNormalRowCounter = arg("totalNormalRowCounter"); + auto totalGeneralRowCounter = arg("totalGeneralRowCounter"); + auto totalFallbackRowCounter = arg("totalFallbackRowCounter"); + auto generalPartitions = arg("generalPartitions"); + auto numGeneralPartitions = arg("numGeneralPartitions"); + auto generalIndexOffset = arg("generalIndexOffset"); + auto generalRowOffset = arg("generalRowOffset"); + auto generalByteOffset = arg("generalByteOffset"); + auto fallbackPartitions = arg("fallbackPartitions"); + auto numFallbackPartitions = arg("numFallbackPartitions"); + auto fallbackIndexOffset = arg("fallbackIndexOffset"); + auto fallbackRowOffset = arg("fallbackRowOffset"); + auto fallbackByteOffset = arg("fallbackByteOffset"); + + // Initialize function body BasicBlock *bbBody = BasicBlock::Create(context, "entry", read_block_func); IRBuilder<> builder(bbBody); - - // there should be a check if argInSize is 0 - // if so -> handle separately, i.e. return immediately -#warning "add here argInSize > 0 check" - - - // compute endptr from args - Value *endPtr = builder.CreateGEP(argInPtr, argInSize, "endPtr"); - Value *currentPtrVar = builder.CreateAlloca(env().i8ptrType(), 0, nullptr, "readPtrVar"); - // later use combi of normal & bad rows - Value *outRowCountVar = builder.CreateAlloca(env().i64Type(), 0, nullptr, "outRowCountVar"); // counter for output row number (used for exception resolution) + // Define basic blocks for function + auto bbInitializeGeneral = llvm::BasicBlock::Create(context, "initialize_general", builder.GetInsertBlock()->getParent()); + auto bbDeclareFallback = llvm::BasicBlock::Create(context, "declare_fallback", builder.GetInsertBlock()->getParent()); + auto bbInitializeFallback = llvm::BasicBlock::Create(context, "initialize_fallback", builder.GetInsertBlock()->getParent()); + auto bbUpdateGeneralCond = llvm::BasicBlock::Create(context, "update_general_cond", builder.GetInsertBlock()->getParent()); + auto bbUpdateGeneralBody = llvm::BasicBlock::Create(context, "update_general_body", builder.GetInsertBlock()->getParent()); + auto bbNextGeneralPartition = llvm::BasicBlock::Create(context, "next_general_partition", builder.GetInsertBlock()->getParent()); + auto bbUpdateFallbackCond = llvm::BasicBlock::Create(context, "update_fallback_cond", builder.GetInsertBlock()->getParent()); + auto bbUpdateFallbackBody = llvm::BasicBlock::Create(context, "update_fallback_body", builder.GetInsertBlock()->getParent()); + auto bbNextFallbackPartition = llvm::BasicBlock::Create(context, "next_fallback_partition", builder.GetInsertBlock()->getParent()); + auto bbUpdateDone = llvm::BasicBlock::Create(context, "update_done", builder.GetInsertBlock()->getParent()); + auto bbLoopCondition = BasicBlock::Create(context, "loop_cond", read_block_func); + auto bbLoopBody = BasicBlock::Create(context, "loop_body", read_block_func); + auto bbLoopDone = BasicBlock::Create(context, "loop_done", read_block_func); + + // Initialize values for normal partitions + auto endPtr = builder.CreateGEP(argInPtr, argInSize, "endPtr"); + auto currentPtrVar = builder.CreateAlloca(env().i8ptrType(), 0, nullptr, "readPtrVar"); + auto outRowCountVar = builder.CreateAlloca(env().i64Type(), 0, nullptr, "outRowCountVar"); // counter for output row number (used for exception resolution) builder.CreateStore(argInPtr, currentPtrVar); - - Value *normalRowCountVar = argOutNormalRowCount; - Value *badRowCountVar = argOutBadRowCount; - builder.CreateStore(builder.CreateAdd(builder.CreateLoad(argOutBadRowCount), - builder.CreateLoad(argOutNormalRowCount)), outRowCountVar); - - // current index into array of exception partitions - auto curExpIndVar = builder.CreateAlloca(env().i64Type(), 0, nullptr, "curExpIndVar"); - builder.CreateStore(env().i64Const(0), curExpIndVar); - - // current partition pointer - auto curExpPtrVar = builder.CreateAlloca(env().i8ptrType(), 0, nullptr, "curExpPtrVar"); - builder.CreateStore(builder.CreateLoad(argExpPtrs), curExpPtrVar); - - // number of rows total in current partition - auto curExpNumRowsVar = builder.CreateAlloca(env().i64Type(), 0, nullptr, "curExpNumRowsVar"); - builder.CreateStore(builder.CreateLoad(argExpPtrSizes), curExpNumRowsVar); - - // current row number in current partition - auto curExpCurRowVar = builder.CreateAlloca(env().i64Type(), 0, nullptr, "curExpCurRowVar"); - builder.CreateStore(env().i64Const(0), curExpCurRowVar); - - // accumulator used to update exception indices when rows are filtered, counts number of previously fitlered rows - auto expAccVar = builder.CreateAlloca(env().i64Type(), 0, nullptr, "expAccVar"); - builder.CreateStore(env().i64Const(0), expAccVar); - - // used to see if rows are filtered during pipeline execution - auto prevRowNumVar = builder.CreateAlloca(env().i64Type(), 0, nullptr, "prevRowNumVar"); - auto prevBadRowNumVar = builder.CreateAlloca(env().i64Type(), 0, nullptr, "prevBadRowNumVar"); - - // current number of exceptions prosessed across all partitions - auto expCurRowVar = builder.CreateAlloca(env().i64Type(), 0, nullptr, "expCurRowVar"); - builder.CreateStore(env().i64Const(0), expCurRowVar); - + // Update the arguments at the end + auto normalRowCountVar = argOutNormalRowCount; + auto badRowCountVar = argOutBadRowCount; + builder.CreateStore(builder.CreateAdd(builder.CreateLoad(argOutBadRowCount),builder.CreateLoad(argOutNormalRowCount)), outRowCountVar); // get num rows to read & process in loop - Value *numRowsVar = builder.CreateAlloca(env().i64Type(), 0, nullptr, "numRowsVar"); - Value *input_ptr = builder.CreatePointerCast(argInPtr, env().i64Type()->getPointerTo(0)); + auto numRowsVar = builder.CreateAlloca(env().i64Type(), 0, nullptr, "numRowsVar"); + auto input_ptr = builder.CreatePointerCast(argInPtr, env().i64Type()->getPointerTo(0)); builder.CreateStore(builder.CreateLoad(input_ptr), numRowsVar); // store current input ptr - Value *currentInputPtrVar = builder.CreateAlloca(env().i8ptrType(), 0, nullptr, "ptr"); + auto currentInputPtrVar = builder.CreateAlloca(env().i8ptrType(), 0, nullptr, "ptr"); builder.CreateStore(builder.CreateGEP(argInPtr, env().i32Const(sizeof(int64_t))), currentInputPtrVar); - - // variable for current row number... - Value *rowVar = builder.CreateAlloca(env().i64Type(), 0, nullptr); + auto rowVar = builder.CreateAlloca(env().i64Type(), 0, nullptr); builder.CreateStore(env().i64Const(0), rowVar); - BasicBlock* bbLoopCondition = BasicBlock::Create(context, "loop_cond", read_block_func); - BasicBlock* bbLoopBody = BasicBlock::Create(context, "loop_body", read_block_func); - BasicBlock* bbLoopDone = BasicBlock::Create(context, "loop_done", read_block_func); + // used to see if rows are filtered during pipeline execution + auto prevRowNumVar = builder.CreateAlloca(env().i64Type(), 0, nullptr, "prevRowNumVar"); + auto prevBadRowNumVar = builder.CreateAlloca(env().i64Type(), 0, nullptr, "prevBadRowNumVar"); - // go from entry block to loop body + // Initialize values for index updating + // uint8_t *curGeneralPtr; + // int64_t curGeneralNumRows = 0; + // if (*generalIndexOffset < numGeneralPartitions) { + // curGeneralPtr = generalPartitions[*generalIndexOffset]; + // curGeneralNumRows = *curGeneralPtr; + // curGeneralPtr += sizeof(int64_t) + *generalByteOffset; + // } + auto curGeneralPtr = builder.CreateAlloca(env().i8ptrType(), 0, nullptr, "curGeneralPtr"); + auto curGeneralNumRows = builder.CreateAlloca(env().i64Type(), 0, nullptr, "curGeneralNumRows"); + builder.CreateStore(env().i64Const(0), curGeneralNumRows); + auto shouldInitializeGeneral = builder.CreateICmpSLT(builder.CreateLoad(generalIndexOffset), numGeneralPartitions); + builder.CreateCondBr(shouldInitializeGeneral, bbInitializeGeneral, bbDeclareFallback); + + builder.SetInsertPoint(bbInitializeGeneral); + builder.CreateStore(builder.CreateLoad(builder.CreateGEP(generalPartitions, builder.CreateLoad(generalIndexOffset))), curGeneralPtr); + builder.CreateStore(builder.CreateLoad(builder.CreatePointerCast(builder.CreateLoad(curGeneralPtr), env().i64ptrType())), curGeneralNumRows); + builder.CreateStore(builder.CreateGEP(builder.CreateLoad(curGeneralPtr), builder.CreateAdd(env().i64Const(sizeof(int64_t)), builder.CreateLoad(generalByteOffset))), curGeneralPtr); + builder.CreateBr(bbDeclareFallback); + + // uint8_t *curFallbackPtr; + // int64_t curFallbackNumRows = 0; + // if (*fallbackIndexOffset < numFallbackPartitions) { + // curFallbackPtr = fallbackPartitions[*fallbackIndexOffset]; + // curFallbackNumRows = *curFallbackPtr; + // curFallbackPtr += sizeof(int64_t) + *fallbackByteOffset; + // } + builder.SetInsertPoint(bbDeclareFallback); + auto curFallbackPtr = builder.CreateAlloca(env().i8ptrType(), 0, nullptr, "curFallbackPtr"); + auto curFallbackNumRows = builder.CreateAlloca(env().i64Type(), 0, nullptr, "curFallbackNumRows"); + builder.CreateStore(env().i64Const(0), curFallbackNumRows); + auto shouldInitializeFallback = builder.CreateICmpSLT(builder.CreateLoad(fallbackIndexOffset), numFallbackPartitions); + builder.CreateCondBr(shouldInitializeFallback, bbInitializeFallback, bbLoopBody); + + builder.SetInsertPoint(bbInitializeFallback); + builder.CreateStore(builder.CreateLoad(builder.CreateGEP(fallbackPartitions, builder.CreateLoad(fallbackIndexOffset))), curFallbackPtr); + builder.CreateStore(builder.CreateLoad(builder.CreatePointerCast(builder.CreateLoad(curFallbackPtr), env().i64ptrType())), curFallbackNumRows); + builder.CreateStore(builder.CreateGEP(builder.CreateLoad(curFallbackPtr), builder.CreateAdd(env().i64Const(sizeof(int64_t)), builder.CreateLoad(fallbackByteOffset))), curFallbackPtr); builder.CreateBr(bbLoopBody); - // -------------- // loop condition builder.SetInsertPoint(bbLoopCondition); Value *row = builder.CreateLoad(rowVar, "row"); @@ -198,8 +224,6 @@ namespace tuplex { auto cond = builder.CreateICmpSLT(nextRow, numRows); builder.CreateCondBr(cond, bbLoopBody, bbLoopDone); - - // --------- // loop body builder.SetInsertPoint(bbLoopBody); // decode tuple from input ptr @@ -207,9 +231,8 @@ namespace tuplex { ft.init(_inputRowType); Value* oldInputPtr = builder.CreateLoad(currentInputPtrVar, "ptr"); ft.deserializationCode(builder, oldInputPtr); - Value* newInputPtr = builder.CreateGEP(oldInputPtr, ft.getSize(builder)); // @TODO: maybe use inbounds + Value* newInputPtr = builder.CreateGEP(oldInputPtr, ft.getSize(builder)); builder.CreateStore(newInputPtr, currentInputPtrVar); - builder.CreateStore(builder.CreateLoad(outRowCountVar), prevRowNumVar); builder.CreateStore(builder.CreateLoad(badRowCountVar), prevBadRowNumVar); @@ -218,117 +241,113 @@ namespace tuplex { Value *inputRowSize = ft.getSize(builder); processRow(builder, argUserData, ft, normalRowCountVar, badRowCountVar, outRowCountVar, oldInputPtr, inputRowSize, terminateEarlyOnLimitCode, pipeline() ? pipeline()->getFunction() : nullptr); + builder.CreateStore(builder.CreateAdd(env().i64Const(1), builder.CreateLoad(totalNormalRowCounter)), totalNormalRowCounter); + // After row is processed we need to update exceptions if the row was filtered // We check that: outRowCountVar == prevRowCountVar (no new row was emitted) // badRowCountVar == prevBadRowNumVar (it was filtered, not just an exception) - // expCurRowVar < argNumExps (we still have exceptions that need updating) - // if (outRowCountVar == prevRowNumVar && badRowCountVar == prevBadRowNumVar && expCurRowVar < argNumExps) - auto bbExpUpdate = llvm::BasicBlock::Create(context, "exp_update", builder.GetInsertBlock()->getParent()); - auto expCond = builder.CreateICmpEQ(builder.CreateLoad(outRowCountVar), builder.CreateLoad(prevRowNumVar)); - auto badCond = builder.CreateICmpEQ(builder.CreateLoad(badRowCountVar), builder.CreateLoad(prevBadRowNumVar)); - auto remCond = builder.CreateICmpSLT(builder.CreateLoad(expCurRowVar), argNumExps); - builder.CreateCondBr(builder.CreateAnd(remCond, builder.CreateAnd(badCond, expCond)), bbExpUpdate, bbLoopCondition); - - // We have determined a row is filtered so we can iterate through all the input exceptions that occured before this - // row and decrement their row index with the number of previously filtered rows (expAccVar). - // This is a while loop that iterates over all exceptions that occured before this filtered row - // - // while (expCurRowVar < numExps && ((*outNormalRowCount - 1) + expCurRowVar) >= *((int64_t *) curExpPtrVar)) - // - // *outNormalRowCount - 1 changes cardinality of rows into its row index, we add the number of previously processed - // exceptions because the normal rows do not know about the exceptions to obtain the correct index. It's then compared - // against the row index of the exception pointed to currently in our partition - builder.SetInsertPoint(bbExpUpdate); - auto bbIncrement = llvm::BasicBlock::Create(context, "increment", builder.GetInsertBlock()->getParent()); - auto bbIncrementDone = llvm::BasicBlock::Create(context, "increment_done", builder.GetInsertBlock()->getParent()); - auto curExpRowIndPtr = builder.CreatePointerCast(builder.CreateLoad(curExpPtrVar), env().i64ptrType()); - auto incCond = builder.CreateICmpSGE(builder.CreateAdd(builder.CreateLoad(badRowCountVar), builder.CreateAdd(builder.CreateSub(builder.CreateLoad(normalRowCountVar), env().i64Const(1)), builder.CreateLoad(expCurRowVar))), builder.CreateLoad(curExpRowIndPtr)); - auto remCond2 = builder.CreateICmpSLT(builder.CreateLoad(expCurRowVar), argNumExps); - builder.CreateCondBr(builder.CreateAnd(remCond2, incCond), bbIncrement, bbIncrementDone); - - // Body of the while loop we need to - // 1. decrement the current exception row index by the expAccVar (all rows previously filtered) - // 2. Increment our partition pointer to next exception - // 3. Change partitions if we've exhausted all exceptions in the current, but still have more remaining in tototal - // - // Increment to the next exception by adding eSize and 4*sizeof(int64_t) to the partition pointer - // *((int64_t *) curExpPtrVar) -= expAccVar; - // curExpPtrVar += 4 * sizeof(int64_t) + ((int64_t *)curExpPtrVar)[3]; - // expCurRowVar += 1; - // curExpCurRowVar += 1; - // - // Finally we check to see if a partition change is required - // if (expCurRowVar < numExps && curExpCurRowVar >= curExpNumRowsVar) - builder.SetInsertPoint(bbIncrement); - // Change row index and go to next exception in partition - auto curExpRowIndPtr2 = builder.CreatePointerCast(builder.CreateLoad(curExpPtrVar), env().i64Type()->getPointerTo(0)); - builder.CreateStore(builder.CreateSub(builder.CreateLoad(curExpRowIndPtr2), builder.CreateLoad(expAccVar)), curExpRowIndPtr2); - auto curOffset = builder.CreateAlloca(env().i64Type(), 0, nullptr, "curOffset"); - builder.CreateStore(builder.CreateLoad(builder.CreateGEP(curExpRowIndPtr2, env().i64Const(3))), curOffset); - builder.CreateStore(builder.CreateAdd(builder.CreateLoad(curOffset), env().i64Const(4 * sizeof(int64_t))), curOffset); - builder.CreateStore(builder.CreateGEP(builder.CreateLoad(curExpPtrVar), builder.CreateLoad(curOffset)), curExpPtrVar); - builder.CreateStore(builder.CreateAdd(builder.CreateLoad(curExpCurRowVar), env().i64Const(1)), curExpCurRowVar); - builder.CreateStore(builder.CreateAdd(builder.CreateLoad(expCurRowVar), env().i64Const(1)), expCurRowVar); - // See if partition change needs to occur - auto bbChange = llvm::BasicBlock::Create(context, "change", builder.GetInsertBlock()->getParent()); - auto changeCond = builder.CreateICmpSGE(builder.CreateLoad(curExpCurRowVar), builder.CreateLoad(curExpNumRowsVar)); - auto leftCond = builder.CreateICmpSLT(builder.CreateLoad(expCurRowVar), argNumExps); - builder.CreateCondBr(builder.CreateAnd(leftCond, changeCond), bbChange, bbExpUpdate); - - // This block changes to the next partition - // curExpCurRowVar = 0; - // curExpIndVar = curExpIndVar + 1; - // curExpPtrVar = expPtrs[curExpIndVar]; - // curExpNumRowsVar = expPtrSizes[curExpIndVar]; - builder.SetInsertPoint(bbChange); - builder.CreateStore(env().i64Const(0), curExpCurRowVar); - builder.CreateStore(builder.CreateAdd(builder.CreateLoad(curExpIndVar), env().i64Const(1)), curExpIndVar); - builder.CreateStore(builder.CreateLoad(builder.CreateGEP(argExpPtrs, builder.CreateLoad(curExpIndVar))), curExpPtrVar); - builder.CreateStore(builder.CreateLoad(builder.CreateGEP(argExpPtrSizes, builder.CreateLoad(curExpIndVar))), curExpNumRowsVar); - builder.CreateBr(bbExpUpdate); - - // Finally increment the expAccVar by 1 becasue a row was filtered - // expAccVar += 1; - builder.SetInsertPoint(bbIncrementDone); - builder.CreateStore(builder.CreateAdd(builder.CreateLoad(expAccVar), env().i64Const(1)), expAccVar); + // if (outRowCountVar == prevRowNumVar && badRowCountVar == prevBadRowNumVar) + auto rowNotEmitted = builder.CreateICmpEQ(builder.CreateLoad(outRowCountVar), builder.CreateLoad(prevRowNumVar)); + auto rowNotException = builder.CreateICmpEQ(builder.CreateLoad(badRowCountVar), builder.CreateLoad(prevBadRowNumVar)); + builder.CreateCondBr(builder.CreateAnd(rowNotEmitted, rowNotException), bbUpdateGeneralCond, bbLoopCondition); + + // Update general cond + // while (*generalRowOffset < curGeneralNumRows && *((int64_t*)curGeneralPtr) < curNormalRowInd + totalGeneralRowCounter) + builder.SetInsertPoint(bbUpdateGeneralCond); + auto generalRowsRemainCond = builder.CreateICmpSLT(builder.CreateLoad(generalRowOffset), builder.CreateLoad(curGeneralNumRows)); + auto curGeneralRowInd = builder.CreateLoad(builder.CreatePointerCast(builder.CreateLoad(curGeneralPtr), env().i64ptrType())); + auto generalIndexLTCond = builder.CreateICmpSLT(curGeneralRowInd, builder.CreateAdd(builder.CreateLoad(totalGeneralRowCounter), builder.CreateLoad(totalNormalRowCounter))); + builder.CreateCondBr(builder.CreateAnd(generalRowsRemainCond, generalIndexLTCond), bbUpdateGeneralBody, bbUpdateFallbackCond); + + // Update general body + // generalNewRowInd = *((int64_t*)curGeneralPtr) - totalFilterCounter; + // *((int64_t*)curGeneralPtr) = generalNewRowInd; + // auto generalRowDelta = 4 * sizeof(int64_t) + ((int64_t*)curGeneralPtr)[3]; + // curGeneralPtr += generalRowDelta; + // *generalByteOffset += generalRowDelta; + // *generalRowOffset++; + // *totalGeneralRowCounter++; + builder.SetInsertPoint(bbUpdateGeneralBody); + auto generalNewRowInd = builder.CreateSub(builder.CreateLoad(builder.CreatePointerCast(builder.CreateLoad(curGeneralPtr), env().i64ptrType())), builder.CreateLoad(totalFilterCounter)); + builder.CreateStore(generalNewRowInd, builder.CreatePointerCast(builder.CreateLoad(curGeneralPtr), env().i64ptrType())); + auto generalRowDelta = builder.CreateAdd(builder.CreateLoad(builder.CreateGEP(builder.CreatePointerCast(builder.CreateLoad(curGeneralPtr), env().i64ptrType()), env().i64Const(3))), env().i64Const(4 * sizeof(int64_t))); + builder.CreateStore(builder.CreateGEP(builder.CreateLoad(curGeneralPtr), generalRowDelta), curGeneralPtr); + builder.CreateStore(builder.CreateAdd(generalRowDelta, builder.CreateLoad(generalByteOffset)), generalByteOffset); + builder.CreateStore(builder.CreateAdd(env().i64Const(1), builder.CreateLoad(generalRowOffset)), generalRowOffset); + builder.CreateStore(builder.CreateAdd(env().i64Const(1), builder.CreateLoad(totalGeneralRowCounter)), totalGeneralRowCounter); + + // if (*generalRowOffset == curGeneralNumRows && *generalIndexOffset < numGeneralPartitions - 1) + auto generalNoRowsRemain = builder.CreateICmpEQ(builder.CreateLoad(generalRowOffset), builder.CreateLoad(curGeneralNumRows)); + auto generalHasMorePartitions = builder.CreateICmpSLT(builder.CreateLoad(generalIndexOffset), builder.CreateSub(numGeneralPartitions, env().i64Const(1))); + builder.CreateCondBr(builder.CreateAnd(generalNoRowsRemain, generalHasMorePartitions), bbNextGeneralPartition, bbUpdateGeneralCond); + + // generalIndexOffset += 1; + // *generalRowOffset = 0; + // *generalByteOffset = 0; + // curGeneralPtr = generalPartitions[*generalIndexOffset]; + // curGeneralNumRows = *((int64_t*)curGeneralPtr); + // curGeneralPtr += sizeof(int64_t); + builder.SetInsertPoint(bbNextGeneralPartition); + builder.CreateStore(builder.CreateAdd(builder.CreateLoad(generalIndexOffset), env().i64Const(1)), generalIndexOffset); + builder.CreateStore(env().i64Const(0), generalRowOffset); + builder.CreateStore(env().i64Const(0), generalByteOffset); + builder.CreateStore(builder.CreateLoad(builder.CreateGEP(generalPartitions, builder.CreateLoad(generalIndexOffset))), curGeneralPtr); + builder.CreateStore(builder.CreateLoad(builder.CreatePointerCast(builder.CreateLoad(curGeneralPtr), env().i64ptrType())), curGeneralNumRows); + builder.CreateStore(builder.CreateGEP(builder.CreateLoad(curGeneralPtr), builder.CreateAdd(env().i64Const(sizeof(int64_t)), builder.CreateLoad(generalByteOffset))), curGeneralPtr); + builder.CreateBr(bbUpdateGeneralCond); + + // Update fallback cond + // while (*fallbackRowOffset < curFallbackNumRows && *((int64_t*)curFallbackPtr) < curNormalRowInd + totalGeneralRowCounter + totalFallbackRowCounter) + builder.SetInsertPoint(bbUpdateFallbackCond); + auto fallbackRowsRemainCond = builder.CreateICmpSLT(builder.CreateLoad(fallbackRowOffset), builder.CreateLoad(curFallbackNumRows)); + auto curFallbackRowInd = builder.CreateLoad(builder.CreatePointerCast(builder.CreateLoad(curFallbackPtr), env().i64ptrType())); + auto fallbackIndexLTCond = builder.CreateICmpSLT(curFallbackRowInd, builder.CreateAdd(builder.CreateLoad(totalGeneralRowCounter), builder.CreateAdd(builder.CreateLoad(totalFallbackRowCounter), builder.CreateLoad(totalNormalRowCounter)))); + builder.CreateCondBr(builder.CreateAnd(fallbackRowsRemainCond, fallbackIndexLTCond), bbUpdateFallbackBody, bbUpdateDone); + + // Update fallback body + // fallbackNewRowInd = *((int64_t*)curFallbackPtr) - totalFilterCounter; + // *((int64_t*)curFallbackPtr) = fallbackNewRowInd; + // auto fallbackRowDelta = 4 * sizeof(int64_t) + ((int64_t*)curFallbackPtr)[3]; + // curFallbackPtr += fallbackRowDelta; + // *fallbackByteOffset += fallbackRowDelta; + // *fallbackRowOffset++; + // *totalFallbackRowCounter++; + builder.SetInsertPoint(bbUpdateFallbackBody); + auto fallbackNewRowInd = builder.CreateSub(builder.CreateLoad(builder.CreatePointerCast(builder.CreateLoad(curFallbackPtr), env().i64ptrType())), builder.CreateLoad(totalFilterCounter)); + builder.CreateStore(fallbackNewRowInd, builder.CreatePointerCast(builder.CreateLoad(curFallbackPtr), env().i64ptrType())); + auto fallbackRowDelta = builder.CreateAdd(builder.CreateLoad(builder.CreateGEP(builder.CreatePointerCast(builder.CreateLoad(curFallbackPtr), env().i64ptrType()), env().i64Const(3))), env().i64Const(4 * sizeof(int64_t))); + builder.CreateStore(builder.CreateGEP(builder.CreateLoad(curFallbackPtr), fallbackRowDelta), curFallbackPtr); + builder.CreateStore(builder.CreateAdd(fallbackRowDelta, builder.CreateLoad(fallbackByteOffset)), fallbackByteOffset); + builder.CreateStore(builder.CreateAdd(env().i64Const(1), builder.CreateLoad(fallbackRowOffset)), fallbackRowOffset); + builder.CreateStore(builder.CreateAdd(env().i64Const(1), builder.CreateLoad(totalFallbackRowCounter)), totalFallbackRowCounter); + + // if (*fallbackRowOffset == curFallbackNumRows && *fallbackIndexOffset < numFallbackPartitions - 1) + auto fallbackNoRowsRemain = builder.CreateICmpEQ(builder.CreateLoad(fallbackRowOffset), builder.CreateLoad(curFallbackNumRows)); + auto fallbackHasMorePartitions = builder.CreateICmpSLT(builder.CreateLoad(fallbackIndexOffset), builder.CreateSub(numFallbackPartitions, env().i64Const(1))); + builder.CreateCondBr(builder.CreateAnd(fallbackNoRowsRemain, fallbackHasMorePartitions), bbNextFallbackPartition, bbUpdateFallbackCond); + + // fallbackIndexOffset += 1; + // *fallbackRowOffset = 0; + // *fallbackByteOffset = 0; + // curFallbackPtr = fallbackPartitions[*fallbackIndexOffset]; + // curFallbackNumRows = *((int64_t*)curFallbackPtr); + // curFallbackPtr += sizeof(int64_t); + builder.SetInsertPoint(bbNextFallbackPartition); + builder.CreateStore(builder.CreateAdd(builder.CreateLoad(fallbackIndexOffset), env().i64Const(1)), fallbackIndexOffset); + builder.CreateStore(env().i64Const(0), fallbackRowOffset); + builder.CreateStore(env().i64Const(0), fallbackByteOffset); + builder.CreateStore(builder.CreateLoad(builder.CreateGEP(fallbackPartitions, builder.CreateLoad(fallbackIndexOffset))), curFallbackPtr); + builder.CreateStore(builder.CreateLoad(builder.CreatePointerCast(builder.CreateLoad(curFallbackPtr), env().i64ptrType())), curFallbackNumRows); + builder.CreateStore(builder.CreateGEP(builder.CreateLoad(curFallbackPtr), builder.CreateAdd(env().i64Const(sizeof(int64_t)), builder.CreateLoad(fallbackByteOffset))), curFallbackPtr); + builder.CreateBr(bbUpdateFallbackCond); + + // Update done + // totalFilterCounter += 1; + builder.SetInsertPoint(bbUpdateDone); + builder.CreateStore(builder.CreateAdd(env().i64Const(1), builder.CreateLoad(totalFilterCounter)), totalFilterCounter); builder.CreateBr(bbLoopCondition); - // --------- - // loop done builder.SetInsertPoint(bbLoopDone); - auto bbRemainingExceptions = llvm::BasicBlock::Create(context, "remaining_exceptions", builder.GetInsertBlock()->getParent()); - auto bbRemainingDone = llvm::BasicBlock::Create(context, "remaining_done", builder.GetInsertBlock()->getParent()); - auto expRemaining = builder.CreateICmpSLT(builder.CreateLoad(expCurRowVar), argNumExps); - builder.CreateCondBr(expRemaining, bbRemainingExceptions, bbRemainingDone); - - // We have processed all of the normal rows. If we have not exhausted all of our exceptions - // we just iterate through the remaining exceptions and decrement their row index by the final - // value of expAccVar counting our filtered rows. - // Same code as above, but just don't need to keep updating expAccVar by 1. - builder.SetInsertPoint(bbRemainingExceptions); - auto curExpRowIndPtr3 = builder.CreatePointerCast(builder.CreateLoad(curExpPtrVar), env().i64Type()->getPointerTo(0)); - builder.CreateStore(builder.CreateSub(builder.CreateLoad(curExpRowIndPtr3), builder.CreateLoad(expAccVar)), curExpRowIndPtr3); - auto curOffset2 = builder.CreateAlloca(env().i64Type(), 0, nullptr, "curOffset2"); - builder.CreateStore(builder.CreateLoad(builder.CreateGEP(curExpRowIndPtr3, env().i64Const(3))), curOffset2); - builder.CreateStore(builder.CreateAdd(builder.CreateLoad(curOffset2), env().i64Const(4 * sizeof(int64_t))), curOffset2); - builder.CreateStore(builder.CreateGEP(builder.CreateLoad(curExpPtrVar), builder.CreateLoad(curOffset2)), curExpPtrVar); - builder.CreateStore(builder.CreateAdd(builder.CreateLoad(curExpCurRowVar), env().i64Const(1)), curExpCurRowVar); - builder.CreateStore(builder.CreateAdd(builder.CreateLoad(expCurRowVar), env().i64Const(1)), expCurRowVar); - - auto bbChange2 = llvm::BasicBlock::Create(context, "change2", builder.GetInsertBlock()->getParent()); - auto changeCond2 = builder.CreateICmpSGE(builder.CreateLoad(curExpCurRowVar), builder.CreateLoad(curExpNumRowsVar)); - auto leftCond2 = builder.CreateICmpSLT(builder.CreateLoad(expCurRowVar), argNumExps); - builder.CreateCondBr(builder.CreateAnd(leftCond2, changeCond2), bbChange2, bbLoopDone); - - builder.SetInsertPoint(bbChange2); - builder.CreateStore(env().i64Const(0), curExpCurRowVar); - builder.CreateStore(builder.CreateAdd(builder.CreateLoad(curExpIndVar), env().i64Const(1)), curExpIndVar); - builder.CreateStore(builder.CreateLoad(builder.CreateGEP(argExpPtrs, builder.CreateLoad(curExpIndVar))), curExpPtrVar); - builder.CreateStore(builder.CreateLoad(builder.CreateGEP(argExpPtrSizes, builder.CreateLoad(curExpIndVar))), curExpNumRowsVar); - builder.CreateBr(bbLoopDone); - - builder.SetInsertPoint(bbRemainingDone); - // if intermediate callback desired, perform! if(_intermediateType != python::Type::UNKNOWN && !_intermediateCallbackName.empty()) { writeIntermediate(builder, argUserData, _intermediateCallbackName); } diff --git a/tuplex/core/src/physical/PhysicalPlan.cc b/tuplex/core/src/physical/PhysicalPlan.cc index 2399edf6f..c7eb06825 100644 --- a/tuplex/core/src/physical/PhysicalPlan.cc +++ b/tuplex/core/src/physical/PhysicalPlan.cc @@ -199,9 +199,11 @@ namespace tuplex { auto t = ops.front()->type(); assert(t == LogicalOperatorType::PARALLELIZE || t == LogicalOperatorType::CACHE); if (t == LogicalOperatorType::PARALLELIZE) - hasInputExceptions = !((ParallelizeOperator *)ops.front())->getPythonObjects().empty(); - if (t == LogicalOperatorType::CACHE) - hasInputExceptions = !((CacheOperator *)ops.front())->cachedExceptions().empty(); + hasInputExceptions = !((ParallelizeOperator *) ops.front())->getFallbackPartitions().empty(); + if (t == LogicalOperatorType::CACHE) { + auto cop = (CacheOperator *) ops.front(); + hasInputExceptions = !cop->cachedGeneralPartitions().empty() || !cop->cachedFallbackPartitions().empty(); + } } } @@ -239,6 +241,11 @@ namespace tuplex { // user wants to merge exceptions in order. bool updateInputExceptions = hasFilter && hasInputExceptions && _context.getOptions().OPT_MERGE_EXCEPTIONS_INORDER(); + // Use incremental resolution if pipelines match and user has enabled the option + auto cache = _context.getIncrementalCache(); + auto cacheEntry = cache->getEntry(IncrementalCache::newKey(originalLogicalPlan()->getAction())); + auto incrementalResolution = cacheEntry && _context.getOptions().OPT_INCREMENTAL_RESOLUTION(); + // create trafostage via builder pattern auto builder = codegen::StageBuilder(_num_stages++, isRootStage, @@ -247,7 +254,8 @@ namespace tuplex { _context.getOptions().NORMALCASE_THRESHOLD(), _context.getOptions().OPT_SHARED_OBJECT_PROPAGATION(), _context.getOptions().OPT_NULLVALUE_OPTIMIZATION(), - updateInputExceptions); + updateInputExceptions, + incrementalResolution); // start code generation // first, add input @@ -401,19 +409,24 @@ namespace tuplex { // fill in data to start processing from operators. if (inputNode->type() == LogicalOperatorType::PARALLELIZE) { auto pop = dynamic_cast(inputNode); assert(inputNode); - stage->setInputPartitions(pop->getPartitions()); - stage->setInputExceptions(pop->getPythonObjects()); - stage->setPartitionToExceptionsMap(pop->getInputPartitionToPythonObjectsMap()); + stage->setInputPartitions(pop->getNormalPartitions()); + stage->setFallbackPartitions(pop->getFallbackPartitions()); + stage->setPartitionGroups(pop->getPartitionGroups()); } else if(inputNode->type() == LogicalOperatorType::CACHE) { auto cop = dynamic_cast(inputNode); assert(inputNode); - stage->setInputPartitions(cop->cachedPartitions()); - stage->setInputExceptions(cop->cachedExceptions()); - stage->setPartitionToExceptionsMap(cop->partitionToExceptionsMap()); + stage->setInputPartitions(cop->cachedNormalPartitions()); + stage->setGeneralPartitions(cop->cachedGeneralPartitions()); + stage->setFallbackPartitions(cop->cachedFallbackPartitions()); + stage->setPartitionGroups(cop->partitionGroups()); } else if(inputNode->type() == LogicalOperatorType::FILEINPUT) { auto csvop = dynamic_cast(inputNode); stage->setInputFiles(csvop->getURIs(), csvop->getURISizes()); } // else it must be an internal node! => need to set manually based on result + if (incrementalResolution) { + stage->setIncrementalCacheEntry(cacheEntry); + } + return stage; } diff --git a/tuplex/core/src/physical/ResolveTask.cc b/tuplex/core/src/physical/ResolveTask.cc index 6ae6723f0..95a91b934 100644 --- a/tuplex/core/src/physical/ResolveTask.cc +++ b/tuplex/core/src/physical/ResolveTask.cc @@ -228,8 +228,21 @@ namespace tuplex { // needs to be put into separate list of python objects... // save index as well to merge back in order. - assert(_rowNumber >= _numUnresolved); - _py_nonconfirming.push_back(std::make_tuple(_rowNumber - _numUnresolved, out_row)); + assert(_currentRowNumber >= _numUnresolved); + auto pickledObject = python::pickleObject(python::getMainModule(), out_row); + auto pyObjectSize = pickledObject.size(); + auto bufSize = 4 * sizeof(int64_t) + pyObjectSize; + + uint8_t *buf = new uint8_t[bufSize]; + auto ptr = buf; + *((int64_t*)ptr) = _currentRowNumber - _numUnresolved; ptr += sizeof(int64_t); + *((int64_t*)ptr) = ecToI64(ExceptionCode::PYTHON_PARALLELIZE); ptr += sizeof(int64_t); + *((int64_t*)ptr) = -1; ptr += sizeof(int64_t); + *((int64_t*)ptr) = pyObjectSize; ptr += sizeof(int64_t); + memcpy(ptr, pickledObject.c_str(), pyObjectSize); + rowToMemorySink(owner(), _fallbackSink, Schema(Schema::MemoryLayout::ROW, python::Type::makeTupleType({python::Type::STRING})), + 0, contextID(), buf, bufSize); + delete[] buf; } int64_t ResolveTask::mergeNormalRow(const uint8_t *buf, int64_t bufSize) { @@ -273,7 +286,7 @@ namespace tuplex { // exceptionCode, exceptionOperatorID, rowNumber, size int64_t ecCode = ecToI64(ExceptionCode::NORMALCASEVIOLATION); int64_t ecOpID = 0; // dummy - int64_t rowNumber = _currentRowNumber; + int64_t rowNumber = _currentRowNumber - _numUnresolved; uint8_t* except_buf = serializeExceptionToMemory(ecCode, ecOpID, rowNumber, buf, bufSize, &except_size); // sink row to type violation exceptions with commonCaseOutputSchema @@ -339,6 +352,7 @@ namespace tuplex { // check if there is a partition left if(_currentNormalPartitionIdx + 1 < _partitions.size()) { _partitions[_currentNormalPartitionIdx]->unlock(); + _partitions[_currentNormalPartitionIdx]->invalidate(); _currentNormalPartitionIdx++; _normalPtr = _partitions[_currentNormalPartitionIdx]->lockRaw(); @@ -406,14 +420,15 @@ namespace tuplex { bool potentiallyHasResolverOnSlowPath = !_operatorIDsAffectedByResolvers.empty() && std::binary_search(_operatorIDsAffectedByResolvers.begin(), _operatorIDsAffectedByResolvers.end(), operatorID); - if(!requiresInterpreterReprocessing(i64ToEC(ecCode)) && !potentiallyHasResolverOnSlowPath) { +// bool potentiallyHasResolverOnSlowPath = true; + if(!_isIncremental && !requiresInterpreterReprocessing(i64ToEC(ecCode)) && !potentiallyHasResolverOnSlowPath) { // TODO: check with resolvers! // i.e., we can directly save this as exception IF code is not an interpreter code // and true exception, i.e. no resolvers available. // => need a list of for which opIds/codes resolvers are available... ///.... _numUnresolved++; - exceptionCallback(ecCode, operatorID, _rowNumber, ebuf, eSize); + exceptionCallback(ecCode, operatorID, _currentRowNumber, ebuf, eSize); return; } @@ -443,7 +458,6 @@ namespace tuplex { // fallback 2: interpreter path // --> only go there if a non-true exception was recorded. Else, it will be dealt with above if(resCode == -1 && _interpreterFunctor) { - // acquire GIL python::lockGIL(); PyCallable_Check(_interpreterFunctor); @@ -648,7 +662,11 @@ namespace tuplex { mergeRow(buf, serialized_length, BUF_FORMAT_GENERAL_OUTPUT); delete [] buf; } else { - writePythonObject(rowObj); + if(PyTuple_Check(rowObj) && PyTuple_Size(rowObj) == 1) { + writePythonObject(PyTuple_GetItem(rowObj, 0)); + } else { + writePythonObject(rowObj); + } } // Py_XDECREF(rowObj); } @@ -676,7 +694,7 @@ namespace tuplex { // fallback 3: still exception? save... if(resCode == -1) { _numUnresolved++; - exceptionCallback(ecCode, operatorID, _rowNumber, ebuf, eSize); + exceptionCallback(ecCode, operatorID, _currentRowNumber, ebuf, eSize); } } @@ -711,7 +729,7 @@ namespace tuplex { } // abort if no exceptions! - if(_runtimeExceptions.empty() && _numInputExceptions == 0) + if(_exceptionPartitions.empty() && _generalPartitions.empty() && _fallbackPartitions.empty()) return; // special case: no functor & no python pipeline functor given @@ -724,12 +742,12 @@ namespace tuplex { #endif // copy _generalCasePartitions over to base class - IExceptionableTask::setExceptions(_runtimeExceptions); + IExceptionableTask::setExceptions(_generalPartitions); // clear exceptions, because they have been resolved (or put to new exceptions!) // if task produced exceptions, they are stored in the IExceptionableTask class! // => no need to overwrite them, getter for iexceptionabletask has all info! - _runtimeExceptions.clear(); + _generalPartitions.clear(); _wallTime = timer.time(); return; @@ -742,78 +760,61 @@ namespace tuplex { // merge exceptions with normal rows after calling slow code over them... // basic idea is go over all exception partitions, execute row wise the resolution function // and merge the result back to the partitions - for(auto partition : _runtimeExceptions) { - const uint8_t *ptr = partition->lockRaw(); - int64_t numRows = *((int64_t *) ptr); - ptr += sizeof(int64_t); - - for(int i = 0; i < numRows; ++i) { - // old - // _currentRowNumber = *((int64_t*)ptr); - // ptr += sizeof(int64_t); - // int64_t ecCode = *((int64_t*)ptr); - // ptr += sizeof(int64_t); - // int64_t operatorID = *((int64_t*)ptr); - // ptr += sizeof(int64_t); - // int64_t eSize = *((int64_t*)ptr); - // ptr += sizeof(int64_t); - + for (const auto &partition : _generalPartitions) { + const uint8_t *ptr = partition->lock(); + auto numRows = partition->getNumRows(); + for (int i = 0; i < numRows; ++i) { const uint8_t *ebuf = nullptr; int64_t ecCode = -1, operatorID = -1; size_t eSize = 0; auto delta = deserializeExceptionFromMemory(ptr, &ecCode, &operatorID, &_currentRowNumber, &ebuf, &eSize); + processExceptionRow(ecCode, operatorID, ebuf, eSize); - // call functor over this... - // ==> important to use row number here for continuous exception resolution! - // args are: "userData", "rowNumber", "exceptionCode", "rowBuf", "bufSize" + ptr += delta; + _rowNumber++; + } + partition->unlock(); + partition->invalidate(); + } + + for (const auto &partition : _fallbackPartitions) { + const uint8_t *ptr = partition->lock(); + auto numRows = partition->getNumRows(); + for (int i = 0; i < numRows; ++i) { + const uint8_t *ebuf = nullptr; + int64_t ecCode = -1, operatorID = -1; + size_t eSize = 0; + auto delta = deserializeExceptionFromMemory(ptr, &ecCode, &operatorID, &_currentRowNumber, &ebuf, + &eSize); processExceptionRow(ecCode, operatorID, ebuf, eSize); ptr += delta; - // old - //ptr += eSize; - - // always inc row number _rowNumber++; } partition->unlock(); - - // exception partition is done or exceptions are transferred to new partition... partition->invalidate(); } - // now process all of the input exceptions - if (_numInputExceptions > 0) { - // Initialize input exception to starting index - auto partition = _inputExceptions[_inputExceptionIndex]; - auto rowsLeftInPartition = partition->getNumRows() - _inputExceptionRowOffset; - const uint8_t *ptr = partition->lock() + _inputExceptionByteOffset; - - // Iterate over all input exceptions, may be accross multiple partitions - for (int i = 0; i < _numInputExceptions; ++i) { - // Change partition once exhausted - if (rowsLeftInPartition == 0) { - partition->unlock(); - _inputExceptionIndex++; - partition = _inputExceptions[_inputExceptionIndex]; - rowsLeftInPartition = partition->getNumRows(); - ptr = partition->lock(); - } - + for (const auto &partition : _exceptionPartitions) { + const uint8_t *ptr = partition->lock(); + auto numRows = partition->getNumRows(); + for (int i = 0; i < numRows; ++i) { const uint8_t *ebuf = nullptr; int64_t ecCode = -1, operatorID = -1; size_t eSize = 0; auto delta = deserializeExceptionFromMemory(ptr, &ecCode, &operatorID, &_currentRowNumber, &ebuf, &eSize); + processExceptionRow(ecCode, operatorID, ebuf, eSize); + ptr += delta; _rowNumber++; - rowsLeftInPartition--; } - // Unlock but wait to invalidate until all resolve tasks have finished partition->unlock(); + partition->invalidate(); } // merging is done, unlock the last partition & copy the others over. @@ -832,8 +833,9 @@ namespace tuplex { // clear exceptions, because they have been resolved (or put to new exceptions!) // if task produced exceptions, they are stored in the IExceptionableTask class! // => no need to overwrite them, getter for iexceptionabletask has all info! - _runtimeExceptions.clear(); - _inputExceptions.clear(); + _exceptionPartitions.clear(); + _generalPartitions.clear(); + _fallbackPartitions.clear(); } else { executeInOrder(); } @@ -881,146 +883,323 @@ namespace tuplex { _rowNumber = 0; } - // Initialize runtime exception variables - size_t curRuntimePartitionInd = 0; // current index into vector of runtime exception partitions - int64_t numRuntimeRowsLeftInPartition = 0; // number of rows remaining in partition - const uint8_t *runPtr = nullptr; - if (_runtimeExceptions.size() > 0) { - curRuntimePartitionInd = 0; - numRuntimeRowsLeftInPartition = _runtimeExceptions[curRuntimePartitionInd]->getNumRows(); - runPtr = _runtimeExceptions[curRuntimePartitionInd]->lock(); + size_t curExceptionInd = 0; + size_t exceptionsRemaining = 0; + const uint8_t *expPtr = nullptr; + size_t exceptionNumRows = 0; + for (int i = 0; i < _exceptionPartitions.size(); ++i) { + auto numRows = _exceptionPartitions[i]->getNumRows(); + exceptionNumRows += numRows; + if (i == 0) { + expPtr = _exceptionPartitions[i]->lock(); + exceptionsRemaining = numRows; + } } - // Initialize input exception variables - size_t curInputPartitionInd = 0; // current index into vector of input exception partitions - int64_t numInputRowsLeftInPartition = 0; // number of rows remaining in partition - const uint8_t *inputPtr = nullptr; - if (_numInputExceptions > 0) { - curInputPartitionInd = _inputExceptionIndex; - numInputRowsLeftInPartition = _inputExceptions[curInputPartitionInd]->getNumRows() - _inputExceptionRowOffset; - inputPtr = _inputExceptions[curInputPartitionInd]->lock() + _inputExceptionByteOffset; + size_t curGeneralInd = 0; + size_t generalRemaining = 0; + const uint8_t *generalPtr = nullptr; + size_t generalNumRows = 0; + for (int i = 0; i < _generalPartitions.size(); ++i) { + auto numRows = _generalPartitions[i]->getNumRows(); + generalNumRows += numRows; + if (i == 0) { + generalPtr = _generalPartitions[i]->lock(); + generalRemaining = numRows; + } + } + + size_t curFallbackInd = 0; + size_t fallbackRemaining = 0; + const uint8_t *fallPtr = nullptr; + size_t fallbackNumRows = 0; + for (int i = 0; i < _fallbackPartitions.size(); ++i) { + auto numRows = _fallbackPartitions[i]->getNumRows(); + fallbackNumRows += numRows; + if (i == 0) { + fallPtr = _fallbackPartitions[i]->lock(); + fallbackRemaining = numRows; + } } // Merge input and runtime exceptions in order. To do so, we can compare the row indices of the // current runtime and input exception and process the one that occurs first. The saved row indices of // runtime exceptions do not account for the existence of input exceptions, so we need to add the previous // input exceptions to compare the true row number - size_t inputRowsProcessed = 0; - const uint8_t *ptr = nullptr; - while (runPtr && inputPtr) { - auto runRowInd = *((int64_t *) runPtr); // get current runtime row index - auto inputRowInd = *((int64_t *) inputPtr); // get current input row index - bool isRuntimeException = false; - // compare indices with accounting for previous input exceptions - if (runRowInd + inputRowsProcessed < inputRowInd) { - ptr = runPtr; - numRuntimeRowsLeftInPartition--; - isRuntimeException = true; + while (_exceptionCounter < exceptionNumRows && _generalCounter < generalNumRows && _fallbackCounter < fallbackNumRows) { + auto expRowInd = *((int64_t *) expPtr) + _fallbackCounter + _generalCounter; + auto generalRowInd = *((int64_t *) generalPtr) + _fallbackCounter; + auto fallbackRowInd = *((int64_t *) fallPtr); + + const uint8_t *buf = nullptr; + int64_t ecCode = 0, operatorID = -1; + size_t eSize = 0; + if (fallbackRowInd <= expRowInd && fallbackRowInd <= generalRowInd) { + fallbackRemaining--; + _fallbackCounter++; + + auto delta = deserializeExceptionFromMemory(fallPtr, &ecCode, &operatorID, &_currentRowNumber, &buf, &eSize); + fallPtr += delta; + } else if (generalRowInd <= expRowInd && generalRowInd <= fallbackRowInd) { + generalRemaining--; + _generalCounter++; + + auto delta = deserializeExceptionFromMemory(generalPtr, &ecCode, &operatorID, &_currentRowNumber, &buf, &eSize); + _currentRowNumber += _fallbackCounter; + generalPtr += delta; } else { - ptr = inputPtr; - numInputRowsLeftInPartition--; - inputRowsProcessed++; + exceptionsRemaining--; + _exceptionCounter++; + + auto delta = deserializeExceptionFromMemory(expPtr, &ecCode, &operatorID, &_currentRowNumber, &buf, &eSize); + _currentRowNumber += _fallbackCounter + _generalCounter; + expPtr += delta; } - const uint8_t *ebuf = nullptr; - int64_t ecCode = -1, operatorID = -1; + processExceptionRow(ecCode, operatorID, buf, eSize); + _rowNumber++; + + if (exceptionsRemaining == 0) { + _exceptionPartitions[curExceptionInd]->unlock(); + _exceptionPartitions[curExceptionInd]->invalidate(); + curExceptionInd++; + if (curExceptionInd < _exceptionPartitions.size()) { + exceptionsRemaining = _exceptionPartitions[curExceptionInd]->getNumRows(); + expPtr = _exceptionPartitions[curExceptionInd]->lock(); + } + } + + if (generalRemaining == 0) { + _generalPartitions[curGeneralInd]->unlock(); + _generalPartitions[curGeneralInd]->invalidate(); + curGeneralInd++; + if (curGeneralInd < _generalPartitions.size()) { + generalRemaining = _generalPartitions[curGeneralInd]->getNumRows(); + generalPtr = _generalPartitions[curGeneralInd]->lock(); + } + } + + if (fallbackRemaining == 0) { + _fallbackPartitions[curFallbackInd]->unlock(); + _fallbackPartitions[curFallbackInd]->invalidate(); + curFallbackInd++; + if (curFallbackInd < _fallbackPartitions.size()) { + fallbackRemaining = _fallbackPartitions[curFallbackInd]->getNumRows(); + fallPtr = _fallbackPartitions[curFallbackInd]->lock(); + } + } + } + + while (_exceptionCounter < exceptionNumRows && _generalCounter < generalNumRows) { + auto expRowInd = *((int64_t *) expPtr) + _fallbackCounter + _generalCounter; + auto generalRowInd = *((int64_t *) generalPtr) + _generalCounter; + + const uint8_t *buf = nullptr; + int64_t ecCode = 0, operatorID = -1; size_t eSize = 0; - auto delta = deserializeExceptionFromMemory(ptr, &ecCode, &operatorID, &_currentRowNumber, &ebuf, - &eSize); + if (generalRowInd <= expRowInd) { + generalRemaining--; + _generalCounter++; - if (isRuntimeException) { - _currentRowNumber += inputRowsProcessed; - runPtr += delta; + auto delta = deserializeExceptionFromMemory(generalPtr, &ecCode, &operatorID, &_currentRowNumber, &buf, &eSize); + _currentRowNumber += _fallbackCounter; + generalPtr += delta; } else { - inputPtr += delta; + exceptionsRemaining--; + _exceptionCounter++; + + auto delta = deserializeExceptionFromMemory(expPtr, &ecCode, &operatorID, &_currentRowNumber, &buf, &eSize); + _currentRowNumber += _fallbackCounter + _generalCounter; + expPtr += delta; } - processExceptionRow(ecCode, operatorID, ebuf, eSize); + processExceptionRow(ecCode, operatorID, buf, eSize); _rowNumber++; - // Exhausted current runtime exceptions, need to switch partitions - if (numRuntimeRowsLeftInPartition == 0) { - _runtimeExceptions[curRuntimePartitionInd]->unlock(); - _runtimeExceptions[curRuntimePartitionInd]->invalidate(); - curRuntimePartitionInd++; - // Still have more exceptions to go through - if (curRuntimePartitionInd < _runtimeExceptions.size()) { - numRuntimeRowsLeftInPartition = _runtimeExceptions[curRuntimePartitionInd]->getNumRows(); - runPtr = _runtimeExceptions[curRuntimePartitionInd]->lock(); - } else { - // processed all exceptions - runPtr = nullptr; + if (exceptionsRemaining == 0) { + _exceptionPartitions[curExceptionInd]->unlock(); + _exceptionPartitions[curExceptionInd]->invalidate(); + curExceptionInd++; + if (curExceptionInd < _exceptionPartitions.size()) { + exceptionsRemaining = _exceptionPartitions[curExceptionInd]->getNumRows(); + expPtr = _exceptionPartitions[curExceptionInd]->lock(); } } - // Exhausted current input exceptions, need to switch partitions - if (numInputRowsLeftInPartition == 0 || inputRowsProcessed == _numInputExceptions) { - _inputExceptions[curInputPartitionInd]->unlock(); - curInputPartitionInd++; - // Still have more exceptions to go through - if (curInputPartitionInd < _inputExceptions.size() && inputRowsProcessed < _numInputExceptions) { - numInputRowsLeftInPartition = _inputExceptions[curInputPartitionInd]->getNumRows(); - inputPtr = _inputExceptions[curInputPartitionInd]->lock(); - } else { - // processed all exceptions - inputPtr = nullptr; + if (generalRemaining == 0) { + _generalPartitions[curGeneralInd]->unlock(); + _generalPartitions[curGeneralInd]->invalidate(); + curGeneralInd++; + if (curGeneralInd < _generalPartitions.size()) { + generalRemaining = _generalPartitions[curGeneralInd]->getNumRows(); + generalPtr = _generalPartitions[curGeneralInd]->lock(); } } } - // Process remaining runtime exceptions if any exist - while (runPtr) { - const uint8_t *ebuf = nullptr; + while (_generalCounter < generalNumRows && _fallbackCounter < fallbackNumRows) { + auto generalRowInd = *((int64_t *) generalPtr) + _fallbackCounter; + auto fallbackRowInd = *((int64_t *) fallPtr); + + const uint8_t *buf = nullptr; + int64_t ecCode = 0, operatorID = -1; + size_t eSize = 0; + if (fallbackRowInd <= generalRowInd) { + fallbackRemaining--; + _fallbackCounter++; + + auto delta = deserializeExceptionFromMemory(fallPtr, &ecCode, &operatorID, &_currentRowNumber, &buf, &eSize); + fallPtr += delta; + } else { + generalRemaining--; + _generalCounter++; + + auto delta = deserializeExceptionFromMemory(generalPtr, &ecCode, &operatorID, &_currentRowNumber, &buf, &eSize); + _currentRowNumber += _fallbackCounter; + generalPtr += delta; + } + + processExceptionRow(ecCode, operatorID, buf, eSize); + _rowNumber++; + + if (generalRemaining == 0) { + _generalPartitions[curGeneralInd]->unlock(); + _generalPartitions[curGeneralInd]->invalidate(); + curGeneralInd++; + if (curGeneralInd < _generalPartitions.size()) { + generalRemaining = _generalPartitions[curGeneralInd]->getNumRows(); + generalPtr = _generalPartitions[curGeneralInd]->lock(); + } + } + + if (fallbackRemaining == 0) { + _fallbackPartitions[curFallbackInd]->unlock(); + _fallbackPartitions[curFallbackInd]->invalidate(); + curFallbackInd++; + if (curFallbackInd < _fallbackPartitions.size()) { + fallbackRemaining = _fallbackPartitions[curFallbackInd]->getNumRows(); + fallPtr = _fallbackPartitions[curFallbackInd]->lock(); + } + } + } + + while (_exceptionCounter < exceptionNumRows && _fallbackCounter < fallbackNumRows) { + auto expRowInd = *((int64_t *) expPtr) + _fallbackCounter + _generalCounter; + auto fallbackRowInd = *((int64_t *) fallPtr); + + const uint8_t *buf = nullptr; + int64_t ecCode = 0, operatorID = -1; + size_t eSize = 0; + if (fallbackRowInd <= expRowInd) { + fallbackRemaining--; + _fallbackCounter++; + + auto delta = deserializeExceptionFromMemory(fallPtr, &ecCode, &operatorID, &_currentRowNumber, &buf, &eSize); + fallPtr += delta; + } else { + exceptionsRemaining--; + _exceptionCounter++; + + auto delta = deserializeExceptionFromMemory(expPtr, &ecCode, &operatorID, &_currentRowNumber, &buf, &eSize); + _currentRowNumber += _fallbackCounter + _generalCounter; + expPtr += delta; + } + + processExceptionRow(ecCode, operatorID, buf, eSize); + _rowNumber++; + + if (exceptionsRemaining == 0) { + _exceptionPartitions[curExceptionInd]->unlock(); + _exceptionPartitions[curExceptionInd]->invalidate(); + curExceptionInd++; + if (curExceptionInd < _exceptionPartitions.size()) { + exceptionsRemaining = _exceptionPartitions[curExceptionInd]->getNumRows(); + expPtr = _exceptionPartitions[curExceptionInd]->lock(); + } + } + + if (fallbackRemaining == 0) { + _fallbackPartitions[curFallbackInd]->unlock(); + _fallbackPartitions[curFallbackInd]->invalidate(); + curFallbackInd++; + if (curFallbackInd < _fallbackPartitions.size()) { + fallbackRemaining = _fallbackPartitions[curFallbackInd]->getNumRows(); + fallPtr = _fallbackPartitions[curFallbackInd]->lock(); + } + } + } + + while (_exceptionCounter < exceptionNumRows) { + const uint8_t *buf = nullptr; int64_t ecCode = -1, operatorID = -1; size_t eSize = 0; - auto delta = deserializeExceptionFromMemory(runPtr, &ecCode, &operatorID, &_currentRowNumber, &ebuf, - &eSize); - _currentRowNumber += inputRowsProcessed; - processExceptionRow(ecCode, operatorID, ebuf, eSize); - runPtr += delta; + auto delta = deserializeExceptionFromMemory(expPtr, &ecCode, &operatorID, &_currentRowNumber, &buf, &eSize); + _currentRowNumber += _generalCounter + _fallbackCounter; + expPtr += delta; + + processExceptionRow(ecCode, operatorID, buf, eSize); _rowNumber++; - numRuntimeRowsLeftInPartition--; - // Exhausted current runtime exceptions in partitions need to switch partitions or could be done - if (numRuntimeRowsLeftInPartition == 0) { - _runtimeExceptions[curRuntimePartitionInd]->unlock(); - _runtimeExceptions[curRuntimePartitionInd]->invalidate(); - curRuntimePartitionInd++; - // More exceptions to process - if (curRuntimePartitionInd < _runtimeExceptions.size()) { - numRuntimeRowsLeftInPartition = _runtimeExceptions[curRuntimePartitionInd]->getNumRows(); - runPtr = _runtimeExceptions[curRuntimePartitionInd]->lock(); - } else { - // processed all exceptions - runPtr = nullptr; + exceptionsRemaining--; + _exceptionCounter++; + + if (exceptionsRemaining == 0) { + _exceptionPartitions[curExceptionInd]->unlock(); + _exceptionPartitions[curExceptionInd]->invalidate(); + curExceptionInd++; + if (curExceptionInd < _exceptionPartitions.size()) { + exceptionsRemaining = _exceptionPartitions[curExceptionInd]->getNumRows(); + expPtr = _exceptionPartitions[curExceptionInd]->lock(); } } } - // Process remaining input exceptions if any exist - while (inputPtr) { - const uint8_t *ebuf = nullptr; + while (_generalCounter < generalNumRows) { + const uint8_t *buf = nullptr; int64_t ecCode = -1, operatorID = -1; size_t eSize = 0; - auto delta = deserializeExceptionFromMemory(inputPtr, &ecCode, &operatorID, &_currentRowNumber, &ebuf, - &eSize); - processExceptionRow(ecCode, operatorID, ebuf, eSize); - inputPtr += delta; + auto delta = deserializeExceptionFromMemory(generalPtr, &ecCode, &operatorID, &_currentRowNumber, &buf, &eSize); + _currentRowNumber += _fallbackCounter; + + generalPtr += delta; + + processExceptionRow(ecCode, operatorID, buf, eSize); _rowNumber++; - numInputRowsLeftInPartition--; - inputRowsProcessed++; - // Exhausted current input exceptions, need to switch partitions - if (numInputRowsLeftInPartition == 0 || inputRowsProcessed == _numInputExceptions) { - _inputExceptions[curInputPartitionInd]->unlock(); - curInputPartitionInd++; - // Still have more exceptions - if (curInputPartitionInd < _inputExceptions.size() && inputRowsProcessed < _numInputExceptions) { - numInputRowsLeftInPartition = _inputExceptions[curInputPartitionInd]->getNumRows(); - inputPtr = _inputExceptions[curInputPartitionInd]->lock(); - } else { - // processed all exceptions - inputPtr = nullptr; + generalRemaining--; + _generalCounter++; + + if (generalRemaining == 0) { + _generalPartitions[curGeneralInd]->unlock(); + _generalPartitions[curGeneralInd]->invalidate(); + curGeneralInd++; + if (curGeneralInd < _generalPartitions.size()) { + generalRemaining = _generalPartitions[curGeneralInd]->getNumRows(); + generalPtr = _generalPartitions[curGeneralInd]->lock(); + } + } + } + + while (_fallbackCounter < fallbackNumRows) { + const uint8_t *buf = nullptr; + int64_t ecCode = -1, operatorID = -1; + size_t eSize = 0; + auto delta = deserializeExceptionFromMemory(fallPtr, &ecCode, &operatorID, &_currentRowNumber, &buf, &eSize); + fallPtr += delta; + + processExceptionRow(ecCode, operatorID, buf, eSize); + _rowNumber++; + + fallbackRemaining--; + _fallbackCounter++; + + if (fallbackRemaining == 0) { + _fallbackPartitions[curFallbackInd]->unlock(); + _fallbackPartitions[curFallbackInd]->invalidate(); + curFallbackInd++; + if (curFallbackInd < _fallbackPartitions.size()) { + fallbackRemaining = _fallbackPartitions[curFallbackInd]->getNumRows(); + fallPtr = _fallbackPartitions[curFallbackInd]->lock(); } } } @@ -1037,8 +1216,10 @@ namespace tuplex { _normalRowNumber++; } - if (!_partitions.empty()) + if (!_partitions.empty()) { _partitions[_currentNormalPartitionIdx]->unlock(); + _partitions[_currentNormalPartitionIdx]->invalidate(); + } // merging is done, unlock the last partition & copy the others over. unlockAll(); @@ -1054,7 +1235,9 @@ namespace tuplex { // clear exceptions, because they have been resolved (or put to new exceptions!) // if task produced exceptions, they are stored in the IExceptionableTask class! // => no need to overwrite them, getter for iexceptionabletask has all info! - _runtimeExceptions.clear(); + _exceptionPartitions.clear(); + _generalPartitions.clear(); + _fallbackPartitions.clear(); } void ResolveTask::sendStatusToHistoryServer() { @@ -1071,6 +1254,7 @@ namespace tuplex { void ResolveTask::unlockAll() { _mergedRowsSink.unlock(); _generalCaseSink.unlock(); + _fallbackSink.unlock(); // unlock exceptionable task IExceptionableTask::unlockAll(); diff --git a/tuplex/core/src/physical/ResultSet.cc b/tuplex/core/src/physical/ResultSet.cc index 0f7bf7319..153553f25 100644 --- a/tuplex/core/src/physical/ResultSet.cc +++ b/tuplex/core/src/physical/ResultSet.cc @@ -13,97 +13,175 @@ namespace tuplex { ResultSet::ResultSet(const Schema& schema, - const std::vector& partitions, - const std::vector& exceptions, - const std::unordered_map& partitionToExceptionsMap, - const std::vector> pyobjects, + const std::vector& normalPartitions, + const std::vector& generalPartitions, + const std::vector& fallbackPartitions, + const std::vector& partitionGroups, int64_t maxRows) : ResultSet::ResultSet() { - for(Partition *p : partitions) - _partitions.push_back(p); - - _pyobjects = std::deque>(pyobjects.begin(), pyobjects.end()); - _exceptions = exceptions; - _partitionToExceptionsMap = partitionToExceptionsMap; - _curRowCounter = 0; + for (const auto &group : partitionGroups) + _partitionGroups.push_back(group); + + for (const auto &p : normalPartitions) + _remainingNormalPartitions.push_back(p); + for (const auto &p : generalPartitions) + _remainingGeneralPartitions.push_back(p); + for (const auto &p : fallbackPartitions) + _remainingFallbackPartitions.push_back(p); + + _curNormalRowCounter = 0; + _curNormalByteCounter = 0; + _curGeneralRowCounter = 0; + _curGeneralByteCounter = 0; + _curFallbackRowCounter = 0; + _curFallbackByteCounter = 0; + _normalRowCounter = 0; + _generalRowCounter = 0; + _fallbackRowCounter = 0; _totalRowCounter = 0; - _byteCounter = 0; + _schema = schema; _maxRows = maxRows < 0 ? std::numeric_limits::max() : maxRows; - _rowsRetrieved = 0; } - void ResultSet::clear() { - for(auto partition : _partitions) - partition->invalidate(); - _partitions.clear(); - for(auto partition : _exceptions) + void clearPartitions(std::list& partitions) { + for (auto &partition : partitions) { partition->invalidate(); + } + partitions.clear(); + } - _curRowCounter = 0; - _byteCounter = 0; + void ResultSet::clear() { + clearPartitions(_remainingNormalPartitions); + clearPartitions(_currentNormalPartitions); + clearPartitions(_remainingGeneralPartitions); + clearPartitions(_currentGeneralPartitions); + clearPartitions(_remainingFallbackPartitions); + clearPartitions(_currentFallbackPartitions); + _partitionGroups.clear(); + + _curNormalRowCounter = 0; + _curNormalByteCounter = 0; + _curGeneralRowCounter = 0; + _curGeneralByteCounter = 0; + _curFallbackRowCounter = 0; + _curFallbackByteCounter = 0; + _normalRowCounter = 0; + _generalRowCounter = 0; + _fallbackRowCounter = 0; + _totalRowCounter = 0; _maxRows = 0; - _rowsRetrieved = 0; } - bool ResultSet::hasNextRow() { - + bool ResultSet::hasNextNormalPartition() const { // all rows already retrieved? - if(_rowsRetrieved >= _maxRows) + if (_totalRowCounter >= _maxRows) return false; // empty? - if(_partitions.empty() && _pyobjects.empty()) + if (_currentNormalPartitions.empty() && _remainingNormalPartitions.empty()) { return false; - else { - // partitions empty? - if(_partitions.empty()) - return true; - else if(_pyobjects.empty()) { - assert(_partitions.size() > 0); - assert(_partitions.front()); - - // still one row left? - return _curRowCounter < _partitions.front()->getNumRows(); - } else { - return true; // there's for sure at least one object left! - } + } else if (!_currentNormalPartitions.empty()) { + return _curNormalRowCounter < _currentNormalPartitions.front()->getNumRows(); + } else { + return _remainingNormalPartitions.front()->getNumRows() > 0; } - } + bool ResultSet::hasNextGeneralPartition() const { + // all rows already retrieved? + if (_totalRowCounter >= _maxRows) + return false; + + // empty? + if (_currentGeneralPartitions.empty() && _remainingGeneralPartitions.empty()) { + return false; + } else if (!_currentGeneralPartitions.empty()) { + return _curGeneralRowCounter < _currentGeneralPartitions.front()->getNumRows(); + } else { + return _remainingGeneralPartitions.front()->getNumRows() > 0; + } + } - bool ResultSet::hasNextPartition() const { + bool ResultSet::hasNextFallbackPartition() const { // all rows already retrieved? - if(_rowsRetrieved >= _maxRows) + if (_totalRowCounter >= _maxRows) return false; // empty? - if(_partitions.empty()) + if (_currentFallbackPartitions.empty() && _remainingFallbackPartitions.empty()) { return false; - else { - assert(_partitions.size() > 0); - assert(_partitions.front()); + } else if (!_currentFallbackPartitions.empty()) { + return _curFallbackRowCounter < _currentFallbackPartitions.front()->getNumRows(); + } else { + return _remainingFallbackPartitions.front()->getNumRows() > 0; + } + } + + Partition* ResultSet::getNextGeneralPartition() { + if (_currentGeneralPartitions.empty() && _remainingGeneralPartitions.empty()) + return nullptr; - // still one row left? - return _curRowCounter < _partitions.front()->getNumRows(); + Partition *first = nullptr; + if (!_currentGeneralPartitions.empty()) { + first = _currentGeneralPartitions.front(); + _currentGeneralPartitions.pop_front(); + } else { + first = _remainingGeneralPartitions.front(); + _remainingGeneralPartitions.pop_front(); } + + auto numRows = first->getNumRows(); + _totalRowCounter += numRows; + _generalRowCounter += numRows; + + _curGeneralRowCounter = 0; + _curGeneralByteCounter = 0; + + return first; } - Partition* ResultSet::getNextPartition() { - if(_partitions.empty()) + Partition* ResultSet::getNextFallbackPartition() { + if (_currentFallbackPartitions.empty() && _remainingFallbackPartitions.empty()) return nullptr; - assert(_partitions.size() > 0); + Partition *first = nullptr; + if (!_currentFallbackPartitions.empty()) { + first = _currentFallbackPartitions.front(); + _currentFallbackPartitions.pop_front(); + } else { + first = _remainingFallbackPartitions.front(); + _remainingFallbackPartitions.pop_front(); + } + + auto numRows = first->getNumRows(); + _totalRowCounter += numRows; + _fallbackRowCounter += numRows; - Partition *first = _partitions.front(); - assert(_schema == first->schema()); + _curFallbackRowCounter = 0; + _curFallbackByteCounter = 0; + + return first; + } + + Partition* ResultSet::getNextNormalPartition() { + if (_currentNormalPartitions.empty() && _remainingNormalPartitions.empty()) + return nullptr; + + Partition *first = nullptr; + if (!_currentNormalPartitions.empty()) { + first = _currentNormalPartitions.front(); + _currentNormalPartitions.pop_front(); + } else { + first = _remainingNormalPartitions.front(); + _remainingNormalPartitions.pop_front(); + } auto numRows = first->getNumRows(); - _rowsRetrieved += numRows; + _totalRowCounter += numRows; + _normalRowCounter += numRows; - _partitions.pop_front(); - _curRowCounter = 0; - _byteCounter = 0; + _curNormalRowCounter = 0; + _curNormalByteCounter = 0; return first; } @@ -121,23 +199,25 @@ namespace tuplex { v.reserve(limit); // do a quick check whether there are ANY pyobjects, if not deserialize quickly! - if(_pyobjects.empty()) { - - if(_partitions.empty()) + if(_currentGeneralPartitions.empty() && _remainingGeneralPartitions.empty() && _currentFallbackPartitions.empty() && _remainingFallbackPartitions.empty()) { + if (_currentNormalPartitions.empty() && _remainingNormalPartitions.empty()) return vector{}; + for (const auto &p : _remainingNormalPartitions) + _currentNormalPartitions.push_back(p); + Deserializer ds(_schema); for(int i = 0; i < limit;) { // all exhausted - if(_partitions.empty()) + if(_currentNormalPartitions.empty()) break; // get number of rows in first partition - Partition *first = _partitions.front(); + Partition *first = _currentNormalPartitions.front(); auto num_rows = first->getNumRows(); // how many left to retrieve? - auto num_to_retrieve_from_partition = std::min(limit - i, num_rows - _curRowCounter); + auto num_to_retrieve_from_partition = std::min(limit - i, num_rows - _curNormalRowCounter); if(num_to_retrieve_from_partition <= 0) break; @@ -148,11 +228,11 @@ namespace tuplex { // get next element of partition const uint8_t* ptr = first->lock(); for(int j = 0; j < num_to_retrieve_from_partition; ++j) { - auto row = Row::fromMemory(ds, ptr + _byteCounter, first->capacity() - _byteCounter); - _byteCounter += row.serializedLength(); - _curRowCounter++; - _rowsRetrieved++; + auto row = Row::fromMemory(ds, ptr + _curNormalByteCounter, first->capacity() - _curNormalByteCounter); + _curNormalByteCounter += row.serializedLength(); + _curNormalRowCounter++; _totalRowCounter++; + _normalRowCounter++; v.push_back(row); } @@ -163,17 +243,13 @@ namespace tuplex { i += num_to_retrieve_from_partition; // get next Partition ready when current one is exhausted - if(_curRowCounter == first->getNumRows()) - removeFirstPartition(); + if(_curNormalRowCounter == first->getNumRows()) + removeFirstNormalPartition(); } v.shrink_to_fit(); return v; } else { - // fallback solution: - // @TODO: write faster version with proper merging! - - std::vector v; while (hasNextRow() && v.size() < limit) { v.push_back(getNextRow()); } @@ -182,81 +258,252 @@ namespace tuplex { } } - Row ResultSet::getNextRow() { - // merge rows from objects - if(!_pyobjects.empty()) { - auto row_number = std::get<0>(_pyobjects.front()); - auto obj = std::get<1>(_pyobjects.front()); - - // partitions empty? - // => simply return next row. no fancy merging possible - // else merge based on row number. - if(_partitions.empty() || row_number <= _totalRowCounter) { - // merge - python::lockGIL(); - auto row = python::pythonToRow(obj); - python::unlockGIL(); - _pyobjects.pop_front(); - _rowsRetrieved++; - - // update row counter (not for double indices which could occur from flatMap!) - if(_pyobjects.empty()) - _totalRowCounter++; - else { - auto next_row_number = std::get<0>(_pyobjects.front()); - if(next_row_number != row_number) - _totalRowCounter++; - } + bool ResultSet::hasNextNormalRow() { + if (!_currentNormalPartitions.empty() && _curNormalRowCounter < _currentNormalPartitions.front()->getNumRows()) + return true; + for (const auto &p : _remainingNormalPartitions) + if (p->getNumRows() > 0) + return true; + return false; + } + + bool ResultSet::hasNextGeneralRow() { + if (!_currentGeneralPartitions.empty() && _curGeneralRowCounter < _currentGeneralPartitions.front()->getNumRows()) + return true; + for (const auto &p : _remainingGeneralPartitions) + if (p->getNumRows() > 0) + return true; + return false; + } + + bool ResultSet::hasNextFallbackRow() { + if (!_currentFallbackPartitions.empty() && _curFallbackRowCounter < _currentFallbackPartitions.front()->getNumRows()) + return true; + for (const auto &p : _remainingFallbackPartitions) + if (p->getNumRows() > 0) + return true; + return false; + } + + bool ResultSet::hasNextRow() { + // all rows already retrieved? + if(_totalRowCounter >= _maxRows) + return false; + + return hasNextNormalRow() || hasNextGeneralRow() || hasNextFallbackRow(); + } - return row; + Row ResultSet::getNextRow() { + if (_currentNormalPartitions.empty() && _currentFallbackPartitions.empty() && _currentGeneralPartitions.empty()) { + // all partitions are exhausted return empty row as default value + if (_partitionGroups.empty()) + return Row(); + _normalRowCounter = 0; + _generalRowCounter = 0; + _fallbackRowCounter = 0; + auto group = _partitionGroups.front(); + _partitionGroups.pop_front(); + for (int i = group.normalPartitionStartInd; i < group.normalPartitionStartInd + group.numNormalPartitions; ++i) { + _currentNormalPartitions.push_back(_remainingNormalPartitions.front()); + _remainingNormalPartitions.pop_front(); + } + for (int i = group.generalPartitionStartInd; i < group.generalPartitionStartInd + group.numGeneralPartitions; ++i) { + _currentGeneralPartitions.push_back(_remainingGeneralPartitions.front()); + _remainingGeneralPartitions.pop_front(); + } + for (int i = group.fallbackPartitionStartInd; i < group.fallbackPartitionStartInd + group.numFallbackPartitions; ++i) { + _currentFallbackPartitions.push_back(_remainingFallbackPartitions.front()); + _remainingFallbackPartitions.pop_front(); + } + return getNextRow(); + } else if (_currentNormalPartitions.empty() && _currentFallbackPartitions.empty()) { + // only general rows remain, return next general row + return getNextGeneralRow(); + } else if (_currentNormalPartitions.empty() && _currentGeneralPartitions.empty()) { + // only fallback rows remain, return next fallback row + return getNextFallbackRow(); + } else if (_currentFallbackPartitions.empty() && _currentGeneralPartitions.empty()) { + // only normal rows remain, return next normal row + return getNextNormalRow(); + } else if (_currentFallbackPartitions.empty()) { + // only normal and general rows remain, compare row index + // emit normal rows until reached current general ind + if (_normalRowCounter + _generalRowCounter < currentGeneralRowInd()) { + return getNextNormalRow(); + } else { + return getNextGeneralRow(); + } + } else if (_currentGeneralPartitions.empty()) { + // only normal and fallback rows remain, compare row index + // emit normal rows until reached current fallback ind + if (_normalRowCounter + _generalRowCounter + _fallbackRowCounter < currentFallbackRowInd()) { + return getNextNormalRow(); + } else { + return getNextFallbackRow(); + } + } else { + // all three cases remain, three way row comparison + auto generalRowInd = currentGeneralRowInd(); + auto fallbackRowInd = currentFallbackRowInd(); + if (_normalRowCounter + _generalRowCounter < generalRowInd && _normalRowCounter + _generalRowCounter + _fallbackRowCounter < fallbackRowInd) { + return getNextNormalRow(); + } else if (generalRowInd <= fallbackRowInd) { + return getNextGeneralRow(); + } else { + return getNextFallbackRow(); } } + } - // check whether entry is available, else return empty row - if(_partitions.empty()) - return Row(); + int64_t ResultSet::currentFallbackRowInd() { + assert(!_currentFallbackPartitions.empty()); + auto p = _currentFallbackPartitions.front(); + auto ptr = p->lock() + _curFallbackByteCounter; + auto rowInd = *((int64_t*) ptr); + p->unlock(); + return rowInd; + } - assert(_partitions.size() > 0); - Partition *first = _partitions.front(); + int64_t ResultSet::currentGeneralRowInd() { + assert(!_currentGeneralPartitions.empty()); + auto p = _currentGeneralPartitions.front(); + auto ptr = p->lock() + _curGeneralByteCounter; + auto rowInd = *((int64_t*) ptr); + p->unlock(); + return rowInd; + } - // make sure partition schema matches stored schema - assert(_schema == first->schema()); + Row ResultSet::getNextNormalRow() { + assert (!_currentNormalPartitions.empty()); + auto p = _currentNormalPartitions.front(); + assert(_schema == p->schema()); - Row row; + auto ptr = p->lock() + _curNormalByteCounter; + auto capacity = p->capacity() - _curNormalByteCounter; + auto row = Row::fromMemory(_schema, ptr, capacity); + p->unlock(); - // thread safe version (slow) - // get next element of partition - const uint8_t* ptr = first->lock(); + _curNormalByteCounter += row.serializedLength(); + _curNormalRowCounter++; + _totalRowCounter++; + _normalRowCounter++; - row = Row::fromMemory(_schema, ptr + _byteCounter, first->capacity() - _byteCounter); + if (_curNormalRowCounter == p->getNumRows()) { + removeFirstNormalPartition(); + } - // thread safe version (slow) - // deserialize - first->unlock(); + return row; + } + + Row ResultSet::getNextGeneralRow() { + assert (!_currentGeneralPartitions.empty()); + auto p = _currentGeneralPartitions.front(); + assert(_schema == p->schema()); + + auto prevRowInd = currentGeneralRowInd(); + _curGeneralByteCounter += 4 * sizeof(int64_t); + auto ptr = p->lock() + _curGeneralByteCounter; + auto capacity = p->capacity() - _curGeneralByteCounter; + auto row = Row::fromMemory(_schema, ptr, capacity); + p->unlock(); + + _curGeneralByteCounter += row.serializedLength(); + _curGeneralRowCounter++; + + if (_curGeneralRowCounter == p->getNumRows()) { + removeFirstGeneralPartition(); + } - _byteCounter += row.serializedLength(); - _curRowCounter++; - _rowsRetrieved++; _totalRowCounter++; + if (_currentGeneralPartitions.empty() || currentGeneralRowInd() > prevRowInd) { + _generalRowCounter++; + } + + return row; + } + + Row ResultSet::getNextFallbackRow() { + assert (!_currentFallbackPartitions.empty()); + + auto prevRowInd = currentFallbackRowInd(); + auto p = _currentFallbackPartitions.front(); + auto ptr = p->lock() + _curFallbackByteCounter; + auto pyObjectSize = ((int64_t *) ptr)[3]; ptr += 4 * sizeof(int64_t); + + python::lockGIL(); + auto row = python::pythonToRow(python::deserializePickledObject(python::getMainModule(), (char *) ptr, pyObjectSize)); + python::unlockGIL(); + + p->unlock(); + + _curFallbackByteCounter += pyObjectSize + 4*sizeof(int64_t); + _curFallbackRowCounter++; + + if (_curFallbackRowCounter == p->getNumRows()) { + removeFirstFallbackPartition(); + } - // get next Partition ready when current one is exhausted - if(_curRowCounter == first->getNumRows()) - removeFirstPartition(); + _totalRowCounter++; + if (_currentFallbackPartitions.empty() || currentFallbackRowInd() > prevRowInd) { + _fallbackRowCounter++; + } return row; } size_t ResultSet::rowCount() const { size_t count = 0; - for(const auto& partition : _partitions) { + for (const auto& partition : _currentNormalPartitions) count += partition->getNumRows(); - } - return count + _pyobjects.size(); + for (const auto& partition : _remainingNormalPartitions) + count += partition->getNumRows(); + for (const auto& partition : _currentGeneralPartitions) + count += partition->getNumRows(); + for (const auto& partition : _remainingGeneralPartitions) + count += partition->getNumRows(); + for (const auto& partition : _currentFallbackPartitions) + count += partition->getNumRows(); + for (const auto& partition : _remainingFallbackPartitions) + count += partition->getNumRows(); + return count; } - void ResultSet::removeFirstPartition() { - assert(_partitions.size() > 0); - Partition *first = _partitions.front(); + void ResultSet::removeFirstGeneralPartition() { + assert(!_currentGeneralPartitions.empty()); + Partition *first = _currentGeneralPartitions.front(); + assert(first); + + // invalidate partition +#ifndef NDEBUG + Logger::instance().defaultLogger().info("ResultSet invalidates partition " + hexAddr(first) + " uuid " + uuidToString(first->uuid())); +#endif + first->invalidate(); + + _currentGeneralPartitions.pop_front(); + _curGeneralRowCounter = 0; + _curGeneralByteCounter = 0; + } + + void ResultSet::removeFirstFallbackPartition() { + assert(!_currentFallbackPartitions.empty()); + Partition *first = _currentFallbackPartitions.front(); + assert(first); + + // invalidate partition +#ifndef NDEBUG + Logger::instance().defaultLogger().info("ResultSet invalidates partition " + hexAddr(first) + " uuid " + uuidToString(first->uuid())); +#endif + first->invalidate(); + + // remove partition (is now processed) + _currentFallbackPartitions.pop_front(); + _curFallbackRowCounter = 0; + _curFallbackByteCounter = 0; + } + + void ResultSet::removeFirstNormalPartition() { + assert(!_currentNormalPartitions.empty()); + Partition *first = _currentNormalPartitions.front(); assert(first); // invalidate partition @@ -266,8 +513,18 @@ namespace tuplex { first->invalidate(); // remove partition (is now processed) - _partitions.pop_front(); - _curRowCounter = 0; - _byteCounter = 0; + + _currentNormalPartitions.pop_front(); + _curNormalRowCounter = 0; + _curNormalByteCounter = 0; + } + + size_t ResultSet::fallbackRowCount() const { + size_t count = 0; + for (const auto &p : _currentFallbackPartitions) + count += p->getNumRows(); + for (const auto&p : _remainingFallbackPartitions) + count += p->getNumRows(); + return count; } } \ No newline at end of file diff --git a/tuplex/core/src/physical/StageBuilder.cc b/tuplex/core/src/physical/StageBuilder.cc index 72f01e2b8..12e3b24a6 100644 --- a/tuplex/core/src/physical/StageBuilder.cc +++ b/tuplex/core/src/physical/StageBuilder.cc @@ -46,10 +46,11 @@ namespace tuplex { double normalCaseThreshold, bool sharedObjectPropagation, bool nullValueOptimization, - bool updateInputExceptions) + bool updateInputExceptions, + bool incrementalResolution) : _stageNumber(stage_number), _isRootStage(rootStage), _allowUndefinedBehavior(allowUndefinedBehavior), _generateParser(generateParser), _normalCaseThreshold(normalCaseThreshold), _sharedObjectPropagation(sharedObjectPropagation), - _nullValueOptimization(nullValueOptimization), _updateInputExceptions(updateInputExceptions), + _nullValueOptimization(nullValueOptimization), _updateInputExceptions(updateInputExceptions), _incrementalResolution(incrementalResolution), _inputNode(nullptr), _outputLimit(std::numeric_limits::max()) { } @@ -1051,7 +1052,7 @@ namespace tuplex { bool requireSlowPath = _nullValueOptimization; // per default, slow path is always required when null-value opt is enabled. // special case: input source is cached and no exceptions happened => no resolve path necessary if there are no resolvers! - if(_inputNode->type() == LogicalOperatorType::CACHE && dynamic_cast(_inputNode)->cachedExceptions().empty()) + if(_inputNode->type() == LogicalOperatorType::CACHE && dynamic_cast(_inputNode)->cachedGeneralPartitions().empty() && dynamic_cast(_inputNode)->cachedFallbackPartitions().empty()) requireSlowPath = false; if (numResolveOperators > 0 || requireSlowPath) { @@ -1443,7 +1444,9 @@ namespace tuplex { stage->_irBitCode = _irBitCode; stage->_pyCode = _pyCode; stage->_pyPipelineName = _pyPipelineName; + stage->_updateInputExceptions = _updateInputExceptions; + stage->_incrementalResolution = _incrementalResolution; // if last op is CacheOperator, check whether normal/exceptional case should get cached separately // or an upcasting step should be performed. diff --git a/tuplex/core/src/physical/TransformStage.cc b/tuplex/core/src/physical/TransformStage.cc index b61f9cbe2..0be4d1a45 100644 --- a/tuplex/core/src/physical/TransformStage.cc +++ b/tuplex/core/src/physical/TransformStage.cc @@ -110,7 +110,22 @@ namespace tuplex { int64_t dataSetID = 0; // no ID here _inputPartitions = rowsToPartitions(backend()->driver(), dataSetID, context().id(), rows); } + void TransformStage::setIncrementalResult(const std::vector& normalPartitions, + const std::vector& exceptionPartitions, + const std::vector& partitionGroups) { + auto pipeline = PhysicalStage::plan()->originalLogicalPlan()->getAction(); + auto cacheEntry = new IncrementalCacheEntry(pipeline, normalPartitions, exceptionPartitions, partitionGroups); + PhysicalStage::plan()->getContext().getIncrementalCache()->addEntry(IncrementalCache::newKey(pipeline), cacheEntry); + } + void TransformStage::setIncrementalResult(const std::vector& exceptionPartitions, + const std::vector& generalPartitions, + const std::vector& fallbackPartitions, + size_t startFileNumber) { + auto pipeline = PhysicalStage::plan()->originalLogicalPlan()->getAction(); + auto cacheEntry = new IncrementalCacheEntry(pipeline, exceptionPartitions, generalPartitions, fallbackPartitions, startFileNumber); + PhysicalStage::plan()->getContext().getIncrementalCache()->addEntry(IncrementalCache::newKey(pipeline), cacheEntry); + } void TransformStage::setFileResult(const std::unordered_map, size_t> &ecounts) { setExceptionCounts(ecounts); @@ -118,29 +133,28 @@ namespace tuplex { _rs = emptyResultSet(); } - void TransformStage::setMemoryResult(const std::vector &partitions, - const std::vector& generalCase, - const std::unordered_map& partitionToExceptionsMap, - const std::vector>& interpreterRows, - const std::vector& remainingExceptions, - const std::unordered_map, size_t> &ecounts) { - setExceptionCounts(ecounts); + void TransformStage::setMemoryResult(const std::vector& normalPartitions, + const std::vector& generalPartitions, + const std::vector& fallbackPartitions, + const std::vector& partitionGroups, + const std::unordered_map, size_t>& exceptionCounts) { + setExceptionCounts(exceptionCounts); - if (partitions.empty() && interpreterRows.empty() && generalCase.empty()) + if (normalPartitions.empty() && generalPartitions.empty() && fallbackPartitions.empty()) _rs = emptyResultSet(); else { std::vector limitedPartitions; auto schema = Schema::UNKNOWN; - if(!partitions.empty()) { - schema = partitions.front()->schema(); - for (auto partition : partitions) { + if(!normalPartitions.empty()) { + schema = normalPartitions.front()->schema(); + for (auto partition : normalPartitions) { assert(schema == partition->schema()); } // check output limit, adjust partitions if necessary size_t numOutputRows = 0; - for (auto partition : partitions) { + for (auto partition : normalPartitions) { numOutputRows += partition->getNumRows(); if (numOutputRows >= outputLimit()) { // clip last partition & leave loop @@ -157,10 +171,7 @@ namespace tuplex { } } - // put ALL partitions to result set - _rs = std::make_shared(schema, limitedPartitions, - generalCase, partitionToExceptionsMap, interpreterRows, - outputLimit()); + _rs = std::make_shared(schema, limitedPartitions, generalPartitions, fallbackPartitions, partitionGroups, outputLimit()); } } @@ -654,7 +665,7 @@ namespace tuplex { } case EndPointMode::MEMORY: case EndPointMode::FILE: { - auto p = stage->resultSet()->partitions(); + auto p = stage->resultSet()->normalPartitions(); std::copy(std::begin(p), std::end(p), std::back_inserter(partitions)); break; } diff --git a/tuplex/core/src/physical/TransformTask.cc b/tuplex/core/src/physical/TransformTask.cc index c560c4af4..1d24d6fdc 100644 --- a/tuplex/core/src/physical/TransformTask.cc +++ b/tuplex/core/src/physical/TransformTask.cc @@ -543,43 +543,65 @@ namespace tuplex { auto functor = reinterpret_cast(_functor); - auto numInputExceptions = _inputExceptionInfo.numExceptions; - auto inputExceptionIndex = _inputExceptionInfo.exceptionIndex; - auto inputExceptionRowOffset = _inputExceptionInfo.exceptionRowOffset; - auto inputExceptionByteOffset = _inputExceptionInfo.exceptionByteOffset; - - // First, prepare the input exception partitions to pass into the code-gen - // This is done to simplify the LLVM code. We will end up passing it an - // array of expPtrs which point to the first exception in their partition - // and expPtrSizes which tell how many exceptions are in that partition. - auto arrSize = _inputExceptions.size() - inputExceptionIndex; - auto expPtrs = new uint8_t*[arrSize]; - auto expPtrSizes = new int64_t[arrSize]; - int expInd = 0; - // Iterate through all exception partitions beginning at the one specified by the starting index - for (int i = inputExceptionIndex; i < _inputExceptions.size(); ++i) { - auto numRows = _inputExceptions[i]->getNumRows(); - auto ptr = _inputExceptions[i]->lock(); - // If its the first partition, we need to account for the offset - if (i == inputExceptionIndex) { - numRows -= inputExceptionRowOffset; - ptr += inputExceptionByteOffset; - } - expPtrSizes[expInd] = numRows; - expPtrs[expInd] = (uint8_t *) ptr; - expInd++; - } + int64_t totalNormalRowCounter = 0; + int64_t totalGeneralRowCounter = 0; + int64_t totalFallbackRowCounter = 0; + int64_t totalFilterCounter = 0; + + uint8_t **generalPartitions = new uint8_t*[_generalPartitions.size()]; + for (int i = 0; i < _generalPartitions.size(); ++i) + generalPartitions[i] = _generalPartitions[i]->lockWriteRaw(); + int64_t numGeneralPartitions = _generalPartitions.size(); + int64_t generalIndexOffset = 0; + int64_t generalRowOffset = 0; + int64_t generalByteOffset = 0; + + uint8_t **fallbackPartitions = new uint8_t*[_fallbackPartitions.size()]; + for (int i = 0; i < _fallbackPartitions.size(); ++i) + fallbackPartitions[i] = _fallbackPartitions[i]->lockWriteRaw(); + int64_t numFallbackPartitions = _fallbackPartitions.size(); + int64_t fallbackIndexOffset = 0; + int64_t fallbackRowOffset = 0; + int64_t fallbackByteOffset = 0; // go over all input partitions. - for(auto inputPartition : _inputPartitions) { + for(auto &inputPartition : _inputPartitions) { // lock ptr, extract number of rows ==> store them // lock raw & call functor! int64_t inSize = inputPartition->size(); const uint8_t *inPtr = inputPartition->lockRaw(); _numInputRowsRead += static_cast(*((int64_t*)inPtr)); +// +// int64_t totalNormalRowCounter = 0; +// int64_t totalGeneralRowCounter = 0; +// int64_t totalFallbackRowCounter = 0; +// +// int64_t g1[] = {2, +// 1, -1, -1, 8, -1, +// 2, -1, -1, 8, -1}; +// int64_t g2[] = {1, +// 3, -1, -1, 8, -1}; +// int64_t g3[] = {2, +// 5, -1, -1, 8, -1, +// 6, -1, -1, 8, -1}; +// uint8_t *generalPartitions[] = {(uint8_t*)g1, (uint8_t*)g2, (uint8_t*)g3}; +// int64_t numGeneralPartitions = 3; +// int64_t generalIndexOffset = 0; +// int64_t generalRowOffset = 0; +// int64_t generalByteOffset = 0; +// +// int64_t f1[] = {1, 2, 3}; +// uint8_t *fallbackPartitions[] = {}; +// int64_t numFallbackPartitions = 0; +// int64_t fallbackIndexOffset = 0; +// int64_t fallbackRowOffset = 0; +// int64_t fallbackByteOffset = 0; // call functor - auto bytesParsed = functor(this, inPtr, inSize, expPtrs, expPtrSizes, numInputExceptions, &num_normal_rows, &num_bad_rows, false); + auto bytesParsed = functor(this, inPtr, inSize, &num_normal_rows, &num_bad_rows, false, + &totalFilterCounter, &totalNormalRowCounter, &totalGeneralRowCounter, &totalFallbackRowCounter, + generalPartitions, numGeneralPartitions, &generalIndexOffset, &generalRowOffset, &generalByteOffset, + fallbackPartitions, numFallbackPartitions, &fallbackIndexOffset, &fallbackRowOffset, &fallbackByteOffset); // save number of normal rows to output rows written if not writeTofile if(hasMemorySink()) @@ -595,13 +617,55 @@ namespace tuplex { inputPartition->invalidate(); } - delete[] expPtrs; - delete[] expPtrSizes; + if (generalIndexOffset < numGeneralPartitions) { + auto curGeneralPtr = generalPartitions[generalIndexOffset]; + auto numRowsInPartition = *((int64_t*)curGeneralPtr); + curGeneralPtr += sizeof(int64_t) + generalByteOffset; + while (generalRowOffset < numRowsInPartition) { + *((int64_t*)curGeneralPtr) -= totalFilterCounter; + curGeneralPtr += 4 * sizeof(int64_t) + ((int64_t*)curGeneralPtr)[3]; + generalRowOffset += 1; + + if (generalRowOffset == numRowsInPartition && generalIndexOffset < numGeneralPartitions - 1) { + generalIndexOffset += 1; + curGeneralPtr = generalPartitions[generalIndexOffset]; + numRowsInPartition = *((int64_t*)curGeneralPtr); + curGeneralPtr += sizeof(int64_t); + generalByteOffset = 0; + generalRowOffset = 0; + } + } + } - for (int i = inputExceptionIndex; i < _inputExceptions.size(); ++i) { - _inputExceptions[i]->unlock(); + if (fallbackIndexOffset < numFallbackPartitions) { + auto curFallbackPtr = fallbackPartitions[fallbackIndexOffset]; + auto numRowsInPartition = *((int64_t*)curFallbackPtr); + curFallbackPtr += sizeof(int64_t) + fallbackByteOffset; + while (fallbackRowOffset < numRowsInPartition) { + *((int64_t*)curFallbackPtr) -= totalFilterCounter; + curFallbackPtr += 4 * sizeof(int64_t) + ((int64_t*)curFallbackPtr)[3]; + fallbackRowOffset += 1; + + if (fallbackRowOffset == numRowsInPartition && fallbackIndexOffset < numFallbackPartitions - 1) { + fallbackIndexOffset += 1; + curFallbackPtr = fallbackPartitions[fallbackIndexOffset]; + numRowsInPartition = *((int64_t*)curFallbackPtr); + curFallbackPtr += sizeof(int64_t); + fallbackByteOffset = 0; + fallbackRowOffset = 0; + } + } } + for (auto & _generalPartition : _generalPartitions) + _generalPartition->unlockWrite(); + + for (auto & _fallbackPartition : _fallbackPartitions) + _fallbackPartition->unlockWrite(); + + delete[] fallbackPartitions; + delete[] generalPartitions; + #ifndef NDEBUG owner()->info("Trafo task memory source exhausted (" + pluralize(_inputPartitions.size(), "partition") + ", " + pluralize(num_normal_rows, "normal row") + ", " + pluralize(num_bad_rows, "exceptional row") + ")"); diff --git a/tuplex/python/include/PythonContext.h b/tuplex/python/include/PythonContext.h index 66e87523b..b3888f342 100644 --- a/tuplex/python/include/PythonContext.h +++ b/tuplex/python/include/PythonContext.h @@ -117,6 +117,8 @@ namespace tuplex { pds.wrap(&_context->makeError(message)); return pds; } + + std::vector serializeFallbackRows(const std::vector>& fallbackRows); public: /*! diff --git a/tuplex/python/include/PythonDataSet.h b/tuplex/python/include/PythonDataSet.h index 665d68856..14eff690c 100644 --- a/tuplex/python/include/PythonDataSet.h +++ b/tuplex/python/include/PythonDataSet.h @@ -140,7 +140,8 @@ namespace tuplex { size_t shardSize=0, size_t limit=std::numeric_limits::max(), const std::string& null_value="", - py::object header=py::none()); + py::object header=py::none(), + const bool commit=true); /*! * save dataset to one or more orc files. Triggers execution of pipeline. diff --git a/tuplex/python/src/PythonContext.cc b/tuplex/python/src/PythonContext.cc index b70be013a..6425cdad2 100644 --- a/tuplex/python/src/PythonContext.cc +++ b/tuplex/python/src/PythonContext.cc @@ -39,35 +39,35 @@ namespace tuplex { Schema schema(Schema::MemoryLayout::ROW, python::Type::makeTupleType({python::Type::F64})); - std::vector> badParallelizeObjects; - std::vector numExceptionsInPartition; - // check if empty? if(0 == numElements) - return _context->fromPartitions(schema, std::vector(), columns, badParallelizeObjects, numExceptionsInPartition); + return _context->fromPartitions(schema, std::vector(), std::vector(), std::vector(), columns); // create new partition on driver auto driver = _context->getDriver(); + std::vector> fallbackRows; + std::vector fallbackPartitions; + std::vector partitionMergeInfo; + std::vector partitions; Partition* partition = driver->allocWritablePartition(allocMinSize, schema, -1, _context->id()); int64_t* rawPtr = (int64_t*)partition->lockWriteRaw(); *rawPtr = 0; double* ptr = (double*)(rawPtr + 1); size_t numBytesSerialized = 0; - size_t prevNumExceptions = 0; - size_t prevNumRows = 0; + auto rowDelta = 0; for(unsigned i = 0; i < numElements; ++i) { auto obj = PyList_GET_ITEM(listObj, i); Py_XINCREF(obj); // check capacity and realloc if necessary get a new partition if(partition->capacity() < numBytesSerialized + sizeof(double)) { - assert(badParallelizeObjects.size() >= prevNumExceptions); - auto numNewExceptions = badParallelizeObjects.size() - prevNumExceptions; - numExceptionsInPartition.push_back(numNewExceptions); - prevNumExceptions = badParallelizeObjects.size(); - prevNumRows += numNewExceptions + *rawPtr; + rowDelta += *rawPtr + fallbackRows.size(); + auto serializedRows = serializeFallbackRows(fallbackRows); + fallbackRows.clear(); + partitionMergeInfo.push_back(PartitionGroup(1, partitions.size(), 0, 0, serializedRows.size(), fallbackPartitions.size())); + std::copy(serializedRows.begin(), serializedRows.end(), std::back_inserter(fallbackPartitions)); partition->unlockWrite(); partitions.push_back(partition); @@ -89,15 +89,15 @@ namespace tuplex { val = (double)PyLong_AsLongLong(obj); if(PyErr_Occurred()) { // too large integer? PyErr_Clear(); - assert(i >= prevNumRows); - badParallelizeObjects.emplace_back(std::make_tuple(i - prevNumRows, obj)); + assert(i >= rowDelta); + fallbackRows.emplace_back(std::make_tuple(i - rowDelta, obj)); continue; } } } else { - assert(i >= prevNumRows); - badParallelizeObjects.emplace_back(std::make_tuple(i - prevNumRows, obj)); + assert(i >= rowDelta); + fallbackRows.emplace_back(std::make_tuple(i - rowDelta, obj)); continue; } } @@ -108,15 +108,16 @@ namespace tuplex { numBytesSerialized += sizeof(double); } - assert(badParallelizeObjects.size() >= prevNumExceptions); - auto numNewExceptions = badParallelizeObjects.size() - prevNumExceptions; - numExceptionsInPartition.push_back(numNewExceptions); + auto serializedRows = serializeFallbackRows(fallbackRows); + fallbackRows.clear(); + partitionMergeInfo.push_back(PartitionGroup(1, partitions.size(), 0, 0, serializedRows.size(), fallbackPartitions.size())); + std::copy(serializedRows.begin(), serializedRows.end(), std::back_inserter(fallbackPartitions)); partition->unlockWrite(); partitions.push_back(partition); // create dataset from partitions. - return _context->fromPartitions(schema, partitions, columns, badParallelizeObjects, numExceptionsInPartition); + return _context->fromPartitions(schema, partitions, fallbackPartitions, partitionMergeInfo, columns); } DataSet& PythonContext::fastI64Parallelize(PyObject* listObj, const std::vector& columns, bool upcast) { @@ -127,16 +128,17 @@ namespace tuplex { Schema schema(Schema::MemoryLayout::ROW, python::Type::makeTupleType({python::Type::I64})); - std::vector> badParallelizeObjects; - std::vector numExceptionsInPartition; - // check if empty? if(0 == numElements) - return _context->fromPartitions(schema, std::vector(), columns, badParallelizeObjects, numExceptionsInPartition); + return _context->fromPartitions(schema, std::vector(), std::vector(), std::vector(), columns); // create new partition on driver auto driver = _context->getDriver(); + std::vector> fallbackRows; + std::vector fallbackPartitions; + std::vector partitionMergeInfo; + std::vector partitions; Partition* partition = driver->allocWritablePartition(std::max(sizeof(int64_t), allocMinSize), schema, -1, _context->id()); int64_t* rawPtr = (int64_t*)partition->lockWriteRaw(); @@ -144,18 +146,18 @@ namespace tuplex { int64_t* ptr = rawPtr + 1; size_t numBytesSerialized = 0; size_t prevNumExceptions = 0; - size_t prevNumRows = 0; + auto rowDelta = 0; for(unsigned i = 0; i < numElements; ++i) { auto obj = PyList_GET_ITEM(listObj, i); Py_XINCREF(obj); // check capacity and realloc if necessary get a new partition if(partition->capacity() < numBytesSerialized + sizeof(int64_t)) { - assert(badParallelizeObjects.size() >= prevNumExceptions); - auto numNewExceptions = badParallelizeObjects.size() - prevNumExceptions; - numExceptionsInPartition.push_back(numNewExceptions); - prevNumExceptions = badParallelizeObjects.size(); - prevNumRows += numNewExceptions + *rawPtr; + rowDelta += *rawPtr + fallbackRows.size(); + auto serializedRows = serializeFallbackRows(fallbackRows); + fallbackRows.clear(); + partitionMergeInfo.push_back(PartitionGroup(1, partitions.size(), 0, 0, serializedRows.size(), fallbackPartitions.size())); + std::copy(serializedRows.begin(), serializedRows.end(), std::back_inserter(fallbackPartitions)); partition->unlockWrite(); partitions.push_back(partition); @@ -171,8 +173,8 @@ namespace tuplex { val = PyLong_AsLongLong(obj); if(PyErr_Occurred()) { // too large integer? PyErr_Clear(); - assert(i >= prevNumRows); - badParallelizeObjects.emplace_back(std::make_tuple(i - prevNumRows, obj)); + assert(i >= rowDelta); + fallbackRows.emplace_back(std::make_tuple(i - rowDelta, obj)); continue; } } else { @@ -180,8 +182,8 @@ namespace tuplex { if(upcast && (obj == Py_True || obj == Py_False)) val = obj == Py_True; else { - assert(i >= prevNumRows); - badParallelizeObjects.emplace_back(std::make_tuple(i - prevNumRows, obj)); + assert(i >= rowDelta); + fallbackRows.emplace_back(std::make_tuple(i - rowDelta, obj)); continue; } } @@ -191,15 +193,16 @@ namespace tuplex { *rawPtr = *rawPtr + 1; numBytesSerialized += sizeof(int64_t); } - assert(badParallelizeObjects.size() >= prevNumExceptions); - auto numNewExceptions = badParallelizeObjects.size() - prevNumExceptions; - numExceptionsInPartition.push_back(numNewExceptions); + auto serializedRows = serializeFallbackRows(fallbackRows); + fallbackRows.clear(); + partitionMergeInfo.push_back(PartitionGroup(1, partitions.size(), 0, 0, serializedRows.size(), fallbackPartitions.size())); + std::copy(serializedRows.begin(), serializedRows.end(), std::back_inserter(fallbackPartitions)); partition->unlockWrite(); partitions.push_back(partition); // create dataset from partitions. - return _context->fromPartitions(schema, partitions, columns, badParallelizeObjects, numExceptionsInPartition); + return _context->fromPartitions(schema, partitions, fallbackPartitions, partitionMergeInfo, columns); } DataSet& PythonContext::fastMixedSimpleTypeTupleTransfer(PyObject *listObj, const python::Type &majType, @@ -215,12 +218,9 @@ namespace tuplex { // now create partitions super fast Schema schema(Schema::MemoryLayout::ROW, majType); - std::vector> badParallelizeObjects; - std::vector numExceptionsInPartition; - // check if empty? if(0 == numElements) - return _context->fromPartitions(schema, std::vector(), columns, badParallelizeObjects, numExceptionsInPartition); + return _context->fromPartitions(schema, std::vector(), std::vector(), std::vector(), columns); // encode type of tuple quickly into string @@ -232,6 +232,10 @@ namespace tuplex { // create new partition on driver auto driver = _context->getDriver(); + std::vector> fallbackRows; + std::vector fallbackPartitions; + std::vector partitionMergeInfo; + std::vector partitions; Partition* partition = driver->allocWritablePartition(allocMinSize, schema, -1, _context->id()); int64_t* rawPtr = (int64_t*)partition->lockWriteRaw(); @@ -239,7 +243,7 @@ namespace tuplex { uint8_t* ptr = (uint8_t*)(rawPtr + 1); size_t numBytesSerialized = 0; size_t prevNumExceptions = 0; - size_t prevNumRows = 0; + auto rowDelta = 0; for(unsigned i = 0; i < numElements; ++i) { auto obj = PyList_GET_ITEM(listObj, i); Py_XINCREF(obj); @@ -266,19 +270,19 @@ namespace tuplex { } } if (nonConforming) { - assert(i >= prevNumRows); - badParallelizeObjects.emplace_back(i - prevNumRows, obj); + assert(i >= rowDelta); + fallbackRows.emplace_back(i - rowDelta, obj); continue; } } // get new partition if capacity exhausted if(partition->capacity() < numBytesSerialized + requiredBytes) { - assert(badParallelizeObjects.size() >= prevNumExceptions); - auto numNewExceptions = badParallelizeObjects.size() - prevNumExceptions; - numExceptionsInPartition.push_back(numNewExceptions); - prevNumExceptions = badParallelizeObjects.size(); - prevNumRows += numNewExceptions + *rawPtr; + rowDelta += *rawPtr + fallbackRows.size(); + auto serializedRows = serializeFallbackRows(fallbackRows); + fallbackRows.clear(); + partitionMergeInfo.push_back(PartitionGroup(1, partitions.size(), 0, 0, serializedRows.size(), fallbackPartitions.size())); + std::copy(serializedRows.begin(), serializedRows.end(), std::back_inserter(fallbackPartitions)); partition->unlockWrite(); partitions.push_back(partition); @@ -358,11 +362,11 @@ namespace tuplex { // special part when bad row encountered bad_element: ptr = rowStartPtr; - assert(i >= prevNumRows); - badParallelizeObjects.emplace_back(std::make_tuple(i - prevNumRows, obj)); + assert(i >= rowDelta); + fallbackRows.emplace_back(i - rowDelta, obj); } else { - assert(i >= prevNumRows); - badParallelizeObjects.emplace_back(std::make_tuple(i - prevNumRows, obj)); + assert(i >= rowDelta); + fallbackRows.emplace_back(i - rowDelta, obj); } // serialization code here is a little bit more complicated @@ -371,9 +375,10 @@ namespace tuplex { // (2) is the field containing total varlength // (3) is the actual string content (incl. '\0' delimiter) } - assert(badParallelizeObjects.size() >= prevNumExceptions); - auto numNewExceptions = badParallelizeObjects.size() - prevNumExceptions; - numExceptionsInPartition.push_back(numNewExceptions); + auto serializedRows = serializeFallbackRows(fallbackRows); + fallbackRows.clear(); + partitionMergeInfo.push_back(PartitionGroup(1, partitions.size(), 0, 0, serializedRows.size(), fallbackPartitions.size())); + std::copy(serializedRows.begin(), serializedRows.end(), std::back_inserter(fallbackPartitions)); partition->unlockWrite(); partitions.push_back(partition); @@ -381,7 +386,7 @@ namespace tuplex { delete [] typeStr; // create dataset from partitions. - return _context->fromPartitions(schema, partitions, columns, badParallelizeObjects, numExceptionsInPartition); + return _context->fromPartitions(schema, partitions, fallbackPartitions, partitionMergeInfo, columns); } DataSet& PythonContext::fastBoolParallelize(PyObject *listObj, const std::vector& columns) { @@ -392,17 +397,18 @@ namespace tuplex { Schema schema(Schema::MemoryLayout::ROW, python::Type::makeTupleType({python::Type::BOOLEAN})); - std::vector> badParallelizeObjects; - std::vector numExceptionsInPartition; - // check if empty? if(0 == numElements) - return _context->fromPartitions(schema, std::vector(), columns, badParallelizeObjects, numExceptionsInPartition); + return _context->fromPartitions(schema, std::vector(), std::vector(), std::vector(), columns); // create new partition on driver auto driver = _context->getDriver(); + std::vector> fallbackRows; + std::vector fallbackPartitions; + std::vector partitionMergeInfo; + std::vector partitions; Partition* partition = driver->allocWritablePartition(std::max(sizeof(int64_t), allocMinSize), schema, -1, _context->id()); int64_t* rawPtr = (int64_t*)partition->lockWriteRaw(); @@ -410,18 +416,18 @@ namespace tuplex { int64_t* ptr = rawPtr + 1; size_t numBytesSerialized = 0; size_t prevNumExceptions = 0; - size_t prevNumRows = 0; + auto rowDelta = 0; for(unsigned i = 0; i < numElements; ++i) { auto obj = PyList_GET_ITEM(listObj, i); Py_XINCREF(obj); // check capacity and realloc if necessary get a new partition if(partition->capacity() < numBytesSerialized + sizeof(int64_t)) { - assert(badParallelizeObjects.size() >= prevNumExceptions); - auto numNewExceptions = badParallelizeObjects.size() - prevNumExceptions; - numExceptionsInPartition.push_back(numNewExceptions); - prevNumExceptions = badParallelizeObjects.size(); - prevNumRows += numNewExceptions + *rawPtr; + rowDelta += *rawPtr + fallbackRows.size(); + auto serializedRows = serializeFallbackRows(fallbackRows); + fallbackRows.clear(); + partitionMergeInfo.push_back(PartitionGroup(1, partitions.size(), 0, 0, serializedRows.size(), fallbackPartitions.size())); + std::copy(serializedRows.begin(), serializedRows.end(), std::back_inserter(fallbackPartitions)); partition->unlockWrite(); partitions.push_back(partition); @@ -438,20 +444,20 @@ namespace tuplex { *rawPtr = *rawPtr + 1; numBytesSerialized += sizeof(int64_t); } else { - assert(i >= prevNumRows); - badParallelizeObjects.emplace_back(std::make_tuple(i - prevNumRows, obj)); + assert(i >= rowDelta); + fallbackRows.emplace_back(i - rowDelta, obj); } } - - assert(badParallelizeObjects.size() >= prevNumExceptions); - auto numNewExceptions = badParallelizeObjects.size() - prevNumExceptions; - numExceptionsInPartition.push_back(numNewExceptions); + auto serializedRows = serializeFallbackRows(fallbackRows); + fallbackRows.clear(); + partitionMergeInfo.push_back(PartitionGroup(1, partitions.size(), 0, 0, serializedRows.size(), fallbackPartitions.size())); + std::copy(serializedRows.begin(), serializedRows.end(), std::back_inserter(fallbackPartitions)); partition->unlockWrite(); partitions.push_back(partition); // create dataset from partitions. - return _context->fromPartitions(schema, partitions, columns, badParallelizeObjects, numExceptionsInPartition); + return _context->fromPartitions(schema, partitions, fallbackPartitions, partitionMergeInfo, columns); } DataSet& PythonContext::fastStrParallelize(PyObject* listObj, const std::vector& columns) { @@ -462,17 +468,18 @@ namespace tuplex { Schema schema(Schema::MemoryLayout::ROW, python::Type::makeTupleType({python::Type::STRING})); - std::vector> badParallelizeObjects; - std::vector numExceptionsInPartition; - // check if empty? if(0 == numElements) - return _context->fromPartitions(schema, std::vector(), columns, badParallelizeObjects, numExceptionsInPartition); + return _context->fromPartitions(schema, std::vector(), std::vector(), std::vector(), columns); // create new partition on driver auto driver = _context->getDriver(); + std::vector> fallbackRows; + std::vector fallbackPartitions; + std::vector partitionMergeInfo; + std::vector partitions; Partition* partition = driver->allocWritablePartition(allocMinSize, schema, -1, _context->id()); int64_t* rawPtr = (int64_t*)partition->lockWriteRaw(); @@ -480,7 +487,7 @@ namespace tuplex { uint8_t* ptr = (uint8_t*)(rawPtr + 1); size_t numBytesSerialized = 0; size_t prevNumExceptions = 0; - size_t prevNumRows = 0; + auto rowDelta = 0; for(unsigned i = 0; i < numElements; ++i) { auto obj = PyList_GET_ITEM(listObj, i); Py_XINCREF(obj); @@ -500,11 +507,11 @@ namespace tuplex { // check capacity and realloc if necessary get a new partition if(partition->capacity() < numBytesSerialized + requiredBytes) { - assert(badParallelizeObjects.size() >= prevNumExceptions); - auto numNewExceptions = badParallelizeObjects.size() - prevNumExceptions; - numExceptionsInPartition.push_back(numNewExceptions); - prevNumExceptions = badParallelizeObjects.size(); - prevNumRows += numNewExceptions + *rawPtr; + rowDelta += *rawPtr + fallbackRows.size(); + auto serializedRows = serializeFallbackRows(fallbackRows); + fallbackRows.clear(); + partitionMergeInfo.push_back(PartitionGroup(1, partitions.size(), 0, 0, serializedRows.size(), fallbackPartitions.size())); + std::copy(serializedRows.begin(), serializedRows.end(), std::back_inserter(fallbackPartitions)); partition->unlockWrite(); partitions.push_back(partition); @@ -530,19 +537,20 @@ namespace tuplex { *rawPtr = *rawPtr + 1; numBytesSerialized += requiredBytes; } else { - assert(i >= prevNumRows); - badParallelizeObjects.emplace_back(std::make_tuple(i - prevNumRows, obj)); + assert(i >= rowDelta); + fallbackRows.emplace_back(i - rowDelta, obj); } } - assert(badParallelizeObjects.size() >= prevNumExceptions); - auto numNewExceptions = badParallelizeObjects.size() - prevNumExceptions; - numExceptionsInPartition.push_back(numNewExceptions); + auto serializedRows = serializeFallbackRows(fallbackRows); + fallbackRows.clear(); + partitionMergeInfo.push_back(PartitionGroup(1, partitions.size(), 0, 0, serializedRows.size(), fallbackPartitions.size())); + std::copy(serializedRows.begin(), serializedRows.end(), std::back_inserter(fallbackPartitions)); partition->unlockWrite(); partitions.push_back(partition); // create dataset from partitions. - return _context->fromPartitions(schema, partitions, columns, badParallelizeObjects, numExceptionsInPartition); + return _context->fromPartitions(schema, partitions, fallbackPartitions, partitionMergeInfo, columns); } // Returns true if t1 can be considered a subtype of t2, specifically in the context of Option types @@ -578,12 +586,9 @@ namespace tuplex { auto numElements = PyList_Size(listObj); logger.debug("transferring " + std::to_string(numElements) + " elements. "); - std::vector> badParallelizeObjects; - std::vector numExceptionsInPartition; - // check if empty? if(0 == numElements) - return _context->fromPartitions(schema, std::vector(), columns, badParallelizeObjects, numExceptionsInPartition); + return _context->fromPartitions(schema, std::vector(), std::vector(), std::vector(), columns); auto firstRow = PyList_GET_ITEM(listObj, 0); Py_XINCREF(firstRow); @@ -592,6 +597,10 @@ namespace tuplex { // create new partition on driver auto driver = _context->getDriver(); + std::vector> fallbackRows; + std::vector fallbackPartitions; + std::vector partitionMergeInfo; + std::vector partitions; Partition* partition = driver->allocWritablePartition(allocMinSize, schema, -1, _context->id()); int64_t* rawPtr = (int64_t*)partition->lockWriteRaw(); @@ -599,7 +608,7 @@ namespace tuplex { uint8_t* ptr = (uint8_t*)(rawPtr + 1); size_t numBytesSerialized = 0; size_t prevNumExceptions = 0; - size_t prevNumRows = 0; + auto rowDelta = 0; for (unsigned i = 0; i < numElements; ++i) { // because this a slow transfer loop, check explicitly for signals and free anything if there's something... @@ -611,10 +620,10 @@ namespace tuplex { logger.warn("slow transfer to backend interrupted."); // free items (decref) - for(auto t : badParallelizeObjects) { + for(auto t : fallbackRows) { Py_XDECREF(std::get<1>(t)); } - badParallelizeObjects.clear(); + fallbackRows.clear(); return _context->makeError("interrupted transfer"); } @@ -632,11 +641,11 @@ namespace tuplex { auto requiredBytes = row.serializedLength(); if(partition->capacity() < numBytesSerialized + requiredBytes) { - assert(badParallelizeObjects.size() >= prevNumExceptions); - auto numNewExceptions = badParallelizeObjects.size() - prevNumExceptions; - numExceptionsInPartition.push_back(numNewExceptions); - prevNumExceptions = badParallelizeObjects.size(); - prevNumRows += numNewExceptions + *rawPtr; + rowDelta += *rawPtr + fallbackRows.size(); + auto serializedRows = serializeFallbackRows(fallbackRows); + fallbackRows.clear(); + partitionMergeInfo.push_back(PartitionGroup(1, partitions.size(), 0, 0, serializedRows.size(), fallbackPartitions.size())); + std::copy(serializedRows.begin(), serializedRows.end(), std::back_inserter(fallbackPartitions)); partition->unlockWrite(); partitions.push_back(partition); @@ -653,17 +662,18 @@ namespace tuplex { *rawPtr = *rawPtr + 1; numBytesSerialized += requiredBytes; } else - badParallelizeObjects.emplace_back(std::make_tuple(i - prevNumRows, item)); + fallbackRows.emplace_back(std::make_tuple(i - rowDelta, item)); } - assert(badParallelizeObjects.size() >= prevNumExceptions); - auto numNewExceptions = badParallelizeObjects.size() - prevNumExceptions; - numExceptionsInPartition.push_back(numNewExceptions); + auto serializedRows = serializeFallbackRows(fallbackRows); + fallbackRows.clear(); + partitionMergeInfo.push_back(PartitionGroup(1, partitions.size(), 0, 0, serializedRows.size(), fallbackPartitions.size())); + std::copy(serializedRows.begin(), serializedRows.end(), std::back_inserter(fallbackPartitions)); partition->unlockWrite(); partitions.push_back(partition); // serialize in main memory - return _context->fromPartitions(schema, partitions, columns, badParallelizeObjects, numExceptionsInPartition); + return _context->fromPartitions(schema, partitions, fallbackPartitions, partitionMergeInfo, columns); } DataSet& PythonContext::strDictParallelize(PyObject *listObj, const python::Type &rowType, @@ -679,16 +689,17 @@ namespace tuplex { assert(rowType.parameters().size() == columns.size()); // also very important!!! Schema schema(Schema::MemoryLayout::ROW, rowType); - std::vector> badParallelizeObjects; - std::vector numExceptionsInPartition; - // check if empty? if(0 == numElements) - return _context->fromPartitions(schema, std::vector(), columns, badParallelizeObjects, numExceptionsInPartition); + return _context->fromPartitions(schema, std::vector(), std::vector(), std::vector(), columns); // create new partition on driver auto driver = _context->getDriver(); + std::vector> fallbackRows; + std::vector fallbackPartitions; + std::vector partitionMergeInfo; + std::vector partitions; Partition* partition = driver->allocWritablePartition(allocMinSize, schema, -1, _context->id()); int64_t* rawPtr = (int64_t*)partition->lockWriteRaw(); @@ -696,7 +707,7 @@ namespace tuplex { uint8_t* ptr = (uint8_t*)(rawPtr + 1); size_t numBytesSerialized = 0; size_t prevNumExceptions = 0; - size_t prevNumRows = 0; + auto rowDelta = 0; for(unsigned i = 0; i < numElements; ++i) { auto obj = PyList_GET_ITEM(listObj, i); Py_XINCREF(obj); @@ -724,11 +735,11 @@ namespace tuplex { size_t requiredBytes = row.serializedLength(); // check capacity and realloc if necessary get a new partition if (partition->capacity() < numBytesSerialized + allocMinSize) { - assert(badParallelizeObjects.size() >= prevNumExceptions); - auto numNewExceptions = badParallelizeObjects.size() - prevNumExceptions; - numExceptionsInPartition.push_back(numNewExceptions); - prevNumExceptions = badParallelizeObjects.size(); - prevNumRows += numNewExceptions + *rawPtr; + rowDelta += *rawPtr + fallbackRows.size(); + auto serializedRows = serializeFallbackRows(fallbackRows); + fallbackRows.clear(); + partitionMergeInfo.push_back(PartitionGroup(1, partitions.size(), 0, 0, serializedRows.size(), fallbackPartitions.size())); + std::copy(serializedRows.begin(), serializedRows.end(), std::back_inserter(fallbackPartitions)); partition->unlockWrite(); partitions.push_back(partition); @@ -744,24 +755,25 @@ namespace tuplex { *rawPtr = *rawPtr + 1; numBytesSerialized += requiredBytes; } catch (const std::exception& e) { - assert(i >= prevNumRows); - badParallelizeObjects.emplace_back(i - prevNumRows, obj); + assert(i >= rowDelta); + fallbackRows.emplace_back(std::make_tuple(i - rowDelta, obj)); } } else { - assert(i >= prevNumRows); - badParallelizeObjects.emplace_back(i - prevNumRows, obj); + assert(i >= rowDelta); + fallbackRows.emplace_back(i - rowDelta, obj); } } - assert(badParallelizeObjects.size() >= prevNumExceptions); - auto numNewExceptions = badParallelizeObjects.size() - prevNumExceptions; - numExceptionsInPartition.push_back(numNewExceptions); + auto serializedRows = serializeFallbackRows(fallbackRows); + fallbackRows.clear(); + partitionMergeInfo.push_back(PartitionGroup(1, partitions.size(), 0, 0, serializedRows.size(), fallbackPartitions.size())); + std::copy(serializedRows.begin(), serializedRows.end(), std::back_inserter(fallbackPartitions)); partition->unlockWrite(); partitions.push_back(partition); // create dataset from partitions. - return _context->fromPartitions(schema, partitions, columns, badParallelizeObjects, numExceptionsInPartition); + return _context->fromPartitions(schema, partitions, fallbackPartitions, partitionMergeInfo, columns); } PythonDataSet PythonContext::parallelize(py::list L, @@ -1290,7 +1302,55 @@ namespace tuplex { return co; } - // // running with another python version might lead to severe issues + std::vector PythonContext::serializeFallbackRows(const std::vector>& fallbackRows) { + std::vector fallbackPartitions; + if (fallbackRows.empty()) { + return fallbackPartitions; + } + + auto driver = _context->getDriver(); + Schema schema(Schema::MemoryLayout::ROW, python::Type::makeTupleType({python::Type::STRING})); + auto partition = driver->allocWritablePartition(allocMinSize, schema, -1, _context->id()); + int64_t* rawPtr = (int64_t*)partition->lockWriteRaw(); + *rawPtr = 0; + uint8_t* ptr = (uint8_t*)(rawPtr + 1); + size_t numBytesSerialized = 0; + + for (const auto& row: fallbackRows) { + auto rowNum = std::get<0>(row); + auto pythonObject = std::get<1>(row); + auto ecCode = ecToI64(ExceptionCode::PYTHON_PARALLELIZE); + auto pickledObject = python::pickleObject(python::getMainModule(), pythonObject); + auto pickledObjectSize = pickledObject.size(); + size_t requiredBytes = sizeof(int64_t) * 4 + pickledObjectSize; + + if (partition->capacity() < numBytesSerialized + requiredBytes) { + partition->unlockWrite(); + fallbackPartitions.push_back(partition); + partition = driver->allocWritablePartition(allocMinSize, schema, -1, _context->id()); + rawPtr = (int64_t *) partition->lockWriteRaw(); + *rawPtr = 0; + ptr = (uint8_t * )(rawPtr + 1); + numBytesSerialized = 0; + } + + *((int64_t*)(ptr)) = rowNum; ptr += sizeof(int64_t); + *((int64_t*)(ptr)) = ecCode; ptr += sizeof(int64_t); + *((int64_t*)(ptr)) = -1; ptr += sizeof(int64_t); + *((int64_t*)(ptr)) = pickledObjectSize; ptr += sizeof(int64_t); + memcpy(ptr, pickledObject.c_str(), pickledObjectSize); ptr += pickledObjectSize; + + *rawPtr = *rawPtr + 1; + numBytesSerialized += requiredBytes; + } + + partition->unlockWrite(); + fallbackPartitions.push_back(partition); + + return fallbackPartitions; + } + + // // running with another python version might lead to severe issues // // hence, perform check at context startup! // bool checkPythonVersion() { // using namespace std; diff --git a/tuplex/python/src/PythonDataSet.cc b/tuplex/python/src/PythonDataSet.cc index 36f9a392b..3f9ca82a2 100644 --- a/tuplex/python/src/PythonDataSet.cc +++ b/tuplex/python/src/PythonDataSet.cc @@ -721,7 +721,7 @@ namespace tuplex { void PythonDataSet::tocsv(const std::string &file_path, const std::string &lambda_code, const std::string &pickled_code, size_t fileCount, size_t shardSize, size_t limit, const std::string &null_value, - py::object header) { + py::object header, const bool commit) { // make sure a dataset is wrapped assert(this->_dataset); // ==> error handled below. @@ -757,6 +757,8 @@ namespace tuplex { outputOptions["header"] = "true"; } + outputOptions["commit"] = boolToString(commit); + // release GIL & hand over everything to Tuplex assert(PyGILState_Check()); // make sure this thread holds the GIL! python::unlockGIL(); @@ -909,8 +911,8 @@ namespace tuplex { // retrieve full partitions for speed Partition *partition = nullptr; size_t pos = 0; - while (rs->hasNextPartition() && pos < maxRowCount) { - partition = rs->getNextPartition(); + while (rs->hasNextNormalPartition() && pos < maxRowCount) { + partition = rs->getNextNormalPartition(); auto schema = partition->schema(); // single value? --> reset rowtype by one level auto type = schema.getRowType(); @@ -964,8 +966,8 @@ namespace tuplex { Partition *partition = nullptr; size_t pos = 0; - while (rs->hasNextPartition() && pos < maxRowCount) { - partition = rs->getNextPartition(); + while (rs->hasNextNormalPartition() && pos < maxRowCount) { + partition = rs->getNextNormalPartition(); // add memory towards list object auto ptr = partition->lockRaw(); @@ -1002,8 +1004,8 @@ namespace tuplex { Partition *partition = nullptr; size_t pos = 0; - while (rs->hasNextPartition() && pos < maxRowCount) { - partition = rs->getNextPartition(); + while (rs->hasNextNormalPartition() && pos < maxRowCount) { + partition = rs->getNextNormalPartition(); // add memory towards list object auto ptr = partition->lockRaw(); @@ -1042,8 +1044,8 @@ namespace tuplex { Partition *partition = nullptr; size_t pos = 0; - while (rs->hasNextPartition() && pos < maxRowCount) { - partition = rs->getNextPartition(); + while (rs->hasNextNormalPartition() && pos < maxRowCount) { + partition = rs->getNextNormalPartition(); // add memory towards list object auto ptr = partition->lockRaw(); @@ -1091,8 +1093,8 @@ namespace tuplex { Partition *partition = nullptr; size_t pos = 0; - while (rs->hasNextPartition() && pos < maxRowCount) { - partition = rs->getNextPartition(); + while (rs->hasNextNormalPartition() && pos < maxRowCount) { + partition = rs->getNextNormalPartition(); // add memory towards list object auto ptr = partition->lockRaw(); @@ -1147,8 +1149,8 @@ namespace tuplex { Partition *partition = nullptr; size_t pos = 0; - while (rs->hasNextPartition() && pos < maxRowCount) { - partition = rs->getNextPartition(); + while (rs->hasNextNormalPartition() && pos < maxRowCount) { + partition = rs->getNextNormalPartition(); // add memory towards list object auto ptr = partition->lockRaw(); @@ -1191,8 +1193,8 @@ namespace tuplex { Partition *partition = nullptr; size_t pos = 0; - while (rs->hasNextPartition() && pos < maxRowCount) { - partition = rs->getNextPartition(); + while (rs->hasNextNormalPartition() && pos < maxRowCount) { + partition = rs->getNextNormalPartition(); // add memory towards list object auto ptr = partition->lockRaw(); @@ -1251,8 +1253,8 @@ namespace tuplex { Partition* partition = nullptr; size_t pos = 0; - while(rs->hasNextPartition() && pos < maxRowCount) { - partition = rs->getNextPartition(); + while(rs->hasNextNormalPartition() && pos < maxRowCount) { + partition = rs->getNextNormalPartition(); // add memory towards list object auto ptr = partition->lockRaw(); @@ -1348,7 +1350,7 @@ namespace tuplex { // b.c. merging of arbitrary python objects is not implemented yet, whenever they're present, use general // version // @TODO: this could be optimized! - if(rs->pyobject_count() != 0) + if(rs->fallbackRowCount() != 0) return anyToCPythonWithPyObjects(rs, maxRowCount); auto type = rs->schema().getRowType(); diff --git a/tuplex/python/src/PythonWrappers.cc b/tuplex/python/src/PythonWrappers.cc index 8e35d5d4c..9968a14fa 100644 --- a/tuplex/python/src/PythonWrappers.cc +++ b/tuplex/python/src/PythonWrappers.cc @@ -172,8 +172,8 @@ namespace tuplex { Partition* partition = nullptr; size_t pos = 0; - while(rs->hasNextPartition()) { - partition = rs->getNextPartition(); + while(rs->hasNextNormalPartition()) { + partition = rs->getNextNormalPartition(); // add memory towards list object auto ptr = partition->lockRaw(); diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py index a2b8c0b33..76f555309 100644 --- a/tuplex/python/tuplex/dataset.py +++ b/tuplex/python/tuplex/dataset.py @@ -425,7 +425,7 @@ def leftJoin(self, dsRight, leftKeyColumn, rightKeyColumn, prefixes=None, suffix return ds - def tocsv(self, path, part_size=0, num_rows=max_rows, num_parts=0, part_name_generator=None, null_value=None, header=True): + def tocsv(self, path, part_size=0, num_rows=max_rows, num_parts=0, part_name_generator=None, null_value=None, header=True, commit=True): """ save dataset to one or more csv files. Triggers execution of pipeline. Args: path: path where to save files to @@ -458,7 +458,7 @@ def tocsv(self, path, part_size=0, num_rows=max_rows, num_parts=0, part_name_gen if null_value is None: null_value = '' - self._dataSet.tocsv(path, code, code_pickled, num_parts, part_size, num_rows, null_value, header) + self._dataSet.tocsv(path, code, code_pickled, num_parts, part_size, num_rows, null_value, header, commit) def toorc(self, path, part_size=0, num_rows=max_rows, num_parts=0, part_name_generator=None): """ save dataset to one or more orc files. Triggers execution of pipeline. diff --git a/tuplex/test/core/DataSetShow.cc b/tuplex/test/core/DataSetShow.cc index cf50705b8..4ec70c4e6 100644 --- a/tuplex/test/core/DataSetShow.cc +++ b/tuplex/test/core/DataSetShow.cc @@ -14,7 +14,7 @@ #include #include "TestUtils.h" -class DataSetTest : public TuplexTest {}; +class DataSetTest : public PyTest {}; TEST_F(DataSetTest, DataSetShow) { using namespace tuplex; diff --git a/tuplex/test/core/ExceptionsTest.cc b/tuplex/test/core/ExceptionsTest.cc new file mode 100644 index 000000000..ef9fae67e --- /dev/null +++ b/tuplex/test/core/ExceptionsTest.cc @@ -0,0 +1,232 @@ +//--------------------------------------------------------------------------------------------------------------------// +// // +// Tuplex: Blazing Fast Python Data Science // +// // +// // +// (c) 2017 - 2021, Tuplex team // +// Created by Leonhard Spiegelberg first on 1/1/2021 // +// License: Apache 2.0 // +//--------------------------------------------------------------------------------------------------------------------// + +#include "gtest/gtest.h" +#include +#include "TestUtils.h" + +class ExceptionsTest : public PyTest {}; + +TEST_F(ExceptionsTest, Basic) { + using namespace tuplex; + + auto opts = microTestOptions(); + opts.set("tuplex.optimizer.mergeExceptionsInOrder", "true"); + Context c(opts); + + std::vector inputRows({Row(1), Row(2), Row(0), Row(4), Row(5)}); + auto res = c.parallelize(inputRows).map(UDF("lambda x: 1 // x if x == 0 else x")).resolve(ExceptionCode::ZERODIVISIONERROR, UDF("lambda x: -1")).collectAsVector(); + std::vector expectedOutput({Row(1), Row(2), Row(-1), Row(4), Row(5)}); + ASSERT_EQ(res.size(), expectedOutput.size()); + for (int i = 0; i < expectedOutput.size(); ++i) + EXPECT_EQ(res[i].toPythonString(), expectedOutput[i].toPythonString()); +} + +TEST_F(ExceptionsTest, Debug) { + using namespace tuplex; + + auto opts = microTestOptions(); + opts.set("tuplex.optimizer.mergeExceptionsInOrder", "true"); + opts.set("tuplex.partitionSize", "40B"); + Context c(opts); + + std::vector inputData({ + Row(1), Row(2), Row(0), Row(4), + Row(5), Row(6), Row(0), Row(8), + }); + + auto res = c.parallelize(inputData).map(UDF("lambda x: 1 // x if x == 0 else x")).resolve(ExceptionCode::ZERODIVISIONERROR, UDF("lambda x: -1")).collectAsVector(); + std::vector expectedRes({Row(1), Row(2), Row(-1), Row(4), Row(5), Row(6), Row(-1), Row(8)}); + ASSERT_EQ(res.size(), expectedRes.size()); + for (int i = 0; i < expectedRes.size(); ++i) { + EXPECT_EQ(res[i].toPythonString(), expectedRes[i].toPythonString()); + } +} + +typedef bool (*filter_t)(int64_t); + +void processPartition(filter_t filter, int64_t* normalPartition, int64_t* totalFilterCounter, int64_t* totalNormalRowCounter, int64_t* totalGeneralRowCounter, int64_t* totalFallbackRowCounter, + uint8_t** generalPartitions, int64_t numGeneralPartitons, int64_t* generalIndexOffset, int64_t* generalRowOffset, int64_t* generalByteOffset, + uint8_t** fallbackPartitions, int64_t numFallbackPartitons, int64_t* fallbackIndexOffset, int64_t *fallbackRowOffset, int64_t* fallbackByteOffset) { + uint8_t *curGeneralPtr; + int64_t curGeneralNumRows = 0; + if (*generalIndexOffset < numGeneralPartitons) { + curGeneralPtr = generalPartitions[*generalIndexOffset]; + curGeneralNumRows = *((int64_t*)curGeneralPtr); + curGeneralPtr += sizeof(int64_t) + *generalByteOffset; + } + + uint8_t *curFallbackPtr; + int64_t curFallbackNumRows = 0; + if (*fallbackIndexOffset < numFallbackPartitons) { + curFallbackPtr = fallbackPartitions[*fallbackIndexOffset]; + curFallbackNumRows = *((int64_t*)curFallbackPtr); + curFallbackPtr += sizeof(int64_t) + *fallbackByteOffset; + } + + int64_t numNormalRows = normalPartition[0]; + for (int normalRowCountVar = 1; normalRowCountVar < numNormalRows + 1; ++normalRowCountVar) { + int64_t curNormalRow = normalPartition[normalRowCountVar]; + if (filter(curNormalRow)) { + int64_t curNormalRowInd = normalRowCountVar + *totalNormalRowCounter; + + while (*generalRowOffset < curGeneralNumRows && *((int64_t*)curGeneralPtr) < curNormalRowInd + *totalGeneralRowCounter) { + *curGeneralPtr -= *totalFilterCounter; + curGeneralPtr += 4 * sizeof(int64_t) + ((int64_t*)curGeneralPtr)[3]; + *generalByteOffset += 4 * sizeof(int64_t) + ((int64_t*)curGeneralPtr)[3]; + *generalRowOffset += 1; + *totalGeneralRowCounter += 1; + + if (*generalRowOffset == curGeneralNumRows && *generalIndexOffset < numGeneralPartitons - 1) { + *generalIndexOffset += 1; + *generalRowOffset = 0; + *generalByteOffset = 0; + curGeneralPtr = generalPartitions[*generalIndexOffset]; + curGeneralNumRows = *curGeneralPtr; + curGeneralPtr += sizeof(int64_t); + } + } + + while (*fallbackRowOffset < curFallbackNumRows && *((int64_t*)curFallbackPtr) < curNormalRowInd + *totalGeneralRowCounter + *totalFallbackRowCounter) { + *curFallbackPtr -= *totalFilterCounter; + curFallbackPtr += 4 * sizeof(int64_t) + ((int64_t*)curFallbackPtr)[3]; + *fallbackByteOffset += 4 * sizeof(int64_t) + ((int64_t*)curFallbackPtr)[3]; + *fallbackRowOffset += 1; + *totalFallbackRowCounter += 1; + + if (*fallbackRowOffset == curFallbackNumRows && *fallbackIndexOffset < numFallbackPartitons - 1) { + *fallbackIndexOffset += 1; + *fallbackRowOffset = 0; + *fallbackByteOffset = 0; + curFallbackPtr = fallbackPartitions[*fallbackIndexOffset]; + curFallbackNumRows = *curFallbackPtr; + curFallbackPtr += sizeof(int64_t); + } + } + + *totalFilterCounter += 1; + } + } + *totalNormalRowCounter += numNormalRows; +} + +void processPartitions(filter_t filter, int64_t** normalPartitions, int64_t numNormalPartitions, uint8_t** generalPartitions, int64_t numGeneralPartitions, uint8_t** fallbackPartitions, int64_t numFallbackPartitions) { + int64_t totalNormalRowCounter = 0; + int64_t totalGeneralRowCounter = 0; + int64_t totalFallbackRowCounter = 0; + int64_t totalFilterCounter = 0; + + int64_t generalIndexOffset = 0; + int64_t generalByteOffset = 0; + int64_t generalRowOffset = 0; + int64_t fallbackIndexOffset = 0; + int64_t fallbackByteOffset = 0; + int64_t fallbackRowOffset = 0; + for (int i = 0; i < numNormalPartitions; ++i) { + processPartition(filter, normalPartitions[i], + &totalFilterCounter, &totalNormalRowCounter, &totalGeneralRowCounter, &totalFallbackRowCounter, + generalPartitions, numGeneralPartitions, &generalIndexOffset, &generalRowOffset, &generalByteOffset, + fallbackPartitions, numFallbackPartitions, &fallbackIndexOffset, &fallbackRowOffset, &fallbackByteOffset); + } + + if (generalIndexOffset < numGeneralPartitions) { + auto curGeneralPtr = generalPartitions[generalIndexOffset]; + auto numRowsInPartition = *((int64_t*)curGeneralPtr); + curGeneralPtr += sizeof(int64_t) + generalByteOffset; + while (generalRowOffset < numRowsInPartition) { + *((int64_t*)curGeneralPtr) -= totalFilterCounter; + curGeneralPtr += 4 * sizeof(int64_t) + ((int64_t*)curGeneralPtr)[3]; + generalRowOffset += 1; + + if (generalRowOffset == numRowsInPartition && generalIndexOffset < numGeneralPartitions - 1) { + generalIndexOffset += 1; + curGeneralPtr = generalPartitions[generalIndexOffset]; + numRowsInPartition = *((int64_t*)curGeneralPtr); + curGeneralPtr += sizeof(int64_t); + generalByteOffset = 0; + generalRowOffset = 0; + } + } + } + + if (fallbackIndexOffset < numFallbackPartitions) { + auto curFallbackPtr = fallbackPartitions[fallbackIndexOffset]; + auto numRowsInPartition = *((int64_t*)curFallbackPtr); + curFallbackPtr += sizeof(int64_t) + fallbackByteOffset; + while (fallbackRowOffset < numRowsInPartition) { + *((int64_t*)curFallbackPtr) -= totalFilterCounter; + curFallbackPtr += 4 * sizeof(int64_t) + ((int64_t*)curFallbackPtr)[3]; + fallbackRowOffset += 1; + + if (fallbackRowOffset == numRowsInPartition && fallbackIndexOffset < numFallbackPartitions - 1) { + fallbackIndexOffset += 1; + curFallbackPtr = fallbackPartitions[fallbackIndexOffset]; + numRowsInPartition = *((int64_t*)curFallbackPtr); + curFallbackPtr += sizeof(int64_t); + fallbackByteOffset = 0; + fallbackRowOffset = 0; + } + } + } +} + +bool filter2(int64_t row) { + return row % 3 == 0; +} + +TEST_F(ExceptionsTest, Algo) { + int64_t n1[] = {15, 1, 2, 3, 5, 6, 7, 9, 10, 11, 13, 14, 15, 17, 18, 19}; + int64_t n2[] = {3, 21, 22, 23}; + int64_t *normalPartitions[] = {n1, n2}; + int64_t numNormalPartitions = 2; + + int64_t g1[] = {4, + 0, -1, -1, 8, -1, + 4, -1, -1, 8, -1, + 8, -1, -1, 8, -1, + 12, -1, -1, 8, -1}; + int64_t g2[] = {3, + 16, -1, -1, 8, -1, + 20, -1, -1, 8, -1, + 24, -1, -1, 8, -1}; + uint8_t *generalPartitions[] = {(uint8_t*)g1, (uint8_t*)g2}; + int64_t numGeneralPartitions = 2; + + uint8_t *fallbackPartitions[] = {}; + int64_t numFallbackPartitions = 0; + + processPartitions(filter2, normalPartitions, numNormalPartitions, generalPartitions, numGeneralPartitions, fallbackPartitions, numFallbackPartitions); + + + std::cout << "Done"; +} + + +bool filter1(int64_t row) { + return true; +} + +TEST_F(ExceptionsTest, ProcessDebug) { + int64_t n1[] = {2, 1, 2}; + int64_t *normalPartitions[] = {n1}; + int64_t numNormalPartitions = 1; + + int64_t g1[] = {3, + 1, -1, -1, 8, -1, + 2, -1, -1, 8, -1, + 3, -1, -1, 8, -1}; + uint8_t *generalPartitions[] = {(uint8_t*)g1}; + int64_t numGeneralPartitions = 1; + + uint8_t *fallbackPartitions[] = {}; + int64_t numFallbackPartitions = 0; + + processPartitions(filter1, normalPartitions, numNormalPartitions, generalPartitions, numGeneralPartitions, fallbackPartitions, numFallbackPartitions); +} \ No newline at end of file diff --git a/tuplex/test/core/IncrementalTest.cc b/tuplex/test/core/IncrementalTest.cc new file mode 100644 index 000000000..dd9a92656 --- /dev/null +++ b/tuplex/test/core/IncrementalTest.cc @@ -0,0 +1,566 @@ +//--------------------------------------------------------------------------------------------------------------------// +// // +// Tuplex: Blazing Fast Python Data Science // +// // +// // +// (c) 2017 - 2021, Tuplex team // +// Created by Benjamin Givertz first on 1/1/2021 // +// License: Apache 2.0 // +//--------------------------------------------------------------------------------------------------------------------// + +#include +#include +#include "TestUtils.h" + +class IncrementalTest : public PyTest { +protected: + + void SetUp() override { + PyTest::SetUp(); + + using namespace tuplex; + auto vfs = VirtualFileSystem::fromURI("."); + vfs.remove(testName); + auto err = vfs.create_dir(testName); + ASSERT_TRUE(err == VirtualFileSystemStatus::VFS_OK); + } + + void TearDown() override { + PyTest::TearDown(); + + using namespace tuplex; + auto vfs = VirtualFileSystem::fromURI("."); + vfs.remove(testName); + } +}; + +TEST_F(IncrementalTest, CommitMode) { + using namespace tuplex; + using namespace std; + + auto opts = microTestOptions(); + opts.set("tuplex.optimizer.mergeExceptionsInOrder", "true"); + opts.set("tuplex.optimizer.incrementalResolution", "true"); + Context c(opts); + + auto outputURI = URI(testName + "/" + testName + ".csv"); + + auto csvops = defaultCSVOutputOptions(); + csvops["commit"] = boolToString(false); + + c.parallelize({Row(1), Row(-1), Row(2), Row(-2), Row(3), Row(-3)}) + .map(UDF("lambda x: 1 // (x - x) if x == -1 else x")) + .map(UDF("lambda x: 1 // (x - x) if x == -2 else x")) + .map(UDF("lambda x: 1 // (x - x) if x == -3 else x")) + .tocsv(outputURI, csvops); + + c.parallelize({Row(1), Row(-1), Row(2), Row(-2), Row(3), Row(-3)}) + .map(UDF("lambda x: 1 // (x - x) if x == -1 else x")) + .resolve(ExceptionCode::ZERODIVISIONERROR, UDF("lambda x: x")) + .map(UDF("lambda x: 1 // (x - x) if x == -2 else x")) + .map(UDF("lambda x: 1 // (x - x) if x == -3 else x")) + .tocsv(outputURI, csvops); + + c.parallelize({Row(1), Row(-1), Row(2), Row(-2), Row(3), Row(-3)}) + .map(UDF("lambda x: 1 // (x - x) if x == -1 else x")) + .resolve(ExceptionCode::ZERODIVISIONERROR, UDF("lambda x: x")) + .map(UDF("lambda x: 1 // (x - x) if x == -2 else x")) + .resolve(ExceptionCode::ZERODIVISIONERROR, UDF("lambda x: x")) + .map(UDF("lambda x: 1 // (x - x) if x == -3 else x")) + .tocsv(outputURI, csvops); + + csvops["commit"] = boolToString(true); + + c.parallelize({Row(1), Row(-1), Row(2), Row(-2), Row(3), Row(-3)}) + .map(UDF("lambda x: 1 // (x - x) if x == -1 else x")) + .resolve(ExceptionCode::ZERODIVISIONERROR, UDF("lambda x: x")) + .map(UDF("lambda x: 1 // (x - x) if x == -2 else x")) + .resolve(ExceptionCode::ZERODIVISIONERROR, UDF("lambda x: x")) + .map(UDF("lambda x: 1 // (x - x) if x == -3 else x")) + .resolve(ExceptionCode::ZERODIVISIONERROR, UDF("lambda x: x")) + .tocsv(outputURI, csvops); +} + +void testIncrementalNoMerge(tuplex::ContextOptions opts, tuplex::URI fileURI, size_t numRows, float general, float fallback, float exception) { + using namespace tuplex; + using namespace std; + + opts.set("tuplex.executorCount", "4"); + opts.set("tuplex.optimizer.mergeExceptionsInOrder", "false"); + opts.set("tuplex.optimizer.incrementalResolution", "true"); + opts.set("tuplex.optimizer.nullValueOptimization", "true"); + opts.set("tuplex.normalcaseThreshold", "0.6"); + opts.set("tuplex.resolveWithInterpreterOnly", "true"); + opts.set("tuplex.useLLVMOptimizer", "true"); + Context c(opts); + + vector inputRows; + vector inputRowInds; + inputRows.reserve(numRows); + inputRowInds.reserve(numRows); + for (int i = 0; i < numRows; ++i) { + inputRows.push_back(i + 1); + inputRowInds.push_back(i); + } + + std::random_shuffle(inputRowInds.begin(), inputRowInds.end()); + int counter = 0; + for (int i = 0; i < (int) (general * numRows); ++i) { + inputRows[inputRowInds[counter]] = -1; + counter++; + } + for (int i = 0; i < (int) (fallback * numRows); ++i) { + inputRows[inputRowInds[counter]] = -2; + counter++; + } + for (int i = 0; i < (int) (exception * numRows); ++i) { + inputRows[inputRowInds[counter]] = -3; + counter++; + } + + stringstream ss; + for (int i = 0; i < numRows; ++i) { + ss << "1,"; + if (inputRows[i] != -1) { + ss << to_string(inputRows[i]); + } + ss << "\n"; + } + stringToFile(fileURI.toPath(), ss.str()); + + auto udf = "def udf(x, y):\n" + " if y == -2:\n" + " return y ** 0.5\n" + " elif y == -1:\n" + " raise ValueError\n" + " else:\n" + " return float(y)"; + + auto &ds_cached = c.csv(fileURI.toPath()).cache().map(UDF(udf)).cache(); +} + +TEST_F(IncrementalTest, NoMergeFallback) { + using namespace tuplex; + testIncrementalNoMerge(microTestOptions(), URI(testName + ".csv"), 100, 0.25, 0.25, 0.25); +} + +void executeZillow(tuplex::Context &context, const tuplex::URI& outputURI, int step, bool commit) { + using namespace tuplex; + + auto extractBd = "def extractBd(x):\n" + " val = x['facts and features']\n" + " max_idx = val.find(' bd')\n" + " if max_idx < 0:\n" + " max_idx = len(val)\n" + " s = val[:max_idx]\n" + "\n" + " # find comma before\n" + " split_idx = s.rfind(',')\n" + " if split_idx < 0:\n" + " split_idx = 0\n" + " else:\n" + " split_idx += 2\n" + " r = s[split_idx:]\n" + " return int(r)"; + + auto extractType = "def extractType(x):\n" + " t = x['title'].lower()\n" + " type = 'unknown'\n" + " if 'condo' in t or 'apartment' in t:\n" + " type = 'condo'\n" + " if 'house' in t:\n" + " type = 'house'\n" + " return type"; + + auto extractBa = "def extractBa(x):\n" + " val = x['facts and features']\n" + " max_idx = val.find(' ba')\n" + " if max_idx < 0:\n" + " max_idx = len(val)\n" + " s = val[:max_idx]\n" + "\n" + " # find comma before\n" + " split_idx = s.rfind(',')\n" + " if split_idx < 0:\n" + " split_idx = 0\n" + " else:\n" + " split_idx += 2\n" + " r = s[split_idx:]\n" + " return int(r)"; + + auto extractSqft = "def extractSqft(x):\n" + " val = x['facts and features']\n" + " max_idx = val.find(' sqft')\n" + " if max_idx < 0:\n" + " max_idx = len(val)\n" + " s = val[:max_idx]\n" + "\n" + " split_idx = s.rfind('ba ,')\n" + " if split_idx < 0:\n" + " split_idx = 0\n" + " else:\n" + " split_idx += 5\n" + " r = s[split_idx:]\n" + " r = r.replace(',', '')\n" + " return int(r)"; + + auto extractPrice = "def extractPrice(x):\n" + " price = x['price']\n" + " p = 0\n" + " if x['offer'] == 'sold':\n" + " # price is to be calculated using price/sqft * sqft\n" + " val = x['facts and features']\n" + " s = val[val.find('Price/sqft:') + len('Price/sqft:') + 1:]\n" + " r = s[s.find('$')+1:s.find(', ') - 1]\n" + " price_per_sqft = int(r)\n" + " p = price_per_sqft * x['sqft']\n" + " elif x['offer'] == 'rent':\n" + " max_idx = price.rfind('/')\n" + " p = int(price[1:max_idx].replace(',', ''))\n" + " else:\n" + " # take price from price column\n" + " p = int(price[1:].replace(',', ''))\n" + "\n" + " return p"; + auto extractOffer = "def extractOffer(x):\n" + " offer = x['title'].lower()\n" + "\n" + " if 'sale' in offer:\n" + " offer = 'sale'\n" + " elif 'rent' in offer:\n" + " offer = 'rent'\n" + " elif 'sold' in offer:\n" + " offer = 'sold'\n" + " elif 'foreclos' in offer.lower():\n" + " offer = 'foreclosed'\n" + " else:\n" + " offer = 'unknown'\n" + "\n" + " return offer"; + + auto resolveBd = "def resolveBd(x):\n" + " if 'Studio' in x['facts and features']:\n" + " return 1\n" + " raise ValueError\n"; + + auto csvops = defaultCSVOutputOptions(); + csvops["commit"] = boolToString(commit); + std::vector columnNames({"url", "zipcode", "address", "city", "state", "bedrooms", "bathrooms", "sqft", "offer", "type", "price"}); + + auto &ds = context.csv("../../../../benchmarks/incremental/data/zillow_dirty.csv"); + ds = ds.withColumn("bedrooms", UDF(extractBd)); + if (step > 0) + ds = ds.resolve(ExceptionCode::VALUEERROR, UDF(resolveBd)); + if (step > 1) + ds = ds.ignore(ExceptionCode::VALUEERROR); + ds = ds.filter(UDF("lambda x: x ['bedrooms'] < 10")); + ds = ds.withColumn("type", UDF(extractType)); + ds = ds.filter(UDF("lambda x: x['type'] == 'condo'")); + ds = ds.withColumn("zipcode", UDF("lambda x: '%05d' % int(x['postal_code'])")); + if (step > 2) + ds = ds.ignore(ExceptionCode::TYPEERROR); + ds = ds.mapColumn("city", UDF("lambda x: x[0].upper() + x[1:].lower()")); + ds = ds.withColumn("bathrooms", UDF(extractBa)); + if (step > 3) + ds = ds.ignore(ExceptionCode::VALUEERROR); + ds = ds.withColumn("sqft", UDF(extractSqft)); + if (step > 4) + ds = ds.ignore(ExceptionCode::VALUEERROR); + ds = ds.withColumn("offer", UDF(extractOffer)); + ds = ds.withColumn("price", UDF(extractPrice)); + if (step > 5) + ds = ds.resolve(ExceptionCode::VALUEERROR, UDF("lambda x: int(100020)")); + ds = ds.filter(UDF("lambda x: 100000 < x['price'] < 2e7 and x['offer'] == 'sale'")); + ds = ds.selectColumns(columnNames); + ds.tocsv(outputURI, csvops); +} + +TEST_F(IncrementalTest, DirtyZilow) { + using namespace tuplex; + using namespace std; + + auto opts = testOptions(); + opts.set("tuplex.executorCount", "0"); + opts.set("tuplex.executorMemory", "2G"); + opts.set("tuplex.driverMemory", "2G"); + opts.set("tuplex.partitionSize", "32MB"); + opts.set("tuplex.resolveWithInterpreterOnly", "false"); + opts.set("tuplex.optimizer.incrementalResolution", "true"); + opts.set("tuplex.optimizer.mergeExceptionsInOrder", "false"); + Context incrementalContext(opts); + opts.set("tuplex.optimizer.incrementalResolution", "false"); + Context plainContext(opts); + + for (int step = 0; step < 7; ++step) { + executeZillow(incrementalContext, testName + "/incremental.csv", step, true); +// executeZillow(plainContext, testName + "/plain.csv", step); + } + +// std::vector incrementalRows; +// auto incrementalResult = plainContext.csv(testName + "/incremental.*.csv").collect(); +// while (incrementalResult->hasNextRow()) +// incrementalRows.push_back(incrementalResult->getNextRow().toPythonString()); +// +// std::vector plainRows; +// auto plainResult = plainContext.csv(testName + "/plain.*.csv").collect(); +// while (plainResult->hasNextRow()) +// plainRows.push_back(plainResult->getNextRow().toPythonString()); +// +// ASSERT_EQ(incrementalRows.size(), plainRows.size()); +// for (int i = 0; i < plainRows.size(); ++i) +// ASSERT_EQ(incrementalRows[i], plainRows[i]); +} + +TEST_F(IncrementalTest, FileOutput) { + using namespace tuplex; + using namespace std; + + auto opts = microTestOptions(); + opts.set("tuplex.executorCount", "0"); + opts.set("tuplex.optimizer.incrementalResolution", "true"); + opts.set("tuplex.optimizer.mergeExceptionsInOrder", "false"); + Context c(opts); + + auto numRows = 50; + auto amountExps = 0.25; + std::vector inputRows; + inputRows.reserve(numRows); + std::unordered_multiset expectedOutput1; + expectedOutput1.reserve((int) (numRows * amountExps)); + std::unordered_multiset expectedOutput2; + expectedOutput2.reserve(numRows); + + auto inputFileURI = URI(testName + "/in.csv"); + auto fileURI = URI(testName + "/out.csv"); + auto outputFileURI = URI(testName + "/out.*.csv"); + + std::stringstream ss; + for (int i = 0; i < numRows; ++i) { + if (i % (int) (1 / amountExps) == 0) { + ss << "0\n"; + expectedOutput2.insert(Row(-1).toPythonString()); + } else { + ss << to_string(i) << "\n"; + expectedOutput1.insert(Row(i).toPythonString()); + expectedOutput2.insert(Row(i).toPythonString()); + } + } + stringToFile(inputFileURI, ss.str()); + + c.csv(inputFileURI.toPath()).map(UDF("lambda x: 1 // x if x == 0 else x")).tocsv(fileURI.toPath()); + auto output1 = c.csv(outputFileURI.toPath()).collectAsVector(); + ASSERT_EQ(output1.size(), expectedOutput1.size()); + for (const auto &row : output1) { + ASSERT_TRUE(expectedOutput1.find(row.toPythonString()) != expectedOutput1.end()); + } + + c.csv(inputFileURI.toPath()).map(UDF("lambda x: 1 // x if x == 0 else x")).resolve(ExceptionCode::ZERODIVISIONERROR, UDF("lambda x: -1")).tocsv(fileURI.toPath()); + auto output2 = c.csv(outputFileURI.toPath()).collectAsVector(); + ASSERT_EQ(output2.size(), expectedOutput2.size()); + for (const auto &row : output2) { + ASSERT_TRUE(expectedOutput2.find(row.toPythonString()) != expectedOutput2.end()); + } +} + +TEST_F(IncrementalTest, FileOutputInOrder) { + using namespace tuplex; + using namespace std; + + auto opts = microTestOptions(); + opts.set("tuplex.executorCount", "0"); + opts.set("tuplex.optimizer.incrementalResolution", "true"); + opts.set("tuplex.optimizer.mergeExceptionsInOrder", "true"); + Context c(opts); + + auto numRows = 50; + auto amountExps = 0.25; + std::vector inputRows; + inputRows.reserve(numRows); + std::vector expectedOutput1; + expectedOutput1.reserve((int) (numRows * amountExps)); + std::vector expectedOutput2; + expectedOutput2.reserve(numRows); + + auto inputFileURI = URI(testName + "/in.csv"); + auto fileURI = URI(testName + "/out.csv"); + auto outputFileURI = URI(testName + "/out.*.csv"); + + std::stringstream ss; + for (int i = 0; i < numRows; ++i) { + if (i % (int) (1 / amountExps) == 0) { + ss << "0\n"; + expectedOutput2.push_back(Row(-1).toPythonString()); + } else { + ss << to_string(i) << "\n"; + expectedOutput1.push_back(Row(i).toPythonString()); + expectedOutput2.push_back(Row(i).toPythonString()); + } + } + stringToFile(inputFileURI, ss.str()); + + c.csv(inputFileURI.toPath()).map(UDF("lambda x: 1 // x if x == 0 else x")).tocsv(fileURI.toPath()); + auto output1 = c.csv(outputFileURI.toPath()).collectAsVector(); + ASSERT_EQ(output1.size(), expectedOutput1.size()); + for (int i = 0; i < expectedOutput1.size(); ++i) { + ASSERT_EQ(expectedOutput1[i], output1[i].toPythonString()); + } + + c.csv(inputFileURI.toPath()).map(UDF("lambda x: 1 // x if x == 0 else x")).resolve(ExceptionCode::ZERODIVISIONERROR, UDF("lambda x: -1")).tocsv(fileURI.toPath()); + auto output2 = c.csv(outputFileURI.toPath()).collectAsVector(); + ASSERT_EQ(output2.size(), expectedOutput2.size()); + for (int i = 0; i < expectedOutput2.size(); ++i) { + ASSERT_EQ(expectedOutput2[i], output2[i].toPythonString()); + } +} + +TEST_F(IncrementalTest, DebugResolver) { + using namespace tuplex; + using namespace std; + + auto opts = microTestOptions(); + opts.set("tuplex.optimizer.incrementalResolution", "false"); + opts.set("tuplex.optimizer.mergeExceptionsInOrder", "false"); + Context c(opts); + +// c.parallelize({Row(1), Row(0), Row(3)}) +// .map(UDF("lambda x: 1 // x if x == 0 else x")) +// .tocsv(testName + "/out.csv"); +// + c.parallelize({Row(1), Row(0), Row(3)}) + .map(UDF("lambda x: 1 // x if x == 0 else x")) + .resolve(ExceptionCode::ZERODIVISIONERROR, UDF("lambda x: 1 // x")) + .tocsv(testName + "/out.csv"); + + c.parallelize({Row(1), Row(0), Row(3)}) + .map(UDF("lambda x: 1 // x if x == 0 else x")) + .resolve(ExceptionCode::ZERODIVISIONERROR, UDF("lambda x: 1 // x")) + .ignore(ExceptionCode::ZERODIVISIONERROR) + .tocsv(testName + "/out.csv"); +} + +TEST_F(IncrementalTest, Filter) { + using namespace tuplex; + using namespace std; + + auto opts = microTestOptions(); + opts.set("tuplex.executorCount", "2"); + opts.set("tuplex.optimizer.incrementalResolution", "true"); + opts.set("tuplex.optimizer.mergeExceptionsInOrder", "true"); + opts.set("tuplex.resolveWithInterpreterOnly", "false"); + Context c(opts); + + auto inputFileURI = URI(testName + "/in.csv"); + auto fileURI = URI(testName + "/out.csv"); + auto outputFileURI = URI(testName + "/out.*.csv"); + + std::vector expectedOutput1; + std::vector expectedOutput2; + std::stringstream ss; + for (int i = 0; i < 100000; ++i) { + auto num = rand()%4; + switch (num) { + case 0: { + ss << to_string(i) << "\n"; + expectedOutput1.push_back(Row(i)); + expectedOutput2.push_back(Row(i)); + break; + } + case 1: { + ss << "-1\n"; + break; + } + case 2: { + ss << "-2\n"; + expectedOutput2.push_back(Row(-2)); + break; + } + case 3: { + ss << "0\n"; + break; + } + } + } + stringToFile(inputFileURI, ss.str()); + + c.csv(inputFileURI.toPath()).map(UDF("lambda x: 1 // (x - x) if x < 0 else x")).filter(UDF("lambda x: x != 0")).tocsv(fileURI.toPath()); + auto output1 = c.csv(outputFileURI.toPath()).collectAsVector(); + ASSERT_EQ(output1.size(), expectedOutput1.size()); + for (int i = 0; i < expectedOutput1.size(); ++i) { + ASSERT_EQ(expectedOutput1[i].toPythonString(), output1[i].toPythonString()); + } + + c.csv(inputFileURI.toPath()).map(UDF("lambda x: 1 // (x - x) if x < 0 else x")).resolve(ExceptionCode::ZERODIVISIONERROR, UDF("lambda x: 1 // (x - x) if x == -1 else x")).filter(UDF("lambda x: x != 0")).tocsv(fileURI.toPath()); + auto output2 = c.csv(outputFileURI.toPath()).collectAsVector(); + ASSERT_EQ(output2.size(), expectedOutput2.size()); + for (int i = 0; i < expectedOutput2.size(); ++i) { + ASSERT_EQ(expectedOutput2[i].toPythonString(), output2[i].toPythonString()); + } +} + +TEST_F(IncrementalTest, FileOutput2) { + using namespace tuplex; + using namespace std; + + auto opts = microTestOptions(); + opts.set("tuplex.resolverWithInterpreterOnly", "false"); + opts.set("tuplex.optimizer.incrementalResolution", "true"); + opts.set("tuplex.optimizer.mergeExceptionsInOrder", "true"); + Context c(opts); + + auto numRows = 10000; + auto amountExps = 0.25; + std::vector inputRows; + inputRows.reserve(numRows); + std::unordered_multiset expectedOutput1; + expectedOutput1.reserve((int) (numRows * amountExps)); + std::unordered_multiset expectedOutput2; + expectedOutput2.reserve(numRows); + + auto inputFileURI = URI(testName + "/in.csv"); + auto fileURI = URI(testName + "/out.csv"); + auto outputFileURI = URI(testName + "/out.*.csv"); + + std::stringstream ss; + for (int i = 0; i < numRows; ++i) { + if (i % (int) (1 / amountExps) == 0) { + ss << "0\n"; + expectedOutput2.insert(Row(-1).toPythonString()); + } else { + ss << to_string(i) << "\n"; + expectedOutput1.insert(Row(i).toPythonString()); + expectedOutput2.insert(Row(i).toPythonString()); + } + } + stringToFile(inputFileURI, ss.str()); + + c.csv(inputFileURI.toPath()).map(UDF("lambda x: 1 // x if x == 0 else x")) + .map(UDF("lambda x: 1 // x if x == 0 else x")) + .tocsv(fileURI.toPath()); + auto output1 = c.csv(outputFileURI.toPath()).collectAsVector(); + ASSERT_EQ(output1.size(), expectedOutput1.size()); + for (const auto &row : output1) { + ASSERT_TRUE(expectedOutput1.find(row.toPythonString()) != expectedOutput1.end()); + } + + c.csv(inputFileURI.toPath()).map(UDF("lambda x: 1 // x if x == 0 else x")) + .resolve(ExceptionCode::ZERODIVISIONERROR, UDF("lambda x: 0")) + .map(UDF("lambda x: 1 // x if x == 0 else x")) + .tocsv(fileURI.toPath()); + auto output2 = c.csv(outputFileURI.toPath()).collectAsVector(); + ASSERT_EQ(output2.size(), expectedOutput1.size()); + for (const auto &row : output2) { + ASSERT_TRUE(expectedOutput1.find(row.toPythonString()) != expectedOutput1.end()); + } + + c.csv(inputFileURI.toPath()) + .map(UDF("lambda x: 1 // x if x == 0 else x")) + .resolve(ExceptionCode::ZERODIVISIONERROR, UDF("lambda x: 0")) + .map(UDF("lambda x: 1 // x if x == 0 else x")) + .ignore(ExceptionCode::ZERODIVISIONERROR) + .tocsv(fileURI.toPath()); + auto output3 = c.csv(outputFileURI.toPath()).collectAsVector(); + ASSERT_EQ(output3.size(), expectedOutput1.size()); + for (const auto &row : output3) { + ASSERT_TRUE(expectedOutput2.find(row.toPythonString()) != expectedOutput1.end()); + } +} \ No newline at end of file diff --git a/tuplex/test/core/ResultSetTest.cc b/tuplex/test/core/ResultSetTest.cc index 4acd38921..cffbc29a9 100644 --- a/tuplex/test/core/ResultSetTest.cc +++ b/tuplex/test/core/ResultSetTest.cc @@ -51,6 +51,57 @@ class ResultSetTest : public PyTest { return pw.getOutputPartitions(); } + std::vector pyObjectsToPartitions(const std::vector>& pyObjects) { + using namespace tuplex; + + std::vector partitions; + if (pyObjects.empty()) { + return partitions; + } + + Schema schema(Schema::MemoryLayout::ROW, python::Type::makeTupleType({python::Type::STRING})); + Partition* partition = allocPartition(schema.getRowType(), -1); + auto rawPtr = (int64_t*)partition->lockWriteRaw(); + *rawPtr = 0; + auto ptr = (uint8_t*)(rawPtr + 1); + size_t numBytesSerialized = 0; + + python::lockGIL(); + for (auto &row: pyObjects) { + auto rowNum = std::get<0>(row); + auto pyObj = std::get<1>(row); + auto ecCode = -1; + auto opID = -1; + auto pickledObject = python::pickleObject(python::getMainModule(), pyObj); + auto pickledObjectSize = pickledObject.size(); + size_t requiredBytes = sizeof(int64_t) * 4 + pickledObjectSize; + + if (partition->capacity() < numBytesSerialized + requiredBytes) { + partition->unlockWrite(); + partitions.push_back(partition); + partition = allocPartition(schema.getRowType(), -1); + rawPtr = (int64_t *) partition->lockWriteRaw(); + *rawPtr = 0; + ptr = (uint8_t*)(rawPtr + 1); + numBytesSerialized = 0; + } + + *((int64_t*)ptr) = rowNum; ptr += sizeof(int64_t); + *((int64_t*)ptr) = ecCode; ptr += sizeof(int64_t); + *((int64_t*)ptr) = opID; ptr += sizeof(int64_t); + *((int64_t*)ptr) = pickledObjectSize; ptr += sizeof(int64_t); + memcpy(ptr, pickledObject.c_str(), pickledObjectSize); ptr += pickledObjectSize; + + *rawPtr += 1; + numBytesSerialized += requiredBytes; + } + python::unlockGIL(); + + partition->unlockWrite(); + partitions.push_back(partition); + + return partitions; + } }; TEST_F(ResultSetTest, NoPyObjects) { @@ -68,10 +119,13 @@ TEST_F(ResultSetTest, NoPyObjects) { sample_rows.push_back(Row(rand() % 256, rand() % 256 * 0.1 - 1.0, strs[rand() % strs.size()])); } auto partitions = rowsToPartitions(sample_rows); - for(auto p : partitions) - p->makeImmortal(); + std::vector partitionGroups; + for(int i = 0; i < partitions.size(); ++i) { + partitions[i]->makeImmortal(); + partitionGroups.push_back(PartitionGroup(1, i)); + } - auto rsA = make_shared(Schema(Schema::MemoryLayout::ROW, sample_rows.front().getRowType()), partitions); + auto rsA = make_shared(Schema(Schema::MemoryLayout::ROW, sample_rows.front().getRowType()), partitions, std::vector{}, std::vector{}, partitionGroups); EXPECT_EQ(rsA->rowCount(), sample_rows.size()); // check correct order returned @@ -79,13 +133,14 @@ TEST_F(ResultSetTest, NoPyObjects) { while(rsA->hasNextRow()) { EXPECT_EQ(rsA->getNextRow().toPythonString(), sample_rows[pos++].toPythonString()); } + EXPECT_EQ(pos, sample_rows.size()); // now limit result set to 17 rows, check this works as well! int Nlimit = 17; auto rsB = make_shared(Schema(Schema::MemoryLayout::ROW, sample_rows.front().getRowType()), partitions, std::vector{}, - std::unordered_map(), - vector>{}, + std::vector{}, + partitionGroups, Nlimit); pos = 0; while(rsB->hasNextRow()) { @@ -137,13 +192,15 @@ TEST_F(ResultSetTest, WithPyObjects) { vector refC = {Row(10), Row(20), Row(30), Row(35), Row(37)}; vector refD = {Row(-1), Row(0), Row(1)}; + auto partitionGroups = std::vector{PartitionGroup(1,0,0,0,1,0)}; + // TEST A: // ----------------- auto rsA = make_shared(Schema(Schema::MemoryLayout::ROW, rows.front().getRowType()), partitions, std::vector{}, - std::unordered_map(), - objsA); + pyObjectsToPartitions(objsA), + partitionGroups); EXPECT_EQ(rsA->rowCount(), objsA.size() + rows.size()); pos = 0; while(rsA->hasNextRow()) { @@ -156,8 +213,8 @@ TEST_F(ResultSetTest, WithPyObjects) { auto rsB = make_shared(Schema(Schema::MemoryLayout::ROW, rows.front().getRowType()), partitions, std::vector{}, - std::unordered_map(), - objsB); + pyObjectsToPartitions(objsB), + partitionGroups); EXPECT_EQ(rsB->rowCount(), objsB.size() + rows.size()); pos = 0; while(rsB->hasNextRow()) { @@ -171,8 +228,8 @@ TEST_F(ResultSetTest, WithPyObjects) { auto rsC = make_shared(Schema(Schema::MemoryLayout::ROW, rows.front().getRowType()), partitions, std::vector{}, - std::unordered_map(), - objsC); + pyObjectsToPartitions(objsC), + partitionGroups); EXPECT_EQ(rsC->rowCount(), objsC.size() + rows.size()); pos = 0; while(rsC->hasNextRow()) { @@ -180,6 +237,8 @@ TEST_F(ResultSetTest, WithPyObjects) { EXPECT_EQ(rsC->getNextRow().toPythonString(), refC[pos++].toPythonString()); } + partitionGroups = std::vector{PartitionGroup(0, 0, 0, 0, 1, 0)}; + // TEST D: // ------- // only pyobjects. @@ -188,8 +247,8 @@ TEST_F(ResultSetTest, WithPyObjects) { auto rsD = make_shared(Schema(Schema::MemoryLayout::ROW, rows.front().getRowType()), std::vector{}, std::vector{}, - std::unordered_map(), - objsD); + pyObjectsToPartitions(objsD), + partitionGroups); EXPECT_EQ(rsD->rowCount(), objsD.size()); pos = 0; while(rsD->hasNextRow()) { diff --git a/tuplex/test/wrappers/WrapperTest.cc b/tuplex/test/wrappers/WrapperTest.cc index ede9dd82d..40804ad41 100644 --- a/tuplex/test/wrappers/WrapperTest.cc +++ b/tuplex/test/wrappers/WrapperTest.cc @@ -17,59 +17,32 @@ #include #include #include +#include "../core/TestUtils.h" #include // need for these tests a running python interpreter, so spin it up -class WrapperTest : public ::testing::Test { -protected: - std::string testName; - std::string scratchDir; - +class WrapperTest : public TuplexTest { void SetUp() override { - testName = std::string(::testing::UnitTest::GetInstance()->current_test_info()->test_case_name()) + std::string(::testing::UnitTest::GetInstance()->current_test_info()->name()); - scratchDir = "/tmp/" + testName; + TuplexTest::SetUp(); python::initInterpreter(); - - // hold GIL assert(python::holdsGIL()); + + using namespace tuplex; + auto vfs = VirtualFileSystem::fromURI("."); + vfs.remove(testName); + auto err = vfs.create_dir(testName); + ASSERT_TRUE(err == VirtualFileSystemStatus::VFS_OK); } void TearDown() override { - - // important to get GIL for this + TuplexTest::TearDown(); python::closeInterpreter(); - } - - inline void remove_temp_files() { - tuplex::Timer timer; - boost::filesystem::remove_all(scratchDir.c_str()); - std::cout<<"removed temp files in "<(listObj); + auto res = c.parallelize(list).map("lambda x: 1 // x if x == 0 else x", "").resolve(ecToI64(ExceptionCode::ZERODIVISIONERROR), "lambda x: -1", "").collect(); + auto resObj = res.ptr(); + + ASSERT_EQ(PyList_Size(resObj), PyList_Size(expectedResult)); + for (int i = 0; i < PyList_Size(expectedResult); ++i) { + EXPECT_EQ(python::pythonToRow(PyList_GetItem(resObj, i)).toPythonString(), python::pythonToRow( + PyList_GetItem(expectedResult, i)).toPythonString()); + } + } +} + TEST_F(WrapperTest, StringTuple) { using namespace tuplex; - PythonContext c("c", "", testOptions()); + PythonContext c("c", "", microTestOptions().asJSON()); PyObject *listObj = PyList_New(4); PyObject *tupleObj1 = PyTuple_New(2); @@ -131,7 +140,7 @@ TEST_F(WrapperTest, StringTuple) { TEST_F(WrapperTest, MixedSimpleTupleTuple) { using namespace tuplex; - PythonContext c("c", "", testOptions()); + PythonContext c("c", "", microTestOptions().asJSON()); PyObject *listObj = PyList_New(4); PyObject *tupleObj1 = PyTuple_New(2); @@ -170,7 +179,7 @@ TEST_F(WrapperTest, MixedSimpleTupleTuple) { TEST_F(WrapperTest, StringParallelize) { using namespace tuplex; - PythonContext c("c", "", testOptions()); + PythonContext c("c", "", microTestOptions().asJSON()); PyObject * listObj = PyList_New(3); PyList_SET_ITEM(listObj, 0, python::PyString_FromString("Hello")); @@ -194,7 +203,7 @@ TEST_F(WrapperTest, StringParallelize) { TEST_F(WrapperTest, DictionaryParallelize) { using namespace tuplex; - PythonContext c("c", "", testOptions()); + PythonContext c("c", "", microTestOptions().asJSON()); PyObject * dictObj1 = PyDict_New(); PyDict_SetItem(dictObj1, python::PyString_FromString("a"), PyFloat_FromDouble(0.0)); @@ -243,7 +252,7 @@ TEST_F(WrapperTest, SimpleCSVParse) { PyDict_SetItemString(pyopt, "tuplex.webui.enable", Py_False); // RAII, destruct python context! - PythonContext c("c", "", testOptions()); + PythonContext c("c", "", microTestOptions().asJSON()); // weird block syntax due to RAII problems. { @@ -274,7 +283,7 @@ TEST_F(WrapperTest, SimpleCSVParse) { TEST_F(WrapperTest, GetOptions) { using namespace tuplex; - PythonContext c("c", "", testOptions()); + PythonContext c("c", "", microTestOptions().asJSON()); // weird RAII problems of boost python { @@ -290,8 +299,8 @@ TEST_F(WrapperTest, GetOptions) { TEST_F(WrapperTest, TwoContexts) { using namespace tuplex; - PythonContext c("", "", testOptions()); - PythonContext c2("", "", testOptions()); + PythonContext c("", "", microTestOptions().asJSON()); + PythonContext c2("", "", microTestOptions().asJSON()); { auto opt1 = c.options(); @@ -315,7 +324,7 @@ TEST_F(WrapperTest, Show) { PyDict_SetItemString(pyopt, "tuplex.webui.enable", Py_False); // RAII, destruct python context! - PythonContext c("python", "", testOptions()); + PythonContext c("python", "", microTestOptions().asJSON()); // weird block syntax due to RAII problems. { @@ -340,7 +349,7 @@ TEST_F(WrapperTest, GoogleTrace) { PyDict_SetItemString(pyopt, "tuplex.webui.enable", Py_False); // RAII, destruct python context! - PythonContext c("python", "", testOptions()); + PythonContext c("python", "", testOptions().asJSON()); /// Based on Google trace data, this mini pipeline serves as CSV parsing test ground. /// c.csv(file_path) \ /// .filter(lambda x: x[3] == 0) \ @@ -487,7 +496,7 @@ TEST_F(WrapperTest, extractPriceExample) { auto cols = py::reinterpret_borrow(colObj); // RAII, destruct python context! - PythonContext c("python", "", testOptions()); + PythonContext c("python", "", testOptions().asJSON()); { // all calls go here... @@ -587,7 +596,7 @@ TEST_F(WrapperTest, DictListParallelize) { using namespace tuplex; // RAII, destruct python context! - PythonContext c("python", "", testOptions()); + PythonContext c("python", "", microTestOptions().asJSON()); // weird block syntax due to RAII problems. { @@ -624,9 +633,9 @@ TEST_F(WrapperTest, UpcastParallelizeI) { using namespace tuplex; // RAII, destruct python context! - auto opts = testOptions(); - opts = opts.substr(0, opts.length() - 1) + ", \"tuplex.autoUpcast\":\"True\"}"; - PythonContext c("python", "", opts); + auto opts = microTestOptions(); + opts.set("tuplex.autoUpcast", "true"); + PythonContext c("python", "", opts.asJSON()); // weird block syntax due to RAII problems. { @@ -656,9 +665,9 @@ TEST_F(WrapperTest, UpcastParallelizeII) { using namespace tuplex; // RAII, destruct python context! - auto opts = testOptions(); - opts = opts.substr(0, opts.length() - 1) + ", \"tuplex.autoUpcast\":\"True\"}"; - PythonContext c("python", "", opts); + auto opts = microTestOptions(); + opts.set("tuplex.autoUpcast", "true"); + PythonContext c("python", "", opts.asJSON()); // weird block syntax due to RAII problems. { @@ -692,9 +701,9 @@ TEST_F(WrapperTest, FilterAll) { using namespace tuplex; // RAII, destruct python context! - auto opts = testOptions(); - opts = opts.substr(0, opts.length() - 1) + ",\"tuplex.autoUpcast\":\"True\"}"; - PythonContext c("python", "", opts); + auto opts = microTestOptions(); + opts.set("tuplex.autoUpcast", "true"); + PythonContext c("python", "", opts.asJSON()); // weird block syntax due to RAII problems. { @@ -719,7 +728,7 @@ TEST_F(WrapperTest, ColumnNames) { using namespace tuplex; // RAII, destruct python context! - PythonContext c("python", "", testOptions()); + PythonContext c("python", "", microTestOptions().asJSON()); // weird block syntax due to RAII problems. { @@ -781,9 +790,9 @@ TEST_F(WrapperTest, IntegerTuple) { PyDict_SetItemString(pyopt, "tuplex.autoUpcast", Py_True); // RAII, destruct python context! - auto opts = testOptions(); - opts = opts.substr(0, opts.length() - 1) + ",\"tuplex.autoUpcast\":\"True\"}"; - PythonContext c("python", "", opts); + auto opts = microTestOptions(); + opts.set("tuplex.autoUpcast", "true"); + PythonContext c("python", "", opts.asJSON()); // weird block syntax due to RAII problems. { @@ -838,8 +847,9 @@ TEST_F(WrapperTest, IfWithNull) { // RAII, destruct python context! auto opts = testOptions(); - opts = opts.substr(0, opts.length() - 1) + ",\"tuplex.useLLVMOptimizer\" : \"False\", \"tuplex.executorCount\":0}"; - PythonContext c("python", "", opts); + opts.set("tuplex.useLLVMOptimizer", "false"); + opts.set("tuplex.executorCount", "0"); + PythonContext c("python", "", opts.asJSON()); // execute mini part of pipeline and output csv to file // pipeline is // df = ctx.csv(perf_path) @@ -913,8 +923,9 @@ TEST_F(WrapperTest, FlightData) { // RAII, destruct python context! auto opts = testOptions(); - opts = opts.substr(0, opts.length() - 1) + ",\"tuplex.useLLVMOptimizer\" : \"False\", \"tuplex.executorCount\":0}"; - PythonContext c("python", "", opts); + opts.set("tuplex.useLLVMOptimizer", "false"); + opts.set("tuplex.executorCount", "0"); + PythonContext c("python", "", opts.asJSON()); // execute mini part of pipeline and output csv to file // pipeline is // df = ctx.csv(perf_path) @@ -1122,7 +1133,7 @@ TEST_F(WrapperTest, Airport) { // RAII, destruct python context! PythonContext c("python", "", - testOptions()); + testOptions().asJSON()); // execute mini part of pipeline and output csv to file // pipeline is @@ -1166,7 +1177,7 @@ TEST_F(WrapperTest, Airport) { TEST_F(WrapperTest, OptionParallelizeI) { using namespace tuplex; - PythonContext c("c", "", testOptions()); + PythonContext c("c", "", microTestOptions().asJSON()); PyObject * listObj = PyList_New(5); PyList_SET_ITEM(listObj, 0, PyLong_FromLong(112)); @@ -1196,7 +1207,7 @@ TEST_F(WrapperTest, OptionParallelizeI) { TEST_F(WrapperTest, OptionParallelizeII) { using namespace tuplex; - PythonContext c("c", "", testOptions()); + PythonContext c("c", "", microTestOptions().asJSON()); PyObject * listObj = PyList_New(5); @@ -1239,7 +1250,7 @@ TEST_F(WrapperTest, OptionParallelizeII) { TEST_F(WrapperTest, NoneParallelize) { using namespace tuplex; - PythonContext c("c", "", testOptions()); + PythonContext c("c", "", microTestOptions().asJSON()); PyObject * listObj = PyList_New(2); PyList_SET_ITEM(listObj, 0, Py_None); @@ -1263,7 +1274,7 @@ TEST_F(WrapperTest, NoneParallelize) { TEST_F(WrapperTest, EmptyMapI) { using namespace tuplex; - PythonContext c("c", "", testOptions()); + PythonContext c("c", "", microTestOptions().asJSON()); PyObject * listObj = PyList_New(4); PyList_SET_ITEM(listObj, 0, PyLong_FromLong(1)); @@ -1291,7 +1302,7 @@ TEST_F(WrapperTest, EmptyMapI) { TEST_F(WrapperTest, EmptyMapII) { using namespace tuplex; - PythonContext c("c", "", testOptions()); + PythonContext c("c", "", microTestOptions().asJSON()); PyObject * listObj = PyList_New(4); PyList_SET_ITEM(listObj, 0, PyLong_FromLong(1)); @@ -1323,7 +1334,7 @@ TEST_F(WrapperTest, EmptyMapII) { TEST_F(WrapperTest, EmptyMapIII) { using namespace tuplex; - PythonContext c("c", "", testOptions()); + PythonContext c("c", "", microTestOptions().asJSON()); PyObject * listObj = PyList_New(4); PyList_SET_ITEM(listObj, 0, PyLong_FromLong(1)); @@ -1355,7 +1366,7 @@ TEST_F(WrapperTest, EmptyMapIII) { TEST_F(WrapperTest, EmptyOptionMapI) { using namespace tuplex; - PythonContext c("c", "", testOptions()); + PythonContext c("c", "", microTestOptions().asJSON()); PyObject * listObj = PyList_New(4); PyList_SET_ITEM(listObj, 0, PyLong_FromLong(1)); @@ -1385,7 +1396,7 @@ TEST_F(WrapperTest, EmptyOptionMapI) { TEST_F(WrapperTest, EmptyOptionMapII) { using namespace tuplex; - PythonContext c("c", "", testOptions()); + PythonContext c("c", "", microTestOptions().asJSON()); PyObject * listObj = PyList_New(4); PyList_SET_ITEM(listObj, 0, PyLong_FromLong(1)); @@ -1415,7 +1426,7 @@ TEST_F(WrapperTest, EmptyOptionMapII) { TEST_F(WrapperTest, OptionTupleParallelizeI) { using namespace tuplex; - PythonContext c("c", "", testOptions()); + PythonContext c("c", "", microTestOptions().asJSON()); PyObject * listObj = PyList_New(3); @@ -1464,7 +1475,7 @@ TEST_F(WrapperTest, OptionTupleParallelizeI) { TEST_F(WrapperTest, OptionTupleParallelizeII) { using namespace tuplex; - PythonContext c("c", "", testOptions()); + PythonContext c("c", "", microTestOptions().asJSON()); PyObject * listObj = PyList_New(3); @@ -1513,7 +1524,7 @@ TEST_F(WrapperTest, OptionTupleParallelizeII) { TEST_F(WrapperTest, OptionTupleParallelizeIII) { using namespace tuplex; - PythonContext c("c", "", testOptions()); + PythonContext c("c", "", microTestOptions().asJSON()); PyObject * listObj = PyList_New(3); @@ -1562,7 +1573,7 @@ TEST_F(WrapperTest, OptionTupleParallelizeIII) { TEST_F(WrapperTest, parallelizeOptionTypeI) { using namespace tuplex; - PythonContext c("c", "", testOptions()); + PythonContext c("c", "", microTestOptions().asJSON()); PyObject * listObj = python::runAndGet( "test_input = [(1.0, '2', 3, '4', 5, 6, True, 8, 9, None), (None, '2', 3, None, 5, 6, True, 8, 9, None)" @@ -1589,7 +1600,7 @@ TEST_F(WrapperTest, parallelizeOptionTypeI) { TEST_F(WrapperTest, parallelizeNestedSlice) { using namespace tuplex; - PythonContext c("c", "", testOptions()); + PythonContext c("c", "", microTestOptions().asJSON()); PyObject * listObj = python::runAndGet( "test_input = [((), (\"hello\",), 123, \"oh no\", (1, 2)), ((), (\"goodbye\",), 123, \"yes\", (-10, 2)),\n" @@ -1621,7 +1632,7 @@ TEST_F(WrapperTest, TPCHQ6) { " 'l_discount', 'l_tax', 'l_returnflag', 'l_linestatus',\n" " 'l_shipdate', 'l_commitdate', 'l_receiptdate',\n" " 'l_shipinstruct', 'l_shipmode', 'l_comment']", "listitem_columns"); - PythonContext c("c", "", testOptions()); + PythonContext c("c", "", testOptions().asJSON()); { @@ -1643,7 +1654,7 @@ TEST_F(WrapperTest, TupleParallelizeI) { PyObject* listObj = python::runAndGet("L = [('hello', 'world', 'hi', 1, 2, 3), ('foo', 'bar', 'baz', 4, 5, 6), ('blank', '', 'not', 7, 8, 9)]", "L"); - PythonContext c("c", "", testOptions()); + PythonContext c("c", "", microTestOptions().asJSON()); { auto list = py::reinterpret_borrow(listObj); c.parallelize(list).map("lambda x: ({x[0]: x[3], x[1]: x[4], x[2]: x[5]},)", "").show(); @@ -1655,7 +1666,7 @@ TEST_F(WrapperTest, TupleParallelizeII) { PyObject* listObj = python::runAndGet("L = [({}, {}, {}), ({}, {}, {}), ({}, {}, {})]", "L"); - PythonContext c("c", "", testOptions()); + PythonContext c("c", "", microTestOptions().asJSON()); { auto list = py::reinterpret_borrow(listObj); c.parallelize(list).map("lambda x, y, z: [x, y, z]", "").show(); @@ -1672,7 +1683,7 @@ TEST_F(WrapperTest, DictParallelizeRefTest) { PyObject* strings = python::runAndGet("strings = [('hello', 'world', 'hi'), ('foo', 'bar', 'baz'), ('blank', '', 'not')]\n", "strings"); PyObject* floats = python::runAndGet("floats = [(1.2, 3.4, -100.2), (5.6, 7.8, -1.234), (9.0, 0.1, 2.3)]\n", "floats"); ASSERT_TRUE(floats->ob_refcnt > 0); - PythonContext c("c", "", testOptions()); + PythonContext c("c", "", microTestOptions().asJSON()); { @@ -1715,7 +1726,7 @@ TEST_F(WrapperTest, DictParallelizeRefTest) { TEST_F(WrapperTest, BuiltinModule) { using namespace tuplex; using namespace std; - PythonContext c("c", "", testOptions()); + PythonContext c("c", "", microTestOptions().asJSON()); { PyObject* L = PyList_New(3); @@ -1747,7 +1758,7 @@ TEST_F(WrapperTest, SwapIII) { " return a, b\n" "\n"; - PythonContext c("c", "", testOptions()); + PythonContext c("c", "", microTestOptions().asJSON()); { PyObject* L = PyList_New(2); auto tuple1 = PyTuple_New(2); @@ -2069,6 +2080,181 @@ namespace tuplex { } } +void executeZillow(tuplex::PythonContext &context, const tuplex::URI& outputURI, int step) { + using namespace tuplex; + using namespace std; + + auto extractBd = "def extractBd(x):\n" + " val = x['facts and features']\n" + " max_idx = val.find(' bd')\n" + " if max_idx < 0:\n" + " max_idx = len(val)\n" + " s = val[:max_idx]\n" + "\n" + " # find comma before\n" + " split_idx = s.rfind(',')\n" + " if split_idx < 0:\n" + " split_idx = 0\n" + " else:\n" + " split_idx += 2\n" + " r = s[split_idx:]\n" + " return int(r)"; + + auto extractType = "def extractType(x):\n" + " t = x['title'].lower()\n" + " type = 'unknown'\n" + " if 'condo' in t or 'apartment' in t:\n" + " type = 'condo'\n" + " if 'house' in t:\n" + " type = 'house'\n" + " return type"; + + auto extractBa = "def extractBa(x):\n" + " val = x['facts and features']\n" + " max_idx = val.find(' ba')\n" + " if max_idx < 0:\n" + " max_idx = len(val)\n" + " s = val[:max_idx]\n" + "\n" + " # find comma before\n" + " split_idx = s.rfind(',')\n" + " if split_idx < 0:\n" + " split_idx = 0\n" + " else:\n" + " split_idx += 2\n" + " r = s[split_idx:]\n" + " return int(r)"; + + auto extractSqft = "def extractSqft(x):\n" + " val = x['facts and features']\n" + " max_idx = val.find(' sqft')\n" + " if max_idx < 0:\n" + " max_idx = len(val)\n" + " s = val[:max_idx]\n" + "\n" + " split_idx = s.rfind('ba ,')\n" + " if split_idx < 0:\n" + " split_idx = 0\n" + " else:\n" + " split_idx += 5\n" + " r = s[split_idx:]\n" + " r = r.replace(',', '')\n" + " return int(r)"; + + auto extractPrice = "def extractPrice(x):\n" + " price = x['price']\n" + " p = 0\n" + " if x['offer'] == 'sold':\n" + " # price is to be calculated using price/sqft * sqft\n" + " val = x['facts and features']\n" + " s = val[val.find('Price/sqft:') + len('Price/sqft:') + 1:]\n" + " r = s[s.find('$')+1:s.find(', ') - 1]\n" + " price_per_sqft = int(r)\n" + " p = price_per_sqft * x['sqft']\n" + " elif x['offer'] == 'rent':\n" + " max_idx = price.rfind('/')\n" + " p = int(price[1:max_idx].replace(',', ''))\n" + " else:\n" + " # take price from price column\n" + " p = int(price[1:].replace(',', ''))\n" + "\n" + " return p"; + auto extractOffer = "def extractOffer(x):\n" + " offer = x['title'].lower()\n" + "\n" + " if 'sale' in offer:\n" + " offer = 'sale'\n" + " elif 'rent' in offer:\n" + " offer = 'rent'\n" + " elif 'sold' in offer:\n" + " offer = 'sold'\n" + " elif 'foreclos' in offer.lower():\n" + " offer = 'foreclosed'\n" + " else:\n" + " offer = 'unknown'\n" + "\n" + " return offer"; + + auto resolveBd = "def resolveBd(x):\n" + " if 'Studio' in x['facts and features']:\n" + " return 1\n" + " raise ValueError\n"; + + // create closure object for resolve_Ba + auto ba_closure = PyDict_New(); + auto math_mod = PyImport_ImportModule("math"); + auto re_mod = PyImport_ImportModule("re"); + assert(math_mod); assert(re_mod); + PyDict_SetItemString(ba_closure, "math", math_mod); + PyDict_SetItemString(ba_closure, "re", re_mod); + + auto cols_to_select = python::runAndGet("L = ['url', 'zipcode', 'address', 'city', 'state'," + "'bedrooms', 'bathrooms', 'sqft', 'offer', 'type', 'price']", "L"); + + { + auto ds = context.csv("../resources/zillow_dirty_sample_mini.csv"); + ds = ds.withColumn("bedrooms", extractBd, "", py::reinterpret_steal(ba_closure)); + if (step > 0) + ds = ds.resolve(ecToI64(ExceptionCode::VALUEERROR), resolveBd, ""); + if (step > 1) + ds = ds.ignore(ecToI64(ExceptionCode::VALUEERROR)); + ds = ds.filter("lambda x: x['bedrooms'] < 10", ""); + ds = ds.withColumn("type", extractType, "", py::reinterpret_steal(ba_closure)); + ds = ds.filter("lambda x: x['type'] == 'condo'", ""); + ds = ds.withColumn("zipcode", "lambda x: '%05d' % int(x['postal_code'])", ""); + if (step > 2) + ds = ds.ignore(ecToI64(ExceptionCode::TYPEERROR)); + ds = ds.mapColumn("city", "lambda x: x[0].upper() + x[1:].lower()", ""); + ds = ds.withColumn("bathrooms", extractBa, "", py::reinterpret_steal(ba_closure)); + if (step > 3) + ds = ds.ignore(ecToI64(ExceptionCode::VALUEERROR)); + ds = ds.withColumn("sqft", extractSqft, "", py::reinterpret_steal(ba_closure)); + if (step > 4) + ds = ds.ignore(ecToI64(ExceptionCode::VALUEERROR)); + ds = ds.withColumn("offer", extractOffer, "", py::reinterpret_steal(ba_closure)); + ds = ds.withColumn("price", extractPrice, "", py::reinterpret_steal(ba_closure)); + if (step > 5) + ds = ds.resolve(ecToI64(ExceptionCode::VALUEERROR), "lambda x: int(re.sub('[^0-9.]*', '', x['price']))", "", py::reinterpret_steal(ba_closure)); + ds = ds.filter("lambda x: 100000 < x['price'] < 2e7 and x['offer'] == 'sale'", ""); + ds = ds.selectColumns(py::reinterpret_borrow(cols_to_select)); + ds.tocsv(outputURI.toPath()); + } +} + +TEST_F(WrapperTest, IncrementalZillow) { + using namespace tuplex; + using namespace std; + + auto opts = testOptions(); + opts.set("tuplex.driverMemory", "512MB"); + opts.set("tuplex.executorCount", "0"); + opts.set("tuplex.optimizer.incrementalResolution", "true"); + opts.set("tuplex.optimizer.mergeExceptionsInOrder", "false"); + opts.set("tuplex.optimizer.nullValueOptimization", "false"); +// opts.set("tuplex.inputSplitSize", "16MB"); +// opts.set("tuplex.optimizer.codeStats", "true"); +// opts.set("tuplex.readBufferSize", "4KB"); +// opts.set("tuplex.resolveWithInterpreterOnly", "true"); +// opts.set("tuplex.allowUndefinedBehavior", "false"); +// opts.set("tuplex.autoUpcast", "false"); +// opts.set("tuplex.executorMemory", "512MB"); +// opts.set("tuplex.inputSplitSize", "16MB"); +// opts.set("tuplex.runTimeMemory", "32MB"); +// PythonContext incrementalContext("incremental", "", opts.asJSON()); + opts.set("tuplex.optimizer.incrementalResolution", "false"); + PythonContext plainContext("plain", "", opts.asJSON()); + + for (int step = 0; step < 7; ++step) { +// executeZillow(incrementalContext, testName + "/incremental.csv", step); + executeZillow(plainContext, testName + "/plain.csv", step); + +// auto incrementalResult = plainContext.csv(testName + "/incremental.*.csv").collect().ptr(); +// auto plainResult = plainContext.csv(testName + "/plain.*.csv").collect().ptr(); +// ASSERT_EQ(PyList_Size(incrementalResult), PyList_Size(plainResult)); + } + +} + TEST_F(WrapperTest, ZillowDirty) { using namespace tuplex; using namespace std; @@ -2156,7 +2342,7 @@ TEST_F(WrapperTest, BitwiseAnd) { PyObject* listObj = python::runAndGet("L = [(False, False), (False, True), (True, False), (True, True)]", "L"); - PythonContext c("c", "", testOptions()); + PythonContext c("c", "", microTestOptions().asJSON()); { auto list = py::reinterpret_borrow(listObj); auto res_list = c.parallelize(list).map("lambda a, b: a & b", "").collect(); @@ -2172,7 +2358,7 @@ TEST_F(WrapperTest, MetricsTest) { PyObject* listObj = python::runAndGet("L = [(False, False), (False, True), (True, False), (True, True)]", "L"); - PythonContext c("c", "", testOptions()); + PythonContext c("c", "", microTestOptions().asJSON()); { auto list = py::reinterpret_borrow(listObj); auto res_list = c.parallelize(list).map("lambda a, b: a & b", "").collect(); @@ -2364,9 +2550,9 @@ TEST_F(WrapperTest, MixedTypesIsWithNone) { using namespace tuplex; using namespace std; - auto opts = testOptions(); - opts = opts.substr(0, opts.length() - 1) + ",\"tuplex.optimizer.mergeExceptionsInOrder\":\"True\"}"; - PythonContext c("python", "", opts); + auto opts = microTestOptions(); + opts.set("tuplex.optimizer.mergeExceptionsInOrder", "true"); + PythonContext c("python", "", opts.asJSON()); PyObject *listObj = PyList_New(8); PyList_SetItem(listObj, 0, Py_None);