这是indexloc提供的服务,不要输入任何密码
Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
174 commits
Select commit Hold shift + click to select a range
d7db067
Standardize runtime exceptions to use ExceptionInfo
bgivertz Feb 4, 2022
31f7d11
Support for fallback and general case partitions
bgivertz Feb 9, 2022
c0b5d10
Give test more memory
bgivertz Feb 9, 2022
d98d471
create benchmark files
bgivertz Feb 10, 2022
d686b0b
BBSN memory
bgivertz Feb 10, 2022
2fbaf15
BBSN memory
bgivertz Feb 10, 2022
eea2257
BBSN memory
bgivertz Feb 10, 2022
d042bc4
BBSN memory
bgivertz Feb 10, 2022
fe2639e
Update incremental benchmarks
bgivertz Feb 11, 2022
ecba919
Temp metrics for resolve task
bgivertz Feb 11, 2022
2886da7
Temp metrics for resolve task
bgivertz Feb 11, 2022
6309d7b
Temp metrics for resolve task
bgivertz Feb 11, 2022
2006da0
Revert metrics for resolve task
bgivertz Feb 11, 2022
33cf870
Detecting incremental resolution
bgivertz Feb 15, 2022
48af6c2
File to file no merge in order
bgivertz Feb 15, 2022
7dce49e
File to file no merge in order
bgivertz Feb 15, 2022
0cb2217
File to file no merge in order
bgivertz Feb 15, 2022
826650a
File to file no merge in order
bgivertz Feb 15, 2022
4e80958
File to file no merge in order
bgivertz Feb 16, 2022
469f26b
File to file no merge in order
bgivertz Feb 16, 2022
d6341ac
File to file no merge in order
bgivertz Feb 16, 2022
300fed6
File to file no merge in order
bgivertz Feb 16, 2022
6b71ba8
File to file no merge in order
bgivertz Feb 16, 2022
5dd8377
File to file no merge in order
bgivertz Feb 16, 2022
3f7c844
File to file no merge in order
bgivertz Feb 16, 2022
8021df3
File to file no merge in order
bgivertz Feb 16, 2022
dc3e291
File to file no merge in order
bgivertz Feb 17, 2022
6883c3a
File to file no merge in order
bgivertz Feb 23, 2022
028ee90
File to file no merge in order
bgivertz Feb 23, 2022
f9ace98
File to file no merge in order
bgivertz Feb 23, 2022
63cad64
File to file no merge in order
bgivertz Feb 23, 2022
9b44cc1
File to file no merge in order
bgivertz Feb 23, 2022
2215d56
File to file no merge in order
bgivertz Feb 23, 2022
3a4305b
File to file no merge in order
bgivertz Feb 23, 2022
cb1128a
File to file no merge in order
bgivertz Feb 23, 2022
2eaffd9
File to file no merge in order
bgivertz Feb 23, 2022
0fdeead
File to file no merge in order
bgivertz Feb 23, 2022
702dddb
Merge branch 'tuplex:master' into incremental-refactor
bgivertz Feb 24, 2022
6da427a
File to file no merge in order
bgivertz Feb 24, 2022
eec910d
File to file no merge in order
bgivertz Feb 24, 2022
2668533
File to file no merge in order
bgivertz Feb 24, 2022
853f516
File to file no merge in order
bgivertz Feb 24, 2022
303766f
File to file no merge in order
bgivertz Feb 24, 2022
f1807e4
File to file no merge in order
bgivertz Feb 24, 2022
0c8f6a8
File to file no merge in order
bgivertz Feb 24, 2022
f2eef29
File to file no merge in order
bgivertz Feb 25, 2022
230fdc6
File to file merge in order
bgivertz Feb 27, 2022
7a5b10b
File to file merge in order
bgivertz Feb 28, 2022
806d687
File to file merge in order
bgivertz Feb 28, 2022
7107a1d
File to file merge in order
bgivertz Feb 28, 2022
1d2f9bb
File to file merge in order
bgivertz Feb 28, 2022
bdf7ef5
File to file merge in order
bgivertz Feb 28, 2022
3237b8b
File to file merge in order
bgivertz Feb 28, 2022
46b677f
File to file merge in order
bgivertz Feb 28, 2022
e103263
File to file merge in order
bgivertz Feb 28, 2022
dc22b59
File to file merge in order
bgivertz Mar 6, 2022
70f4c75
File to file merge in order
bgivertz Mar 6, 2022
653fcc6
File to file merge in order
bgivertz Mar 6, 2022
437e022
File to file merge in order
bgivertz Mar 6, 2022
a46adbe
File to file merge in order
bgivertz Mar 6, 2022
1611278
File to file merge in order
bgivertz Mar 6, 2022
afc44c8
Merge branch 'tuplex:master' into incremental-refactor
bgivertz Mar 9, 2022
c227631
File to file merge in order
bgivertz Mar 11, 2022
30d144d
File to file merge in order
bgivertz Mar 11, 2022
8949800
File to file merge in order
bgivertz Mar 11, 2022
d6eee8e
File to file merge in order
bgivertz Mar 11, 2022
4a1c247
File to file merge in order
bgivertz Mar 11, 2022
fddd545
Commit to merge in order
bgivertz Mar 15, 2022
7800f83
Commit to merge in order
bgivertz Mar 15, 2022
ca4c175
Commit to merge in order
bgivertz Mar 15, 2022
905444a
Commit to merge in order
bgivertz Mar 16, 2022
cee8ba6
Commit to merge in order
bgivertz Mar 16, 2022
f2fdb5c
Commit to merge in order
bgivertz Mar 16, 2022
f60548d
Commit to merge in order
bgivertz Mar 16, 2022
d3b06d5
Commit to merge in order
bgivertz Mar 16, 2022
c201b7b
Commit to merge in order
bgivertz Mar 16, 2022
1de564d
Commit to merge in order
bgivertz Mar 16, 2022
7173f11
Commit to merge in order
bgivertz Mar 16, 2022
1e2679a
Commit to merge in order
bgivertz Mar 16, 2022
2e5d5a9
Commit to merge in order
bgivertz Mar 16, 2022
4e75ee7
Commit to merge in order
bgivertz Mar 16, 2022
0ff9c79
Commit to merge in order
bgivertz Mar 16, 2022
077ca91
Commit to merge in order
bgivertz Mar 16, 2022
b17a469
Commit to merge in order
bgivertz Mar 16, 2022
9410f1f
Commit to merge in order
bgivertz Mar 16, 2022
80633f7
Commit to merge in order
bgivertz Mar 17, 2022
0444087
Commit to merge in order
bgivertz Mar 17, 2022
85af466
Commit to merge in order
bgivertz Mar 18, 2022
d504369
Commit to merge in order
bgivertz Mar 18, 2022
a409dcf
Commit to merge in order
bgivertz Mar 18, 2022
88d5c8e
Commit to merge in order
bgivertz Mar 18, 2022
910e654
Commit to merge in order
bgivertz Mar 18, 2022
477a71f
Commit to merge in order
bgivertz Mar 18, 2022
b58d81a
Commit to merge in order
bgivertz Mar 18, 2022
8fde8b9
Commit to merge in order
bgivertz Mar 18, 2022
95b09fe
Commit to merge in order
bgivertz Mar 18, 2022
1af0d6c
Commit to merge in order
bgivertz Mar 18, 2022
d7f2aa1
Commit to merge in order
bgivertz Mar 21, 2022
b97bcef
Commit to merge in order
bgivertz Mar 21, 2022
defe61a
Commit to merge in order
bgivertz Mar 21, 2022
9fc6d59
Commit to merge in order
bgivertz Mar 21, 2022
646a4f8
Commit to merge in order
bgivertz Mar 21, 2022
c9048de
Commit to merge in order
bgivertz Mar 21, 2022
5afe2f7
Commit to merge in order
bgivertz Mar 21, 2022
e5668e5
Commit to merge in order
bgivertz Mar 21, 2022
3092167
Commit to merge in order
bgivertz Mar 21, 2022
c5bb99d
Commit to merge in order
bgivertz Mar 22, 2022
e2a75d1
Commit to merge in order
bgivertz Mar 22, 2022
2f66d71
Commit to merge in order
bgivertz Mar 22, 2022
29f1f3d
Commit to merge in order
bgivertz Mar 22, 2022
42e9315
Commit to merge in order
bgivertz Mar 22, 2022
4d37fd7
Commit to merge in order
bgivertz Mar 22, 2022
9f9fa39
Commit to merge in order
bgivertz Mar 22, 2022
2435e98
Commit to merge in order
bgivertz Mar 22, 2022
e7cd987
Commit to merge in order
bgivertz Mar 22, 2022
f974063
Commit to merge in order
bgivertz Mar 22, 2022
15827d0
Commit to merge in order
bgivertz Mar 23, 2022
d4c996c
Commit to merge in order
bgivertz Mar 23, 2022
1e108df
Commit to merge in order
bgivertz Mar 24, 2022
d45375a
Commit to merge in order
bgivertz Mar 24, 2022
c29f9a3
Commit to merge in order
bgivertz Mar 25, 2022
4846224
Merge branch 'tuplex:master' into incremental-refactor
bgivertz Apr 2, 2022
735ef8f
Commit to merge in order
bgivertz Apr 3, 2022
15e6fff
Commit to merge in order
bgivertz Apr 4, 2022
4d23c4f
Commit to merge in order
bgivertz Apr 4, 2022
2f42756
Commit to merge in order
bgivertz Apr 4, 2022
619ec53
Commit to merge in order
bgivertz Apr 4, 2022
57cf486
Commit to merge in order
bgivertz Apr 4, 2022
c700de3
Commit to merge in order
bgivertz Apr 4, 2022
4b4eead
Commit to merge in order
bgivertz Apr 4, 2022
e49fe9c
Commit to merge in order
bgivertz Apr 4, 2022
26b11d8
Commit to merge in order
bgivertz Apr 4, 2022
1cf82d6
Commit to merge in order
bgivertz Apr 4, 2022
5075774
Commit to merge in order
bgivertz Apr 4, 2022
01624c3
Commit to merge in order
bgivertz Apr 7, 2022
0cfa974
Commit to merge in order
bgivertz Apr 7, 2022
7d8250f
Commit to merge in order
bgivertz Apr 7, 2022
461f8a1
Commit to merge in order
bgivertz Apr 7, 2022
397fa45
Synthetic pipeline
bgivertz Apr 13, 2022
741a8f0
Synthetic pipeline
bgivertz Apr 13, 2022
9b2c30a
Synthetic pipeline
bgivertz Apr 14, 2022
229097c
Synthetic pipeline
bgivertz Apr 14, 2022
b33771e
Synthetic pipeline
bgivertz Apr 14, 2022
aeab7d7
Synthetic pipeline
bgivertz Apr 14, 2022
d8f407b
Synthetic pipeline
bgivertz Apr 14, 2022
ffb0277
Synthetic pipeline
bgivertz Apr 14, 2022
d6f4b31
Synthetic pipeline
bgivertz Apr 14, 2022
65bf5c4
Synthetic pipeline
bgivertz Apr 14, 2022
c348fcb
Synthetic pipeline
bgivertz Apr 14, 2022
ca9f190
Synthetic pipeline
bgivertz Apr 14, 2022
45628aa
Synthetic pipeline
bgivertz Apr 14, 2022
cf47225
Synthetic pipeline
bgivertz Apr 14, 2022
64904f9
Synthetic pipeline
bgivertz Apr 14, 2022
36b8ed5
Synthetic pipeline
bgivertz Apr 14, 2022
14db2f1
Synthetic pipeline
bgivertz Apr 14, 2022
705764e
Synthetic pipeline
bgivertz Apr 14, 2022
031cb59
Synthetic pipeline
bgivertz Apr 14, 2022
898951e
Synthetic pipeline
bgivertz Apr 14, 2022
0cef126
Synthetic pipeline
bgivertz Apr 14, 2022
7a882f4
Synthetic pipeline
bgivertz Apr 14, 2022
885dc70
Synthetic pipeline
bgivertz Apr 15, 2022
fa75ab6
Synthetic pipeline
bgivertz Apr 15, 2022
96258c0
Synthetic pipeline
bgivertz Apr 15, 2022
79fedf4
Synthetic pipeline
bgivertz Apr 15, 2022
115b4bd
Synthetic pipeline
bgivertz Apr 15, 2022
c97fa92
Synthetic pipeline
bgivertz Apr 15, 2022
a3ac129
Synthetic pipeline
bgivertz Apr 15, 2022
cf2d9a1
Merge branch 'tuplex:master' into incremental-refactor
bgivertz Apr 17, 2022
2a4a16f
Merge branch 'tuplex:master' into incremental-refactor
bgivertz Apr 27, 2022
8a6f053
Synthetic pipeline
bgivertz Apr 27, 2022
d5b2f10
Merge remote-tracking branch 'origin/incremental-refactor' into incre…
bgivertz Apr 27, 2022
d24ec2a
Job metrics doc strings
bgivertz Apr 27, 2022
a72b7ff
PR Changes
bgivertz Apr 27, 2022
5afee38
PR Changes
bgivertz Apr 27, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions benchmarks/incremental/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
data/*.csv
tuplex_output/
python_code_pipeline_stage_*.py
transform_stage_*.txt
results_dirty_zillow@10G/
tuplex_config.json
25 changes: 25 additions & 0 deletions benchmarks/incremental/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@

## Incremental Exception Resolution Experiment

In this experiment, we run a sequence of six pipelines over the dirty zillow dataset.
- The first pipeline contains no ignore or resolver operations
- The final pipeline contains 5 unique ignore and resolver operations
- Each of the pipelines in between incrementally adds on an additional resolver until all are present for the final pipeline

We compare the following conditions, for a total of 8 experimental trials
- Plain vs Incremental Resolution
- Single vs Multi-threaded
- Merge in order vs Merge without order

In order to get 10GB of input data, replicate dirty zillow data 1460x (or use 1500x for simplicity).

### Setup
To replicate the original data, create the 10G files with the following settings:
```
python3 replicate-data.py -s 1460 -o data/zillow_dirty@10G.csv
```
Note that both files have the same number of rows, but the synthetic version is slightly larger.

### Running the benchmark
Use
`nohup perflock ./benchmark.sh -hwloc &`
96 changes: 96 additions & 0 deletions benchmarks/incremental/benchmark-synthetic.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#!/usr/bin/env bash

# Parse HWLOC settings
HWLOC=""
if [ $# -ne 0 ] && [ $# -ne 1 ]; then # check nmber of inputs
echo "usage: ./benchmark.sh [-hwloc]"
exit 1
fi

if [ $# -eq 1 ]; then # check if hwloc
if [ "$1" != "-hwloc" ]; then # check flag
echo -e "invalid flag: $1\nusage: ./benchmark.sh [-hwloc]"
exit 1
fi
HWLOC="hwloc-bind --cpubind node:1 --membind node:1 --cpubind node:2 --membind node:2"
fi

# use 10 runs (3 for very long jobs) and a timeout after 180min/3h
NUM_RUNS=1
NUM_STEPS=10
TIMEOUT=14400

RESDIR='results_synthetic'
DATA_PATH='/hot/scratch/bgivertz/data/synthetic/synth'
INCREMENTAL_OUT_PATH='/hot/scratch/bgivertz/output/incremental'
COMMIT_OUT_PATH='/hot/scratch/bgivertz/output/commit'
PLAIN_OUT_PATH='/hot/scratch/bgivertz/output/plain'

rm -rf $RESDIR
rm -rf $INCREMENTAL_OUT_PATH
rm -rf $PLAIN_OUT_PATH
rm -rf $COMMIT_OUT_PATH

mkdir -p ${RESDIR}

# create tuplex_config.json
python3 create_conf.py --opt-pushdown --opt-filter --opt-llvm --executor-count 63 --executor-memory "6G" > tuplex_config.json

echo "running out of order experiments"
for ((r = 1; r <= NUM_RUNS; r++)); do
echo "trial ($r/$NUM_RUNS)"

echo "running plain (0/10)"
LOG="${RESDIR}/plain-out-of-order-e0-t$r.txt"
timeout $TIMEOUT ${HWLOC} python3 runsynthetic.py --clear-cache --input-path "$DATA_PATH""0.csv" --output-path $PLAIN_OUT_PATH >$LOG 2>$LOG.stderr

echo "running incremental (0/10)"
LOG="${RESDIR}/incremental-out-of-order-e0-t$r.txt"
timeout $TIMEOUT ${HWLOC} python3 runsynthetic.py --clear-cache --incremental-resolution --input-path "$DATA_PATH""0.csv" --output-path $INCREMENTAL_OUT_PATH >$LOG 2>$LOG.stderr

for ((s = 1; s <= 10; s++)) do
echo "running plain ($s/10)"
LOG="${RESDIR}/plain-out-of-order-e$s-t$r.txt"
timeout $TIMEOUT ${HWLOC} python3 runsynthetic.py --use-resolve-step --clear-cache --input-path "$DATA_PATH$s.csv" --output-path $PLAIN_OUT_PATH >$LOG 2>$LOG.stderr

echo "running incremental ($s/10)"
LOG="${RESDIR}/incremental-out-of-order-e$s-t$r.txt"
timeout $TIMEOUT ${HWLOC} python3 runsynthetic.py --use-resolve-step --clear-cache --incremental-resolution --input-path "$DATA_PATH$s.csv" --output-path $INCREMENTAL_OUT_PATH >$LOG 2>$LOG.stderr
done
done

echo "running in order experiments"
for ((r = 1; r <= NUM_RUNS; r++)); do
echo "trial ($r/$NUM_RUNS)"

echo "running plain (0/10)"
LOG="${RESDIR}/plain-in-order-e0-t$r.txt"
timeout $TIMEOUT ${HWLOC} python3 runsynthetic.py --clear-cache --resolve-in-order --input-path "$DATA_PATH""0.csv" --output-path $PLAIN_OUT_PATH >$LOG 2>$LOG.stderr

echo "running incremental (0/10)"
LOG="${RESDIR}/incremental-in-order-e0-t$r.txt"
timeout $TIMEOUT ${HWLOC} python3 runsynthetic.py --clear-cache --resolve-in-order --incremental-resolution --input-path "$DATA_PATH""0.csv" --output-path $INCREMENTAL_OUT_PATH >$LOG 2>$LOG.stderr

echo "running commit (0/10)"
LOG="${RESDIR}/commit-in-order-e0-t$r.txt"
timeout $TIMEOUT ${HWLOC} python3 runsynthetic.py --clear-cache --resolve-in-order --incremental-resolution --commit --input-path "$DATA_PATH""0.csv" --output-path $COMMIT_OUT_PATH >$LOG 2>$LOG.stderr

for ((s = 1; s <= 10; s++)) do
echo "running plain ($s/10)"
LOG="${RESDIR}/plain-in-order-e$s-t$r.txt"
timeout $TIMEOUT ${HWLOC} python3 runsynthetic.py --use-resolve-step --clear-cache --resolve-in-order --input-path "$DATA_PATH$s.csv" --output-path $PLAIN_OUT_PATH >$LOG 2>$LOG.stderr

echo "running incremental ($s/10)"
LOG="${RESDIR}/incremental-in-order-e$s-t$r.txt"
timeout $TIMEOUT ${HWLOC} python3 runsynthetic.py --use-resolve-step --clear-cache --resolve-in-order --incremental-resolution --input-path "$DATA_PATH$s.csv" --output-path $INCREMENTAL_OUT_PATH >$LOG 2>$LOG.stderr

echo "running commit ($s/10)"
LOG="${RESDIR}/commit-in-order-e$s-t$r.txt"
timeout $TIMEOUT ${HWLOC} python3 runsynthetic.py --use-resolve-step --clear-cache --resolve-in-order --incremental-resolution --commit --input-path "$DATA_PATH$s.csv" --output-path $COMMIT_OUT_PATH >$LOG 2>$LOG.stderr
done
done


rm -rf $INCREMENTAL_OUT_PATH
rm -rf $PLAIN_OUT_PATH
rm -rf $COMMIT_OUT_PATH
90 changes: 90 additions & 0 deletions benchmarks/incremental/benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#!/usr/bin/env bash

# Parse HWLOC settings
HWLOC=""
if [ $# -ne 0 ] && [ $# -ne 1 ]; then # check nmber of inputs
echo "usage: ./benchmark.sh [-hwloc]"
exit 1
fi

if [ $# -eq 1 ]; then # check if hwloc
if [ "$1" != "-hwloc" ]; then # check flag
echo -e "invalid flag: $1\nusage: ./benchmark.sh [-hwloc]"
exit 1
fi
HWLOC="hwloc-bind --cpubind node:1 --membind node:1 --cpubind node:2 --membind node:2"
fi

# use 10 runs (3 for very long jobs) and a timeout after 180min/3h
NUM_RUNS=3
TIMEOUT=14400

RESDIR='results_dirty_zillow@100G'
DATA_PATH_SSD='/hot/scratch/bgivertz/data/zillow_dirty@100G.csv'
INCREMENTAL_OUT_PATH_SSD='/hot/scratch/bgivertz/output/incremental'
INCREMENTAL_COMMIT_OUT_PATH_SSD='/hot/scratch/bgivertz/output/commit'
PLAIN_OUT_PATH_SSD='/hot/scratch/bgivertz/output/plain'

rm -rf $RESDIR
rm -rf $INCREMENTAL_OUT_PATH_SSD
rm -rf $PLAIN_OUT_PATH_SSD
rm -rf $INCREMENTAL_COMMIT_OUT_PATH_SSD

# does file exist?
if [[ ! -f "$DATA_PATH_SSD" ]]; then
echo "file $DATA_PATH_SSD not found, abort."
exit 1
fi

mkdir -p ${RESDIR}

# create tuplex_config.json
python3 create_conf.py --opt-pushdown --opt-filter --opt-llvm --executor-count 63 --executor-memory "6G" > tuplex_config.json

echo "running out-of-order ssd experiments"
for ((r = 1; r <= NUM_RUNS; r++)); do
echo "trial ($r/$NUM_RUNS)"

echo "running plain"
LOG="${RESDIR}/tuplex-plain-out-of-order-ssd-$r.txt"
timeout $TIMEOUT ${HWLOC} python3 runtuplex.py --clear-cache --path $DATA_PATH_SSD --output-path $PLAIN_OUT_PATH_SSD >$LOG 2>$LOG.stderr

echo "running incremental"
LOG="${RESDIR}/tuplex-incremental-out-of-order-ssd-$r.txt"
timeout $TIMEOUT ${HWLOC} python3 runtuplex.py --clear-cache --incremental-resolution --path $DATA_PATH_SSD --output-path $INCREMENTAL_OUT_PATH_SSD >$LOG 2>$LOG.stderr

# echo "validating results"
# LOG="${RESDIR}/tuplex-compare-out-of-order-ssd-$r.txt"
# timeout $TIMEOUT ${HWLOC} python3 compare_folders.py $PLAIN_OUT_PATH_SSD $INCREMENTAL_OUT_PATH_SSD >$LOG 2>$LOG.stderr
done

echo "running in-order ssd experiments"
for ((r = 1; r <= NUM_RUNS; r++)); do
echo "trial ($r/$NUM_RUNS)"

echo "running plain"
LOG="${RESDIR}/tuplex-plain-in-order-ssd-$r.txt"
timeout $TIMEOUT ${HWLOC} python3 runtuplex.py --clear-cache --resolve-in-order --path $DATA_PATH_SSD --output-path $PLAIN_OUT_PATH_SSD >$LOG 2>$LOG.stderr

echo "running incremental"
LOG="${RESDIR}/tuplex-incremental-in-order-ssd-$r.txt"
timeout $TIMEOUT ${HWLOC} python3 runtuplex.py --clear-cache --resolve-in-order --incremental-resolution --path $DATA_PATH_SSD --output-path $INCREMENTAL_OUT_PATH_SSD >$LOG 2>$LOG.stderr

echo "running commit"
LOG="${RESDIR}/tuplex-incremental-in-order-commit-ssd-$r.txt"
timeout $TIMEOUT ${HWLOC} python3 runtuplex.py --clear-cache --resolve-in-order --incremental-resolution --commit --path $DATA_PATH_SSD --output-path $INCREMENTAL_COMMIT_OUT_PATH_SSD >$LOG 2>$LOG.stderr

# echo "validating results"
# LOG="${RESDIR}/tuplex-compare-in-order-ssd-$r.txt"
# timeout $TIMEOUT ${HWLOC} python3 compare_folders.py --in-order $PLAIN_OUT_PATH_SSD $INCREMENTAL_OUT_PATH_SSD >$LOG 2>$LOG.stderr
#
# LOG="${RESDIR}/tuplex-compare-in-order-commit-ssd-$r.txt"
# timeout $TIMEOUT ${HWLOC} python3 compare_folders.py --in-order $INCREMENTAL_COMMIT_OUT_PATH_SSD $INCREMENTAL_OUT_PATH_SSD >$LOG 2>$LOG.stderr
done

echo "graphing results"
python3 graph.py --results-path $RESDIR --num-trials $NUM_RUNS --num-steps 7

rm -rf $INCREMENTAL_OUT_PATH_SSD
rm -rf $PLAIN_OUT_PATH_SSD
rm -rf $INCREMENTAL_COMMIT_OUT_PATH_SSD
79 changes: 79 additions & 0 deletions benchmarks/incremental/compare_folders.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#!/usr/bin/env python3
# (c) L.Spiegelberg 2021
# compare the csv output contents of two folders (ignoring order)

import os
import sys
import argparse
import glob


def wc_files(path):
files = sorted(glob.glob(os.path.join(path, '*.csv')))

all_lines = []
num_rows = 0
header = None
matching_headers = 0
for f in files:
with open(f, 'r') as fp:
lines = fp.readlines()
if header is None and len(lines) > 0:
header = lines[0]
num_rows += len(lines)
if len(lines) > 0:
if header == lines[0]:
matching_headers += 1
all_lines += lines[1:]
else:
all_lines += lines

if matching_headers == len(files):
num_rows -= matching_headers

print('-- counted {} rows in {} files in folder {}'.format(num_rows, len(files), path))
return num_rows, len(files), all_lines


def main():
parser = argparse.ArgumentParser()
parser.add_argument("folderA")
parser.add_argument("folderB")
parser.add_argument("--in-order", help='whether to compare in order', action='store_true')
args = parser.parse_args()

print('== Dirty Zillow experiment validation ==')

# count lines in each folder
print('-- loading folder contents...')
rowCountA, filesA, rowsA = wc_files(args.folderA)
rowCountB, filesB, rowsB = wc_files(args.folderB)

if rowCountA != rowCountB:
print('>>> number of rows does not match')
sys.exit(1)

# sort lines and compare them
if not args.in_order:
print('-- sorting rows from {}'.format(args.folderA))
rowsA = sorted(rowsA)
print('-- sorting rows from {}'.format(args.folderB))
rowsB = sorted(rowsB)

print('-- computing comparison of rows...')
non_matching_indices = [ind for ind, (i, j) in enumerate(zip(rowsA, rowsB)) if i != j]

if len(non_matching_indices) > 0:
print('>>> rows do not match up, details:')

for idx in non_matching_indices:
print('{:5d}: {} != {}'.format(idx, rowsA[idx], rowsB[idx]))
sys.exit(1)

print('>>> contents of folders match.')

sys.exit(0)


if __name__ == '__main__':
main()
11 changes: 11 additions & 0 deletions benchmarks/incremental/create-synthetic.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/usr/bin/env bash

set -x

python3 synthesize-data.py --dataset-size $1 --output-path /hot/scratch/bgivertz/data/synthetic/synth0.csv --exceptions 0

for ((i = 1; i <= 9; i++)) do
python3 synthesize-data.py --dataset-size $1 --output-path /hot/scratch/bgivertz/data/synthetic/synth$i.csv --exceptions 0.$i
done

python3 synthesize-data.py --dataset-size $1 --output-path /hot/scratch/bgivertz/data/synthetic/synth10.csv --exceptions 1
36 changes: 36 additions & 0 deletions benchmarks/incremental/create_conf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/usr/bin/env python3
# (c) 2020 L.Spiegelberg
# this script creates Tuplex json configuration files for benchmarks

import json
import argparse

if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--executor-memory', default='10G', help='how much memory each thread gets')
parser.add_argument('--executor-count', default=15, help='how many worker threads')
parser.add_argument('--partition-size', default='32MB', help='task size')
parser.add_argument('--runtime-memory', default='64MB', help='how much maximum runtime memory to use')
parser.add_argument('--input-split-size', default='64MB', help='chunk size of input files')
parser.add_argument('--opt-null', help='enable null value optimization', action='store_true')
parser.add_argument('--opt-pushdown', help='enable projection pushdown', action='store_true')
parser.add_argument('--opt-filter', help='enable filter pushdown', action='store_true')
parser.add_argument('--opt-parser', help='generate CSV parser', action='store_true')
parser.add_argument('--opt-llvm', help='run llvm optimizers', action='store_true')

args = parser.parse_args()

conf = {'webui.enable' : False,
'executorMemory' : args.executor_memory,
'executorCount' : args.executor_count,
'driverMemory' : args.executor_memory,
'partitionSize' : args.partition_size,
'runTimeMemory' : args.runtime_memory,
'inputSplitSize' : args.input_split_size,
'useLLVMOptimizer' : args.opt_llvm,
'optimizer.nullValueOptimization' : args.opt_null,
'csv.selectionPushdown' : args.opt_pushdown,
'optimizer.generateParser' : args.opt_parser,
'optimizer.filterPushdown' : args.opt_filter}

print(json.dumps(conf))
Loading