tuplex · LeonhardFS · Apr 13, 2022 · Dec 7, 2021 · Dec 8, 2021 · Dec 8, 2021
diff --git a/benchmarks/311/benchmark.sh b/benchmarks/311/benchmark.sh
diff --git a/benchmarks/311/runbenchmark.sh b/benchmarks/311/runbenchmark.sh
@@ -16,47 +16,77 @@ python3 create_conf.py --opt-null --opt-pushdown --opt-filter --opt-llvm > tuple
 python3 create_conf.py --opt-pushdown --opt-filter --opt-llvm > tuplex_config.json
 cp tuplex_config.json ${RESDIR}
 
-echo "running tuplex"
+# Weld
+echo "running weld"
+for ((r = 1; r <= NUM_RUNS; r++)); do
+  LOG="${RESDIR}/weld-run-$r.txt"
+  rm -rf "${OUTPUT_DIR}/weld_output"
+  timeout $TIMEOUT ${HWLOC} python2 rungrizzly.py --path $DATA_PATH --output-path ${OUTPUT_DIR}/weld_output >$LOG 2>$LOG.stderr
+done
+
+echo "-- running tuplex (single-threaded & multi-threaded)"
+
+cp tuplex_config_mt.json tuplex_config.json
+echo "running mt tuplex e2e"
 for ((r = 1; r <= NUM_RUNS; r++)); do
   LOG="${RESDIR}/tuplex-run-e2e-$r.txt"
-  rm -rf "${OUTPUT_DIR}/tuplex_output"
-  timeout $TIMEOUT ${PYTHON} runtuplex.py --path $DATA_PATH --output-path "${OUTPUT_DIR}/tuplex_output" >$LOG 2>$LOG.stderr
+  timeout $TIMEOUT ${PYTHON} runtuplex.py --path $DATA_PATH --output-path ${OUTPUT_DIR}/tuplex_e2e >$LOG 2>$LOG.stderr
 done
+
+echo "running mt tuplex cached"
 for ((r = 1; r <= NUM_RUNS; r++)); do
   LOG="${RESDIR}/tuplex-run-weld-$r.txt"
-  rm -rf "${OUTPUT_DIR}/tuplex_output"
-  timeout $TIMEOUT ${PYTHON} runtuplex.py --path $DATA_PATH --weld-mode --output-path "${OUTPUT_DIR}/tuplex_output" >$LOG 2>$LOG.stderr
+  timeout $TIMEOUT ${PYTHON} runtuplex.py --path $DATA_PATH --output-path ${OUTPUT_DIR}/tuplex_cached --weld-mode >$LOG 2>$LOG.stderr
+done
+
+cp tuplex_config_st.json tuplex_config.json
+echo "running st tuplex e2e"
+for ((r = 1; r <= NUM_RUNS; r++)); do
+  LOG="${RESDIR}/sttuplex-run-e2e-$r.txt"
+  timeout $TIMEOUT ${PYTHON} runtuplex.py --path $DATA_PATH --output-path ${OUTPUT_DIR}/sttuplex_e2e >$LOG 2>$LOG.stderr
+done
+
+echo "running st tuplex cached"
+for ((r = 1; r <= NUM_RUNS; r++)); do
+  LOG="${RESDIR}/sttuplex-run-weld-$r.txt"
+  timeout $TIMEOUT ${PYTHON} runtuplex.py --path $DATA_PATH --output-path ${OUTPUT_DIR}/sttuplex_cached --weld-mode >$LOG 2>$LOG.stderr
 done
 
+
 # spark
 export PYSPARK_PYTHON=${PYTHON}
 export PYSPARK_DRIVER_PYTHON=${PYTHON}
-echo "benchmarking pyspark"
+echo "benchmarking pyspark (4 modes)"
+echo "benchmarking pyspark e2e"
 for ((r = 1; r <= NUM_RUNS; r++)); do
   LOG="${RESDIR}/pyspark-run-e2e-$r.txt"
   timeout $TIMEOUT spark-submit --master "local[16]" --driver-memory 100g runpyspark.py --path $DATA_PATH --output-path "${OUTPUT_DIR}/spark_output" >$LOG 2>$LOG.stderr
 done
+echo "benchmarking pyspark cached"
 for ((r = 1; r <= NUM_RUNS; r++)); do
   LOG="${RESDIR}/pyspark-run-weld-$r.txt"
   timeout $TIMEOUT spark-submit --master "local[16]" --driver-memory 100g runpyspark.py --path $DATA_PATH --weld-mode --output-path "${OUTPUT_DIR}/spark_output" >$LOG 2>$LOG.stderr
 done
+echo "benchmarking pyspark sql e2e"
 for ((r = 1; r <= NUM_RUNS; r++)); do
   LOG="${RESDIR}/pysparksql-run-e2e-$r.txt"
   timeout $TIMEOUT spark-submit --master "local[16]" --driver-memory 100g runpyspark.py --path $DATA_PATH --sql-mode --output-path "${OUTPUT_DIR}/spark_output" >$LOG 2>$LOG.stderr
 done
+echo "benchmarking pyspark sql cached"
 for ((r = 1; r <= NUM_RUNS; r++)); do
   LOG="${RESDIR}/pysparksql-run-weld-$r.txt"
   timeout $TIMEOUT spark-submit --master "local[16]" --driver-memory 100g runpyspark.py --path $DATA_PATH --sql-mode --weld-mode --output-path "${OUTPUT_DIR}/spark_output" >$LOG 2>$LOG.stderr
 done
 
 
 # Dask
-echo "benchmarking dask"
+echo "benchmarking dask e2e"
 for ((r = 1; r <= NUM_RUNS; r++)); do
   LOG="${RESDIR}/dask-run-e2e-$r.txt"
-  timeout $TIMEOUT ${PYTHON} rundask.py --path $DATA_PATH --output-path "${OUTPUT_DIR}/dask_output" >$LOG 2>$LOG.stderr
+  timeout $TIMEOUT ${PYTHON} rundask.py --path $DATA_PATH --output-path "${OUTPUT_DIR}/dask_e2e" >$LOG 2>$LOG.stderr
 done
+echo "benchmarking dask cached"
 for ((r = 1; r <= NUM_RUNS; r++)); do
   LOG="${RESDIR}/dask-run-weld-$r.txt"
-  timeout $TIMEOUT ${PYTHON} rundask.py --path $DATA_PATH --weld-mode --output-path "${OUTPUT_DIR}/dask_output" >$LOG 2>$LOG.stderr
+  timeout $TIMEOUT ${PYTHON} rundask.py --path $DATA_PATH --weld-mode --output-path "${OUTPUT_DIR}/dask_cached" >$LOG 2>$LOG.stderr
 done
diff --git a/benchmarks/311/rundask.py b/benchmarks/311/rundask.py
@@ -1,3 +1,4 @@
+import shutil
 import time
 import argparse
 import json
@@ -43,6 +44,16 @@ def fix_zip_codes(zips):
 # save the run configuration
 output_path = args.output_path
 
+# if dir exists, remove
+if os.path.exists(output_path):
+    shutil.rmtree(output_path)
+os.makedirs(output_path, exist_ok=True)
+
+# remove all files within
+
+# dask will fail if it is a directory else
+output_path = os.path.join(output_path, 'export-*.csv')
+
 # get the input files
 perf_paths = [args.data_path]
 if not os.path.isfile(args.data_path):

diff --git a/benchmarks/311/rungrizzly.py b/benchmarks/311/rungrizzly.py
@@ -47,4 +47,4 @@
 
 print("Total end-to-end time, including compilation: %.2f" % query_time)
 
-print('framework,pandas_load,load,query\n{},{},{},{}'.format('weld-grizzly', pandas_load_time, load_time, query_time))
+print('framework,pandas_load,load,query\n{},{},{},{}'.format('weld-grizzly', pandas_load_time, load_time, query_time))
diff --git a/benchmarks/flights/runbenchmark.sh b/benchmarks/flights/runbenchmark.sh
@@ -16,6 +16,10 @@ PYTHON=python3.6
 mkdir -p ${LG_RESDIR}
 mkdir -p ${SM_RESDIR}
 
+# create original tuplex_config.json (gets overwritten by breakdown...)
+cp tuplex_config_template.json tuplex_config.json
+cat tuplex_config.json
+
 echo "running on large flight data"
 echo "running tuplex"
 for ((r = 1; r <= NUM_RUNS; r++)); do
@@ -65,3 +69,8 @@ for ((r = 1; r <= NUM_RUNS; r++)); do
   LOG="${SM_RESDIR}/dask-run-$r.txt"
     timeout $TIMEOUT ${PYTHON} rundask.py --path $SM_INPUT_PATH --output-path $OUTPUT_DIR/dask >$LOG 2>$LOG.stderr
 done
+
+# copy config files after LG run
+cp tuplex_config.json ${LG_RESDIR}/
+# copy config files after SM run
+cp tuplex_config.json ${SM_RESDIR}/
diff --git a/benchmarks/flights/rundask.py b/benchmarks/flights/rundask.py
@@ -92,7 +92,8 @@
     #client = Client(n_workers=8, threads_per_worker=1, processes=True) # default init
     # client = Client(n_workers=16, threads_per_worker=1, processes=True)
 
-    client=Client(n_workers=16, threads_per_worker=1, processes=True, memory_limit='8GB')
+    # because Dask tends to fail for the large flights dataset, give it more memory than Tuplex/Spark
+    client=Client(n_workers=16, threads_per_worker=1, processes=True, memory_limit='12GB')
     #client=Client()
     print(client)
 

diff --git a/benchmarks/flights/tuplex_config_template.json b/benchmarks/flights/tuplex_config_template.json
@@ -0,0 +1,11 @@
+{"webui.enable": false,
+"executorCount": 15,
+"executorMemory": "6G",
+"driverMemory": "10G",
+"partitionSize": "32MB",
+"runTimeMemory": "8MB",
+"useLLVMOptimizer": true,
+"optimizer.nullValueOptimization": true,
+"csv.selectionPushdown": true,
+"resolveWithInterpreterOnly":false,
+"mergeRowsInOrder":false}
diff --git a/benchmarks/logs/process_data.py b/benchmarks/logs/process_data.py
@@ -1,19 +1,48 @@
+#!/usr/bin/env python3
 import glob
 import os
+import argparse
 
-input_dir = "/disk/data/weblogs/*.*.*.txt"
-output_dir =  "/disk/data/weblogs_clean"
+if __name__ == '__main__':
+    # parse the arguments
+    parser = argparse.ArgumentParser(description="Apache data cleaning + join")
+    parser.add_argument(
+        "--input-path",
+        type=str,
+        dest="input_path",
+        default="/data/logs",
+        help="raw logs path",
+    )
+    parser.add_argument(
+        "--output-path",
+        type=str,
+        dest="output_path",
+        default="/data/logs_clean",
+        help="raw logs path",
+    )
 
-if not os.path.exists(output_dir):
-    os.makedirs(output_dir)
+    args = parser.parse_args()
 
-for filename in glob.glob(input_dir):
-    new_filename = f'{output_dir}/{filename[filename.rfind("/")+1:]}'
-    print(f'{filename} -> {new_filename}')
-    with open(filename, "r", encoding='latin_1') as f:
-        with open(new_filename, 'w') as fo:
-            for l in f:
-                test = l.encode('ascii', 'replace').decode('ascii') # make it ascii
-                test = test.replace('\0', '?') # get rid of null characters
-                fo.write(test)
+    input_dir = os.path.join(args.input_path, "*.*.*.txt")
+    output_dir = args.output_path
 
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    num_skipped = 0
+    for filename in glob.glob(input_dir):
+        new_filename = f'{output_dir}/{filename[filename.rfind("/")+1:]}'
+
+        if os.path.isfile(new_filename):
+            num_skipped += 1
+        else:
+            print(f'{filename} -> {new_filename}')
+            with open(filename, "r", encoding='latin_1') as f:
+                with open(new_filename, 'w') as fo:
+                    for l in f:
+                        test = l.encode('ascii', 'replace').decode('ascii') # make it ascii
+                        test = test.replace('\0', '?') # get rid of null characters
+                        fo.write(test)
+    if num_skipped > 0:
+        print('skipped {} files'.format(num_skipped))
+    print('Done.')
diff --git a/benchmarks/logs/runbenchmark.sh b/benchmarks/logs/runbenchmark.sh
@@ -16,10 +16,14 @@ if [ $# -eq 1 ]; then # check if hwloc
   HWLOC="hwloc-bind --cpubind node:1 --membind node:1 --cpubind node:2 --membind node:2"
 fi
 
-DATA_PATH=/data/logs
+
+# invoke preprocess script
+python3 process_data.py --input-path /data/logs --output-path /data/logs_clean
+
+DATA_PATH=/data/logs_clean
 IP_PATH=/data/logs/ip_blacklist.csv
-RESDIR=/results/weblogs
-OUTPUT_DIR=/results/output/weblogs
+RESDIR=/results/logs
+OUTPUT_DIR=/results/output/logs
 NUM_RUNS="${NUM_RUNS:-11}"
 REDUCED_NUM_RUNS=4
 TIMEOUT=14400

diff --git a/benchmarks/logs/rundask.py b/benchmarks/logs/rundask.py
@@ -244,7 +244,8 @@ def try_int(x):
     import pandas as pd
     import numpy as np
 
-    client = Client(n_workers=16, threads_per_worker=1, processes=True, memory_limit='8GB')
+    # because Dask tends to fail for the large flights dataset, give it more memory than Tuplex/Spark
+    client = Client(n_workers=16, threads_per_worker=1, processes=True, memory_limit='14GB')
     print(client)
     startup_time = time.time() - tstart
     print("Dask startup time: {}".format(startup_time))

diff --git a/benchmarks/logs/tuplex_config.json b/benchmarks/logs/tuplex_config.json
@@ -2,7 +2,7 @@
   "executorMemory": "6G",
   "executorCount": 15,
   "driverMemory": "10G",
-  "partitionSize": "32MB",
+  "partitionSize": "16MB",
   "runTimeMemory": "64MB",
   "inputSplitSize": "64MB",
   "useLLVMOptimizer": true,

diff --git a/benchmarks/sigmod21-reproducibility/AWS_Configuration.md b/benchmarks/sigmod21-reproducibility/AWS_Configuration.md
@@ -22,11 +22,12 @@ sudo chown $(whoami) /disk
 # 2. install docker
 sudo apt update
 sudo apt install -y apt-transport-https ca-certificates curl software-properties-common p7zip-full
-curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
-sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu focal stable"
+curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg
+echo \
+  "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu \
+  $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
 sudo apt update
-apt-cache policy docker-ce # should print out apt details
-sudo apt install -y docker-ce
+sudo apt-get install -y docker-ce docker-ce-cli containerd.io
 sudo systemctl status docker # should print status out
 sudo usermod -aG docker ${USER} # allows to run docker commands as non-sudo
 #logout, login to make user group changes effective

diff --git a/benchmarks/sigmod21-reproducibility/README.md b/benchmarks/sigmod21-reproducibility/README.md
@@ -100,7 +100,7 @@ In `AWS_Configuration.md` we provide the commands we used to configure the machi
 
 ### D) Experimentation Info
 
-For convenience we provide a top-level command-line interface `/tuplex.py` to carry out various tasks in order to reproduce the results. In order to carry out experiments, a couple steps to be performed after setting up a benchmark machine (as described in C) or `AWS_Setup.md` / `AWS_Configuration.md`), for which the CLI may be used:
+For convenience, we provide a top-level command-line interface `/tuplex.py` to carry out various tasks in order to reproduce the results. In order to carry out experiments, a couple steps to be performed after setting up a benchmark machine (as described in C) or `AWS_Setup.md` / `AWS_Configuration.md`), for which the CLI may be used:
 
 1. download & extract data  
     `./tuplex.py download --password <PASSWORD HERE>`
Original file line number	Diff line number	Diff line change
Expand Up		@@ -47,4 +47,4 @@

		print("Total end-to-end time, including compilation: %.2f" % query_time)

		print('framework,pandas_load,load,query\n{},{},{},{}'.format('weld-grizzly', pandas_load_time, load_time, query_time))
		print('framework,pandas_load,load,query\n{},{},{},{}'.format('weld-grizzly', pandas_load_time, load_time, query_time))