tuplex · yash-browncs · Nov 1, 2021 · Nov 1, 2021 · Nov 2, 2021 · Nov 2, 2021
diff --git a/benchmarks/hyperspecialization/list_generate/0_to_10.png b/benchmarks/hyperspecialization/list_generate/0_to_10.png
diff --git a/benchmarks/hyperspecialization/list_generate/TODO.txt b/benchmarks/hyperspecialization/list_generate/TODO.txt
@@ -0,0 +1,43 @@
+Immediate TODOs:
+- implement tuplex hashmap
+- implement specialized hashmap 
+- convert all results to PyDict
+
+Leonhard
+-- have way more elements per list.
+-- list contained should be smaller, random size.
+-- sketch the graph from whiteboard on google slides.
+-- fix unordered_map 
+
+-- add standard deviation.
+
+(python baseline vs c++ is x% slower.)
+
+-- assume key distribution.
+    - have statistics of key distribution and can do something smart about it.
+
+-- multiple queries is our edge
+-- real data more compelling than artificial data
+
+Unknowns
+- for replace, what should keys look like?
+- what does reorganize data mean exactly
+
+Immediate TODOs:
+- write string parser to identify type.
+- write specialized c++ code for int only (use boolean array, specialized hash function, etc.)
+
+Workloads
+- count unique (update dictionary per iteration, inserts decrease over time, only numkeys lookup)
+- replace (lookup dictionary per iteration, once everything inserted, only lookups)
+- reorganize data (JSON -> list?) (need only traverse tree structure once)
+
+Code todo
+- write python version of replace
+- write cython version of unique, replace
+- write cpp version of optimized float/int/string/bool only map/unordered_map 
+
+Plots todo
+- create multiple maps vs one map std variant/wrapper type
+  (one per type, fixing key per map, --> plot runtime vs length per list (3x unordered map, 3x ordered_map)
+   val always fixed to int)
diff --git a/benchmarks/hyperspecialization/list_generate/build.sh b/benchmarks/hyperspecialization/list_generate/build.sh
@@ -0,0 +1,23 @@
+echo "building main binaries"
+g++ -std=c++17 -msse4.2 -mcx16 -Wall -Wextra -O3 -march=native -DNDEBUG -o stdumap_int_int count_unique_bench.cc -DCOUNT_UM_INT -I/usr/include/python3.8 -lpython3.8 -ldl
+g++ -std=c++17 -msse4.2 -mcx16 -Wall -Wextra -O3 -march=native -DNDEBUG -o stdmap_int_int count_unique_bench.cc -DCOUNT_M_INT -I/usr/include/python3.8 -lpython3.8 -ldl
+g++ -std=c++17 -msse4.2 -mcx16 -Wall -Wextra -O3 -march=native -DNDEBUG -o stdmap_string_int count_unique_bench.cc -DCOUNT_M_STRING -I/usr/include/python3.8 -lpython3.8 -ldl
+g++ -std=c++17 -msse4.2 -mcx16 -Wall -Wextra -O3 -march=native -DNDEBUG -o stdumap_string_int count_unique_bench.cc -DCOUNT_UM_STRING -I/usr/include/python3.8 -lpython3.8 -ldl
+g++ -std=c++17 -msse4.2 -mcx16 -Wall -Wextra -O3 -march=native -DNDEBUG -o fixed_range_nopydict count_unique_bench.cc -DFIXED_RANGE -I/usr/include/python3.8 -lpython3.8 -ldl
+g++ -g -std=c++17 -msse4.2 -mcx16 -Wall -Wextra -O3 -march=native -DNDEBUG -o tuplex_int_nopydict count_unique_bench.cc -DTUPLEX_INT -I/usr/include/python3.8 -lpython3.8 -ldl
+
+echo "building shared libs"
+
+cd tuplex-hashmap
+./build.sh
+cd ..
+
+cd specialized-hashmap
+./build.sh
+cd ..
+
+cd cpp_shared
+./build.sh
+cd ..
+
+echo "done"
diff --git a/benchmarks/hyperspecialization/list_generate/cjson/count_cjson.cc b/benchmarks/hyperspecialization/list_generate/cjson/count_cjson.cc
@@ -0,0 +1,95 @@
+#include <bits/stdc++.h>
+#include "../csvmonkey.hpp"
+#include <cjson/cJSON.h>
+#include <Python.h>
+
+using KEY_TYPE = int;
+using namespace csvmonkey;
+
+const int MIN_KEY = 0;
+const int MAX_KEY = 1000;
+
+/*
+    - char dict[256]
+    - perfect hash function
+*/
+
+// PyObject* convert_to_dict(int* dict) {
+//     PyObject* mydict = PyDict_New();
+//     for(int i = 0; i <= )
+// }
+
+int countUnique(const std::vector<int>& li) {
+    int dict[MAX_KEY + 1];
+    memset(dict, 0, sizeof(dict));
+    int ctr = 0;
+
+    for(auto x : li) {
+        if(dict[x] == 0) ctr++;
+        dict[x] += 1;
+    }
+    return ctr;
+}
+
+std::vector<int> countUniqueList(const std::vector<std::vector<int> >& li) {
+    std::vector<int> my_vec;
+    my_vec.reserve(li.size());
+    for(const auto& x : li) {
+        my_vec.push_back(countUnique(x));
+    }
+    return my_vec;
+}
+
+
+template<typename T>
+void run(const char* path) {
+    // MappedFileCursor
+    MappedFileCursor stream;
+    stream.open(path);
+
+    CsvReader<MappedFileCursor> reader(stream);
+    CsvCursor &row = reader.row();
+
+    std::vector<std::vector<T> > list_of_lists;
+
+    while(reader.read_row()) {
+        std::vector<T> v;
+        for(size_t i = 0; i < row.count; i++) {
+            CsvCell &cell = row.cells[i];
+            std::string s = cell.as_str();
+            if constexpr (std::is_same<T, int>::value) {
+                v.push_back(stoi(s));
+            } else {
+                v.push_back(s);
+            }
+        }
+
+        list_of_lists.push_back(v);
+    }
+
+    using std::chrono::high_resolution_clock;
+    using std::chrono::duration_cast;
+    using std::chrono::duration;
+    using std::chrono::milliseconds;
+
+    auto t1 = high_resolution_clock::now();
+
+    auto result = countUniqueList(list_of_lists);
+
+    auto t2 = high_resolution_clock::now();
+    /* Getting number of milliseconds as an integer. */
+    auto ms_int = duration_cast<milliseconds>(t2 - t1);
+    /* Getting number of milliseconds as a double. */
+    duration<double, std::milli> ms_double = t2 - t1;
+    std::cout << ms_double.count() << "\n";
+
+}
+
+int main(int argc, char** argv) {
+    const char* path = "";
+    if (argc > 1) {
+        path = argv[1];
+    }
+
+    run<KEY_TYPE>(path);
+}
diff --git a/benchmarks/hyperspecialization/list_generate/count_unique_baseline.py b/benchmarks/hyperspecialization/list_generate/count_unique_baseline.py
@@ -0,0 +1,61 @@
+import argparse
+import time
+import pickle
+import csv
+import pdb
+
+def count_unique(simple_list):
+    my_dict = {}
+    for element in simple_list:
+        if element in my_dict:
+            my_dict[element] += 1
+        else:
+            my_dict[element] = 0
+    return my_dict
+
+def count_unique_for_loop(list_of_lists):
+    tstart = time.time_ns()
+    result = []
+    for my_list in list_of_lists:
+        result.append(count_unique(my_list))
+    duration = time.time_ns() - tstart
+    print(duration / 1_000_000)
+    return result
+
+def count_unique_list_comprehension(list_of_lists):
+    tstart = time.time_ns()
+    result = [count_unique(my_list) for my_list in list_of_lists]
+    duration = time.time_ns() - tstart
+    print(duration / 1_000_000)
+    return result
+
+def count_unique_map(list_of_lists):
+    tstart = time.time_ns()
+    result = list(map(count_unique, list_of_lists))
+    duration = time.time_ns() - tstart
+    print(duration / 1_000_000)
+    return result
+
+def readlist_pickle(filename):
+    with open(filename, 'rb') as f:
+        return pickle.load(f)
+
+def readlist_csv(filename):
+    list_of_lists = []
+    with open(filename, newline='') as f:
+        list_of_lists = list(csv.reader(f))
+    # print(len([len(li) for li in list_of_lists]), [len(li) for li in list_of_lists])
+    # pdb.set_trace()
+    return list_of_lists
+
+def main():
+    parser = argparse.ArgumentParser(description='Run baseline implementation of count unique.')
+    parser.add_argument('--filename', type=str)
+    args = parser.parse_args()
+
+    randlist = readlist_csv(args.filename)
+
+    count_unique_list_comprehension(randlist)
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/hyperspecialization/list_generate/count_unique_baseline_freq.py b/benchmarks/hyperspecialization/list_generate/count_unique_baseline_freq.py
@@ -0,0 +1,61 @@
+import argparse
+import time
+import pickle
+import csv
+import pdb
+import gc
+
+def count_unique(simple_list):
+    my_dict = {}
+    for element in simple_list:
+        if element in my_dict:
+            my_dict[element] += 1
+        else:
+            my_dict[element] = 0
+    return len(my_dict)
+
+def count_unique_for_loop(list_of_lists):
+    tstart = time.time_ns()
+    result = []
+    for my_list in list_of_lists:
+        result.append(count_unique(my_list))
+    duration = time.time_ns() - tstart
+    print(duration / 1_000_000)
+    return result
+
+def count_unique_list_comprehension(list_of_lists):
+    tstart = time.time_ns()
+    result = [count_unique(my_list) for my_list in list_of_lists]
+    duration = time.time_ns() - tstart
+    print(duration / 1_000_000)
+    return result
+
+def count_unique_map(list_of_lists):
+    tstart = time.time_ns()
+    result = list(map(count_unique, list_of_lists))
+    duration = time.time_ns() - tstart
+    print(duration / 1_000_000)
+    return result
+
+def readlist_pickle(filename):
+    with open(filename, 'rb') as f:
+        return pickle.load(f)
+
+def readlist_csv(filename):
+    list_of_lists = []
+    with open(filename, newline='') as f:
+        list_of_lists = list(csv.reader(f))
+    # print(len([len(li) for li in list_of_lists]), [len(li) for li in list_of_lists])
+    return list_of_lists
+
+def main():
+    parser = argparse.ArgumentParser(description='Run baseline implementation of count unique.')
+    parser.add_argument('--filename', type=str)
+    args = parser.parse_args()
+    randlist = readlist_csv(args.filename)
+    count_unique_list_comprehension(randlist)
+
+if __name__ == "__main__":
+    main()
+    gc.collect()
+