这是indexloc提供的服务,不要输入任何密码
Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
43 changes: 43 additions & 0 deletions benchmarks/hyperspecialization/list_generate/TODO.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
Immediate TODOs:
- implement tuplex hashmap
- implement specialized hashmap
- convert all results to PyDict

Leonhard
-- have way more elements per list.
-- list contained should be smaller, random size.
-- sketch the graph from whiteboard on google slides.
-- fix unordered_map

-- add standard deviation.

(python baseline vs c++ is x% slower.)

-- assume key distribution.
- have statistics of key distribution and can do something smart about it.

-- multiple queries is our edge
-- real data more compelling than artificial data

Unknowns
- for replace, what should keys look like?
- what does reorganize data mean exactly

Immediate TODOs:
- write string parser to identify type.
- write specialized c++ code for int only (use boolean array, specialized hash function, etc.)

Workloads
- count unique (update dictionary per iteration, inserts decrease over time, only numkeys lookup)
- replace (lookup dictionary per iteration, once everything inserted, only lookups)
- reorganize data (JSON -> list?) (need only traverse tree structure once)

Code todo
- write python version of replace
- write cython version of unique, replace
- write cpp version of optimized float/int/string/bool only map/unordered_map

Plots todo
- create multiple maps vs one map std variant/wrapper type
(one per type, fixing key per map, --> plot runtime vs length per list (3x unordered map, 3x ordered_map)
val always fixed to int)
23 changes: 23 additions & 0 deletions benchmarks/hyperspecialization/list_generate/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
echo "building main binaries"
g++ -std=c++17 -msse4.2 -mcx16 -Wall -Wextra -O3 -march=native -DNDEBUG -o stdumap_int_int count_unique_bench.cc -DCOUNT_UM_INT -I/usr/include/python3.8 -lpython3.8 -ldl
g++ -std=c++17 -msse4.2 -mcx16 -Wall -Wextra -O3 -march=native -DNDEBUG -o stdmap_int_int count_unique_bench.cc -DCOUNT_M_INT -I/usr/include/python3.8 -lpython3.8 -ldl
g++ -std=c++17 -msse4.2 -mcx16 -Wall -Wextra -O3 -march=native -DNDEBUG -o stdmap_string_int count_unique_bench.cc -DCOUNT_M_STRING -I/usr/include/python3.8 -lpython3.8 -ldl
g++ -std=c++17 -msse4.2 -mcx16 -Wall -Wextra -O3 -march=native -DNDEBUG -o stdumap_string_int count_unique_bench.cc -DCOUNT_UM_STRING -I/usr/include/python3.8 -lpython3.8 -ldl
g++ -std=c++17 -msse4.2 -mcx16 -Wall -Wextra -O3 -march=native -DNDEBUG -o fixed_range_nopydict count_unique_bench.cc -DFIXED_RANGE -I/usr/include/python3.8 -lpython3.8 -ldl
g++ -g -std=c++17 -msse4.2 -mcx16 -Wall -Wextra -O3 -march=native -DNDEBUG -o tuplex_int_nopydict count_unique_bench.cc -DTUPLEX_INT -I/usr/include/python3.8 -lpython3.8 -ldl

echo "building shared libs"

cd tuplex-hashmap
./build.sh
cd ..

cd specialized-hashmap
./build.sh
cd ..

cd cpp_shared
./build.sh
cd ..

echo "done"
95 changes: 95 additions & 0 deletions benchmarks/hyperspecialization/list_generate/cjson/count_cjson.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#include <bits/stdc++.h>
#include "../csvmonkey.hpp"
#include <cjson/cJSON.h>
#include <Python.h>

using KEY_TYPE = int;
using namespace csvmonkey;

const int MIN_KEY = 0;
const int MAX_KEY = 1000;

/*
- char dict[256]
- perfect hash function
*/

// PyObject* convert_to_dict(int* dict) {
// PyObject* mydict = PyDict_New();
// for(int i = 0; i <= )
// }

int countUnique(const std::vector<int>& li) {
int dict[MAX_KEY + 1];
memset(dict, 0, sizeof(dict));
int ctr = 0;

for(auto x : li) {
if(dict[x] == 0) ctr++;
dict[x] += 1;
}
return ctr;
}

std::vector<int> countUniqueList(const std::vector<std::vector<int> >& li) {
std::vector<int> my_vec;
my_vec.reserve(li.size());
for(const auto& x : li) {
my_vec.push_back(countUnique(x));
}
return my_vec;
}


template<typename T>
void run(const char* path) {
// MappedFileCursor
MappedFileCursor stream;
stream.open(path);

CsvReader<MappedFileCursor> reader(stream);
CsvCursor &row = reader.row();

std::vector<std::vector<T> > list_of_lists;

while(reader.read_row()) {
std::vector<T> v;
for(size_t i = 0; i < row.count; i++) {
CsvCell &cell = row.cells[i];
std::string s = cell.as_str();
if constexpr (std::is_same<T, int>::value) {
v.push_back(stoi(s));
} else {
v.push_back(s);
}
}

list_of_lists.push_back(v);
}

using std::chrono::high_resolution_clock;
using std::chrono::duration_cast;
using std::chrono::duration;
using std::chrono::milliseconds;

auto t1 = high_resolution_clock::now();

auto result = countUniqueList(list_of_lists);

auto t2 = high_resolution_clock::now();
/* Getting number of milliseconds as an integer. */
auto ms_int = duration_cast<milliseconds>(t2 - t1);
/* Getting number of milliseconds as a double. */
duration<double, std::milli> ms_double = t2 - t1;
std::cout << ms_double.count() << "\n";

}

int main(int argc, char** argv) {
const char* path = "";
if (argc > 1) {
path = argv[1];
}

run<KEY_TYPE>(path);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import argparse
import time
import pickle
import csv
import pdb

def count_unique(simple_list):
my_dict = {}
for element in simple_list:
if element in my_dict:
my_dict[element] += 1
else:
my_dict[element] = 0
return my_dict

def count_unique_for_loop(list_of_lists):
tstart = time.time_ns()
result = []
for my_list in list_of_lists:
result.append(count_unique(my_list))
duration = time.time_ns() - tstart
print(duration / 1_000_000)
return result

def count_unique_list_comprehension(list_of_lists):
tstart = time.time_ns()
result = [count_unique(my_list) for my_list in list_of_lists]
duration = time.time_ns() - tstart
print(duration / 1_000_000)
return result

def count_unique_map(list_of_lists):
tstart = time.time_ns()
result = list(map(count_unique, list_of_lists))
duration = time.time_ns() - tstart
print(duration / 1_000_000)
return result

def readlist_pickle(filename):
with open(filename, 'rb') as f:
return pickle.load(f)

def readlist_csv(filename):
list_of_lists = []
with open(filename, newline='') as f:
list_of_lists = list(csv.reader(f))
# print(len([len(li) for li in list_of_lists]), [len(li) for li in list_of_lists])
# pdb.set_trace()
return list_of_lists

def main():
parser = argparse.ArgumentParser(description='Run baseline implementation of count unique.')
parser.add_argument('--filename', type=str)
args = parser.parse_args()

randlist = readlist_csv(args.filename)

count_unique_list_comprehension(randlist)

if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import argparse
import time
import pickle
import csv
import pdb
import gc

def count_unique(simple_list):
my_dict = {}
for element in simple_list:
if element in my_dict:
my_dict[element] += 1
else:
my_dict[element] = 0
return len(my_dict)

def count_unique_for_loop(list_of_lists):
tstart = time.time_ns()
result = []
for my_list in list_of_lists:
result.append(count_unique(my_list))
duration = time.time_ns() - tstart
print(duration / 1_000_000)
return result

def count_unique_list_comprehension(list_of_lists):
tstart = time.time_ns()
result = [count_unique(my_list) for my_list in list_of_lists]
duration = time.time_ns() - tstart
print(duration / 1_000_000)
return result

def count_unique_map(list_of_lists):
tstart = time.time_ns()
result = list(map(count_unique, list_of_lists))
duration = time.time_ns() - tstart
print(duration / 1_000_000)
return result

def readlist_pickle(filename):
with open(filename, 'rb') as f:
return pickle.load(f)

def readlist_csv(filename):
list_of_lists = []
with open(filename, newline='') as f:
list_of_lists = list(csv.reader(f))
# print(len([len(li) for li in list_of_lists]), [len(li) for li in list_of_lists])
return list_of_lists

def main():
parser = argparse.ArgumentParser(description='Run baseline implementation of count unique.')
parser.add_argument('--filename', type=str)
args = parser.parse_args()
randlist = readlist_csv(args.filename)
count_unique_list_comprehension(randlist)

if __name__ == "__main__":
main()
gc.collect()

Loading