From 215f48acf5f77d6823975b2548fdd92b5579a67e Mon Sep 17 00:00:00 2001 From: leonhards Date: Fri, 22 Oct 2021 12:43:32 -0400 Subject: [PATCH 1/5] improving column renaming in tuplex --- tuplex/core/include/DataSet.h | 14 + tuplex/core/include/EmptyDataset.h | 1 + tuplex/core/include/ErrorDataSet.h | 1 + tuplex/core/src/DataSet.cc | 63 +- tuplex/python/include/PythonDataSet.h | 2 + tuplex/python/src/PythonDataSet.cc | 37 + tuplex/test/core/DataFrameOperations.cc | 11 + tuplex/utils/include/Utils.h | 32 + .../include/third_party/levenshtein-sse.h | 957 ++++++++++++++++++ 9 files changed, 1115 insertions(+), 3 deletions(-) create mode 100644 tuplex/utils/include/third_party/levenshtein-sse.h diff --git a/tuplex/core/include/DataSet.h b/tuplex/core/include/DataSet.h index 15b52901a..c598ac1a3 100644 --- a/tuplex/core/include/DataSet.h +++ b/tuplex/core/include/DataSet.h @@ -165,6 +165,14 @@ namespace tuplex { */ virtual DataSet &renameColumn(const std::string &oldColumnName, const std::string &newColumnName); + /*! + * rename column based on position in dataframe. throws error if invalid index is supplied. + * @param index position, 0 <= index < #columns + * @param newColumnName new column name + * @return Dataset or Errordataset + */ + virtual DataSet &renameColumn(int index, const std::string& newColumnName); + /*! * add a new column to dataset, whose result is defined through the given udf * @param columnName @@ -209,6 +217,12 @@ namespace tuplex { */ Schema schema() const; + /*! + * How many columns dataset has (at least 1) + * @return number of columns + */ + size_t numColumns() const; + /*! * join dataset with other dataset, either based on (K, V), (K, W) layout or via column names(equijoin) * @param other diff --git a/tuplex/core/include/EmptyDataset.h b/tuplex/core/include/EmptyDataset.h index 250de48d2..b3c1ed7af 100644 --- a/tuplex/core/include/EmptyDataset.h +++ b/tuplex/core/include/EmptyDataset.h @@ -43,6 +43,7 @@ namespace tuplex { virtual DataSet& selectColumns(const std::vector& columnNames) override { return *this; } virtual DataSet& selectColumns(const std::vector& columnIndices) override { return *this; } virtual DataSet& renameColumn(const std::string& oldColumnName, const std::string& newColumnName) override { return *this; } + virtual DataSet& renameColumn(int index, const std::string& newColumnName) override { return *this; } virtual DataSet& withColumn(const std::string& columnName, const UDF& udf) override { return *this; } virtual void tofile(FileFormat fmt, const URI& uri, diff --git a/tuplex/core/include/ErrorDataSet.h b/tuplex/core/include/ErrorDataSet.h index 459d59370..2f46d8638 100644 --- a/tuplex/core/include/ErrorDataSet.h +++ b/tuplex/core/include/ErrorDataSet.h @@ -51,6 +51,7 @@ namespace tuplex { DataSet& selectColumns(const std::vector& columnNames) override { return *this; } DataSet& selectColumns(const std::vector& columnIndices) override { return *this; } DataSet& renameColumn(const std::string& oldColumnName, const std::string& newColumnName) override { return *this; } + DataSet& renameColumn(int, const std::string& newColumnName) override { return *this; } DataSet& withColumn(const std::string& columnName, const UDF& udf) override { return *this; } void tofile(FileFormat fmt, const URI& uri, diff --git a/tuplex/core/src/DataSet.cc b/tuplex/core/src/DataSet.cc index b5cc4b870..8cd43f08c 100644 --- a/tuplex/core/src/DataSet.cc +++ b/tuplex/core/src/DataSet.cc @@ -378,17 +378,65 @@ namespace tuplex { assert(_context); assert(_operator); + if("" == oldColumnName) { + return _context->makeError("Can't rename \"\" to name"); + } + + if(_columnNames.empty()) { + return _context->makeError("Dataset has no column names specified, try to use position based renameColumn function"); + } + // find old column in current columns auto it = std::find(_columnNames.begin(), _columnNames.end(), oldColumnName); - if(it == _columnNames.end()) - return _context->makeError("renameColumn: could not find column '" + oldColumnName + "' in dataset's columns"); + if(it == _columnNames.end()) { + // fuzzy match against existing columns + assert(_columnNames.size() >= 1); + auto closest_index = fuzzyMatch(oldColumnName, _columnNames); + assert(closest_index >= 0 && closest_index < _columnNames.size()); + auto closest_name = _columnNames[closest_index]; + return _context->makeError("renameColumn: could not find column '" + oldColumnName + "' in dataset's columns. Did you mean \"" + closest_name + "\""); + } // position? auto idx = it - _columnNames.begin(); + return renameColumn(idx, newColumnName); + } + + size_t DataSet::numColumns() const { + assert(schema().getRowType().isTupleType()); + return this->schema().getRowType().parameters().size(); + } + + DataSet & DataSet::renameColumn(int index, const std::string &newColumnName) { + using namespace std; + + if(isError()) + return *this; + + assert(_context); + assert(_operator); + + // renaming to empty string is not allowed + if("" == newColumnName) { + return _context->makeError("Can't rename column to \"\", as it is a reserved value"); + } + + auto num_columns = numColumns(); + if(index < 0) + throw std::runtime_error("index must be non-negative number"); + if(index >= num_columns) + throw std::runtime_error("Dataset contains only " + std::to_string(num_columns) + ", can't rename the " + + ordinal(index + 1) + " column"); // make copy vector columnNames(_columnNames.begin(), _columnNames.end()); - columnNames[idx] = newColumnName; + + // are column names empty? If so, fill in with blanks! + if(columnNames.empty()) { + columnNames = vector(num_columns, ""); + } + + columnNames[index] = newColumnName; // create dummy map operator // now it is a simple map operator @@ -409,6 +457,15 @@ namespace tuplex { return _context->makeError("job aborted (signal received)"); } + // emit warning if non-unique names anymore + // ==> only for non-empty strings + std::vector non_empty_names; + std::copy_if(columnNames.begin(), columnNames.end(), std::back_inserter(non_empty_names), [](const std::string& name) { return name != ""; }); + std::set unique_names(non_empty_names.begin(), non_empty_names.end()); + if(unique_names.size() != non_empty_names.size()) { + Logger::instance().defaultLogger().info("Found duplicate column names. Note that this can negatively impact UDFs and subsequent operators."); + } + return ds; } diff --git a/tuplex/python/include/PythonDataSet.h b/tuplex/python/include/PythonDataSet.h index 1f4f561b6..44df71eeb 100644 --- a/tuplex/python/include/PythonDataSet.h +++ b/tuplex/python/include/PythonDataSet.h @@ -90,6 +90,8 @@ namespace tuplex { PythonDataSet renameColumn(const std::string& oldName, const std::string& newName); + PythonDataSet renameColumn(int index, const std::string& newName); + PythonDataSet ignore(const int64_t exceptionCode); PythonDataSet join(const PythonDataSet& right, const std::string& leftKeyColumn, const std::string& rightKeyColumn, diff --git a/tuplex/python/src/PythonDataSet.cc b/tuplex/python/src/PythonDataSet.cc index 8e079ed67..3a2d3b608 100644 --- a/tuplex/python/src/PythonDataSet.cc +++ b/tuplex/python/src/PythonDataSet.cc @@ -559,6 +559,43 @@ namespace tuplex { return pds; } + PythonDataSet PythonDataSet::renameColumn(int index, const std::string &newName) { + assert(_dataset); + if (_dataset->isError()) { + PythonDataSet pds; + pds.wrap(this->_dataset); + return pds; + } + + PythonDataSet pds; + // GIL release & reacquire + assert(PyGILState_Check()); // make sure this thread holds the GIL! + python::unlockGIL(); + DataSet *ds = nullptr; + std::string err_message = ""; + try { + ds = &_dataset->renameColumn(index, newName); + } catch(const std::exception& e) { + err_message = e.what(); + Logger::instance().defaultLogger().error(err_message); + } catch(...) { + err_message = "unknown C++ exception occurred, please change type."; + Logger::instance().defaultLogger().error(err_message); + } + + python::lockGIL(); + + // nullptr? then error dataset! + if(!ds || !err_message.empty()) { + Logger::instance().flushAll(); + assert(_dataset->getContext()); + ds = &_dataset->getContext()->makeError(err_message); + } + pds.wrap(ds); + Logger::instance().flushAll(); + return pds; + } + PythonDataSet PythonDataSet::selectColumns(boost::python::list L) { // check dataset is valid & perform error check. assert(this->_dataset); diff --git a/tuplex/test/core/DataFrameOperations.cc b/tuplex/test/core/DataFrameOperations.cc index c9d48c28c..c3510a100 100644 --- a/tuplex/test/core/DataFrameOperations.cc +++ b/tuplex/test/core/DataFrameOperations.cc @@ -382,4 +382,15 @@ TEST_F(DataFrameTest, FolderOutput) { // load file from disk auto content = fileToString(URI("output/part0.csv")); EXPECT_EQ(content, "A,B\n11,20\n11,40\n"); +} + +TEST_F(DataFrameTest, RenameColumns) { + using namespace tuplex; + Context c(microTestOptions()); + + // rename test, position based: + auto& ds = c.parallelize({Row(1, 2), Row(3, 4)}); + auto cols_before_rename = ds.columns(); + + printf("test"); } \ No newline at end of file diff --git a/tuplex/utils/include/Utils.h b/tuplex/utils/include/Utils.h index 95ddf4022..4273a6653 100644 --- a/tuplex/utils/include/Utils.h +++ b/tuplex/utils/include/Utils.h @@ -43,6 +43,7 @@ namespace std { } #endif +#include "third_party/levenshtein-sse.h" // helper code to allow tuples in maps. #include @@ -299,6 +300,37 @@ namespace tuplex { return std::to_string(i) + "th"; } + /*! + * match needle against dictionary and find closest match based on Levenshtein distance + * @param needle string + * @param dictionary strings + * @return position in dictionary, or -1 in error case + */ + inline int fuzzyMatch(const std::string& needle, const std::vector& dictionary) { + using namespace std; + using namespace levenshteinSSE; + + if(dictionary.empty()) + return -1; + + if(dictionary.size() == 1) + return 0; + + int best_index = 0; + int lowest_score = numeric_limits::max(); + for(int i = 1; i < dictionary.size(); ++i) { + const auto& word = dictionary[i]; + auto score = levenshtein(needle, word); + + if(score < lowest_score) { + best_index = i; + lowest_score = score; + } + } + return best_index; + } + + template std::ostream &operator <<(std::ostream &os, const std::vector &v) { using namespace std; diff --git a/tuplex/utils/include/third_party/levenshtein-sse.h b/tuplex/utils/include/third_party/levenshtein-sse.h new file mode 100644 index 000000000..418b764d5 --- /dev/null +++ b/tuplex/utils/include/third_party/levenshtein-sse.h @@ -0,0 +1,957 @@ +// from https://github.com/addaleax/levenshtein-sse +/** + * Copyright (c) 2016 Anna Henningsen + * + * MIT License + */ + +#ifndef LSTSSE_LEVENSHTEIN_SSE_HPP +#define LSTSSE_LEVENSHTEIN_SSE_HPP + +#include +#include +#include +#include +#include +#ifdef __SSSE3__ +#include +#endif +#ifdef __SSE4_1__ +#include +#endif +#ifdef __AVX2__ +#include +#endif + +namespace levenshteinSSE { + +/** + * Public methods + */ + +/** + * Compute the Levenshtein distances of [a, aEnd) and [b, bEnd). + * Both types of iterators need to be bidirectional, e.g. + * fulfill the requirements of BidirectionalIterator, but may just be + * pointers. + * + * For pointers to types where SIMD instructions make sense, these are + * used when available. + */ + template + std::size_t levenshtein(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd); + +/** + * Compute the Levenshtein distances of a and b. + * Both containers need to support bidirectional start and end iterators + * (via std::begin() and std::end()). + * + * If both containers provide .data() and .size(), these are used to + * get pointers to the start and past-end elements. This holds true + * for std::string, std::vector, std::array and possibly more, but + * if you want to use this with custom container types, bear in mind that + * this also means that the available memory area needs to be contiguous. + */ + template + std::size_t levenshtein(const Container1& a, const Container2& b); + +/** + * Only implementation-specific stuff below + */ + +/** + * C++ STL allocator returning aligned memory with additional memory + * on both sides to safely allow garbage reads/writes + * + * based on https://stackoverflow.com/a/8545389/688904 + */ + template + class AlignmentAllocator; + +#ifdef __SSE2__ + template + class AlignmentAllocator { + public: + typedef T value_type; + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + + typedef T* pointer; + typedef const T* const_pointer; + + typedef T& reference; + typedef const T& const_reference; + + constexpr inline AlignmentAllocator () noexcept { } + + template + constexpr inline AlignmentAllocator (const AlignmentAllocator &) noexcept { } + + inline ~AlignmentAllocator () noexcept { } + + inline pointer address (reference r) { + return &r; + } + + inline const_pointer address (const_reference r) const { + return &r; + } + + inline pointer allocate (size_type n) { + // this allocator is special in that it leaves 4*N bytes before and after + // the allocated area for garbage reads/writes + return reinterpret_cast(reinterpret_cast(_mm_malloc(n * sizeof(value_type) + 8 * N, N)) + 4*N); + } + + inline void deallocate (pointer p, size_type) { + _mm_free(reinterpret_cast(p) - 4*N); + } + + inline void construct (pointer p, const value_type& value) { + new (p) value_type (value); + } + + inline void destroy (pointer p) { + p->~value_type (); + } + + constexpr inline size_type max_size () const noexcept { + return size_type (-1) / sizeof (value_type); + } + + template + struct rebind { + typedef AlignmentAllocator other; + }; + + constexpr bool operator!=(const AlignmentAllocator& other) const { + return !(*this == other); + } + + // Returns true if and only if storage allocated from *this + // can be deallocated from other, and vice versa. + // Always returns true for stateless allocators. + constexpr bool operator==(const AlignmentAllocator& other) const { + return other.usesMMAlloc; + } + + static constexpr bool usesMMAlloc = true; + }; +#endif + + template + class AlignmentAllocator : public std::allocator { + public: + static constexpr bool usesMMAlloc = false; + }; + +#ifdef __SSSE3__ + constexpr std::size_t alignment = 16; +#else + #warning "No SIMD extensions enabled" +constexpr std::size_t alignment = 1; +#endif + +/** + * Some notes on the algorithm used here: + * + * The standard Levenshtein metric is usually calculated using the + * Wagner–Fischer algorithm [1]. This essentially means calculating + * Levenshtein(a[:n], b[:m]) for all prefixes a[:n] and b[:m] of a and b, + * storing the results in a table (and possibly “forgetting” + * the previous rows to save memory). Normally, this is performed + * row by row, but this approach has the disadvantage that it is not + * easily converted to using SIMD instructions. + * + * The “diagonal” algorithm variants in this file calculate the minor + * diagonals one by one. There, the index `i` always corresponds to the + * table row in which we currently are and `j` always denotes the column. + * Note that the tables have a trivial i=0 row and j=0 column; The + * table entry [i,j] corresponds to the comparison a[i-1] == b[j-1]. + * A diagonal is defined by its index `k`, yielding the inner loop invariant + * k == i + j. + * + * The outer loop iterates over all diagonals from k = 0 to + * k = len(a)+len(b) (compare with the table in [1] to verify), meaning + * that the length of the diagonals may vary between iterations. + * The current diagonal and the previous diagonal are kept in memory, + * referred to as diag and diag2 respectively. + * + * In the inner loop, the rows are treated in descending order, + * i.e. `i` decreases from iteration to iteration, to avoid overwriting + * data needed in the next iteration. + * + * [1]: https://en.wikipedia.org/wiki/Wagner%E2%80%93Fischer_algorithm + */ + +/** + * Trivial implementation of one inner loop iteration. + * + * Note that i is passed as a reference; The SIMD version may decrease it + * by more than 1. + */ + template + struct LevenshteinIterationBase { + static inline void perform(const Iterator1& a, const Iterator2& b, + std::size_t& i, std::size_t j, std::size_t bLen, Vec1& diag, const Vec2& diag2) + { + std::uint32_t min = std::min(diag2[i], diag2[i-1]); + if (min < diag[i-1]) { + diag[i] = min + 1; + } + else { + diag[i] = diag[i-1] + (a[i-1] != b[j-1]); + } + --i; + } + }; + +/** + * Start SIMD-Land + * + * Currently, only SSSE3+ is supported, since we use + * _mm_shuffle_epi8 and _mm_alignr_epi8. + * + * If you are not familiar with SSE and/or the intrinsics, + * you can just believe me that they do about the same thing + * as LevenshteinIterationBase::perform. + */ + +#ifdef __SSSE3__ +#ifndef __SSE4_1__ + // straightforward _mm_min_epi32 polyfill using only SSE2 +inline __m128i _mm_min_epi32 (__m128i a, __m128i b) { + __m128i compare = _mm_cmpgt_epi32(a, b); + __m128i aIsSmaller = _mm_andnot_si128(a, compare); + __m128i bIsSmaller = _mm_and_si128 (b, compare); + return _mm_or_si128(aIsSmaller, bIsSmaller); +} +#endif // __SSE4_1__ +#endif // __SSSE3__ + +#ifdef __AVX2__ + // provide an AVX2 version of _mm256_alignr_epi32(a, b, 7) +// (i.e. move a one epi32 to the left and write the highest +// bytes of b to the lowest of a) +/*#ifndef __AVX512F__*/ +__m256i _mm256_alignr_epi32_7(__m256i a, __m256i b) { + const __m256i rotl1_256_epi32 = _mm256_setr_epi32(7, 0, 1, 2, 3, 4, 5, 6); + __m256i combined = _mm256_blend_epi32(a, b, 0x80); + return _mm256_permutevar8x32_epi32(combined, rotl1_256_epi32); +} +/* +#else +__m256i _mm256_alignr_epi32_7(__m256i a, __m256i b) { + return _mm256_alignr_epi32(a, b, 7); +} +#endif*/ +#endif + + template + struct LevenshteinIterationSIMD { +/* Decide which implementation is acceptable */ + static inline void performSIMD(const T* a, const T* b, + std::size_t& i, std::size_t j, std::size_t bLen, + std::uint32_t* diag, const std::uint32_t* diag2) + { + +#ifdef __AVX2__ + if (i >= 32 && bLen - j >= 32) { + performSSE_AVX2(a, b, i, j, bLen, diag, diag2); + return; + } +#endif + +#ifdef __SSSE3__ + if (i >= 16 && bLen - j >= 16) { + performSSE(a, b, i, j, bLen, diag, diag2); + return; + } +#endif + + LevenshteinIterationBase + ::perform(a, b, i, j, bLen, diag, diag2); + } + +#ifdef __SSSE3__ + static inline void performSSE(const T* a, const T* b, + std::size_t& i, std::size_t j, std::size_t bLen, + std::uint32_t* diag, const std::uint32_t* diag2) + { + const __m128i one128_epi32 = _mm_set1_epi32(1); + const __m128i reversedIdentity128_epi8 = _mm_setr_epi8( + 15, 14, 13, 12, + 11, 10, 9, 8, + 7, 6, 5, 4, + 3, 2, 1, 0 + ); + const __m128i reversedIdentity128_epi16 = _mm_setr_epi8( + 14, 15, 12, 13, + 10, 11, 8, 9, + 6, 7, 4, 5, + 2, 3, 0, 1 + ); + const __m128i blockwiseReversed128_epi8_4 = _mm_setr_epi8( + 3, 2, 1, 0, + 7, 6, 5, 4, + 11, 10, 9, 8, + 15, 14, 13, 12 + ); + const __m128i blockwiseReversed128_epi16_4 = _mm_setr_epi8( + 6, 7, 4, 5, + 2, 3, 0, 1, + 14, 15, 12, 13, + 10, 11, 8, 9 + ); + + __m128i substitutionCost32[4]; + std::size_t k; + + // We support 1, 2, and 4 byte objects for SSE comparison. + // We always process 16 entries at once, so we may need multiple fetches + // depending on object size. + if (sizeof(T) <= 2) { + __m128i substitutionCost16LX, substitutionCost16HX; + + if (sizeof(T) == 1) { + __m128i a_ = _mm_loadu_si128(reinterpret_cast(&a[i-16])); + __m128i b_ = _mm_loadu_si128(reinterpret_cast(&b[j-1])); + a_ = _mm_shuffle_epi8(a_, reversedIdentity128_epi8); + + // int substitutionCost = a[i-1] == b[j-1] ? -1 : 0; + + // diag/diag2 will contain the following entries from the diagonal: + // index [0]0 [0]1 [0]2 [0]3 [1]0 [1]1 [1]2 [1]3 ... [4]0 [4]1 [4]2 [4]3 + // diag+i -3 -2 -1 0 -7 -6 -5 -4 ... -19 -18 -17 -16 + + // substitutionCost8 contains the comparisons for the following indices in a and b: + // index 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + // a+i -1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 -14 -15 -16 + // b+j -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 + // in substitutionCost8X we shuffle this so that it matches up with diag/diag2 + // (barring the offset induces by comparing i-1 against j-1) + __m128i substitutionCost8 = _mm_cmpeq_epi8(a_, b_); + __m128i substitutionCost8X = _mm_shuffle_epi8(substitutionCost8, blockwiseReversed128_epi8_4); + // unpack with self so that 00 -> 00 00 and ff -> ff ff + substitutionCost16LX = _mm_unpacklo_epi8(substitutionCost8X, substitutionCost8X); + substitutionCost16HX = _mm_unpackhi_epi8(substitutionCost8X, substitutionCost8X); + } else { + __m128i a0 = _mm_loadu_si128(reinterpret_cast(&a[i-8])); + __m128i a1 = _mm_loadu_si128(reinterpret_cast(&a[i-16])); + __m128i b0 = _mm_loadu_si128(reinterpret_cast(&b[j-1])); + __m128i b1 = _mm_loadu_si128(reinterpret_cast(&b[j+7])); + a0 = _mm_shuffle_epi8(a0, reversedIdentity128_epi16); + a1 = _mm_shuffle_epi8(a1, reversedIdentity128_epi16); + __m128i substitutionCost16L = _mm_cmpeq_epi16(a0, b0); + __m128i substitutionCost16H = _mm_cmpeq_epi16(a1, b1); + substitutionCost16LX = _mm_shuffle_epi8(substitutionCost16L, blockwiseReversed128_epi16_4); + substitutionCost16HX = _mm_shuffle_epi8(substitutionCost16H, blockwiseReversed128_epi16_4); + } + + substitutionCost32[0] = _mm_unpacklo_epi16(substitutionCost16LX, substitutionCost16LX); + substitutionCost32[1] = _mm_unpackhi_epi16(substitutionCost16LX, substitutionCost16LX); + substitutionCost32[2] = _mm_unpacklo_epi16(substitutionCost16HX, substitutionCost16HX); + substitutionCost32[3] = _mm_unpackhi_epi16(substitutionCost16HX, substitutionCost16HX); + } else { + assert(sizeof(T) == 4); + + for (k = 0; k < 4; ++k) { + __m128i a_ = _mm_loadu_si128(reinterpret_cast(&a[i-4-k*4])); + __m128i b_ = _mm_loadu_si128(reinterpret_cast(&b[j-1+k*4])); + b_ = _mm_shuffle_epi32(b_, 0x1b); // simple reverse + substitutionCost32[k] = _mm_cmpeq_epi32(a_, b_); + } + } + + __m128i diag_[5], diag2_[5]; + for (k = 0; k < 5; ++k) { + diag_ [k] = _mm_loadu_si128(reinterpret_cast(&diag [i-3-k*4])); + } + for (k = 0; k < 5; ++k) { + diag2_[k] = _mm_loadu_si128(reinterpret_cast(&diag2[i-3-k*4])); + } + + // There’s a ton of debug stuff down here. You can use it for reference + // or, of course, compile with -DLSTSSE_DEBUG if you’re feeling funny. +#ifdef LSTSSE_DEBUG + for (k = 0; k < 5; ++k) { + std::uint32_t diag0 = _mm_extract_epi32(diag_[k], 0); + std::uint32_t diag1 = _mm_extract_epi32(diag_[k], 1); + std::uint32_t diag2 = _mm_extract_epi32(diag_[k], 2); + std::uint32_t diag3 = _mm_extract_epi32(diag_[k], 3); + assert(diag0 == diag[i-k*4-3]); + assert(diag1 == diag[i-k*4-2]); + assert(diag2 == diag[i-k*4-1]); + assert(diag3 == diag[i-k*4-0]); + } + for (k = 0; k < 5; ++k) { + std::uint32_t diag20 = _mm_extract_epi32(diag2_[k], 0); + std::uint32_t diag21 = _mm_extract_epi32(diag2_[k], 1); + std::uint32_t diag22 = _mm_extract_epi32(diag2_[k], 2); + std::uint32_t diag23 = _mm_extract_epi32(diag2_[k], 3); + assert(diag20 == diag2[i-k*4-3]); + assert(diag21 == diag2[i-k*4-2]); + assert(diag22 == diag2[i-k*4-1]); + assert(diag23 == diag2[i-k*4-0]); + } + + for (k = 0; k < 4; ++k) { + int sc0 = _mm_extract_epi32(substitutionCost32[k], 0); + int sc1 = _mm_extract_epi32(substitutionCost32[k], 1); + int sc2 = _mm_extract_epi32(substitutionCost32[k], 2); + int sc3 = _mm_extract_epi32(substitutionCost32[k], 3); + assert(sc0 == ((a[i-1-k*4-3] == b[j-1+k*4+3]) ? -1 : 0)); + assert(sc1 == ((a[i-1-k*4-2] == b[j-1+k*4+2]) ? -1 : 0)); + assert(sc2 == ((a[i-1-k*4-1] == b[j-1+k*4+1]) ? -1 : 0)); + assert(sc3 == ((a[i-1-k*4-0] == b[j-1+k*4+0]) ? -1 : 0)); + } +#endif // LSTSSE_DEBUG + + // reminders: + // the arrays (diag, diag2, substitutionCost32) correspond to i: + // index [0]0 [0]1 [0]2 [0]3 [1]0 [1]1 [1]2 [1]3 ... [4]0 [4]1 [4]2 [4]3 + // i+ -3 -2 -1 0 -7 -6 -5 -4 ... -19 -18 -17 -16 + // so when we need to access ...[i-1], we need to align the entries + // in *reverse* order + + // diag[i] = min( + // diag2[i-1], + // diag2[i], + // diag[i-1] + substitutionCost + // ) + 1; + // + for (k = 0; k < 4; ++k) { + __m128i diag2_i_m1 = _mm_alignr_epi8(diag2_[k], diag2_[k+1], 12); + __m128i diag_i_m1 = _mm_alignr_epi8(diag_ [k], diag_ [k+1], 12); + + __m128i result3 = _mm_add_epi32(diag_i_m1, substitutionCost32[k]); + __m128i min = _mm_min_epi32(_mm_min_epi32(diag2_i_m1, diag2_[k]), result3); + min = _mm_add_epi32(min, one128_epi32); + +#ifdef LSTSSE_DEBUG + std::uint32_t diag_i_m10 = _mm_extract_epi32(diag_i_m1, 0); + std::uint32_t diag_i_m11 = _mm_extract_epi32(diag_i_m1, 1); + std::uint32_t diag_i_m12 = _mm_extract_epi32(diag_i_m1, 2); + std::uint32_t diag_i_m13 = _mm_extract_epi32(diag_i_m1, 3); + assert(diag_i_m10 == diag[i-k*4-4]); + assert(diag_i_m11 == diag[i-k*4-3]); + assert(diag_i_m12 == diag[i-k*4-2]); + assert(diag_i_m13 == diag[i-k*4-1]); + + std::uint32_t diag2_i_m10 = _mm_extract_epi32(diag2_i_m1, 0); + std::uint32_t diag2_i_m11 = _mm_extract_epi32(diag2_i_m1, 1); + std::uint32_t diag2_i_m12 = _mm_extract_epi32(diag2_i_m1, 2); + std::uint32_t diag2_i_m13 = _mm_extract_epi32(diag2_i_m1, 3); + assert(diag2_i_m10 == diag2[i-k*4-4]); + assert(diag2_i_m11 == diag2[i-k*4-3]); + assert(diag2_i_m12 == diag2[i-k*4-2]); + assert(diag2_i_m13 == diag2[i-k*4-1]); + + std::uint32_t result30 = _mm_extract_epi32(result3, 0); + std::uint32_t result31 = _mm_extract_epi32(result3, 1); + std::uint32_t result32 = _mm_extract_epi32(result3, 2); + std::uint32_t result33 = _mm_extract_epi32(result3, 3); + + assert(result30 == diag[i-1-k*4-3] + ((a[i-1-k*4-3] == b[j-1+k*4+3]) ? -1 : 0)); + assert(result31 == diag[i-1-k*4-2] + ((a[i-1-k*4-2] == b[j-1+k*4+2]) ? -1 : 0)); + assert(result32 == diag[i-1-k*4-1] + ((a[i-1-k*4-1] == b[j-1+k*4+1]) ? -1 : 0)); + assert(result33 == diag[i-1-k*4-0] + ((a[i-1-k*4-0] == b[j-1+k*4+0]) ? -1 : 0)); +#endif // LSTSSE_DEBUG + + _mm_storeu_si128(reinterpret_cast<__m128i*>(&diag[i-k*4-3]), min); + } + + // We just handled 16 entries at once. Yay! + i -= 16; + } +#endif // __SSSE3__ + + +#ifdef __AVX2__ + static inline void performSSE_AVX2(const T* a, const T* b, + std::size_t& i, std::size_t j, std::size_t bLen, + std::uint32_t* diag, const std::uint32_t* diag2) +{ + const __m256i one256_epi32 = _mm256_set1_epi32(1); + const __m256i reversedIdentity256_epi8 = _mm256_setr_epi8( + 15, 14, 13, 12, + 11, 10, 9, 8, + 7, 6, 5, 4, + 3, 2, 1, 0, + 15, 14, 13, 12, + 11, 10, 9, 8, + 7, 6, 5, 4, + 3, 2, 1, 0 + ); + + const __m256i blockwiseReversed256_epi8_8 = _mm256_setr_epi8( + 7, 6, 5, 4, + 3, 2, 1, 0, + 15, 14, 13, 12, + 11, 10, 9, 8, + 7, 6, 5, 4, + 3, 2, 1, 0, + 15, 14, 13, 12, + 11, 10, 9, 8 + ); + + const __m256i blockwiseReversed256_epi16_8 = _mm256_setr_epi8( + 14, 15, 12, 13, + 10, 11, 8, 9, + 6, 7, 4, 5, + 2, 3, 0, 1, + 14, 15, 12, 13, + 10, 11, 8, 9, + 6, 7, 4, 5, + 2, 3, 0, 1 + ); + + __m256i substitutionCost32[4]; + std::size_t k; + + if (sizeof(T) <= 2) { + if (sizeof(T) == 1) { + __m256i a_ = _mm256_loadu_si256(reinterpret_cast(&a[i-32])); + __m256i b_ = _mm256_loadu_si256(reinterpret_cast(&b[j-1])); + a_ = _mm256_shuffle_epi8(a_, reversedIdentity256_epi8); + a_ = _mm256_permute2x128_si256(a_, a_, 1); // swap hi/lo + + __m256i substitutionCost8 = _mm256_cmpeq_epi8(a_, b_); + __m256i substitutionCost8X = _mm256_shuffle_epi8(substitutionCost8, blockwiseReversed256_epi8_8); + __m128i sc8Xlo = _mm256_extracti128_si256(substitutionCost8X, 0); + __m128i sc8Xhi = _mm256_extracti128_si256(substitutionCost8X, 1); + substitutionCost32[0] = _mm256_cvtepi8_epi32(sc8Xlo); + substitutionCost32[1] = _mm256_cvtepi8_epi32(_mm_srli_si128(sc8Xlo, 8)); + substitutionCost32[2] = _mm256_cvtepi8_epi32(sc8Xhi); + substitutionCost32[3] = _mm256_cvtepi8_epi32(_mm_srli_si128(sc8Xhi, 8)); + } else { + __m256i a0 = _mm256_loadu_si256(reinterpret_cast(&a[i-16])); + __m256i a1 = _mm256_loadu_si256(reinterpret_cast(&a[i-32])); + __m256i b0 = _mm256_loadu_si256(reinterpret_cast(&b[j-1])); + __m256i b1 = _mm256_loadu_si256(reinterpret_cast(&b[j+15])); + a0 = _mm256_permute2x128_si256(a0, a0, 1); // swap hi/lo + a1 = _mm256_permute2x128_si256(a1, a1, 1); // swap hi/lo + b0 = _mm256_shuffle_epi8(b0, blockwiseReversed256_epi16_8); + b1 = _mm256_shuffle_epi8(b1, blockwiseReversed256_epi16_8); + __m256i substitutionCost16L = _mm256_cmpeq_epi16(a0, b0); + __m256i substitutionCost16H = _mm256_cmpeq_epi16(a1, b1); + __m128i sc16Llo = _mm256_extracti128_si256(substitutionCost16L, 0); + __m128i sc16Lhi = _mm256_extracti128_si256(substitutionCost16L, 1); + __m128i sc16Hlo = _mm256_extracti128_si256(substitutionCost16H, 0); + __m128i sc16Hhi = _mm256_extracti128_si256(substitutionCost16H, 1); + substitutionCost32[0] = _mm256_cvtepi16_epi32(sc16Llo); + substitutionCost32[1] = _mm256_cvtepi16_epi32(sc16Lhi); + substitutionCost32[2] = _mm256_cvtepi16_epi32(sc16Hlo); + substitutionCost32[3] = _mm256_cvtepi16_epi32(sc16Hhi); + } + } else { + assert(sizeof(T) == 4); + + for (k = 0; k < 4; ++k) { + __m256i a_ = _mm256_loadu_si256(reinterpret_cast(&a[i-8-k*8])); + __m256i b_ = _mm256_loadu_si256(reinterpret_cast(&b[j-1+k*8])); + b_ = _mm256_shuffle_epi32(b_, 0x1b); // simple reverse + b_ = _mm256_permute2x128_si256(b_, b_, 1); // swap hi/lo + substitutionCost32[k] = _mm256_cmpeq_epi32(a_, b_); + } + } + + __m256i diag_[5], diag2_[5]; + for (k = 0; k < 5; ++k) { + diag_ [k] = _mm256_loadu_si256(reinterpret_cast(&diag [i-7-k*8])); + } + for (k = 0; k < 5; ++k) { + diag2_[k] = _mm256_loadu_si256(reinterpret_cast(&diag2[i-7-k*8])); + } + +#ifdef LSTSSE_DEBUG + for (k = 0; k < 5; ++k) { + std::uint32_t diag0 = _mm256_extract_epi32(diag_[k], 0); + std::uint32_t diag1 = _mm256_extract_epi32(diag_[k], 1); + std::uint32_t diag2 = _mm256_extract_epi32(diag_[k], 2); + std::uint32_t diag3 = _mm256_extract_epi32(diag_[k], 3); + std::uint32_t diag4 = _mm256_extract_epi32(diag_[k], 4); + std::uint32_t diag5 = _mm256_extract_epi32(diag_[k], 5); + std::uint32_t diag6 = _mm256_extract_epi32(diag_[k], 6); + std::uint32_t diag7 = _mm256_extract_epi32(diag_[k], 7); + assert(diag0 == diag[i-k*8-7]); + assert(diag1 == diag[i-k*8-6]); + assert(diag2 == diag[i-k*8-5]); + assert(diag3 == diag[i-k*8-4]); + assert(diag4 == diag[i-k*8-3]); + assert(diag5 == diag[i-k*8-2]); + assert(diag6 == diag[i-k*8-1]); + assert(diag7 == diag[i-k*8-0]); + } + for (k = 0; k < 5; ++k) { + std::uint32_t diag20 = _mm256_extract_epi32(diag2_[k], 0); + std::uint32_t diag21 = _mm256_extract_epi32(diag2_[k], 1); + std::uint32_t diag22 = _mm256_extract_epi32(diag2_[k], 2); + std::uint32_t diag23 = _mm256_extract_epi32(diag2_[k], 3); + std::uint32_t diag24 = _mm256_extract_epi32(diag2_[k], 4); + std::uint32_t diag25 = _mm256_extract_epi32(diag2_[k], 5); + std::uint32_t diag26 = _mm256_extract_epi32(diag2_[k], 6); + std::uint32_t diag27 = _mm256_extract_epi32(diag2_[k], 7); + assert(diag20 == diag2[i-k*8-7]); + assert(diag21 == diag2[i-k*8-6]); + assert(diag22 == diag2[i-k*8-5]); + assert(diag23 == diag2[i-k*8-4]); + assert(diag24 == diag2[i-k*8-3]); + assert(diag25 == diag2[i-k*8-2]); + assert(diag26 == diag2[i-k*8-1]); + assert(diag27 == diag2[i-k*8-0]); + } + + for (k = 0; k < 4; ++k) { + int sc0 = _mm256_extract_epi32(substitutionCost32[k], 0); + int sc1 = _mm256_extract_epi32(substitutionCost32[k], 1); + int sc2 = _mm256_extract_epi32(substitutionCost32[k], 2); + int sc3 = _mm256_extract_epi32(substitutionCost32[k], 3); + int sc4 = _mm256_extract_epi32(substitutionCost32[k], 4); + int sc5 = _mm256_extract_epi32(substitutionCost32[k], 5); + int sc6 = _mm256_extract_epi32(substitutionCost32[k], 6); + int sc7 = _mm256_extract_epi32(substitutionCost32[k], 7); + assert(sc0 == ((a[i-1-k*8-7] == b[j-1+k*8+7]) ? -1 : 0)); + assert(sc1 == ((a[i-1-k*8-6] == b[j-1+k*8+6]) ? -1 : 0)); + assert(sc2 == ((a[i-1-k*8-5] == b[j-1+k*8+5]) ? -1 : 0)); + assert(sc3 == ((a[i-1-k*8-4] == b[j-1+k*8+4]) ? -1 : 0)); + assert(sc4 == ((a[i-1-k*8-3] == b[j-1+k*8+3]) ? -1 : 0)); + assert(sc5 == ((a[i-1-k*8-2] == b[j-1+k*8+2]) ? -1 : 0)); + assert(sc6 == ((a[i-1-k*8-1] == b[j-1+k*8+1]) ? -1 : 0)); + assert(sc7 == ((a[i-1-k*8-0] == b[j-1+k*8+0]) ? -1 : 0)); + } +#endif // LSTSSE_DEBUG + + for (k = 0; k < 4; ++k) { + __m256i diag2_i_m1 = _mm256_alignr_epi32_7(diag2_[k], diag2_[k+1]); + __m256i diag_i_m1 = _mm256_alignr_epi32_7(diag_ [k], diag_ [k+1]); + + __m256i result3 = _mm256_add_epi32(diag_i_m1, substitutionCost32[k]); + __m256i min = _mm256_min_epi32(_mm256_min_epi32(diag2_i_m1, diag2_[k]), result3); + min = _mm256_add_epi32(min, one256_epi32); + +#ifdef LSTSSE_DEBUG + std::uint32_t diag_i_m10 = _mm256_extract_epi32(diag_i_m1, 0); + std::uint32_t diag_i_m11 = _mm256_extract_epi32(diag_i_m1, 1); + std::uint32_t diag_i_m12 = _mm256_extract_epi32(diag_i_m1, 2); + std::uint32_t diag_i_m13 = _mm256_extract_epi32(diag_i_m1, 3); + std::uint32_t diag_i_m14 = _mm256_extract_epi32(diag_i_m1, 4); + std::uint32_t diag_i_m15 = _mm256_extract_epi32(diag_i_m1, 5); + std::uint32_t diag_i_m16 = _mm256_extract_epi32(diag_i_m1, 6); + std::uint32_t diag_i_m17 = _mm256_extract_epi32(diag_i_m1, 7); + assert(diag_i_m10 == diag[i-k*8-8]); + assert(diag_i_m11 == diag[i-k*8-7]); + assert(diag_i_m12 == diag[i-k*8-6]); + assert(diag_i_m13 == diag[i-k*8-5]); + assert(diag_i_m14 == diag[i-k*8-4]); + assert(diag_i_m15 == diag[i-k*8-3]); + assert(diag_i_m16 == diag[i-k*8-2]); + assert(diag_i_m17 == diag[i-k*8-1]); + + std::uint32_t diag2_i_m10 = _mm256_extract_epi32(diag2_i_m1, 0); + std::uint32_t diag2_i_m11 = _mm256_extract_epi32(diag2_i_m1, 1); + std::uint32_t diag2_i_m12 = _mm256_extract_epi32(diag2_i_m1, 2); + std::uint32_t diag2_i_m13 = _mm256_extract_epi32(diag2_i_m1, 3); + std::uint32_t diag2_i_m14 = _mm256_extract_epi32(diag2_i_m1, 4); + std::uint32_t diag2_i_m15 = _mm256_extract_epi32(diag2_i_m1, 5); + std::uint32_t diag2_i_m16 = _mm256_extract_epi32(diag2_i_m1, 6); + std::uint32_t diag2_i_m17 = _mm256_extract_epi32(diag2_i_m1, 7); + assert(diag2_i_m10 == diag2[i-k*8-8]); + assert(diag2_i_m11 == diag2[i-k*8-7]); + assert(diag2_i_m12 == diag2[i-k*8-6]); + assert(diag2_i_m13 == diag2[i-k*8-5]); + assert(diag2_i_m14 == diag2[i-k*8-4]); + assert(diag2_i_m15 == diag2[i-k*8-3]); + assert(diag2_i_m16 == diag2[i-k*8-2]); + assert(diag2_i_m17 == diag2[i-k*8-1]); + + std::uint32_t result30 = _mm256_extract_epi32(result3, 0); + std::uint32_t result31 = _mm256_extract_epi32(result3, 1); + std::uint32_t result32 = _mm256_extract_epi32(result3, 2); + std::uint32_t result33 = _mm256_extract_epi32(result3, 3); + std::uint32_t result34 = _mm256_extract_epi32(result3, 4); + std::uint32_t result35 = _mm256_extract_epi32(result3, 5); + std::uint32_t result36 = _mm256_extract_epi32(result3, 6); + std::uint32_t result37 = _mm256_extract_epi32(result3, 7); + + assert(result30 == diag[i-1-k*8-7] + ((a[i-1-k*8-7] == b[j-1+k*8+7]) ? -1 : 0)); + assert(result31 == diag[i-1-k*8-6] + ((a[i-1-k*8-6] == b[j-1+k*8+6]) ? -1 : 0)); + assert(result32 == diag[i-1-k*8-5] + ((a[i-1-k*8-5] == b[j-1+k*8+5]) ? -1 : 0)); + assert(result33 == diag[i-1-k*8-4] + ((a[i-1-k*8-4] == b[j-1+k*8+4]) ? -1 : 0)); + assert(result34 == diag[i-1-k*8-3] + ((a[i-1-k*8-3] == b[j-1+k*8+3]) ? -1 : 0)); + assert(result35 == diag[i-1-k*8-2] + ((a[i-1-k*8-2] == b[j-1+k*8+2]) ? -1 : 0)); + assert(result36 == diag[i-1-k*8-1] + ((a[i-1-k*8-1] == b[j-1+k*8+1]) ? -1 : 0)); + assert(result37 == diag[i-1-k*8-0] + ((a[i-1-k*8-0] == b[j-1+k*8+0]) ? -1 : 0)); +#endif // LSTSSE_DEBUG + + _mm256_storeu_si256(reinterpret_cast<__m256i*>(&diag[i-k*8-7]), min); + } + + // We just handled 32 entries at once. Yay! + i -= 32; +} +#endif // __AVX2__ + + }; + +/** + * Default: If we don’t know better, just use the trivial implementation. + * + * This is overloaded in multiple places and is the struct whose `perform` + * method will ultimately be called from the outer loop. + * + * Vec1 and Vec2 correspond to `diag` and `diag2`, respectively. + */ + template + struct LevenshteinIteration : LevenshteinIterationBase { + }; + +/** + * Use a wrapper to decay the `diag` and `diag2` entries into pointers. + */ + template + struct LevenshteinIterationSIMDWrap : private LevenshteinIterationSIMD { + static inline void perform(const T* a, const T* b, + std::size_t& i, std::size_t j, std::size_t bLen, + std::vector& diag, + const std::vector& diag2) { + return LevenshteinIterationSIMD::performSIMD(a, b, i, j, bLen, diag.data(), diag2.data()); + } + }; + +/** + * Use a wrapper to test whether we can use SSE instuctions. + * + * T needs to be a scalar of size 1, 2 or 4. + */ + template + struct LevenshteinIteration, std::vector, const T*, const T*> + : std::conditional::value && (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4), + LevenshteinIterationSIMDWrap, + LevenshteinIterationBase, std::vector, const T*, const T*> + >::type + { }; + +/** + * Always decay pointers to const. + */ + template + struct LevenshteinIteration, std::vector, T*, T*> + : LevenshteinIteration, std::vector, const T*, const T*> + { }; + +/** + * Outer loop of the diagonal algorithm variant. + */ + template + T levenshteinDiagonal(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd) { + const std::size_t aLen = aEnd - a; + const std::size_t bLen = bEnd - b; + + assert(0 < aLen); + assert(aLen <= bLen); + + typedef AlignmentAllocator Alloc; + std::vector diag (aLen + 1, T(0)); + std::vector diag2 (aLen + 1, T(0)); + + std::size_t i, j, k; + + k = 0; + for (k = 1; ; ++k) { + assert(k <= aLen + bLen); + + std::size_t startRow = k > bLen ? k - bLen : 1; + std::size_t endRow = k > aLen ? aLen : k - 1; + + assert(endRow >= startRow || k == 1); + + for (i = endRow; i >= startRow; ) { + assert(i < k); + j = k - i; + + assert(bLen >= j); + assert(aLen >= i); + + LevenshteinIteration, std::vector, Iterator1, Iterator2> + ::perform(a, b, i, j, bLen, diag, diag2); + } + + diag[0] = k; + + if (k <= aLen) { + diag[k] = k; + } + + if (k == aLen + bLen) { + assert(startRow == endRow); + return diag[startRow]; + } + + // switch buffers + std::swap(diag, diag2); + } + + assert(0); + } + +/** + * Outer loop of the row-based variant, used for non-random-access iterators. + * + * based on https://github.com/sindresorhus/leven + */ + template + T levenshteinRowBased(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd) { + std::vector arr; + + std::size_t i = 0, j = 0; + T ret(0); + + for (Iterator1 it = a; it != aEnd; ++it) { + arr.push_back(++i); + } + + arr.shrink_to_fit(); + + for (; b != bEnd; ++b) { + T tmp = j++; + ret = j; + i = 0; + + for (Iterator1 it = a; it != aEnd; ++it, ++i) { + T tmp2 = *b == *it ? tmp : tmp + 1; + tmp = arr[i]; + ret = arr[i] = tmp > ret ? tmp2 > ret ? ret + 1 : tmp2 : tmp2 > tmp ? tmp + 1 : tmp2; + } + } + + return ret; + } + +/** + * Preable for edge cases and skipping common prefixes/suffixes, + * random access version. + */ + template + std::size_t levenshtein(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd, + std::random_access_iterator_tag, std::random_access_iterator_tag) { + if (aEnd - a > bEnd - b) { + return levenshtein(b, bEnd, a, aEnd); + } + + // skip common prefixes and suffixes + while (a < aEnd && a[0] == b[0]) + ++a, ++b; + + while (a < aEnd && aEnd[-1] == bEnd[-1]) + --aEnd, --bEnd; + + std::size_t aLen = aEnd - a; + std::size_t bLen = bEnd - b; + + if (aLen == 0) { + return bLen; + } + + if (aLen == 1) { + return bLen - (std::find(b, bEnd, *a) == bEnd ? 0 : 1); + } + + if (aLen + bLen <= std::numeric_limits::max()) + return levenshteinDiagonal(a, aEnd, b, bEnd); + + return levenshteinDiagonal(a, aEnd, b, bEnd); + } + +/** + * Preable for edge cases and skipping common prefixes/suffixes, + * non-random access version. + */ + template + std::size_t levenshtein(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd, + std::bidirectional_iterator_tag, std::bidirectional_iterator_tag) { + // skip common prefixes and suffixes + while (a != aEnd && b != bEnd && *a == *b) + ++a, ++b; + + while (a != aEnd && b != bEnd && *std::prev(aEnd) == *std::prev(bEnd)) + --aEnd, --bEnd; + + if (a == aEnd) { + return std::distance(b, bEnd); + } + + if (b == bEnd) { + return std::distance(a, aEnd); + } + + if (std::next(a) == aEnd) { + std::size_t ret = 0, found = 0; + for (; b != bEnd; ++b, ++ret) + if (*b == *a) + found = 1; + return ret - found; + } + + if (std::next(b) == bEnd) { + std::size_t ret = 0, found = 0; + for (; a != aEnd; ++a, ++ret) + if (*b == *a) + found = 1; + return ret - found; + } + + return levenshteinRowBased(a, aEnd, b, bEnd); + } + +// SFINAE checker for .data() and .size() + template + struct has_data_and_size { + private: + struct char2 { char _[2]; }; + template static auto test(U* a) -> decltype(a->data() + a->size(), char(0)); + template static auto test(const U* a) -> char2; + public: + static constexpr bool value = sizeof(test(static_cast(nullptr))) == 1; + }; + + template + struct LevenshteinContainer {}; + +/** + * Use .data(), .size() when available. + */ + template<> + struct LevenshteinContainer { + template + static inline std::size_t calc(const Container1& a, const Container2& b) { + return levenshtein(a.data(), a.data() + a.size(), b.data(), b.data() + b.size()); + } + }; + +/** + * Use std::begin(), std::end() otherwise. + */ + template<> + struct LevenshteinContainer { + template + static inline std::size_t calc(const Container1& a, const Container2& b) { + return levenshtein(std::begin(a), std::end(a), std::begin(b), std::end(b)); + } + }; + + template + std::size_t levenshtein(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd) { + return levenshtein(a, aEnd, b, bEnd, + typename std::iterator_traits::iterator_category(), + typename std::iterator_traits::iterator_category()); + } + + template + std::size_t levenshtein(const Container1& a, const Container2& b) { + return LevenshteinContainer::value && + has_data_and_size::value>::calc(a, b); + } +} + +#endif From 580250a76103af245028531a395cea6d7ea5bef9 Mon Sep 17 00:00:00 2001 From: leonhards Date: Fri, 22 Oct 2021 13:07:40 -0400 Subject: [PATCH 2/5] adding testing for new rename syntax --- tuplex/core/src/DataSet.cc | 2 +- tuplex/python/tests/test_columns.py | 15 ++++++++++++++- tuplex/python/tuplex/dataset.py | 8 ++++---- tuplex/test/core/DataFrameOperations.cc | 16 +++++++++++++++- 4 files changed, 34 insertions(+), 7 deletions(-) diff --git a/tuplex/core/src/DataSet.cc b/tuplex/core/src/DataSet.cc index 8cd43f08c..cb4989d77 100644 --- a/tuplex/core/src/DataSet.cc +++ b/tuplex/core/src/DataSet.cc @@ -394,7 +394,7 @@ namespace tuplex { auto closest_index = fuzzyMatch(oldColumnName, _columnNames); assert(closest_index >= 0 && closest_index < _columnNames.size()); auto closest_name = _columnNames[closest_index]; - return _context->makeError("renameColumn: could not find column '" + oldColumnName + "' in dataset's columns. Did you mean \"" + closest_name + "\""); + return _context->makeError("renameColumn: could not find column '" + oldColumnName + "' in dataset's columns. Did you mean \"" + closest_name + "\"?"); } // position? diff --git a/tuplex/python/tests/test_columns.py b/tuplex/python/tests/test_columns.py index 460f38f70..b76a5f10e 100644 --- a/tuplex/python/tests/test_columns.py +++ b/tuplex/python/tests/test_columns.py @@ -82,4 +82,17 @@ def test_withColumnUnnamed(self): .withColumn('newcol', lambda a, b: (a + b)/10) \ .collect() - self.assertEqual(res, [(1, 2, 3/10), (3, 2, 5/10)]) \ No newline at end of file + self.assertEqual(res, [(1, 2, 3/10), (3, 2, 5/10)]) + + def test_renameColumn(self): + ds = self.c.parallelize([(1, 2), (3, 2)]) + + ds2 = ds.renameColumn(0, 'first') + self.assertEqual(ds2.columns[0], 'first') + ds3 = ds2.renameColumn(1, 'second') + self.assertEqual(ds3.columns[1], 'second') + + ds4 = ds3.renameColumn("first", '1') + self.assertEqual(ds4.columns, ['1', 'second']) + ds5 = ds4.renameColumn('second', '2') + self.assertEqual(ds5.columns, ['1', '2']) \ No newline at end of file diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py index a60488d58..dfa9c9d81 100644 --- a/tuplex/python/tuplex/dataset.py +++ b/tuplex/python/tuplex/dataset.py @@ -261,10 +261,10 @@ def selectColumns(self, columns): ds._dataSet = self._dataSet.selectColumns(columns) return ds - def renameColumn(self, oldColumnName, newColumnName): + def renameColumn(self, oldColumnNameOrPosition, newColumnName): """ rename a column in dataset Args: - oldColumnName: str, old column name. Must exist. + oldColumnName: str|int, old column name or (0-indexed) position. newColumnName: str, new column name Returns: @@ -273,11 +273,11 @@ def renameColumn(self, oldColumnName, newColumnName): assert self._dataSet is not None, 'internal API error, datasets must be created via context object' - assert isinstance(oldColumnName, str), 'oldColumnName must be a string' + assert isinstance(oldColumnNameOrPosition, (str, int)), 'oldColumnNameOrPosition must be a string or integer' assert isinstance(newColumnName, str), 'newColumnName must be a string' ds = DataSet() - ds._dataSet = self._dataSet.renameColumn(oldColumnName, newColumnName) + ds._dataSet = self._dataSet.renameColumn(oldColumnNameOrPosition, newColumnName) return ds def ignore(self, eclass): diff --git a/tuplex/test/core/DataFrameOperations.cc b/tuplex/test/core/DataFrameOperations.cc index c3510a100..8d018da45 100644 --- a/tuplex/test/core/DataFrameOperations.cc +++ b/tuplex/test/core/DataFrameOperations.cc @@ -392,5 +392,19 @@ TEST_F(DataFrameTest, RenameColumns) { auto& ds = c.parallelize({Row(1, 2), Row(3, 4)}); auto cols_before_rename = ds.columns(); - printf("test"); + EXPECT_EQ(cols_before_rename.size(), 0); // no columns defined + + // now rename columns + auto& ds2 = ds.renameColumn(0, "first"); + ASSERT_EQ(ds2.columns().size(), 2); + EXPECT_EQ(ds2.columns()[0], "first"); + EXPECT_EQ(ds2.columns()[1], ""); + auto& ds3 = ds2.renameColumn(1, "second"); + ASSERT_EQ(ds3.columns().size(), 2); + EXPECT_EQ(ds3.columns()[0], "first"); + EXPECT_EQ(ds3.columns()[1], "second"); + + // check now fuzzy matching + auto& err_ds = ds3.renameColumn("secund", "+1"); + EXPECT_TRUE(err_ds.isError()); } \ No newline at end of file From 794efec209e421541f4ea4099c133556f11a13cf Mon Sep 17 00:00:00 2001 From: leonhards Date: Fri, 22 Oct 2021 13:13:58 -0400 Subject: [PATCH 3/5] inline fix --- tuplex/utils/include/third_party/levenshtein-sse.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tuplex/utils/include/third_party/levenshtein-sse.h b/tuplex/utils/include/third_party/levenshtein-sse.h index 418b764d5..1164a1aee 100644 --- a/tuplex/utils/include/third_party/levenshtein-sse.h +++ b/tuplex/utils/include/third_party/levenshtein-sse.h @@ -234,14 +234,14 @@ inline __m128i _mm_min_epi32 (__m128i a, __m128i b) { // (i.e. move a one epi32 to the left and write the highest // bytes of b to the lowest of a) /*#ifndef __AVX512F__*/ -__m256i _mm256_alignr_epi32_7(__m256i a, __m256i b) { +inline __m256i _mm256_alignr_epi32_7(__m256i a, __m256i b) { const __m256i rotl1_256_epi32 = _mm256_setr_epi32(7, 0, 1, 2, 3, 4, 5, 6); __m256i combined = _mm256_blend_epi32(a, b, 0x80); return _mm256_permutevar8x32_epi32(combined, rotl1_256_epi32); } /* #else -__m256i _mm256_alignr_epi32_7(__m256i a, __m256i b) { +inline __m256i _mm256_alignr_epi32_7(__m256i a, __m256i b) { return _mm256_alignr_epi32(a, b, 7); } #endif*/ From 8a4111ac636f134505e0bbb198b2e53274182d30 Mon Sep 17 00:00:00 2001 From: leonhards Date: Fri, 22 Oct 2021 13:43:00 -0400 Subject: [PATCH 4/5] fixing python bindings to have unique dispatch --- tuplex/python/include/PythonDataSet.h | 2 +- tuplex/python/src/PythonBindings.cc | 1 + tuplex/python/src/PythonDataSet.cc | 2 +- tuplex/python/tuplex/dataset.py | 13 +++++++++---- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/tuplex/python/include/PythonDataSet.h b/tuplex/python/include/PythonDataSet.h index 44df71eeb..6679aea64 100644 --- a/tuplex/python/include/PythonDataSet.h +++ b/tuplex/python/include/PythonDataSet.h @@ -90,7 +90,7 @@ namespace tuplex { PythonDataSet renameColumn(const std::string& oldName, const std::string& newName); - PythonDataSet renameColumn(int index, const std::string& newName); + PythonDataSet renameColumnByPosition(int index, const std::string& newName); PythonDataSet ignore(const int64_t exceptionCode); diff --git a/tuplex/python/src/PythonBindings.cc b/tuplex/python/src/PythonBindings.cc index a5c436c58..e5b4841c3 100644 --- a/tuplex/python/src/PythonBindings.cc +++ b/tuplex/python/src/PythonBindings.cc @@ -49,6 +49,7 @@ PYMODULE { .def("withColumn", &tuplex::PythonDataSet::withColumn) .def("selectColumns", &tuplex::PythonDataSet::selectColumns) .def("renameColumn", &tuplex::PythonDataSet::renameColumn) + .def("renameColumnByPosition", &tuplex::PythonDataSet::renameColumnByPosition) .def("join", &tuplex::PythonDataSet::join) .def("leftJoin", &tuplex::PythonDataSet::leftJoin) .def("columns", &tuplex::PythonDataSet::columns) diff --git a/tuplex/python/src/PythonDataSet.cc b/tuplex/python/src/PythonDataSet.cc index 3a2d3b608..7554d4cf5 100644 --- a/tuplex/python/src/PythonDataSet.cc +++ b/tuplex/python/src/PythonDataSet.cc @@ -559,7 +559,7 @@ namespace tuplex { return pds; } - PythonDataSet PythonDataSet::renameColumn(int index, const std::string &newName) { + PythonDataSet PythonDataSet::renameColumnByPosition(int index, const std::string &newName) { assert(_dataset); if (_dataset->isError()) { PythonDataSet pds; diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py index dfa9c9d81..a2b8c0b33 100644 --- a/tuplex/python/tuplex/dataset.py +++ b/tuplex/python/tuplex/dataset.py @@ -261,10 +261,10 @@ def selectColumns(self, columns): ds._dataSet = self._dataSet.selectColumns(columns) return ds - def renameColumn(self, oldColumnNameOrPosition, newColumnName): + def renameColumn(self, key, newColumnName): """ rename a column in dataset Args: - oldColumnName: str|int, old column name or (0-indexed) position. + key: str|int, old column name or (0-indexed) position. newColumnName: str, new column name Returns: @@ -273,11 +273,16 @@ def renameColumn(self, oldColumnNameOrPosition, newColumnName): assert self._dataSet is not None, 'internal API error, datasets must be created via context object' - assert isinstance(oldColumnNameOrPosition, (str, int)), 'oldColumnNameOrPosition must be a string or integer' + assert isinstance(key, (str, int)), 'key must be a string or integer' assert isinstance(newColumnName, str), 'newColumnName must be a string' ds = DataSet() - ds._dataSet = self._dataSet.renameColumn(oldColumnNameOrPosition, newColumnName) + if isinstance(key, str): + ds._dataSet = self._dataSet.renameColumn(key, newColumnName) + elif isinstance(key, int): + ds._dataSet = self._dataSet.renameColumnByPosition(key, newColumnName) + else: + raise TypeError('key must be int or str') return ds def ignore(self, eclass): From 5070a1119240adfa6a0c52cb5ce020c0aea52c47 Mon Sep 17 00:00:00 2001 From: leonhards Date: Fri, 22 Oct 2021 15:19:51 -0400 Subject: [PATCH 5/5] allow "" again --- tuplex/core/src/DataSet.cc | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/tuplex/core/src/DataSet.cc b/tuplex/core/src/DataSet.cc index cb4989d77..f9bc9d8e3 100644 --- a/tuplex/core/src/DataSet.cc +++ b/tuplex/core/src/DataSet.cc @@ -378,10 +378,6 @@ namespace tuplex { assert(_context); assert(_operator); - if("" == oldColumnName) { - return _context->makeError("Can't rename \"\" to name"); - } - if(_columnNames.empty()) { return _context->makeError("Dataset has no column names specified, try to use position based renameColumn function"); } @@ -407,7 +403,7 @@ namespace tuplex { return this->schema().getRowType().parameters().size(); } - DataSet & DataSet::renameColumn(int index, const std::string &newColumnName) { + DataSet& DataSet::renameColumn(int index, const std::string &newColumnName) { using namespace std; if(isError()) @@ -416,16 +412,11 @@ namespace tuplex { assert(_context); assert(_operator); - // renaming to empty string is not allowed - if("" == newColumnName) { - return _context->makeError("Can't rename column to \"\", as it is a reserved value"); - } - auto num_columns = numColumns(); if(index < 0) - throw std::runtime_error("index must be non-negative number"); + return _context->makeError("index must be non-negative number"); if(index >= num_columns) - throw std::runtime_error("Dataset contains only " + std::to_string(num_columns) + ", can't rename the " + + return _context->makeError("Dataset contains only " + std::to_string(num_columns) + ", can't rename the " + ordinal(index + 1) + " column"); // make copy