diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index e5d2e85dbe90..c1fbfc050110 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -680,6 +680,40 @@ Result: └───────────┘ ``` +## isValidASCII {#isvalidascii} + +Returns 1, if the set of bytes constitutes valid ASCII-encoded text, otherwise 0. + +**Syntax** + +```sql +isValidASCII(input) +``` + +Alias: `isASCII` + +**Parameters** + +- `input` — A string type [String](../data-types/string.md). + +**Returned value** + +- Returns `1`, if the set of bytes constitutes valid ASCII-encoded text, otherwise `0`. + +Query: + +```sql +SELECT isValidASCII('\x7F\x00') AS valid, isValidASCII('\xc3\xb1') AS invalid; +``` + +Result: + +```response +┌─valid─┬─invalid─┐ +│ 1 │ 0 │ +└───────┴─────────┘ +``` + ## isValidUTF8 {#isvalidutf8} Returns 1, if the set of bytes constitutes valid UTF-8-encoded text, otherwise 0. diff --git a/src/Client/BuzzHouse/Generator/SQLFuncs.h b/src/Client/BuzzHouse/Generator/SQLFuncs.h index ab2da61516ee..aefe9e400c37 100644 --- a/src/Client/BuzzHouse/Generator/SQLFuncs.h +++ b/src/Client/BuzzHouse/Generator/SQLFuncs.h @@ -884,6 +884,7 @@ const std::vector CHFuncs = { CHFunction(SQLFunc::FUNCtryBase32Decode, 0, 0, 1, 1), CHFunction(SQLFunc::FUNCbech32Encode, 0, 0, 1, 3), CHFunction(SQLFunc::FUNCbech32Decode, 0, 0, 1, 1), + CHFunction(SQLFunc::FUNCisValidASCII, 0, 0, 1, 1), /// Timeseries CHFunction(SQLFunc::FUNCseriesOutliersDetectTukey, 0, 0, 1, 1), CHFunction(SQLFunc::FUNCseriesPeriodDetectFFT, 0, 0, 1, 1), diff --git a/src/Client/BuzzHouse/Proto/SQLGrammar.proto b/src/Client/BuzzHouse/Proto/SQLGrammar.proto index 441d3bfb6faf..75cd7587177e 100644 --- a/src/Client/BuzzHouse/Proto/SQLGrammar.proto +++ b/src/Client/BuzzHouse/Proto/SQLGrammar.proto @@ -1570,6 +1570,7 @@ enum SQLFunc { FUNCYYYYMMDDToDate = 1565; FUNCYYYYMMDDToDate32 = 1566; FUNCzookeeperSessionUptime = 1567; + FUNCisValidASCII = 1568; } enum SQLTableFunc { diff --git a/src/Functions/isValidASCII.cpp b/src/Functions/isValidASCII.cpp new file mode 100644 index 000000000000..99d445f93f55 --- /dev/null +++ b/src/Functions/isValidASCII.cpp @@ -0,0 +1,288 @@ +#include +#include +#include + +#include +#include + +#include + +#if defined(__AVX512F__) || defined(__AVX__) && defined(__AVX2__) +# include +#elif defined(__SSE2__) +# include +#elif defined(__aarch64__) && defined(__ARM_NEON) +# include +# pragma clang diagnostic ignored "-Wreserved-identifier" +#endif + +namespace DB +{ + +namespace ErrorCodes +{ +extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} + +/// inspired by https://github.com/cyb70289/utf8/ +/* +MIT License + +Copyright (c) 2019 Yibo Cai + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +namespace +{ +/* +* Checks if a data buffer contains only valid ASCII characters by: +* - Accumulate ranges of the data into 2 SIMD registers using bitwise OR. +* - Accumulate the 2 registers into one, check if any byte has a set 7th bit, and +* return 0 if this is the case. +* - Advance the data pointer and update the length. +*/ +#if defined(__AVX512F__) +UInt8 isValidASCIIWithSIMD(const UInt8 *& data, UInt64 & len) +{ + if (len >= 128) + { + __m512i first_mask = _mm512_setzero_si512(); + __m512i second_mask = _mm512_setzero_si512(); + + while (len >= 128) + { + __m512i first_input = _mm512_loadu_si512(reinterpret_cast(data)); + __m512i second_input = _mm512_loadu_si512(reinterpret_cast(data + 64)); + + first_mask = _mm512_or_si512(first_mask, first_input); + second_mask = _mm512_or_si512(second_mask, second_input); + + data += 128; + len -= 128; + } + + first_mask = _mm512_or_si512(first_mask, second_mask); + if (_mm512_cmplt_epi8_mask(first_mask, _mm512_set1_epi8(0))) + { + return 0; + } + } + + return 1; +} + +#elif defined(__AVX__) && defined(__AVX2__) +UInt8 isValidASCIIWithSIMD(const UInt8 *& data, UInt64 & len) +{ + if (len >= 64) + { + __m256i first_mask = _mm256_setzero_si256(); + __m256i second_mask = _mm256_setzero_si256(); + + while (len >= 64) + { + __m256i first_input = _mm256_loadu_si256(reinterpret_cast(data)); + __m256i second_input = _mm256_loadu_si256(reinterpret_cast(data + 32)); + + first_mask = _mm256_or_si256(first_mask, first_input); + second_mask = _mm256_or_si256(second_mask, second_input); + + data += 64; + len -= 64; + } + + first_mask = _mm256_or_si256(first_mask, second_mask); + if (_mm256_movemask_epi8(_mm256_cmpgt_epi8(_mm256_set1_epi8(0), first_mask))) + { + return 0; + } + } + + return 1; +} + +#elif defined(__SSE2__) +UInt8 isValidASCIIWithSIMD(const UInt8 *& data, UInt64 & len) +{ + if (len >= 32) + { + __m128i first_mask = _mm_set1_epi8(0); + __m128i second_mask = _mm_set1_epi8(0); + + while (len >= 32) + { + __m128i first_input = _mm_loadu_si128(reinterpret_cast(data)); + __m128i second_input = _mm_loadu_si128(reinterpret_cast(data + 16)); + + first_mask = _mm_or_si128(first_mask, first_input); + second_mask = _mm_or_si128(second_mask, second_input); + + data += 32; + len -= 32; + } + + first_mask = _mm_or_si128(first_mask, second_mask); + if (_mm_movemask_epi8(_mm_cmplt_epi8(first_mask, _mm_set1_epi8(0)))) + { + return 0; + } + } + + return 1; +} + +#elif defined(__aarch64__) && defined(__ARM_NEON) +UInt8 isValidASCIIWithSIMD(const UInt8 *& data, UInt64 & len) +{ + if (len >= 32) + { + uint8x16_t first_mask = vdupq_n_u8(0); + uint8x16_t second_mask = vdupq_n_u8(0); + + while (len >= 32) + { + const uint8x16_t first_input = vld1q_u8(reinterpret_cast(data)); + const uint8x16_t second_input = vld1q_u8(reinterpret_cast(data + 16)); + + first_mask = vorrq_u8(first_mask, first_input); + second_mask = vorrq_u8(second_mask, second_input); + + data += 32; + len -= 32; + } + + first_mask = vorrq_u8(first_mask, second_mask); + if (vmaxvq_u8(first_mask) >= 0x80) + { + return 0; + } + } + + return 1; +} +#endif + +UInt8 isValidASCIIWithoutSIMD(const UInt8 * data, UInt64 len) +{ + UInt8 all_mask = 0; + + if (len >= 16) + { + UInt64 first_mask = 0; + UInt64 second_mask = 0; + + do + { + UInt64 first_input; + std::memcpy(&first_input, data, sizeof(first_input)); + first_mask |= first_input; + + UInt64 second_input; + std::memcpy(&second_input, data + 8, sizeof(second_input)); + second_mask |= second_input; + + data += 16; + len -= 16; + } while (len >= 16); + + // if any byte has a set high bit, the result will be !(non zero) - 1 = 0 - 1 = 0xFF. + // if all byte have a clear high bit, the result will be !(zero) - 1 = 1 - 1 = 0x00. + all_mask = !((first_mask | second_mask) & 0x8080808080808080ULL) - 1; + } + + // iterate through remaining bytes. + std::for_each(data, data + len, [&](UInt8 byte) { all_mask |= byte; }); + + return all_mask < 0x80; +} +} + +struct ValidASCIIImpl +{ + static UInt8 isValidASCII(const UInt8 * data, UInt64 len) + { +#if defined(__AVX512F__) || defined(__AVX__) && defined(__AVX2__) || defined(__SSE2__) || defined(__aarch64__) && defined(__ARM_NEON) + // advances the data pointer and updates the length. + if (!isValidASCIIWithSIMD(data, len)) + { + return 0; + } +#endif + return isValidASCIIWithoutSIMD(data, len); + } + + static constexpr bool is_fixed_to_constant = false; + + static void + vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, PaddedPODArray & res, size_t input_rows_count) + { + size_t prev_offset = 0; + for (size_t i = 0; i < input_rows_count; ++i) + { + res[i] = isValidASCII(data.data() + prev_offset, offsets[i] - 1 - prev_offset); + prev_offset = offsets[i]; + } + } + + [[noreturn]] static void vectorFixedToConstant(const ColumnString::Chars &, size_t, UInt8 &, size_t) + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "vectorFixedToConstant not implemented for function isValidASCII"); + } + + static void vectorFixedToVector(const ColumnString::Chars & data, size_t n, PaddedPODArray & res, size_t input_rows_count) + { + for (size_t i = 0; i < input_rows_count; ++i) + res[i] = isValidASCII(data.data() + i * n, n); + } + + [[noreturn]] static void array(const ColumnString::Offsets &, PaddedPODArray &, size_t) + { + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Cannot apply function isValidASCII to Array argument"); + } + + [[noreturn]] static void uuid(const ColumnUUID::Container &, size_t &, PaddedPODArray &, size_t) + { + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Cannot apply function isValidASCII to UUID argument"); + } + + [[noreturn]] static void ipv6(const ColumnIPv6::Container &, size_t &, PaddedPODArray &, size_t) + { + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Cannot apply function isValidASCII to IPv6 argument"); + } + + [[noreturn]] static void ipv4(const ColumnIPv4::Container &, size_t &, PaddedPODArray &, size_t) + { + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Cannot apply function isValidASCII to IPv4 argument"); + } +}; + +struct NameIsValidASCII +{ + static constexpr auto name = "isValidASCII"; +}; +using FunctionValidASCII = FunctionStringOrArrayToT; + +REGISTER_FUNCTION(IsValidASCII) +{ + factory.registerFunction(); + factory.registerAlias("isASCII", "isValidASCII", FunctionFactory::Case::Sensitive); +} + +} diff --git a/tests/performance/is_valid_ascii.xml b/tests/performance/is_valid_ascii.xml new file mode 100644 index 000000000000..d357c8ada929 --- /dev/null +++ b/tests/performance/is_valid_ascii.xml @@ -0,0 +1,6 @@ + + CREATE TABLE long_text (`id` UInt64, `text` String) ENGINE = MergeTree ORDER BY id; + INSERT INTO long_text SELECT 1, randomPrintableASCII(500000000); + select isValidASCII(text) from long_text; + DROP TABLE IF EXISTS long_text; + \ No newline at end of file diff --git a/tests/queries/0_stateless/03594_is_valid_ascii.reference b/tests/queries/0_stateless/03594_is_valid_ascii.reference new file mode 100644 index 000000000000..7d58ac06835f --- /dev/null +++ b/tests/queries/0_stateless/03594_is_valid_ascii.reference @@ -0,0 +1,42 @@ +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +0 +0 +1 +1 +1 +0 +0 +0 +0 diff --git a/tests/queries/0_stateless/03594_is_valid_ascii.sql b/tests/queries/0_stateless/03594_is_valid_ascii.sql new file mode 100644 index 000000000000..423dd62d970c --- /dev/null +++ b/tests/queries/0_stateless/03594_is_valid_ascii.sql @@ -0,0 +1,63 @@ +-- constant input +SELECT 1 = isValidASCII(''); +SELECT 1 = isValidASCII('some text'); +SELECT 0 = isValidASCII('какой-то текст'); +SELECT 1 = isValidASCII('\x00'); +SELECT 1 = isValidASCII('\x7F'); +SELECT 0 = isValidASCII('\x80'); +SELECT 1 = isValidASCII('\x00\x7F'); +SELECT 1 = isValidASCII('\x7F\x00'); +SELECT 0 = isValidASCII('\xC2\x80'); +SELECT 0 = isValidASCII('\x7F\x80'); +SELECT 0 = isValidASCII('\x70\x70\x80'); +SELECT 0 = isValidUTF8('\x66\x80\x00'); +SELECT 1 = isValidASCII(repeat('\x7F\x00', 200)); +SELECT 0 = isValidASCII(repeat('\x7F\x80', 200)); + +-- fixed size constant input +SELECT 1 = isValidASCII(toFixedString('', 1)); +SELECT 1 = isValidASCII(toFixedString('some text', 9)); +SELECT 0 = isValidASCII(toFixedString('какой-то текст', 26)); +SELECT 1 = isValidASCII(toFixedString('\x00', 1)); +SELECT 1 = isValidASCII(toFixedString('\x7F', 1)); +SELECT 0 = isValidASCII(toFixedString('\x80', 1)); +SELECT 1 = isValidASCII(toFixedString('\x00\x7F', 2)); +SELECT 1 = isValidASCII(toFixedString('\x7F\x00', 2)); +SELECT 0 = isValidASCII(toFixedString('\xC2\x80', 2)); +SELECT 0 = isValidASCII(toFixedString('\x7F\x80', 2)); +SELECT 0 = isValidASCII(toFixedString('\x70\x70\x80', 3)); +SELECT 0 = isValidUTF8(toFixedString('\x66\x80\x00', 3)); +SELECT 1 = isValidASCII(toFixedString(repeat('\x7F\x00', 200), 400)); +SELECT 0 = isValidASCII(toFixedString(repeat('\x7F\x80', 200), 400)); + +-- alias +SELECT 1 = isASCII('some text'); +SELECT 1 = isASCII(toFixedString('some text', 9)); + +-- column input +DROP TABLE IF EXISTS asciis; + +CREATE TABLE asciis (val String) ENGINE = SummingMergeTree ORDER BY val; + +INSERT INTO asciis (val) VALUES (''); +INSERT INTO asciis (val) VALUES ('some text'); +INSERT INTO asciis (val) VALUES ('какой-то текст'); +INSERT INTO asciis (val) VALUES ('\x00'); +INSERT INTO asciis (val) VALUES ('\x7F'); +INSERT INTO asciis (val) VALUES ('\x80'); +INSERT INTO asciis (val) VALUES ('\x00\x7F'); +INSERT INTO asciis (val) VALUES ('\x7F\x00'); +INSERT INTO asciis (val) VALUES ('\xC2\x80'); +INSERT INTO asciis (val) VALUES ('\x7F\x80'); +INSERT INTO asciis (val) VALUES ('\x70\x70\x80'); +INSERT INTO asciis (val) VALUES ('\x66\x80\x00'); + +SELECT isValidASCII(val) FROM asciis ORDER BY val; + +DROP TABLE asciis; + +-- unsupported arguments +SELECT isValidUTF8(['\x00', '\x7F']); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT isValidUTF8(toUUID('00000000-0000-0000-0000-000000000000')); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT isValidUTF8(toIPv6('127.0.0.1')); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT isValidUTF8(toIPv4('127.0.0.1')); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }