zlib-ng · Dead2 · Nov 20, 2024 · Sep 25, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -981,8 +981,10 @@ if(WITH_OPTIM)
                 add_definitions(-DX86_AVX512)
                 list(APPEND AVX512_SRCS ${ARCHDIR}/adler32_avx512.c)
                 add_feature_info(AVX512_ADLER32 1 "Support AVX512-accelerated adler32, using \"${AVX512FLAG}\"")
-                list(APPEND ZLIB_ARCH_SRCS ${AVX512_SRCS})
+                list(APPEND AVX512_SRCS ${ARCHDIR}/chunkset_avx512.c)
+                add_feature_info(AVX512_CHUNKSET 1 "Support AVX512 optimized chunkset, using \"${AVX512FLAG}\"")
                 list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/adler32_avx512_p.h)
+                list(APPEND ZLIB_ARCH_SRCS ${AVX512_SRCS})
                 set_property(SOURCE ${AVX512_SRCS} PROPERTY COMPILE_FLAGS "${AVX512FLAG} ${NOLTOFLAG}")
             else()
                 set(WITH_AVX512 OFF)

diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in
@@ -8,8 +8,8 @@ SFLAGS=
 INCLUDES=
 SUFFIX=
 
-AVX512FLAG=-mavx512f -mavx512dq -mavx512vl -mavx512bw
-AVX512VNNIFLAG=-mavx512vnni
+AVX512FLAG=-mavx512f -mavx512dq -mavx512vl -mavx512bw -mbmi2
+AVX512VNNIFLAG=-mavx512vnni -mbmi2
 AVX2FLAG=-mavx2
 SSE2FLAG=-msse2
 SSSE3FLAG=-mssse3
@@ -31,6 +31,7 @@ all: \
 	adler32_sse42.o adler32_sse42.lo \
 	adler32_ssse3.o adler32_ssse3.lo \
 	chunkset_avx2.o chunkset_avx2.lo \
+	chunkset_avx512.o chunkset_avx512.lo \
 	chunkset_sse2.o chunkset_sse2.lo \
 	chunkset_ssse3.o chunkset_ssse3.lo \
 	compare256_avx2.o compare256_avx2.lo \
@@ -52,6 +53,12 @@ chunkset_avx2.o:
 chunkset_avx2.lo:
 	$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx2.c
 
+chunkset_avx512.o:
+	$(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx512.c
+
+chunkset_avx512.lo:
+	$(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx512.c
+
 chunkset_sse2.o:
 	$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c
 

diff --git a/arch/x86/avx2_tables.h b/arch/x86/avx2_tables.h
@@ -0,0 +1,44 @@
+#ifndef _AVX2_TABLES_H
+#define _AVX2_TABLES_H
+
+#include "../generic/chunk_permute_table.h"
+
+/* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can
+ * never be 0 - 2, we'll start with an offset, subtracting 3 from the input */
+static const lut_rem_pair perm_idx_lut[29] = {
+    { 0, 2},                /* 3 */
+    { 0, 0},                /* don't care */
+    { 1 * 32, 2},           /* 5 */
+    { 2 * 32, 2},           /* 6 */
+    { 3 * 32, 4},           /* 7 */
+    { 0 * 32, 0},           /* don't care */
+    { 4 * 32, 5},           /* 9 */
+    { 5 * 32, 22},          /* 10 */
+    { 6 * 32, 21},          /* 11 */
+    { 7 * 32, 20},          /* 12 */
+    { 8 * 32, 6},           /* 13 */
+    { 9 * 32, 4},           /* 14 */
+    {10 * 32, 2},           /* 15 */
+    { 0 * 32, 0},           /* don't care */
+    {11 * 32, 15},          /* 17 */
+    {11 * 32 + 16, 14},     /* 18 */
+    {11 * 32 + 16 * 2, 13}, /* 19 */
+    {11 * 32 + 16 * 3, 12}, /* 20 */
+    {11 * 32 + 16 * 4, 11}, /* 21 */
+    {11 * 32 + 16 * 5, 10}, /* 22 */
+    {11 * 32 + 16 * 6,  9}, /* 23 */
+    {11 * 32 + 16 * 7,  8}, /* 24 */
+    {11 * 32 + 16 * 8,  7}, /* 25 */
+    {11 * 32 + 16 * 9,  6}, /* 26 */
+    {11 * 32 + 16 * 10, 5}, /* 27 */
+    {11 * 32 + 16 * 11, 4}, /* 28 */
+    {11 * 32 + 16 * 12, 3}, /* 29 */
+    {11 * 32 + 16 * 13, 2}, /* 30 */
+    {11 * 32 + 16 * 14, 1}  /* 31 */
+};
+
+static const uint16_t half_rem_vals[13] = {
+    1, 0, 1, 4, 2, 0, 7, 6, 5, 4, 3, 2, 1
+};
+
+#endif
diff --git a/arch/x86/chunkset_avx2.c b/arch/x86/chunkset_avx2.c
@@ -4,8 +4,8 @@
 #include "zbuild.h"
 
 #ifdef X86_AVX2
+#include "avx2_tables.h"
 #include <immintrin.h>
-#include "../generic/chunk_permute_table.h"
 #include "x86_intrins.h"
 
 typedef __m256i chunk_t;
@@ -19,44 +19,6 @@ typedef __m128i halfchunk_t;
 #define HAVE_CHUNK_MAG
 #define HAVE_HALF_CHUNK
 
-/* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can
- * never be 0 - 2, we'll start with an offset, subtracting 3 from the input */
-static const lut_rem_pair perm_idx_lut[29] = {
-    { 0, 2},                /* 3 */
-    { 0, 0},                /* don't care */
-    { 1 * 32, 2},           /* 5 */
-    { 2 * 32, 2},           /* 6 */
-    { 3 * 32, 4},           /* 7 */
-    { 0 * 32, 0},           /* don't care */
-    { 4 * 32, 5},           /* 9 */
-    { 5 * 32, 22},          /* 10 */
-    { 6 * 32, 21},          /* 11 */
-    { 7 * 32, 20},          /* 12 */
-    { 8 * 32, 6},           /* 13 */
-    { 9 * 32, 4},           /* 14 */
-    {10 * 32, 2},           /* 15 */
-    { 0 * 32, 0},           /* don't care */
-    {11 * 32, 15},          /* 17 */
-    {11 * 32 + 16, 14},     /* 18 */
-    {11 * 32 + 16 * 2, 13}, /* 19 */
-    {11 * 32 + 16 * 3, 12}, /* 20 */
-    {11 * 32 + 16 * 4, 11}, /* 21 */
-    {11 * 32 + 16 * 5, 10}, /* 22 */
-    {11 * 32 + 16 * 6,  9}, /* 23 */
-    {11 * 32 + 16 * 7,  8}, /* 24 */
-    {11 * 32 + 16 * 8,  7}, /* 25 */
-    {11 * 32 + 16 * 9,  6}, /* 26 */
-    {11 * 32 + 16 * 10, 5}, /* 27 */
-    {11 * 32 + 16 * 11, 4}, /* 28 */
-    {11 * 32 + 16 * 12, 3}, /* 29 */
-    {11 * 32 + 16 * 13, 2}, /* 30 */
-    {11 * 32 + 16 * 14, 1}  /* 31 */
-};
-
-static const uint16_t half_rem_vals[13] = {
-    1, 0, 1, 4, 2, 0, 7, 6, 5, 4, 3, 2, 1
-};
-
 static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
     int16_t tmp;
     memcpy(&tmp, from, sizeof(tmp));

diff --git a/arch/x86/chunkset_avx512.c b/arch/x86/chunkset_avx512.c
@@ -0,0 +1,189 @@
+/* chunkset_avx512.c -- AVX512 inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#include "zbuild.h"
+
+#ifdef X86_AVX512
+
+#include "avx2_tables.h"
+#include <immintrin.h>
+#include "x86_intrins.h"
+
+typedef __m256i chunk_t;
+typedef __m128i halfchunk_t;
+typedef __mmask32 mask_t;
+typedef __mmask16 halfmask_t;
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNKMEMSET_16
+#define HAVE_CHUNKMEMSET_1
+#define HAVE_CHUNK_MAG
+#define HAVE_HALF_CHUNK
+#define HAVE_MASKED_READWRITE
+#define HAVE_CHUNKCOPY
+#define HAVE_HALFCHUNKCOPY
+
+static inline halfmask_t gen_half_mask(unsigned len) {
+   return (halfmask_t)_bzhi_u32(0xFFFF, len);
+}
+
+static inline mask_t gen_mask(unsigned len) {
+   return (mask_t)_bzhi_u32(0xFFFFFFFF, len);
+}
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    int16_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm256_set1_epi16(tmp);
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    int32_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm256_set1_epi32(tmp);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    int64_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm256_set1_epi64x(tmp);
+}
+
+static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) {
+    *chunk = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)from));
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = _mm256_loadu_si256((__m256i *)s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    _mm256_storeu_si256((__m256i *)out, *chunk);
+}
+
+static inline void storechunk_mask(uint8_t *out, mask_t mask, chunk_t *chunk) {
+    _mm256_mask_storeu_epi8(out, mask, *chunk);
+}
+
+static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
+    Assert(len > 0, "chunkcopy should never have a length 0");
+
+    unsigned rem = len % sizeof(chunk_t);
+    mask_t rem_mask = gen_mask(rem);
+
+    /* Since this is only ever called if dist >= a chunk, we don't need a masked load */
+    chunk_t chunk;
+    loadchunk(from, &chunk);
+    _mm256_mask_storeu_epi8(out, rem_mask, chunk);
+    out += rem;
+    from += rem;
+    len -= rem;
+
+    while (len > 0) {
+        loadchunk(from, &chunk);
+        storechunk(out, &chunk);
+        out += sizeof(chunk_t);
+        from += sizeof(chunk_t);
+        len -= sizeof(chunk_t);
+    }
+
+    return out;
+}
+
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+    lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+    __m256i ret_vec;
+    *chunk_rem = lut_rem.remval;
+
+    /* See the AVX2 implementation for more detailed comments. This is that + some masked
+     * loads to avoid an out of bounds read on the heap */
+
+    if (dist < 16) {
+        const __m256i permute_xform =
+            _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16);
+        __m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx));
+        halfmask_t load_mask = gen_half_mask(dist);
+        __m128i ret_vec0 = _mm_maskz_loadu_epi8(load_mask, buf);
+        perm_vec = _mm256_add_epi8(perm_vec, permute_xform);
+        ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
+        ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec);
+    }  else {
+        halfmask_t load_mask = gen_half_mask(dist - 16);
+        __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
+        __m128i ret_vec1 = _mm_maskz_loadu_epi8(load_mask, (__m128i*)(buf + 16));
+        __m128i perm_vec1 = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+        halfmask_t xlane_mask = _mm_cmp_epi8_mask(perm_vec1, _mm_set1_epi8(15), _MM_CMPINT_LE);
+        __m128i latter_half = _mm_mask_shuffle_epi8(ret_vec1, xlane_mask, ret_vec0, perm_vec1);
+        ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), latter_half, 1);
+    }
+
+    return ret_vec;
+}
+
+static inline void loadhalfchunk(uint8_t const *s, halfchunk_t *chunk) {
+    *chunk = _mm_loadu_si128((__m128i *)s);
+}
+
+static inline void storehalfchunk(uint8_t *out, halfchunk_t *chunk) {
+    _mm_storeu_si128((__m128i *)out, *chunk);
+}
+
+static inline chunk_t halfchunk2whole(halfchunk_t *chunk) {
+    /* We zero extend mostly to appease some memory sanitizers. These bytes are ultimately
+     * unlikely to be actually written or read from */
+    return _mm256_zextsi128_si256(*chunk);
+}
+
+static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+    lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+    __m128i perm_vec, ret_vec;
+    halfmask_t load_mask = gen_half_mask(dist);
+    ret_vec = _mm_maskz_loadu_epi8(load_mask, buf);
+    *chunk_rem = half_rem_vals[dist - 3];
+
+    perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+    ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);
+
+    return ret_vec;
+}
+
+static inline uint8_t* HALFCHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
+    Assert(len > 0, "chunkcopy should never have a length 0");
+
+    unsigned rem = len % sizeof(halfchunk_t);
+    halfmask_t rem_mask = gen_half_mask(rem);
+
+    /* Since this is only ever called if dist >= a chunk, we don't need a masked load */
+    halfchunk_t chunk;
+    loadhalfchunk(from, &chunk);
+    _mm_mask_storeu_epi8(out, rem_mask, chunk);
+    out += rem;
+    from += rem;
+    len -= rem;
+
+    while (len > 0) {
+        loadhalfchunk(from, &chunk);
+        storehalfchunk(out, &chunk);
+        out += sizeof(halfchunk_t);
+        from += sizeof(halfchunk_t);
+        len -= sizeof(halfchunk_t);
+    }
+
+    return out;
+}
+
+#define CHUNKSIZE        chunksize_avx512
+#define CHUNKUNROLL      chunkunroll_avx512
+#define CHUNKMEMSET      chunkmemset_avx512
+#define CHUNKMEMSET_SAFE chunkmemset_safe_avx512
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST     inflate_fast_avx512
+
+#include "inffast_tpl.h"
+
+#endif
diff --git a/arch/x86/x86_features.c b/arch/x86/x86_features.c
@@ -97,6 +97,8 @@ void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) {
             features->has_avx2 = ebx & 0x20;
         }
 
+        features->has_bmi2 = ebx & 0x8;
+
         // check AVX512 bits if the OS supports saving ZMM registers
         if (features->has_os_save_zmm) {
             features->has_avx512f = ebx & 0x00010000;
@@ -108,7 +110,7 @@ void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) {
                 features->has_avx512vl = ebx & 0x80000000;
             }
             features->has_avx512_common = features->has_avx512f && features->has_avx512dq && features->has_avx512bw \
-              && features->has_avx512vl;
+              && features->has_avx512vl && features->has_bmi2;
             features->has_avx512vnni = ecx & 0x800;
         }
     }

diff --git a/arch/x86/x86_features.h b/arch/x86/x86_features.h
@@ -14,6 +14,7 @@ struct x86_cpu_features {
     int has_avx512vl;
     int has_avx512_common; // Enabled when AVX512(F,DQ,BW,VL) are all enabled.
     int has_avx512vnni;
+    int has_bmi2;
     int has_sse2;
     int has_ssse3;
     int has_sse42;

diff --git a/arch/x86/x86_functions.h b/arch/x86/x86_functions.h
@@ -46,6 +46,9 @@ uint8_t* chunkmemset_safe_avx2(uint8_t *out, uint8_t *from, unsigned len, unsign
 #ifdef X86_AVX512
 uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len);
 uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+uint32_t chunksize_avx512(void);
+uint8_t* chunkmemset_safe_avx512(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
+void inflate_fast_avx512(PREFIX3(stream)* strm, uint32_t start);
 #endif
 #ifdef X86_AVX512VNNI
 uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len);
@@ -146,6 +149,12 @@ uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
 #    define native_adler32 adler32_avx512
 #    undef native_adler32_fold_copy
 #    define native_adler32_fold_copy adler32_fold_copy_avx512
+#    undef native_chunkmemset_safe
+#    define native_chunkmemset_safe chunkmemset_safe_avx512
+#    undef native_chunksize
+#    define native_chunksize chunksize_avx512
+#    undef native_inflate_fast
+#    define native_inflate_fast inflate_fast_avx512
 // X86 - AVX512 (VNNI)
 #    if defined(X86_AVX512VNNI) && defined(__AVX512VNNI__)
 #      undef native_adler32