From 061aca5c8ba29622a3cf47ec94e984ab7aeed498 Mon Sep 17 00:00:00 2001 From: Mika Lindqvist Date: Fri, 18 Jun 2021 23:15:28 +0300 Subject: [PATCH 1/3] [chunkset_neon] Don't use signed vector types. * There is no need to convert between unsigned and signed vector types. All relevant intrinsics have versions for all unsigned vector types. --- arch/arm/chunkset_neon.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm/chunkset_neon.c b/arch/arm/chunkset_neon.c index 22c3785c11..b1fcb241d0 100644 --- a/arch/arm/chunkset_neon.c +++ b/arch/arm/chunkset_neon.c @@ -25,15 +25,15 @@ static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) { } static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { - int16_t tmp; + uint16_t tmp; memcpy(&tmp, from, 2); - *chunk = vreinterpretq_u8_s16(vdupq_n_s16(tmp)); + *chunk = vreinterpretq_u8_u16(vdupq_n_u16(tmp)); } static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { - int32_t tmp; + uint32_t tmp; memcpy(&tmp, from, 4); - *chunk = vreinterpretq_u8_s32(vdupq_n_s32(tmp)); + *chunk = vreinterpretq_u8_u32(vdupq_n_u32(tmp)); } static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { From 618c481d2358597b90905be2614272af7900b545 Mon Sep 17 00:00:00 2001 From: Mika Lindqvist Date: Sat, 19 Jun 2021 00:10:44 +0300 Subject: [PATCH 2/3] [chunkset_neon] Use vdupq_n_u64. * Using vdupq_n_u64 duplicates the unsigned 64-bit integer to two consecutive aligned memory locations in stack so compiler can use wider load instructions. All different-sized general-purpose registers overlay on ARM/AArch64, so any vector cast is no-op in assembly. --- arch/arm/chunkset_neon.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm/chunkset_neon.c b/arch/arm/chunkset_neon.c index b1fcb241d0..e0ad3e04ea 100644 --- a/arch/arm/chunkset_neon.c +++ b/arch/arm/chunkset_neon.c @@ -37,7 +37,9 @@ static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { } static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { - *chunk = vcombine_u8(vld1_u8(from), vld1_u8(from)); + uint64_t tmp; + memcpy(&tmp, from, 8); + *chunk = vreinterpretq_u8_u64(vdupq_n_u64(tmp)); } #define CHUNKSIZE chunksize_neon From 858ec3eecc50bb6bb614a7b01ae17b99ee41f6a4 Mon Sep 17 00:00:00 2001 From: Mika Lindqvist Date: Sat, 19 Jun 2021 02:08:20 +0300 Subject: [PATCH 3/3] [chunkcopy_safe] Don't call chunkcopy(). * chunkcopy() can read or write more than the safe length if the length is not multiple of chunk size. --- chunkset_tpl.h | 65 ++++++++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 28 deletions(-) diff --git a/chunkset_tpl.h b/chunkset_tpl.h index 2026ff37cc..256475a641 100644 --- a/chunkset_tpl.h +++ b/chunkset_tpl.h @@ -39,37 +39,46 @@ Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) { /* Behave like chunkcopy, but avoid writing beyond of legal output. */ Z_INTERNAL uint8_t* CHUNKCOPY_SAFE(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe) { len = MIN(len, safe - out + 1); - if (len < sizeof(chunk_t)) { -#if CHUNK_SIZE > 16 - if (len & 16) { - memcpy(out, from, 16); - out += 16; - from += 16; - } +#if CHUNK_SIZE >= 32 + while (len >= 32) { + memcpy(out, from, 32); + out += 32; + from += 32; + len -= 32; + } #endif -#if CHUNK_SIZE > 8 - if (len & 8) { - memcpy(out, from, 8); - out += 8; - from += 8; - } +#if CHUNK_SIZE >= 16 + while (len >= 16) { + memcpy(out, from, 16); + out += 16; + from += 16; + len -= 16; + } #endif - if (len & 4) { - memcpy(out, from, 4); - out += 4; - from += 4; - } - if (len & 2) { - memcpy(out, from, 2); - out += 2; - from += 2; - } - if (len & 1) { - *out++ = *from++; - } - return out; +#if CHUNK_SIZE >= 8 + while (len >= 8) { + memcpy(out, from, 8); + out += 8; + from += 8; + len -= 8; + } +#endif + if (len >= 4) { + memcpy(out, from, 4); + out += 4; + from += 4; + len -= 4; } - return CHUNKCOPY(out, from, len); + if (len >= 2) { + memcpy(out, from, 2); + out += 2; + from += 2; + len -= 2; + } + if (len == 1) { + *out++ = *from++; + } + return out; } /* Perform short copies until distance can be rewritten as being at least