diff --git a/arch/arm/chunkset_neon.c b/arch/arm/chunkset_neon.c index 22c3785c11..e0ad3e04ea 100644 --- a/arch/arm/chunkset_neon.c +++ b/arch/arm/chunkset_neon.c @@ -25,19 +25,21 @@ static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) { } static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { - int16_t tmp; + uint16_t tmp; memcpy(&tmp, from, 2); - *chunk = vreinterpretq_u8_s16(vdupq_n_s16(tmp)); + *chunk = vreinterpretq_u8_u16(vdupq_n_u16(tmp)); } static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { - int32_t tmp; + uint32_t tmp; memcpy(&tmp, from, 4); - *chunk = vreinterpretq_u8_s32(vdupq_n_s32(tmp)); + *chunk = vreinterpretq_u8_u32(vdupq_n_u32(tmp)); } static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { - *chunk = vcombine_u8(vld1_u8(from), vld1_u8(from)); + uint64_t tmp; + memcpy(&tmp, from, 8); + *chunk = vreinterpretq_u8_u64(vdupq_n_u64(tmp)); } #define CHUNKSIZE chunksize_neon diff --git a/chunkset_tpl.h b/chunkset_tpl.h index 2026ff37cc..256475a641 100644 --- a/chunkset_tpl.h +++ b/chunkset_tpl.h @@ -39,37 +39,46 @@ Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) { /* Behave like chunkcopy, but avoid writing beyond of legal output. */ Z_INTERNAL uint8_t* CHUNKCOPY_SAFE(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe) { len = MIN(len, safe - out + 1); - if (len < sizeof(chunk_t)) { -#if CHUNK_SIZE > 16 - if (len & 16) { - memcpy(out, from, 16); - out += 16; - from += 16; - } +#if CHUNK_SIZE >= 32 + while (len >= 32) { + memcpy(out, from, 32); + out += 32; + from += 32; + len -= 32; + } #endif -#if CHUNK_SIZE > 8 - if (len & 8) { - memcpy(out, from, 8); - out += 8; - from += 8; - } +#if CHUNK_SIZE >= 16 + while (len >= 16) { + memcpy(out, from, 16); + out += 16; + from += 16; + len -= 16; + } #endif - if (len & 4) { - memcpy(out, from, 4); - out += 4; - from += 4; - } - if (len & 2) { - memcpy(out, from, 2); - out += 2; - from += 2; - } - if (len & 1) { - *out++ = *from++; - } - return out; +#if CHUNK_SIZE >= 8 + while (len >= 8) { + memcpy(out, from, 8); + out += 8; + from += 8; + len -= 8; + } +#endif + if (len >= 4) { + memcpy(out, from, 4); + out += 4; + from += 4; + len -= 4; } - return CHUNKCOPY(out, from, len); + if (len >= 2) { + memcpy(out, from, 2); + out += 2; + from += 2; + len -= 2; + } + if (len == 1) { + *out++ = *from++; + } + return out; } /* Perform short copies until distance can be rewritten as being at least