Espresso 0.0.1f
This commit is contained in:
@ -99,41 +99,43 @@ void sse2_add_int32_arrays(int32_t *dst, const int32_t *a, const int32_t *b, siz
|
||||
__attribute__((force_align_arg_pointer))
|
||||
void *sse2_memcpy(void *dst, const void *src, uint32_t n)
|
||||
{
|
||||
uintptr_t i;
|
||||
uint8_t *d = (uint8_t *)dst;
|
||||
const uint8_t *s = (const uint8_t *)src;
|
||||
uintptr_t i = 0;
|
||||
|
||||
/* Align to 16 bytes if necessary */
|
||||
while (((uintptr_t)dst & 15) && n > 0)
|
||||
/* Align destination to 16 bytes */
|
||||
while (((uintptr_t)d & 15) && n > 0)
|
||||
{
|
||||
*((uint8_t*)dst) = *((uint8_t*)src);
|
||||
dst = (uint8_t*)dst + 1;
|
||||
src = (uint8_t*)src + 1;
|
||||
*d++ = *s++;
|
||||
n--;
|
||||
}
|
||||
|
||||
/* Copy 16-byte chunks with SSE2 */
|
||||
for (i = 0; i + 15 < n; i += 16)
|
||||
/* Use SSE2 for 16-byte aligned blocks */
|
||||
for (; i + 15 < n; i += 16)
|
||||
{
|
||||
asm volatile (
|
||||
"movdqa (%1), %%xmm0\n\t" /* Load 16 bytes from source into xmm0 */
|
||||
"movdqa %%xmm0, (%0)\n\t" /* Store 16 bytes to destination */
|
||||
"movdqa (%1), %%xmm0\n\t"
|
||||
"movdqa %%xmm0, (%0)\n\t"
|
||||
:
|
||||
: "r"(dst + i), "r"(src + i)
|
||||
: "r"(d + i), "r"(s + i)
|
||||
: "xmm0", "memory"
|
||||
);
|
||||
}
|
||||
|
||||
/* Handle remaining bytes (less than 16) */
|
||||
while (n > 0)
|
||||
d += i;
|
||||
s += i;
|
||||
n -= i;
|
||||
|
||||
/* Copy any remaining bytes */
|
||||
while (n--)
|
||||
{
|
||||
*((uint8_t*)dst) = *((uint8_t*)src);
|
||||
dst = (uint8_t*)dst + 1;
|
||||
src = (uint8_t*)src + 1;
|
||||
n--;
|
||||
*d++ = *s++;
|
||||
}
|
||||
|
||||
return dst; /* Return pointer to destination */
|
||||
return dst;
|
||||
}
|
||||
|
||||
|
||||
__attribute__((force_align_arg_pointer))
|
||||
char *sse2_strncpy(char *dest, const char *src, uint32_t n)
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user