Espresso 0.0.2a
This commit is contained in:
@ -1,6 +1,8 @@
|
||||
#include <types.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <emmintrin.h> /* SSE2 intrinsics, TODO: use these in all functions, currently only used in memclr_sse2 */
|
||||
|
||||
#include <vector_extensions/sse.h>
|
||||
|
||||
|
||||
@ -20,7 +22,7 @@ void enable_sse(void)
|
||||
}
|
||||
|
||||
/* Basic SSE test: add two arrays of 4 floats using xmm registers */
|
||||
__attribute__((force_align_arg_pointer))
|
||||
__attribute__((force_align_arg_pointer, target("sse2")))
|
||||
int32_t test_sse(void)
|
||||
{
|
||||
float a[4] __attribute__((aligned(16))) = {1.0f, 2.0f, 3.0f, 4.0f};
|
||||
@ -45,7 +47,7 @@ int32_t test_sse(void)
|
||||
return 0;
|
||||
}
|
||||
|
||||
__attribute__((force_align_arg_pointer))
|
||||
__attribute__((force_align_arg_pointer, target("sse2")))
|
||||
void sse2_add_double_arrays(double *dst, const double *a, const double *b, size_t count)
|
||||
{
|
||||
for (size_t i = 0; i < count; i += 2)
|
||||
@ -62,7 +64,7 @@ void sse2_add_double_arrays(double *dst, const double *a, const double *b, size_
|
||||
}
|
||||
}
|
||||
|
||||
__attribute__((force_align_arg_pointer))
|
||||
__attribute__((force_align_arg_pointer, target("sse2")))
|
||||
void sse2_add_int64_arrays(int64_t *dst, const int64_t *a, const int64_t *b, size_t count)
|
||||
{
|
||||
for (size_t i = 0; i < count; i += 2)
|
||||
@ -79,7 +81,7 @@ void sse2_add_int64_arrays(int64_t *dst, const int64_t *a, const int64_t *b, siz
|
||||
}
|
||||
}
|
||||
|
||||
__attribute__((force_align_arg_pointer))
|
||||
__attribute__((force_align_arg_pointer, target("sse2")))
|
||||
void sse2_add_int32_arrays(int32_t *dst, const int32_t *a, const int32_t *b, size_t count)
|
||||
{
|
||||
for (size_t i = 0; i < count; i += 4)
|
||||
@ -96,7 +98,7 @@ void sse2_add_int32_arrays(int32_t *dst, const int32_t *a, const int32_t *b, siz
|
||||
}
|
||||
}
|
||||
|
||||
__attribute__((force_align_arg_pointer))
|
||||
__attribute__((force_align_arg_pointer, target("sse2")))
|
||||
void *sse2_memcpy(void *dst, const void *src, uint32_t n)
|
||||
{
|
||||
uint8_t *d = (uint8_t *)dst;
|
||||
@ -136,7 +138,7 @@ void *sse2_memcpy(void *dst, const void *src, uint32_t n)
|
||||
}
|
||||
|
||||
|
||||
__attribute__((force_align_arg_pointer))
|
||||
__attribute__((force_align_arg_pointer, target("sse2")))
|
||||
char *sse2_strncpy(char *dest, const char *src, uint32_t n)
|
||||
{
|
||||
uint32_t i = 0;
|
||||
@ -218,48 +220,31 @@ void int_vector_to_double_vector(const int32_t *src, double *dst)
|
||||
);
|
||||
}
|
||||
|
||||
void * memclr_sse2(const void * const m_start, const size_t m_count)
|
||||
__attribute__((force_align_arg_pointer, target("sse2")))
|
||||
void* memclr_sse2(void *m_start, size_t m_count)
|
||||
{
|
||||
|
||||
/* "i" is our counter of how many bytes we've cleared */
|
||||
size_t i;
|
||||
unsigned char *dst = m_start;
|
||||
size_t i = 0;
|
||||
|
||||
/* find out if "m_start" is aligned on a SSE_XMM_SIZE boundary */
|
||||
if ((size_t)m_start & (SSE_XMM_SIZE - 1))
|
||||
while ((uintptr_t)(dst + i) & 15 && i < m_count)
|
||||
{
|
||||
i = 0;
|
||||
|
||||
/* we need to clear byte-by-byte until "m_start" is aligned on an SSE_XMM_SIZE boundary */
|
||||
/* ... and lets make sure we don't copy 'too' many bytes (i < m_count) */
|
||||
while (((size_t)m_start + i) & (SSE_XMM_SIZE - 1) && i < m_count)
|
||||
{
|
||||
asm volatile ("stosb;" :: "D"((size_t)m_start + i), "a"(0));
|
||||
i++;
|
||||
}
|
||||
dst[i++] = 0;
|
||||
}
|
||||
else
|
||||
|
||||
__m128i zero = _mm_setzero_si128();
|
||||
|
||||
for (; i + 64 <= m_count; i += 64)
|
||||
{
|
||||
/* if "m_start" was aligned, set our count to 0 */
|
||||
i = 0;
|
||||
_mm_store_si128((__m128i *)(dst + i + 0), zero);
|
||||
_mm_store_si128((__m128i *)(dst + i + 16), zero);
|
||||
_mm_store_si128((__m128i *)(dst + i + 32), zero);
|
||||
_mm_store_si128((__m128i *)(dst + i + 48), zero);
|
||||
}
|
||||
|
||||
asm volatile ("pxor %%xmm0,%%xmm0"::); /* zero out XMM0 */
|
||||
/* clear 64-byte chunks of memory (4 16-byte operations) */
|
||||
for(; i + 64 <= m_count; i += 64)
|
||||
|
||||
for (; i < m_count; ++i)
|
||||
{
|
||||
asm volatile (" movdqa %%xmm0, 0(%0); " /* move 16 bytes from XMM0 to %0 + 0 */
|
||||
" movdqa %%xmm0, 16(%0); "
|
||||
" movdqa %%xmm0, 32(%0); "
|
||||
" movdqa %%xmm0, 48(%0); "
|
||||
:: "r"((size_t)m_start + i));
|
||||
dst[i] = 0;
|
||||
}
|
||||
|
||||
/* copy the remaining bytes (if any) */
|
||||
asm volatile (" rep stosb; " :: "a"((size_t)(0)), "D"(((size_t)m_start) + i), "c"(m_count - i));
|
||||
|
||||
/* "i" will contain the total amount of bytes that were actually transfered */
|
||||
i += m_count - i;
|
||||
|
||||
/* we return "m_start" + the amount of bytes that were transfered */
|
||||
return (void *)(((size_t)m_start) + i);
|
||||
|
||||
return m_start;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user