diff --git a/lib/vector_extentions/sse.c b/lib/vector_extentions/sse.c new file mode 100644 index 0000000..02ece9a --- /dev/null +++ b/lib/vector_extentions/sse.c @@ -0,0 +1,263 @@ +#include +#include +#include + +#include + +#include + + +void enable_sse(void) { + uint32_t cr0, cr4; + + __asm__ volatile ("mov %%cr0, %0" : "=r"(cr0)); + cr0 &= ~(1 << 2); // EM = 0 + cr0 |= (1 << 1); // MP = 1 + __asm__ volatile ("mov %0, %%cr0" :: "r"(cr0)); + + __asm__ volatile ("mov %%cr4, %0" : "=r"(cr4)); + cr4 |= (1 << 9); // OSFXSR = 1 + cr4 |= (1 << 10); // OSXMMEXCPT = 1 + __asm__ volatile ("mov %0, %%cr4" :: "r"(cr4)); +} + +// Basic SSE test: add two arrays of 4 floats using xmm registers +__attribute__((force_align_arg_pointer)) +int32_t test_sse(void) { + float a[4] __attribute__((aligned(16))) = {1.0f, 2.0f, 3.0f, 4.0f}; + float b[4] __attribute__((aligned(16))) = {5.0f, 6.0f, 7.0f, 8.0f}; + float result[4] __attribute__((aligned(16))); + + asm volatile ( + "movaps %1, %%xmm0\n\t" + "movaps %2, %%xmm2\n\t" + "addps %%xmm2, %%xmm0\n\t" + "movaps %%xmm0, %0\n\t" + : "=m" (result) + : "m" (a), "m" (b) + : "xmm0", "xmm2" + ); + + if (result[0] != 6.0 || result[1] != 8.0 || result[2] != 10.0 || result[3] != 12.0) + { + return -1; + } + + return 0; +} + +__attribute__((force_align_arg_pointer)) +void sse2_add_double_arrays(double *dst, const double *a, const double *b, size_t count) +{ + for (size_t i = 0; i < count; i += 2) + { + asm volatile ( + "movapd (%1), %%xmm0\n\t" + "movapd (%2), %%xmm1\n\t" + "addpd %%xmm1, %%xmm0\n\t" + "movapd %%xmm0, (%0)\n\t" + : + : "r"(dst + i), "r"(a + i), "r"(b + i) + : "xmm0", "xmm1", "memory" + ); + } +} + +__attribute__((force_align_arg_pointer)) +void sse2_add_int64_arrays(int64_t *dst, const int64_t *a, const int64_t *b, size_t count) +{ + for (size_t i = 0; i < count; i += 2) + { + asm volatile ( + "movdqa (%1), %%xmm0\n\t" + "movdqa (%2), %%xmm1\n\t" + "paddq %%xmm1, %%xmm0\n\t" + "movdqa %%xmm0, (%0)\n\t" + : + : "r"(dst + i), "r"(a + i), "r"(b + i) + : "xmm0", "xmm1", "memory" + ); + } +} + +__attribute__((force_align_arg_pointer)) +void sse2_add_int32_arrays(int32_t *dst, const int32_t *a, const int32_t *b, size_t count) +{ + for (size_t i = 0; i < count; i += 4) + { + asm volatile ( + "movdqa (%1), %%xmm0\n\t" + "movdqa (%2), %%xmm1\n\t" + "paddd %%xmm1, %%xmm0\n\t" + "movdqa %%xmm0, (%0)\n\t" + : + : "r"(dst + i), "r"(a + i), "r"(b + i) + : "xmm0", "xmm1", "memory" + ); + } +} + +__attribute__((force_align_arg_pointer)) +void *sse2_memcpy(void *dst, const void *src, uint32_t n) +{ + uintptr_t i; + + /* Align to 16 bytes if necessary */ + while (((uintptr_t)dst & 15) && n > 0) + { + *((uint8_t*)dst) = *((uint8_t*)src); + dst = (uint8_t*)dst + 1; + src = (uint8_t*)src + 1; + n--; + } + + /* Copy 16-byte chunks with SSE2 */ + for (i = 0; i + 15 < n; i += 16) + { + asm volatile ( + "movdqa (%1), %%xmm0\n\t" /* Load 16 bytes from source into xmm0 */ + "movdqa %%xmm0, (%0)\n\t" /* Store 16 bytes to destination */ + : + : "r"(dst + i), "r"(src + i) + : "xmm0", "memory" + ); + } + + /* Handle remaining bytes (less than 16) */ + while (n > 0) + { + *((uint8_t*)dst) = *((uint8_t*)src); + dst = (uint8_t*)dst + 1; + src = (uint8_t*)src + 1; + n--; + } + + return dst; /* Return pointer to destination */ +} + +__attribute__((force_align_arg_pointer)) +char *sse2_strncpy(char *dest, const char *src, uint32_t n) +{ + uint32_t i = 0; + + /* Align initial copy */ + while (((uintptr_t)(dest + i) & 15) && i < n && src[i]) { + dest[i] = src[i]; + i++; + } + + /* Bulk copy in 16-byte blocks */ + for (; i + 15 < n; i += 16) + { + asm volatile ( + "movdqu (%1), %%xmm0\n\t" + "movdqu %%xmm0, (%0)\n\t" + : + : "r"(dest + i), "r"(src + i) + : "xmm0", "memory" + ); + + /* Manually check if any nulls in the just-copied block */ + for (int j = 0; j < 16; ++j) + { + if (src[i + j] == '\0') + { + /* Null found: pad the rest with zeros */ + for (int k = j + 1; k < 16 && i + k < n; ++k) + { + dest[i + k] = '\0'; + } + i += 16; + goto tail; + } + } + } + +tail: + /* Final bytes */ + for (; i < n && src[i]; ++i) + { + dest[i] = src[i]; + } + + for (; i < n; ++i) + { + dest[i] = '\0'; + } + + return dest; +} + + +__attribute__((force_align_arg_pointer)) +void double_vector_to_int_vector(const double *src, int32_t *dst) +{ + asm volatile ( + "pxor %%xmm0, %%xmm0\n\t" /* zero xmm0 */ + "cvttpd2dq (%1), %%xmm1\n\t" /* convert src to int32s */ + "movq %%xmm1, %%xmm0\n\t" /* move low 64 bits (2 ints) to xmm0 */ + "movdqa %%xmm0, (%0)\n\t" /* store result */ + : + : "r"(dst), "r"(src) + : "xmm0", "xmm1", "memory" + ); +} + +__attribute__((force_align_arg_pointer)) +void int_vector_to_double_vector(const int32_t *src, double *dst) +{ + asm volatile ( + "movq (%1), %%xmm0\n\t" /* Load 2 int32s (64 bits) into xmm0 */ + "cvtdq2pd %%xmm0, %%xmm1\n\t" /* Convert to 2 doubles */ + "movapd %%xmm1, (%0)\n\t" /* Store to destination */ + : + : "r"(dst), "r"(src) + : "xmm0", "xmm1", "memory" + ); +} + +void * memclr_sse2(const void * const m_start, const size_t m_count) +{ + + /* "i" is our counter of how many bytes we've cleared */ + size_t i; + + /* find out if "m_start" is aligned on a SSE_XMM_SIZE boundary */ + if ((size_t)m_start & (SSE_XMM_SIZE - 1)) + { + i = 0; + + /* we need to clear byte-by-byte until "m_start" is aligned on an SSE_XMM_SIZE boundary */ + /* ... and lets make sure we don't copy 'too' many bytes (i < m_count) */ + while (((size_t)m_start + i) & (SSE_XMM_SIZE - 1) && i < m_count) + { + asm volatile ("stosb;" :: "D"((size_t)m_start + i), "a"(0)); + i++; + } + } + else + { + /* if "m_start" was aligned, set our count to 0 */ + i = 0; + } + + asm volatile ("pxor %%xmm0,%%xmm0"::); /* zero out XMM0 */ + /* clear 64-byte chunks of memory (4 16-byte operations) */ + for(; i + 64 <= m_count; i += 64) + { + asm volatile (" movdqa %%xmm0, 0(%0); " /* move 16 bytes from XMM0 to %0 + 0 */ + " movdqa %%xmm0, 16(%0); " + " movdqa %%xmm0, 32(%0); " + " movdqa %%xmm0, 48(%0); " + :: "r"((size_t)m_start + i)); + } + + /* copy the remaining bytes (if any) */ + asm volatile (" rep stosb; " :: "a"((size_t)(0)), "D"(((size_t)m_start) + i), "c"(m_count - i)); + + /* "i" will contain the total amount of bytes that were actually transfered */ + i += m_count - i; + + /* we return "m_start" + the amount of bytes that were transfered */ + return (void *)(((size_t)m_start) + i); +}