#include #include #include #include #include void enable_sse(void) { uint32_t cr0, cr4; __asm__ volatile ("mov %%cr0, %0" : "=r"(cr0)); cr0 &= ~(1 << 2); // EM = 0 cr0 |= (1 << 1); // MP = 1 __asm__ volatile ("mov %0, %%cr0" :: "r"(cr0)); __asm__ volatile ("mov %%cr4, %0" : "=r"(cr4)); cr4 |= (1 << 9); // OSFXSR = 1 cr4 |= (1 << 10); // OSXMMEXCPT = 1 __asm__ volatile ("mov %0, %%cr4" :: "r"(cr4)); } // Basic SSE test: add two arrays of 4 floats using xmm registers __attribute__((force_align_arg_pointer)) int32_t test_sse(void) { float a[4] __attribute__((aligned(16))) = {1.0f, 2.0f, 3.0f, 4.0f}; float b[4] __attribute__((aligned(16))) = {5.0f, 6.0f, 7.0f, 8.0f}; float result[4] __attribute__((aligned(16))); asm volatile ( "movaps %1, %%xmm0\n\t" "movaps %2, %%xmm2\n\t" "addps %%xmm2, %%xmm0\n\t" "movaps %%xmm0, %0\n\t" : "=m" (result) : "m" (a), "m" (b) : "xmm0", "xmm2" ); if (result[0] != 6.0 || result[1] != 8.0 || result[2] != 10.0 || result[3] != 12.0) { return -1; } return 0; } __attribute__((force_align_arg_pointer)) void sse2_add_double_arrays(double *dst, const double *a, const double *b, size_t count) { for (size_t i = 0; i < count; i += 2) { asm volatile ( "movapd (%1), %%xmm0\n\t" "movapd (%2), %%xmm1\n\t" "addpd %%xmm1, %%xmm0\n\t" "movapd %%xmm0, (%0)\n\t" : : "r"(dst + i), "r"(a + i), "r"(b + i) : "xmm0", "xmm1", "memory" ); } } __attribute__((force_align_arg_pointer)) void sse2_add_int64_arrays(int64_t *dst, const int64_t *a, const int64_t *b, size_t count) { for (size_t i = 0; i < count; i += 2) { asm volatile ( "movdqa (%1), %%xmm0\n\t" "movdqa (%2), %%xmm1\n\t" "paddq %%xmm1, %%xmm0\n\t" "movdqa %%xmm0, (%0)\n\t" : : "r"(dst + i), "r"(a + i), "r"(b + i) : "xmm0", "xmm1", "memory" ); } } __attribute__((force_align_arg_pointer)) void sse2_add_int32_arrays(int32_t *dst, const int32_t *a, const int32_t *b, size_t count) { for (size_t i = 0; i < count; i += 4) { asm volatile ( "movdqa (%1), %%xmm0\n\t" "movdqa (%2), %%xmm1\n\t" "paddd %%xmm1, %%xmm0\n\t" "movdqa %%xmm0, (%0)\n\t" : : "r"(dst + i), "r"(a + i), "r"(b + i) : "xmm0", "xmm1", "memory" ); } } __attribute__((force_align_arg_pointer)) void *sse2_memcpy(void *dst, const void *src, uint32_t n) { uintptr_t i; /* Align to 16 bytes if necessary */ while (((uintptr_t)dst & 15) && n > 0) { *((uint8_t*)dst) = *((uint8_t*)src); dst = (uint8_t*)dst + 1; src = (uint8_t*)src + 1; n--; } /* Copy 16-byte chunks with SSE2 */ for (i = 0; i + 15 < n; i += 16) { asm volatile ( "movdqa (%1), %%xmm0\n\t" /* Load 16 bytes from source into xmm0 */ "movdqa %%xmm0, (%0)\n\t" /* Store 16 bytes to destination */ : : "r"(dst + i), "r"(src + i) : "xmm0", "memory" ); } /* Handle remaining bytes (less than 16) */ while (n > 0) { *((uint8_t*)dst) = *((uint8_t*)src); dst = (uint8_t*)dst + 1; src = (uint8_t*)src + 1; n--; } return dst; /* Return pointer to destination */ } __attribute__((force_align_arg_pointer)) char *sse2_strncpy(char *dest, const char *src, uint32_t n) { uint32_t i = 0; /* Align initial copy */ while (((uintptr_t)(dest + i) & 15) && i < n && src[i]) { dest[i] = src[i]; i++; } /* Bulk copy in 16-byte blocks */ for (; i + 15 < n; i += 16) { asm volatile ( "movdqu (%1), %%xmm0\n\t" "movdqu %%xmm0, (%0)\n\t" : : "r"(dest + i), "r"(src + i) : "xmm0", "memory" ); /* Manually check if any nulls in the just-copied block */ for (int j = 0; j < 16; ++j) { if (src[i + j] == '\0') { /* Null found: pad the rest with zeros */ for (int k = j + 1; k < 16 && i + k < n; ++k) { dest[i + k] = '\0'; } i += 16; goto tail; } } } tail: /* Final bytes */ for (; i < n && src[i]; ++i) { dest[i] = src[i]; } for (; i < n; ++i) { dest[i] = '\0'; } return dest; } __attribute__((force_align_arg_pointer)) void double_vector_to_int_vector(const double *src, int32_t *dst) { asm volatile ( "pxor %%xmm0, %%xmm0\n\t" /* zero xmm0 */ "cvttpd2dq (%1), %%xmm1\n\t" /* convert src to int32s */ "movq %%xmm1, %%xmm0\n\t" /* move low 64 bits (2 ints) to xmm0 */ "movdqa %%xmm0, (%0)\n\t" /* store result */ : : "r"(dst), "r"(src) : "xmm0", "xmm1", "memory" ); } __attribute__((force_align_arg_pointer)) void int_vector_to_double_vector(const int32_t *src, double *dst) { asm volatile ( "movq (%1), %%xmm0\n\t" /* Load 2 int32s (64 bits) into xmm0 */ "cvtdq2pd %%xmm0, %%xmm1\n\t" /* Convert to 2 doubles */ "movapd %%xmm1, (%0)\n\t" /* Store to destination */ : : "r"(dst), "r"(src) : "xmm0", "xmm1", "memory" ); } void * memclr_sse2(const void * const m_start, const size_t m_count) { /* "i" is our counter of how many bytes we've cleared */ size_t i; /* find out if "m_start" is aligned on a SSE_XMM_SIZE boundary */ if ((size_t)m_start & (SSE_XMM_SIZE - 1)) { i = 0; /* we need to clear byte-by-byte until "m_start" is aligned on an SSE_XMM_SIZE boundary */ /* ... and lets make sure we don't copy 'too' many bytes (i < m_count) */ while (((size_t)m_start + i) & (SSE_XMM_SIZE - 1) && i < m_count) { asm volatile ("stosb;" :: "D"((size_t)m_start + i), "a"(0)); i++; } } else { /* if "m_start" was aligned, set our count to 0 */ i = 0; } asm volatile ("pxor %%xmm0,%%xmm0"::); /* zero out XMM0 */ /* clear 64-byte chunks of memory (4 16-byte operations) */ for(; i + 64 <= m_count; i += 64) { asm volatile (" movdqa %%xmm0, 0(%0); " /* move 16 bytes from XMM0 to %0 + 0 */ " movdqa %%xmm0, 16(%0); " " movdqa %%xmm0, 32(%0); " " movdqa %%xmm0, 48(%0); " :: "r"((size_t)m_start + i)); } /* copy the remaining bytes (if any) */ asm volatile (" rep stosb; " :: "a"((size_t)(0)), "D"(((size_t)m_start) + i), "c"(m_count - i)); /* "i" will contain the total amount of bytes that were actually transfered */ i += m_count - i; /* we return "m_start" + the amount of bytes that were transfered */ return (void *)(((size_t)m_start) + i); }