Upload files to "lib/vector_extentions"

2025-05-20 20:46:02 -05:00
parent 7fee654d7c
commit 360ac0f358
1 changed files with 263 additions and 0 deletions
--- a/lib/vector_extentions/sse.c
+++ b/lib/vector_extentions/sse.c
@ -0,0 +1,263 @@
+#include <stdint.h>
+#include <stddef.h>
+#include <stdbool.h>
+
+#include <stdio.h>
+
+#include <vector_extentions/sse.h>
+
+
+void enable_sse(void) {
+    uint32_t cr0, cr4;
+
+    __asm__ volatile ("mov %%cr0, %0" : "=r"(cr0));
+    cr0 &= ~(1 << 2); // EM = 0
+    cr0 |=  (1 << 1); // MP = 1
+    __asm__ volatile ("mov %0, %%cr0" :: "r"(cr0));
+
+    __asm__ volatile ("mov %%cr4, %0" : "=r"(cr4));
+    cr4 |= (1 << 9);  // OSFXSR = 1
+    cr4 |= (1 << 10); // OSXMMEXCPT = 1
+    __asm__ volatile ("mov %0, %%cr4" :: "r"(cr4));
+}
+
+// Basic SSE test: add two arrays of 4 floats using xmm registers
+__attribute__((force_align_arg_pointer))
+int32_t test_sse(void) {
+  float a[4] __attribute__((aligned(16))) = {1.0f, 2.0f, 3.0f, 4.0f};
+  float b[4] __attribute__((aligned(16))) = {5.0f, 6.0f, 7.0f, 8.0f};
+  float result[4] __attribute__((aligned(16)));
+    
+  asm volatile (
+    "movaps %1, %%xmm0\n\t"
+    "movaps %2, %%xmm2\n\t"
+    "addps  %%xmm2, %%xmm0\n\t"
+    "movaps %%xmm0, %0\n\t"
+    : "=m" (result)
+    : "m" (a), "m" (b)
+    : "xmm0", "xmm2"
+  );
+
+  if (result[0] != 6.0 || result[1] != 8.0 || result[2] != 10.0 || result[3] != 12.0)
+  {
+    return -1;
+  }
+  
+  return 0;
+}
+
+__attribute__((force_align_arg_pointer))
+void sse2_add_double_arrays(double *dst, const double *a, const double *b, size_t count)
+{
+  for (size_t i = 0; i < count; i += 2)
+  {
+    asm volatile (
+      "movapd (%1), %%xmm0\n\t"
+      "movapd (%2), %%xmm1\n\t"
+      "addpd %%xmm1, %%xmm0\n\t"
+      "movapd %%xmm0, (%0)\n\t"
+      :
+      : "r"(dst + i), "r"(a + i), "r"(b + i)
+      : "xmm0", "xmm1", "memory"
+    );
+  }
+}
+
+__attribute__((force_align_arg_pointer))
+void sse2_add_int64_arrays(int64_t *dst, const int64_t *a, const int64_t *b, size_t count)
+{
+  for (size_t i = 0; i < count; i += 2)
+  {
+    asm volatile (
+      "movdqa (%1), %%xmm0\n\t"
+      "movdqa (%2), %%xmm1\n\t"
+      "paddq %%xmm1, %%xmm0\n\t"
+      "movdqa %%xmm0, (%0)\n\t"
+      :
+      : "r"(dst + i), "r"(a + i), "r"(b + i)
+      : "xmm0", "xmm1", "memory"
+    );
+  }
+}
+
+__attribute__((force_align_arg_pointer))
+void sse2_add_int32_arrays(int32_t *dst, const int32_t *a, const int32_t *b, size_t count)
+{
+  for (size_t i = 0; i < count; i += 4)
+  {
+    asm volatile (
+      "movdqa (%1), %%xmm0\n\t"
+      "movdqa (%2), %%xmm1\n\t"
+      "paddd %%xmm1, %%xmm0\n\t"
+      "movdqa %%xmm0, (%0)\n\t"
+      :
+      : "r"(dst + i), "r"(a + i), "r"(b + i)
+      : "xmm0", "xmm1", "memory"
+    );
+  }
+}
+
+__attribute__((force_align_arg_pointer))
+void *sse2_memcpy(void *dst, const void *src, uint32_t n)
+{
+  uintptr_t i;
+
+  /* Align to 16 bytes if necessary */
+  while (((uintptr_t)dst & 15) && n > 0)
+  {
+    *((uint8_t*)dst) = *((uint8_t*)src);
+    dst = (uint8_t*)dst + 1;
+    src = (uint8_t*)src + 1;
+    n--;
+  }
+
+  /* Copy 16-byte chunks with SSE2 */
+  for (i = 0; i + 15 < n; i += 16)
+  {
+    asm volatile (
+      "movdqa (%1), %%xmm0\n\t"  /* Load 16 bytes from source into xmm0 */
+      "movdqa %%xmm0, (%0)\n\t"  /* Store 16 bytes to destination */
+      :
+      : "r"(dst + i), "r"(src + i)
+      : "xmm0", "memory"
+    );
+  }
+
+  /* Handle remaining bytes (less than 16) */
+  while (n > 0)
+  {
+    *((uint8_t*)dst) = *((uint8_t*)src);
+    dst = (uint8_t*)dst + 1;
+    src = (uint8_t*)src + 1;
+    n--;
+  }
+
+  return dst;  /* Return pointer to destination */
+}
+
+__attribute__((force_align_arg_pointer))
+char *sse2_strncpy(char *dest, const char *src, uint32_t n)
+{
+  uint32_t i = 0;
+
+  /* Align initial copy */
+  while (((uintptr_t)(dest + i) & 15) && i < n && src[i]) {
+    dest[i] = src[i];
+    i++;
+  }
+
+  /* Bulk copy in 16-byte blocks */
+  for (; i + 15 < n; i += 16)
+  {
+    asm volatile (
+        "movdqu (%1), %%xmm0\n\t"
+        "movdqu %%xmm0, (%0)\n\t"
+        :
+        : "r"(dest + i), "r"(src + i)
+        : "xmm0", "memory"
+    );
+
+    /* Manually check if any nulls in the just-copied block */
+    for (int j = 0; j < 16; ++j)
+    {
+      if (src[i + j] == '\0')
+      {
+        /* Null found: pad the rest with zeros */
+        for (int k = j + 1; k < 16 && i + k < n; ++k)
+        {
+          dest[i + k] = '\0';
+        }
+        i += 16;
+        goto tail;
+      }
+    }
+  }
+
+tail:
+  /* Final bytes */
+  for (; i < n && src[i]; ++i)
+  {
+      dest[i] = src[i];
+  }
+
+  for (; i < n; ++i)
+  {
+      dest[i] = '\0';
+  }
+
+  return dest;
+}
+
+
+__attribute__((force_align_arg_pointer))
+void double_vector_to_int_vector(const double *src, int32_t *dst)
+{
+  asm volatile (
+    "pxor %%xmm0, %%xmm0\n\t"          /* zero xmm0 */
+    "cvttpd2dq (%1), %%xmm1\n\t"       /* convert src to int32s */
+    "movq %%xmm1, %%xmm0\n\t"          /* move low 64 bits (2 ints) to xmm0 */
+    "movdqa %%xmm0, (%0)\n\t"          /* store result */
+    :
+    : "r"(dst), "r"(src)
+    : "xmm0", "xmm1", "memory"
+  );
+}
+
+__attribute__((force_align_arg_pointer))
+void int_vector_to_double_vector(const int32_t *src, double *dst)
+{
+  asm volatile (
+    "movq (%1), %%xmm0\n\t"         /* Load 2 int32s (64 bits) into xmm0 */
+    "cvtdq2pd %%xmm0, %%xmm1\n\t"   /* Convert to 2 doubles */
+    "movapd %%xmm1, (%0)\n\t"       /* Store to destination */
+    :
+    : "r"(dst), "r"(src)
+    : "xmm0", "xmm1", "memory"
+  );
+}
+
+void * memclr_sse2(const void * const m_start, const size_t m_count)
+{
+	
+  /* "i" is our counter of how many bytes we've cleared */
+  size_t i;
+
+  /* find out if "m_start" is aligned on a SSE_XMM_SIZE boundary */
+  if ((size_t)m_start & (SSE_XMM_SIZE - 1))
+  {
+    i = 0;
+
+    /* we need to clear byte-by-byte until "m_start" is aligned on an SSE_XMM_SIZE boundary */
+    /* ... and lets make sure we don't copy 'too' many bytes (i < m_count) */
+    while (((size_t)m_start + i) & (SSE_XMM_SIZE - 1) && i < m_count)
+    {
+      asm volatile ("stosb;" :: "D"((size_t)m_start + i), "a"(0));
+      i++;
+    }
+  }
+  else
+  {
+    /* if "m_start" was aligned, set our count to 0 */
+    i = 0;
+  }
+ 
+  asm volatile ("pxor %%xmm0,%%xmm0"::); /* zero out XMM0 */
+  /* clear 64-byte chunks of memory (4 16-byte operations) */
+  for(; i + 64 <= m_count; i += 64)
+  {
+    asm volatile (" movdqa %%xmm0, 0(%0);	"    /* move 16 bytes from XMM0 to %0 + 0 */
+                  " movdqa %%xmm0, 16(%0);	"
+                  " movdqa %%xmm0, 32(%0);	"
+                  " movdqa %%xmm0, 48(%0);	"
+                  :: "r"((size_t)m_start + i));
+  }
+ 
+  /* copy the remaining bytes (if any) */
+  asm volatile (" rep stosb; " :: "a"((size_t)(0)), "D"(((size_t)m_start) + i), "c"(m_count - i));
+
+  /* "i" will contain the total amount of bytes that were actually transfered */
+  i += m_count - i;
+
+  /* we return "m_start" + the amount of bytes that were transfered */
+  return (void *)(((size_t)m_start) + i);
+}