Espresso 0.0.2a

2026-02-12 20:33:46 -06:00
parent c0dc95e255
commit 021fdbbcef
26 changed files with 452 additions and 27315 deletions
--- a/lib/vector_extensions/sse.c
+++ b/lib/vector_extensions/sse.c
@ -1,6 +1,8 @@
 #include <types.h>
 #include <stdio.h>

+#include <emmintrin.h> /* SSE2 intrinsics, TODO: use these in all functions, currently only used in memclr_sse2 */
+
 #include <vector_extensions/sse.h>


@ -20,7 +22,7 @@ void enable_sse(void)
 }

 /* Basic SSE test: add two arrays of 4 floats using xmm registers */
-__attribute__((force_align_arg_pointer))
+__attribute__((force_align_arg_pointer, target("sse2")))
 int32_t test_sse(void)
 {
  float a[4] __attribute__((aligned(16))) = {1.0f, 2.0f, 3.0f, 4.0f};
@ -45,7 +47,7 @@ int32_t test_sse(void)
  return 0;
 }

-__attribute__((force_align_arg_pointer))
+__attribute__((force_align_arg_pointer, target("sse2")))
 void sse2_add_double_arrays(double *dst, const double *a, const double *b, size_t count)
 {
  for (size_t i = 0; i < count; i += 2)
@ -62,7 +64,7 @@ void sse2_add_double_arrays(double *dst, const double *a, const double *b, size_
  }
 }

-__attribute__((force_align_arg_pointer))
+__attribute__((force_align_arg_pointer, target("sse2")))
 void sse2_add_int64_arrays(int64_t *dst, const int64_t *a, const int64_t *b, size_t count)
 {
  for (size_t i = 0; i < count; i += 2)
@ -79,7 +81,7 @@ void sse2_add_int64_arrays(int64_t *dst, const int64_t *a, const int64_t *b, siz
  }
 }

-__attribute__((force_align_arg_pointer))
+__attribute__((force_align_arg_pointer, target("sse2")))
 void sse2_add_int32_arrays(int32_t *dst, const int32_t *a, const int32_t *b, size_t count)
 {
  for (size_t i = 0; i < count; i += 4)
@ -96,7 +98,7 @@ void sse2_add_int32_arrays(int32_t *dst, const int32_t *a, const int32_t *b, siz
  }
 }

-__attribute__((force_align_arg_pointer))
+__attribute__((force_align_arg_pointer, target("sse2")))
 void *sse2_memcpy(void *dst, const void *src, uint32_t n)
 {
  uint8_t *d = (uint8_t *)dst;
@ -136,7 +138,7 @@ void *sse2_memcpy(void *dst, const void *src, uint32_t n)
 }


-__attribute__((force_align_arg_pointer))
+__attribute__((force_align_arg_pointer, target("sse2")))
 char *sse2_strncpy(char *dest, const char *src, uint32_t n)
 {
  uint32_t i = 0;
@ -218,48 +220,31 @@ void int_vector_to_double_vector(const int32_t *src, double *dst)
  );
 }

-void * memclr_sse2(const void * const m_start, const size_t m_count)
+__attribute__((force_align_arg_pointer, target("sse2")))
+void* memclr_sse2(void *m_start, size_t m_count)
 {
-	
-  /* "i" is our counter of how many bytes we've cleared */
-  size_t i;
+  unsigned char *dst = m_start;
+  size_t i = 0;

-  /* find out if "m_start" is aligned on a SSE_XMM_SIZE boundary */
-  if ((size_t)m_start & (SSE_XMM_SIZE - 1))
+  while ((uintptr_t)(dst + i) & 15 && i < m_count)
  {
-    i = 0;
-
-    /* we need to clear byte-by-byte until "m_start" is aligned on an SSE_XMM_SIZE boundary */
-    /* ... and lets make sure we don't copy 'too' many bytes (i < m_count) */
-    while (((size_t)m_start + i) & (SSE_XMM_SIZE - 1) && i < m_count)
-    {
-      asm volatile ("stosb;" :: "D"((size_t)m_start + i), "a"(0));
-      i++;
-    }
+    dst[i++] = 0;
  }
-  else
+
+  __m128i zero = _mm_setzero_si128();
+  
+  for (; i + 64 <= m_count; i += 64)
  {
-    /* if "m_start" was aligned, set our count to 0 */
-    i = 0;
+    _mm_store_si128((__m128i *)(dst + i + 0), zero);
+    _mm_store_si128((__m128i *)(dst + i + 16), zero);
+    _mm_store_si128((__m128i *)(dst + i + 32), zero);
+    _mm_store_si128((__m128i *)(dst + i + 48), zero);
  }
- 
-  asm volatile ("pxor %%xmm0,%%xmm0"::); /* zero out XMM0 */
-  /* clear 64-byte chunks of memory (4 16-byte operations) */
-  for(; i + 64 <= m_count; i += 64)
+
+  for (; i < m_count; ++i)
  {
-    asm volatile (" movdqa %%xmm0, 0(%0);	"    /* move 16 bytes from XMM0 to %0 + 0 */
-                  " movdqa %%xmm0, 16(%0);	"
-                  " movdqa %%xmm0, 32(%0);	"
-                  " movdqa %%xmm0, 48(%0);	"
-                  :: "r"((size_t)m_start + i));
+    dst[i] = 0;
  }
- 
-  /* copy the remaining bytes (if any) */
-  asm volatile (" rep stosb; " :: "a"((size_t)(0)), "D"(((size_t)m_start) + i), "c"(m_count - i));
-
-  /* "i" will contain the total amount of bytes that were actually transfered */
-  i += m_count - i;
-
-  /* we return "m_start" + the amount of bytes that were transfered */
-  return (void *)(((size_t)m_start) + i);
+  
+  return m_start;
 }