Espresso/lib/vector_extentions/sse.c

#include <stdint.h>
#include <stddef.h>
#include <stdbool.h>

#include <stdio.h>

#include <vector_extentions/sse.h>


void enable_sse(void)
{
  uint32_t cr0, cr4;

  __asm__ volatile ("mov %%cr0, %0" : "=r"(cr0));
  cr0 &= ~(1 << 2); // EM = 0
  cr0 |=  (1 << 1); // MP = 1
  __asm__ volatile ("mov %0, %%cr0" :: "r"(cr0));

  __asm__ volatile ("mov %%cr4, %0" : "=r"(cr4));
  cr4 |= (1 << 9);  // OSFXSR = 1
  cr4 |= (1 << 10); // OSXMMEXCPT = 1
  __asm__ volatile ("mov %0, %%cr4" :: "r"(cr4));
}

// Basic SSE test: add two arrays of 4 floats using xmm registers
__attribute__((force_align_arg_pointer))
int32_t test_sse(void)
{
  float a[4] __attribute__((aligned(16))) = {1.0f, 2.0f, 3.0f, 4.0f};
  float b[4] __attribute__((aligned(16))) = {5.0f, 6.0f, 7.0f, 8.0f};
  float result[4] __attribute__((aligned(16)));
    
  asm volatile (
    "movaps %1, %%xmm0\n\t"
    "movaps %2, %%xmm2\n\t"
    "addps  %%xmm2, %%xmm0\n\t"
    "movaps %%xmm0, %0\n\t"
    : "=m" (result)
    : "m" (a), "m" (b)
    : "xmm0", "xmm2"
  );

  if (result[0] != 6.0 || result[1] != 8.0 || result[2] != 10.0 || result[3] != 12.0)
  {
    return -1;
  }
  
  return 0;
}

__attribute__((force_align_arg_pointer))
void sse2_add_double_arrays(double *dst, const double *a, const double *b, size_t count)
{
  for (size_t i = 0; i < count; i += 2)
  {
    asm volatile (
      "movapd (%1), %%xmm0\n\t"
      "movapd (%2), %%xmm1\n\t"
      "addpd %%xmm1, %%xmm0\n\t"
      "movapd %%xmm0, (%0)\n\t"
      :
      : "r"(dst + i), "r"(a + i), "r"(b + i)
      : "xmm0", "xmm1", "memory"
    );
  }
}

__attribute__((force_align_arg_pointer))
void sse2_add_int64_arrays(int64_t *dst, const int64_t *a, const int64_t *b, size_t count)
{
  for (size_t i = 0; i < count; i += 2)
  {
    asm volatile (
      "movdqa (%1), %%xmm0\n\t"
      "movdqa (%2), %%xmm1\n\t"
      "paddq %%xmm1, %%xmm0\n\t"
      "movdqa %%xmm0, (%0)\n\t"
      :
      : "r"(dst + i), "r"(a + i), "r"(b + i)
      : "xmm0", "xmm1", "memory"
    );
  }
}

__attribute__((force_align_arg_pointer))
void sse2_add_int32_arrays(int32_t *dst, const int32_t *a, const int32_t *b, size_t count)
{
  for (size_t i = 0; i < count; i += 4)
  {
    asm volatile (
      "movdqa (%1), %%xmm0\n\t"
      "movdqa (%2), %%xmm1\n\t"
      "paddd %%xmm1, %%xmm0\n\t"
      "movdqa %%xmm0, (%0)\n\t"
      :
      : "r"(dst + i), "r"(a + i), "r"(b + i)
      : "xmm0", "xmm1", "memory"
    );
  }
}

__attribute__((force_align_arg_pointer))
void *sse2_memcpy(void *dst, const void *src, uint32_t n)
{
  uintptr_t i;

  /* Align to 16 bytes if necessary */
  while (((uintptr_t)dst & 15) && n > 0)
  {
    *((uint8_t*)dst) = *((uint8_t*)src);
    dst = (uint8_t*)dst + 1;
    src = (uint8_t*)src + 1;
    n--;
  }

  /* Copy 16-byte chunks with SSE2 */
  for (i = 0; i + 15 < n; i += 16)
  {
    asm volatile (
      "movdqa (%1), %%xmm0\n\t"  /* Load 16 bytes from source into xmm0 */
      "movdqa %%xmm0, (%0)\n\t"  /* Store 16 bytes to destination */
      :
      : "r"(dst + i), "r"(src + i)
      : "xmm0", "memory"
    );
  }

  /* Handle remaining bytes (less than 16) */
  while (n > 0)
  {
    *((uint8_t*)dst) = *((uint8_t*)src);
    dst = (uint8_t*)dst + 1;
    src = (uint8_t*)src + 1;
    n--;
  }

  return dst;  /* Return pointer to destination */
}

__attribute__((force_align_arg_pointer))
char *sse2_strncpy(char *dest, const char *src, uint32_t n)
{
  uint32_t i = 0;

  /* Align initial copy */
  while (((uintptr_t)(dest + i) & 15) && i < n && src[i])
  {
    dest[i] = src[i];
    i++;
  }

  /* Bulk copy in 16-byte blocks */
  for (; i + 15 < n; i += 16)
  {
    asm volatile (
        "movdqu (%1), %%xmm0\n\t"
        "movdqu %%xmm0, (%0)\n\t"
        :
        : "r"(dest + i), "r"(src + i)
        : "xmm0", "memory"
    );

    /* Manually check if any nulls in the just-copied block */
    for (int j = 0; j < 16; ++j)
    {
      if (src[i + j] == '\0')
      {
        /* Null found: pad the rest with zeros */
        for (int k = j + 1; k < 16 && i + k < n; ++k)
        {
          dest[i + k] = '\0';
        }
        i += 16;
        goto tail;
      }
    }
  }

tail:
  /* Final bytes */
  for (; i < n && src[i]; ++i)
  {
      dest[i] = src[i];
  }

  for (; i < n; ++i)
  {
      dest[i] = '\0';
  }

  return dest;
}


__attribute__((force_align_arg_pointer))
void double_vector_to_int_vector(const double *src, int32_t *dst)
{
  asm volatile (
    "pxor %%xmm0, %%xmm0\n\t"          /* zero xmm0 */
    "cvttpd2dq (%1), %%xmm1\n\t"       /* convert src to int32s */
    "movq %%xmm1, %%xmm0\n\t"          /* move low 64 bits (2 ints) to xmm0 */
    "movdqa %%xmm0, (%0)\n\t"          /* store result */
    :
    : "r"(dst), "r"(src)
    : "xmm0", "xmm1", "memory"
  );
}

__attribute__((force_align_arg_pointer))
void int_vector_to_double_vector(const int32_t *src, double *dst)
{
  asm volatile (
    "movq (%1), %%xmm0\n\t"         /* Load 2 int32s (64 bits) into xmm0 */
    "cvtdq2pd %%xmm0, %%xmm1\n\t"   /* Convert to 2 doubles */
    "movapd %%xmm1, (%0)\n\t"       /* Store to destination */
    :
    : "r"(dst), "r"(src)
    : "xmm0", "xmm1", "memory"
  );
}

void * memclr_sse2(const void * const m_start, const size_t m_count)
{
	
  /* "i" is our counter of how many bytes we've cleared */
  size_t i;

  /* find out if "m_start" is aligned on a SSE_XMM_SIZE boundary */
  if ((size_t)m_start & (SSE_XMM_SIZE - 1))
  {
    i = 0;

    /* we need to clear byte-by-byte until "m_start" is aligned on an SSE_XMM_SIZE boundary */
    /* ... and lets make sure we don't copy 'too' many bytes (i < m_count) */
    while (((size_t)m_start + i) & (SSE_XMM_SIZE - 1) && i < m_count)
    {
      asm volatile ("stosb;" :: "D"((size_t)m_start + i), "a"(0));
      i++;
    }
  }
  else
  {
    /* if "m_start" was aligned, set our count to 0 */
    i = 0;
  }
 
  asm volatile ("pxor %%xmm0,%%xmm0"::); /* zero out XMM0 */
  /* clear 64-byte chunks of memory (4 16-byte operations) */
  for(; i + 64 <= m_count; i += 64)
  {
    asm volatile (" movdqa %%xmm0, 0(%0);	"    /* move 16 bytes from XMM0 to %0 + 0 */
                  " movdqa %%xmm0, 16(%0);	"
                  " movdqa %%xmm0, 32(%0);	"
                  " movdqa %%xmm0, 48(%0);	"
                  :: "r"((size_t)m_start + i));
  }
 
  /* copy the remaining bytes (if any) */
  asm volatile (" rep stosb; " :: "a"((size_t)(0)), "D"(((size_t)m_start) + i), "c"(m_count - i));

  /* "i" will contain the total amount of bytes that were actually transfered */
  i += m_count - i;

  /* we return "m_start" + the amount of bytes that were transfered */
  return (void *)(((size_t)m_start) + i);
}
A update 2025-05-28 14:41:02 -05:00			`#include <stdint.h>`
			`#include <stddef.h>`
			`#include <stdbool.h>`

			`#include <stdio.h>`

			`#include <vector_extentions/sse.h>`


Espresso 0.0.1b 2025-06-27 14:48:06 -05:00			`void enable_sse(void)`
			`{`
			`uint32_t cr0, cr4;`

			`__asm__ volatile ("mov %%cr0, %0" : "=r"(cr0));`
			`cr0 &= ~(1 << 2); // EM = 0`
			`cr0 \|= (1 << 1); // MP = 1`
			`__asm__ volatile ("mov %0, %%cr0" :: "r"(cr0));`

			`__asm__ volatile ("mov %%cr4, %0" : "=r"(cr4));`
			`cr4 \|= (1 << 9); // OSFXSR = 1`
			`cr4 \|= (1 << 10); // OSXMMEXCPT = 1`
			`__asm__ volatile ("mov %0, %%cr4" :: "r"(cr4));`
A update 2025-05-28 14:41:02 -05:00			`}`

			`// Basic SSE test: add two arrays of 4 floats using xmm registers`
			`__attribute__((force_align_arg_pointer))`
Espresso 0.0.1b 2025-06-27 14:48:06 -05:00			`int32_t test_sse(void)`
			`{`
A update 2025-05-28 14:41:02 -05:00			`float a[4] __attribute__((aligned(16))) = {1.0f, 2.0f, 3.0f, 4.0f};`
			`float b[4] __attribute__((aligned(16))) = {5.0f, 6.0f, 7.0f, 8.0f};`
			`float result[4] __attribute__((aligned(16)));`

			`asm volatile (`
			`"movaps %1, %%xmm0\n\t"`
			`"movaps %2, %%xmm2\n\t"`
			`"addps %%xmm2, %%xmm0\n\t"`
			`"movaps %%xmm0, %0\n\t"`
			`: "=m" (result)`
			`: "m" (a), "m" (b)`
			`: "xmm0", "xmm2"`
			`);`

			`if (result[0] != 6.0 \|\| result[1] != 8.0 \|\| result[2] != 10.0 \|\| result[3] != 12.0)`
			`{`
			`return -1;`
			`}`

			`return 0;`
			`}`

			`__attribute__((force_align_arg_pointer))`
			`void sse2_add_double_arrays(double dst, const double a, const double *b, size_t count)`
			`{`
			`for (size_t i = 0; i < count; i += 2)`
			`{`
			`asm volatile (`
			`"movapd (%1), %%xmm0\n\t"`
			`"movapd (%2), %%xmm1\n\t"`
			`"addpd %%xmm1, %%xmm0\n\t"`
			`"movapd %%xmm0, (%0)\n\t"`
			`:`
			`: "r"(dst + i), "r"(a + i), "r"(b + i)`
			`: "xmm0", "xmm1", "memory"`
			`);`
			`}`
			`}`

			`__attribute__((force_align_arg_pointer))`
			`void sse2_add_int64_arrays(int64_t dst, const int64_t a, const int64_t *b, size_t count)`
			`{`
			`for (size_t i = 0; i < count; i += 2)`
			`{`
			`asm volatile (`
			`"movdqa (%1), %%xmm0\n\t"`
			`"movdqa (%2), %%xmm1\n\t"`
			`"paddq %%xmm1, %%xmm0\n\t"`
			`"movdqa %%xmm0, (%0)\n\t"`
			`:`
			`: "r"(dst + i), "r"(a + i), "r"(b + i)`
			`: "xmm0", "xmm1", "memory"`
			`);`
			`}`
			`}`

			`__attribute__((force_align_arg_pointer))`
			`void sse2_add_int32_arrays(int32_t dst, const int32_t a, const int32_t *b, size_t count)`
			`{`
			`for (size_t i = 0; i < count; i += 4)`
			`{`
			`asm volatile (`
			`"movdqa (%1), %%xmm0\n\t"`
			`"movdqa (%2), %%xmm1\n\t"`
			`"paddd %%xmm1, %%xmm0\n\t"`
			`"movdqa %%xmm0, (%0)\n\t"`
			`:`
			`: "r"(dst + i), "r"(a + i), "r"(b + i)`
			`: "xmm0", "xmm1", "memory"`
			`);`
			`}`
			`}`

			`__attribute__((force_align_arg_pointer))`
			`void sse2_memcpy(void dst, const void *src, uint32_t n)`
			`{`
			`uintptr_t i;`

			`/* Align to 16 bytes if necessary */`
			`while (((uintptr_t)dst & 15) && n > 0)`
			`{`
			`((uint8_t)dst) = ((uint8_t)src);`
			`dst = (uint8_t*)dst + 1;`
			`src = (uint8_t*)src + 1;`
			`n--;`
			`}`

			`/* Copy 16-byte chunks with SSE2 */`
			`for (i = 0; i + 15 < n; i += 16)`
			`{`
			`asm volatile (`
			`"movdqa (%1), %%xmm0\n\t" /* Load 16 bytes from source into xmm0 */`
			`"movdqa %%xmm0, (%0)\n\t" /* Store 16 bytes to destination */`
			`:`
			`: "r"(dst + i), "r"(src + i)`
			`: "xmm0", "memory"`
			`);`
			`}`

			`/* Handle remaining bytes (less than 16) */`
			`while (n > 0)`
			`{`
			`((uint8_t)dst) = ((uint8_t)src);`
			`dst = (uint8_t*)dst + 1;`
			`src = (uint8_t*)src + 1;`
			`n--;`
			`}`

			`return dst; /* Return pointer to destination */`
			`}`

			`__attribute__((force_align_arg_pointer))`
			`char sse2_strncpy(char dest, const char *src, uint32_t n)`
			`{`
			`uint32_t i = 0;`

			`/* Align initial copy */`
Espresso 0.0.1b 2025-06-27 14:48:06 -05:00			`while (((uintptr_t)(dest + i) & 15) && i < n && src[i])`
			`{`
A update 2025-05-28 14:41:02 -05:00			`dest[i] = src[i];`
			`i++;`
			`}`

			`/* Bulk copy in 16-byte blocks */`
			`for (; i + 15 < n; i += 16)`
			`{`
			`asm volatile (`
			`"movdqu (%1), %%xmm0\n\t"`
			`"movdqu %%xmm0, (%0)\n\t"`
			`:`
			`: "r"(dest + i), "r"(src + i)`
			`: "xmm0", "memory"`
			`);`

			`/* Manually check if any nulls in the just-copied block */`
			`for (int j = 0; j < 16; ++j)`
			`{`
			`if (src[i + j] == '\0')`
			`{`
			`/* Null found: pad the rest with zeros */`
			`for (int k = j + 1; k < 16 && i + k < n; ++k)`
			`{`
			`dest[i + k] = '\0';`
			`}`
			`i += 16;`
			`goto tail;`
			`}`
			`}`
			`}`

			`tail:`
			`/* Final bytes */`
			`for (; i < n && src[i]; ++i)`
			`{`
			`dest[i] = src[i];`
			`}`

			`for (; i < n; ++i)`
			`{`
			`dest[i] = '\0';`
			`}`

			`return dest;`
			`}`


			`__attribute__((force_align_arg_pointer))`
			`void double_vector_to_int_vector(const double src, int32_t dst)`
			`{`
			`asm volatile (`
			`"pxor %%xmm0, %%xmm0\n\t" /* zero xmm0 */`
			`"cvttpd2dq (%1), %%xmm1\n\t" /* convert src to int32s */`
			`"movq %%xmm1, %%xmm0\n\t" /* move low 64 bits (2 ints) to xmm0 */`
			`"movdqa %%xmm0, (%0)\n\t" /* store result */`
			`:`
			`: "r"(dst), "r"(src)`
			`: "xmm0", "xmm1", "memory"`
			`);`
			`}`

			`__attribute__((force_align_arg_pointer))`
			`void int_vector_to_double_vector(const int32_t src, double dst)`
			`{`
			`asm volatile (`
			`"movq (%1), %%xmm0\n\t" /* Load 2 int32s (64 bits) into xmm0 */`
			`"cvtdq2pd %%xmm0, %%xmm1\n\t" /* Convert to 2 doubles */`
			`"movapd %%xmm1, (%0)\n\t" /* Store to destination */`
			`:`
			`: "r"(dst), "r"(src)`
			`: "xmm0", "xmm1", "memory"`
			`);`
			`}`

			`void * memclr_sse2(const void * const m_start, const size_t m_count)`
			`{`

			`/* "i" is our counter of how many bytes we've cleared */`
			`size_t i;`

			`/* find out if "m_start" is aligned on a SSE_XMM_SIZE boundary */`
			`if ((size_t)m_start & (SSE_XMM_SIZE - 1))`
			`{`
			`i = 0;`

			`/* we need to clear byte-by-byte until "m_start" is aligned on an SSE_XMM_SIZE boundary */`
			`/* ... and lets make sure we don't copy 'too' many bytes (i < m_count) */`
			`while (((size_t)m_start + i) & (SSE_XMM_SIZE - 1) && i < m_count)`
			`{`
			`asm volatile ("stosb;" :: "D"((size_t)m_start + i), "a"(0));`
			`i++;`
			`}`
			`}`
			`else`
			`{`
			`/* if "m_start" was aligned, set our count to 0 */`
			`i = 0;`
			`}`

			`asm volatile ("pxor %%xmm0,%%xmm0"::); /* zero out XMM0 */`
			`/* clear 64-byte chunks of memory (4 16-byte operations) */`
			`for(; i + 64 <= m_count; i += 64)`
			`{`
			`asm volatile (" movdqa %%xmm0, 0(%0); " /* move 16 bytes from XMM0 to %0 + 0 */`
			`" movdqa %%xmm0, 16(%0); "`
			`" movdqa %%xmm0, 32(%0); "`
			`" movdqa %%xmm0, 48(%0); "`
			`:: "r"((size_t)m_start + i));`
			`}`

			`/* copy the remaining bytes (if any) */`
			`asm volatile (" rep stosb; " :: "a"((size_t)(0)), "D"(((size_t)m_start) + i), "c"(m_count - i));`

			`/* "i" will contain the total amount of bytes that were actually transfered */`
			`i += m_count - i;`

			`/* we return "m_start" + the amount of bytes that were transfered */`
			`return (void *)(((size_t)m_start) + i);`
			`}`