Espresso/lib/vector_extensions/sse.c

#include <types.h>
#include <stdio.h>

#include <emmintrin.h> /* SSE2 intrinsics, TODO: use these in all functions, currently only used in memclr_sse2 */

#include <vector_extensions/sse.h>


void enable_sse(void)
{
  uint32_t cr0, cr4;

  __asm__ volatile ("mov %%cr0, %0" : "=r"(cr0));
  cr0 &= ~(1 << 2);
  cr0 |=  (1 << 1);
  __asm__ volatile ("mov %0, %%cr0" :: "r"(cr0));

  __asm__ volatile ("mov %%cr4, %0" : "=r"(cr4));
  cr4 |= (1 << 9);
  cr4 |= (1 << 10);
  __asm__ volatile ("mov %0, %%cr4" :: "r"(cr4));
}

/* Basic SSE test: add two arrays of 4 floats using xmm registers */
__attribute__((force_align_arg_pointer, target("sse2")))
int32_t test_sse(void)
{
  float a[4] __attribute__((aligned(16))) = {1.0f, 2.0f, 3.0f, 4.0f};
  float b[4] __attribute__((aligned(16))) = {5.0f, 6.0f, 7.0f, 8.0f};
  float result[4] __attribute__((aligned(16)));

  asm volatile (
    "movaps %1, %%xmm0\n\t"
    "movaps %2, %%xmm2\n\t"
    "addps  %%xmm2, %%xmm0\n\t"
    "movaps %%xmm0, %0\n\t"
    : "=m" (result)
    : "m" (a), "m" (b)
    : "xmm0", "xmm2"
  );

  if (result[0] != 6.0 || result[1] != 8.0 || result[2] != 10.0 || result[3] != 12.0)
  {
    return -1;
  }

  return 0;
}

__attribute__((force_align_arg_pointer, target("sse2")))
void sse2_add_double_arrays(double *dst, const double *a, const double *b, size_t count)
{
  for (size_t i = 0; i < count; i += 2)
  {
    asm volatile (
      "movapd (%1), %%xmm0\n\t"
      "movapd (%2), %%xmm1\n\t"
      "addpd %%xmm1, %%xmm0\n\t"
      "movapd %%xmm0, (%0)\n\t"
      :
      : "r"(dst + i), "r"(a + i), "r"(b + i)
      : "xmm0", "xmm1", "memory"
    );
  }
}

__attribute__((force_align_arg_pointer, target("sse2")))
void sse2_add_int64_arrays(int64_t *dst, const int64_t *a, const int64_t *b, size_t count)
{
  for (size_t i = 0; i < count; i += 2)
  {
    asm volatile (
      "movdqa (%1), %%xmm0\n\t"
      "movdqa (%2), %%xmm1\n\t"
      "paddq %%xmm1, %%xmm0\n\t"
      "movdqa %%xmm0, (%0)\n\t"
      :
      : "r"(dst + i), "r"(a + i), "r"(b + i)
      : "xmm0", "xmm1", "memory"
    );
  }
}

__attribute__((force_align_arg_pointer, target("sse2")))
void sse2_add_int32_arrays(int32_t *dst, const int32_t *a, const int32_t *b, size_t count)
{
  for (size_t i = 0; i < count; i += 4)
  {
    asm volatile (
      "movdqa (%1), %%xmm0\n\t"
      "movdqa (%2), %%xmm1\n\t"
      "paddd %%xmm1, %%xmm0\n\t"
      "movdqa %%xmm0, (%0)\n\t"
      :
      : "r"(dst + i), "r"(a + i), "r"(b + i)
      : "xmm0", "xmm1", "memory"
    );
  }
}

__attribute__((force_align_arg_pointer, target("sse2")))
void *sse2_memcpy(void *dst, const void *src, uint32_t n)
{
  uint8_t *d = (uint8_t *)dst;
  const uint8_t *s = (const uint8_t *)src;
  uintptr_t i = 0;

  /* Align destination to 16 bytes */
  while (((uintptr_t)d & 15) && n > 0)
  {
    *d++ = *s++;
    n--;
  }

  /* Use SSE2 for 16-byte aligned blocks */
  for (; i + 15 < n; i += 16)
  {
    asm volatile (
      "movdqa (%1), %%xmm0\n\t"
      "movdqa %%xmm0, (%0)\n\t"
      :
      : "r"(d + i), "r"(s + i)
      : "xmm0", "memory"
    );
  }

  d += i;
  s += i;
  n -= i;

  /* Copy any remaining bytes */
  while (n--)
  {
    *d++ = *s++;
  }

  return dst;
}


__attribute__((force_align_arg_pointer, target("sse2")))
char *sse2_strncpy(char *dest, const char *src, uint32_t n)
{
  uint32_t i = 0;

  /* Align initial copy */
  while (((uintptr_t)(dest + i) & 15) && i < n && src[i])
  {
    dest[i] = src[i];
    i++;
  }

  /* Bulk copy in 16-byte blocks */
  for (; i + 15 < n; i += 16)
  {
    asm volatile (
        "movdqu (%1), %%xmm0\n\t"
        "movdqu %%xmm0, (%0)\n\t"
        :
        : "r"(dest + i), "r"(src + i)
        : "xmm0", "memory"
    );

    /* Manually check if any nulls in the just-copied block */
    for (int j = 0; j < 16; ++j)
    {
      if (src[i + j] == '\0')
      {
        /* Null found: pad the rest with zeros */
        for (int k = j + 1; k < 16 && i + k < n; ++k)
        {
          dest[i + k] = '\0';
        }
        i += 16;
        goto tail;
      }
    }
  }

tail:
  /* Final bytes */
  for (; i < n && src[i]; ++i)
  {
      dest[i] = src[i];
  }

  for (; i < n; ++i)
  {
      dest[i] = '\0';
  }

  return dest;
}


__attribute__((force_align_arg_pointer))
void double_vector_to_int_vector(const double *src, int32_t *dst)
{
  asm volatile (
    "pxor %%xmm0, %%xmm0\n\t"          /* zero xmm0 */
    "cvttpd2dq (%1), %%xmm1\n\t"       /* convert src to int32s */
    "movq %%xmm1, %%xmm0\n\t"          /* move low 64 bits (2 ints) to xmm0 */
    "movdqa %%xmm0, (%0)\n\t"          /* store result */
    :
    : "r"(dst), "r"(src)
    : "xmm0", "xmm1", "memory"
  );
}

__attribute__((force_align_arg_pointer))
void int_vector_to_double_vector(const int32_t *src, double *dst)
{
  asm volatile (
    "movq (%1), %%xmm0\n\t"         /* Load 2 int32s (64 bits) into xmm0 */
    "cvtdq2pd %%xmm0, %%xmm1\n\t"   /* Convert to 2 doubles */
    "movapd %%xmm1, (%0)\n\t"       /* Store to destination */
    :
    : "r"(dst), "r"(src)
    : "xmm0", "xmm1", "memory"
  );
}

__attribute__((force_align_arg_pointer, target("sse2")))
void* memclr_sse2(void *m_start, size_t m_count)
{
  unsigned char *dst = m_start;
  size_t i = 0;

  while ((uintptr_t)(dst + i) & 15 && i < m_count)
  {
    dst[i++] = 0;
  }

  __m128i zero = _mm_setzero_si128();

  for (; i + 64 <= m_count; i += 64)
  {
    _mm_store_si128((__m128i *)(dst + i + 0), zero);
    _mm_store_si128((__m128i *)(dst + i + 16), zero);
    _mm_store_si128((__m128i *)(dst + i + 32), zero);
    _mm_store_si128((__m128i *)(dst + i + 48), zero);
  }

  for (; i < m_count; ++i)
  {
    dst[i] = 0;
  }

  return m_start;
}