AK+LibC: Always use REP MOVSB/STOSB for memcpy()/memset()

There's no great advantage to using MMX instructions here on modern
processors, since REP MOVSB/STOSB are optimized in microcode anyway
and tend to run much faster than MMX/SSE/AVX variants.

This also makes it much easier to implement high-level emulation of
memcpy/memset in UserspaceEmulator once we get there. :^)
This commit is contained in:
Andreas Kling 2020-07-27 15:54:39 +02:00
parent 272dbb82ff
commit 1366557094
2 changed files with 9 additions and 100 deletions

View file

@ -35,23 +35,11 @@
# include <string.h>
#endif
#if defined(__serenity__) && !defined(KERNEL)
extern "C" void* mmx_memcpy(void* to, const void* from, size_t);
#endif
ALWAYS_INLINE void fast_u32_copy(u32* dest, const u32* src, size_t count)
{
#if defined(__serenity__) && !defined(KERNEL)
if (count >= 256) {
mmx_memcpy(dest, src, count * sizeof(count));
return;
}
#endif
asm volatile(
"rep movsl\n"
: "=S"(src), "=D"(dest), "=c"(count)
: "S"(src), "D"(dest), "c"(count)
: "memory");
: "+S"(src), "+D"(dest), "+c"(count)::"memory");
}
ALWAYS_INLINE void fast_u32_fill(u32* dest, u32 value, size_t count)

View file

@ -139,103 +139,24 @@ int memcmp(const void* v1, const void* v2, size_t n)
}
#if ARCH(I386)
void* mmx_memcpy(void* dest, const void* src, size_t len)
{
ASSERT(len >= 1024);
auto* dest_ptr = (u8*)dest;
auto* src_ptr = (const u8*)src;
if ((u32)dest_ptr & 7) {
u32 prologue = 8 - ((u32)dest_ptr & 7);
len -= prologue;
asm volatile(
"rep movsb\n"
: "=S"(src_ptr), "=D"(dest_ptr), "=c"(prologue)
: "0"(src_ptr), "1"(dest_ptr), "2"(prologue)
: "memory");
}
for (u32 i = len / 64; i; --i) {
asm volatile(
"movq (%0), %%mm0\n"
"movq 8(%0), %%mm1\n"
"movq 16(%0), %%mm2\n"
"movq 24(%0), %%mm3\n"
"movq 32(%0), %%mm4\n"
"movq 40(%0), %%mm5\n"
"movq 48(%0), %%mm6\n"
"movq 56(%0), %%mm7\n"
"movq %%mm0, (%1)\n"
"movq %%mm1, 8(%1)\n"
"movq %%mm2, 16(%1)\n"
"movq %%mm3, 24(%1)\n"
"movq %%mm4, 32(%1)\n"
"movq %%mm5, 40(%1)\n"
"movq %%mm6, 48(%1)\n"
"movq %%mm7, 56(%1)\n" ::"r"(src_ptr),
"r"(dest_ptr)
: "memory");
src_ptr += 64;
dest_ptr += 64;
}
asm volatile("emms" ::
: "memory");
// Whatever remains we'll have to memcpy.
len %= 64;
if (len)
memcpy(dest_ptr, src_ptr, len);
return dest;
}
void* memcpy(void* dest_ptr, const void* src_ptr, size_t n)
{
if (n >= 1024)
return mmx_memcpy(dest_ptr, src_ptr, n);
u32 dest = (u32)dest_ptr;
u32 src = (u32)src_ptr;
// FIXME: Support starting at an unaligned address.
if (!(dest & 0x3) && !(src & 0x3) && n >= 12) {
size_t u32s = n / sizeof(u32);
asm volatile(
"rep movsl\n"
: "=S"(src), "=D"(dest)
: "S"(src), "D"(dest), "c"(u32s)
: "memory");
n -= u32s * sizeof(u32);
if (n == 0)
return dest_ptr;
}
void* original_dest = dest_ptr;
asm volatile(
"rep movsb\n" ::"S"(src), "D"(dest), "c"(n)
: "memory");
return dest_ptr;
"rep movsb"
: "+D"(dest_ptr), "+S"(src_ptr), "+c"(n)::"memory");
return original_dest;
}
void* memset(void* dest_ptr, int c, size_t n)
{
u32 dest = (u32)dest_ptr;
// FIXME: Support starting at an unaligned address.
if (!(dest & 0x3) && n >= 12) {
size_t u32s = n / sizeof(u32);
u32 expanded_c = (u8)c;
expanded_c |= expanded_c << 8;
expanded_c |= expanded_c << 16;
asm volatile(
"rep stosl\n"
: "=D"(dest)
: "D"(dest), "c"(u32s), "a"(expanded_c)
: "memory");
n -= u32s * sizeof(u32);
if (n == 0)
return dest_ptr;
}
void* original_dest = dest_ptr;
asm volatile(
"rep stosb\n"
: "=D"(dest), "=c"(n)
: "0"(dest), "1"(n), "a"(c)
: "=D"(dest_ptr), "=c"(n)
: "0"(dest_ptr), "1"(n), "a"(c)
: "memory");
return dest_ptr;
return original_dest;
}
#else
void* memcpy(void* dest_ptr, const void* src_ptr, size_t n)