audioconvert: add stereo deinterleave neon asm

This can take some shortcuts and convert twice as many samples in one
iteration as the strided stereo deinterleave one.
This commit is contained in:
Wim Taymans 2021-10-28 11:30:04 +02:00
parent 6fab8fabca
commit 0ace131d72
3 changed files with 84 additions and 0 deletions

View file

@ -28,6 +28,88 @@
#include "fmt-ops.h"
void
conv_s16_to_f32d_2_neon(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],
uint32_t n_samples)
{
const int16_t *s = src[0];
float *d0 = dst[0], *d1 = dst[1];
unsigned int remainder = n_samples & 7;
n_samples -= remainder;
#ifdef __aarch64__
asm volatile(
" cmp %[n_samples], #0\n"
" beq 2f\n"
"1:"
" ld2 {v2.8h, v3.8h}, [%[s]], #32\n"
" subs %w[n_samples], %w[n_samples], #8\n"
" sxtl v0.4s, v2.4h\n"
" sxtl2 v1.4s, v2.8h\n"
" sxtl v2.4s, v3.4h\n"
" sxtl2 v3.4s, v3.8h\n"
" scvtf v0.4s, v0.4s, #15\n"
" scvtf v1.4s, v1.4s, #15\n"
" scvtf v2.4s, v2.4s, #15\n"
" scvtf v3.4s, v3.4s, #15\n"
" st1 {v0.4s, v1.4s}, [%[d0]], #32\n"
" st1 {v2.4s, v3.4s}, [%[d1]], #32\n"
" b.ne 1b\n"
"2:"
" cmp %[remainder], #0\n"
" beq 4f\n"
"3:"
" ld2 { v0.h, v1.h }[0], [%[s]], #4\n"
" subs %[remainder], %[remainder], #1\n"
" sshll v2.4s, v0.4h, #0\n"
" sshll v3.4s, v1.4h, #0\n"
" scvtf v0.4s, v2.4s, #15\n"
" scvtf v1.4s, v3.4s, #15\n"
" st1 { v0.s }[0], [%[d0]], #4\n"
" st1 { v1.s }[0], [%[d1]], #4\n"
" bne 3b\n"
"4:"
: [d0] "+r" (d0), [d1] "+r" (d1), [s] "+r" (s), [n_samples] "+r" (n_samples),
[remainder] "+r" (remainder)
: : "v0", "v1", "v2", "v3", "memory", "cc");
#else
asm volatile(
" cmp %[n_samples], #0\n"
" beq 2f\n"
"1:"
" vld2.16 {d0-d3}, [%[s]]!\n"
" subs %[n_samples], #8\n"
" vmovl.s16 q3, d3\n"
" vmovl.s16 q2, d2\n"
" vmovl.s16 q1, d1\n"
" vmovl.s16 q0, d0\n"
" vcvt.f32.s32 q3, q3, #15\n"
" vcvt.f32.s32 q2, q2, #15\n"
" vcvt.f32.s32 q1, q1, #15\n"
" vcvt.f32.s32 q0, q0, #15\n"
" vst1.32 {d4-d7}, [%[d1]]!\n"
" vst1.32 {d0-d3}, [%[d0]]!\n"
" bne 1b\n"
"2:"
" cmp %[remainder], #0\n"
" beq 4f\n"
"3:"
" vld2.16 { d0[0], d1[0] }, [%[s]], #4\n"
" subs %[remainder], %[remainder], #1\n"
" vmovl.s16 q1, d1\n"
" vmovl.s16 q0, d0\n"
" vcvt.f32.s32 q1, q1, #15\n"
" vcvt.f32.s32 q0, q0, #15\n"
" vst1.32 { d2[0] }, [%[d1]]!\n"
" vst1.32 { d0[0] }, [%[d0]]!\n"
" bne 3b\n"
"4:"
: [d0] "+r" (d0), [d1] "+r" (d1), [s] "+r" (s), [n_samples] "+r" (n_samples),
[remainder] "+r" (remainder)
: : "q0", "q1", "q2", "q3", "memory", "cc");
#endif
}
static void
conv_s16_to_f32d_2s_neon(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
uint32_t n_channels, uint32_t n_samples)

View file

@ -66,6 +66,7 @@ static struct conv_info conv_table[] =
{ SPA_AUDIO_FORMAT_S16, SPA_AUDIO_FORMAT_F32, 0, 0, conv_s16_to_f32_c },
{ SPA_AUDIO_FORMAT_S16P, SPA_AUDIO_FORMAT_F32P, 0, 0, conv_s16d_to_f32d_c },
#if defined (HAVE_NEON)
{ SPA_AUDIO_FORMAT_S16, SPA_AUDIO_FORMAT_F32P, 2, SPA_CPU_FLAG_NEON, conv_s16_to_f32d_2_neon },
{ SPA_AUDIO_FORMAT_S16, SPA_AUDIO_FORMAT_F32P, 0, SPA_CPU_FLAG_NEON, conv_s16_to_f32d_neon },
#endif
#if defined (HAVE_AVX2)

View file

@ -295,6 +295,7 @@ DEFINE_FUNCTION(interleave_32, c);
DEFINE_FUNCTION(interleave_32s, c);
#if defined(HAVE_NEON)
DEFINE_FUNCTION(s16_to_f32d_2, neon);
DEFINE_FUNCTION(s16_to_f32d, neon);
DEFINE_FUNCTION(f32d_to_s16, neon);
#endif