mirror of
https://gitlab.freedesktop.org/pipewire/pipewire
synced 2024-09-19 16:01:45 +00:00
audioconvert: add stereo deinterleave neon asm
This can take some shortcuts and convert twice as many samples in one iteration as the strided stereo deinterleave one.
This commit is contained in:
parent
6fab8fabca
commit
0ace131d72
|
@ -28,6 +28,88 @@
|
|||
|
||||
#include "fmt-ops.h"
|
||||
|
||||
void
|
||||
conv_s16_to_f32d_2_neon(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],
|
||||
uint32_t n_samples)
|
||||
{
|
||||
const int16_t *s = src[0];
|
||||
float *d0 = dst[0], *d1 = dst[1];
|
||||
unsigned int remainder = n_samples & 7;
|
||||
n_samples -= remainder;
|
||||
|
||||
#ifdef __aarch64__
|
||||
asm volatile(
|
||||
" cmp %[n_samples], #0\n"
|
||||
" beq 2f\n"
|
||||
"1:"
|
||||
" ld2 {v2.8h, v3.8h}, [%[s]], #32\n"
|
||||
" subs %w[n_samples], %w[n_samples], #8\n"
|
||||
" sxtl v0.4s, v2.4h\n"
|
||||
" sxtl2 v1.4s, v2.8h\n"
|
||||
" sxtl v2.4s, v3.4h\n"
|
||||
" sxtl2 v3.4s, v3.8h\n"
|
||||
" scvtf v0.4s, v0.4s, #15\n"
|
||||
" scvtf v1.4s, v1.4s, #15\n"
|
||||
" scvtf v2.4s, v2.4s, #15\n"
|
||||
" scvtf v3.4s, v3.4s, #15\n"
|
||||
" st1 {v0.4s, v1.4s}, [%[d0]], #32\n"
|
||||
" st1 {v2.4s, v3.4s}, [%[d1]], #32\n"
|
||||
" b.ne 1b\n"
|
||||
"2:"
|
||||
" cmp %[remainder], #0\n"
|
||||
" beq 4f\n"
|
||||
"3:"
|
||||
" ld2 { v0.h, v1.h }[0], [%[s]], #4\n"
|
||||
" subs %[remainder], %[remainder], #1\n"
|
||||
" sshll v2.4s, v0.4h, #0\n"
|
||||
" sshll v3.4s, v1.4h, #0\n"
|
||||
" scvtf v0.4s, v2.4s, #15\n"
|
||||
" scvtf v1.4s, v3.4s, #15\n"
|
||||
" st1 { v0.s }[0], [%[d0]], #4\n"
|
||||
" st1 { v1.s }[0], [%[d1]], #4\n"
|
||||
" bne 3b\n"
|
||||
"4:"
|
||||
: [d0] "+r" (d0), [d1] "+r" (d1), [s] "+r" (s), [n_samples] "+r" (n_samples),
|
||||
[remainder] "+r" (remainder)
|
||||
: : "v0", "v1", "v2", "v3", "memory", "cc");
|
||||
#else
|
||||
asm volatile(
|
||||
" cmp %[n_samples], #0\n"
|
||||
" beq 2f\n"
|
||||
"1:"
|
||||
" vld2.16 {d0-d3}, [%[s]]!\n"
|
||||
" subs %[n_samples], #8\n"
|
||||
" vmovl.s16 q3, d3\n"
|
||||
" vmovl.s16 q2, d2\n"
|
||||
" vmovl.s16 q1, d1\n"
|
||||
" vmovl.s16 q0, d0\n"
|
||||
" vcvt.f32.s32 q3, q3, #15\n"
|
||||
" vcvt.f32.s32 q2, q2, #15\n"
|
||||
" vcvt.f32.s32 q1, q1, #15\n"
|
||||
" vcvt.f32.s32 q0, q0, #15\n"
|
||||
" vst1.32 {d4-d7}, [%[d1]]!\n"
|
||||
" vst1.32 {d0-d3}, [%[d0]]!\n"
|
||||
" bne 1b\n"
|
||||
"2:"
|
||||
" cmp %[remainder], #0\n"
|
||||
" beq 4f\n"
|
||||
"3:"
|
||||
" vld2.16 { d0[0], d1[0] }, [%[s]], #4\n"
|
||||
" subs %[remainder], %[remainder], #1\n"
|
||||
" vmovl.s16 q1, d1\n"
|
||||
" vmovl.s16 q0, d0\n"
|
||||
" vcvt.f32.s32 q1, q1, #15\n"
|
||||
" vcvt.f32.s32 q0, q0, #15\n"
|
||||
" vst1.32 { d2[0] }, [%[d1]]!\n"
|
||||
" vst1.32 { d0[0] }, [%[d0]]!\n"
|
||||
" bne 3b\n"
|
||||
"4:"
|
||||
: [d0] "+r" (d0), [d1] "+r" (d1), [s] "+r" (s), [n_samples] "+r" (n_samples),
|
||||
[remainder] "+r" (remainder)
|
||||
: : "q0", "q1", "q2", "q3", "memory", "cc");
|
||||
#endif
|
||||
}
|
||||
|
||||
static void
|
||||
conv_s16_to_f32d_2s_neon(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
|
||||
uint32_t n_channels, uint32_t n_samples)
|
||||
|
|
|
@ -66,6 +66,7 @@ static struct conv_info conv_table[] =
|
|||
{ SPA_AUDIO_FORMAT_S16, SPA_AUDIO_FORMAT_F32, 0, 0, conv_s16_to_f32_c },
|
||||
{ SPA_AUDIO_FORMAT_S16P, SPA_AUDIO_FORMAT_F32P, 0, 0, conv_s16d_to_f32d_c },
|
||||
#if defined (HAVE_NEON)
|
||||
{ SPA_AUDIO_FORMAT_S16, SPA_AUDIO_FORMAT_F32P, 2, SPA_CPU_FLAG_NEON, conv_s16_to_f32d_2_neon },
|
||||
{ SPA_AUDIO_FORMAT_S16, SPA_AUDIO_FORMAT_F32P, 0, SPA_CPU_FLAG_NEON, conv_s16_to_f32d_neon },
|
||||
#endif
|
||||
#if defined (HAVE_AVX2)
|
||||
|
|
|
@ -295,6 +295,7 @@ DEFINE_FUNCTION(interleave_32, c);
|
|||
DEFINE_FUNCTION(interleave_32s, c);
|
||||
|
||||
#if defined(HAVE_NEON)
|
||||
DEFINE_FUNCTION(s16_to_f32d_2, neon);
|
||||
DEFINE_FUNCTION(s16_to_f32d, neon);
|
||||
DEFINE_FUNCTION(f32d_to_s16, neon);
|
||||
#endif
|
||||
|
|
Loading…
Reference in a new issue