From 2c132be6262494517bad0249ef6af9ba10e46e12 Mon Sep 17 00:00:00 2001 From: Wim Taymans Date: Wed, 23 Oct 2024 12:54:23 +0200 Subject: [PATCH] audioconvert: align some buffers so that we can use aligned read and writes in SSE. --- spa/plugins/audioconvert/channelmix-ops-sse.c | 10 +++++----- spa/plugins/audioconvert/channelmix-ops.c | 6 +++++- spa/plugins/audioconvert/channelmix-ops.h | 10 ++++++---- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/spa/plugins/audioconvert/channelmix-ops-sse.c b/spa/plugins/audioconvert/channelmix-ops-sse.c index 2b04ace0a..9a128177f 100644 --- a/spa/plugins/audioconvert/channelmix-ops-sse.c +++ b/spa/plugins/audioconvert/channelmix-ops-sse.c @@ -266,7 +266,7 @@ static inline void convolver_run(const float *src, float *dst, sum[0] = _mm_setzero_ps(); for(i = 0; i < n_taps; i+=4) { t[0] = _mm_loadu_ps(&src[i]); - sum[0] = _mm_add_ps(sum[0], _mm_mul_ps(_mm_loadu_ps(&taps[i]), t[0])); + sum[0] = _mm_add_ps(sum[0], _mm_mul_ps(_mm_load_ps(&taps[i]), t[0])); } sum[0] = _mm_add_ps(sum[0], _mm_movehl_ps(sum[0], sum[0])); sum[0] = _mm_add_ss(sum[0], _mm_shuffle_ps(sum[0], sum[0], 0x55)); @@ -303,8 +303,8 @@ static inline void delay_convolve_run_sse(float *buffer, uint32_t *pos, w += 4; if (w >= n_buffer) { w -= n_buffer; - t[0] = _mm_loadu_ps(&buffer[n_buffer]); - _mm_storeu_ps(&buffer[0], t[0]); + t[0] = _mm_load_ps(&buffer[n_buffer]); + _mm_store_ps(&buffer[0], t[0]); } } for(; n < n_samples; n++) { @@ -326,8 +326,8 @@ static inline void delay_convolve_run_sse(float *buffer, uint32_t *pos, w += 4; if (w >= n_buffer) { w -= n_buffer; - t[0] = _mm_loadu_ps(&buffer[n_buffer]); - _mm_storeu_ps(&buffer[0], t[0]); + t[0] = _mm_load_ps(&buffer[n_buffer]); + _mm_store_ps(&buffer[0], t[0]); } } for(; n < n_samples; n++) { diff --git a/spa/plugins/audioconvert/channelmix-ops.c b/spa/plugins/audioconvert/channelmix-ops.c index f7db4c61c..d774112bc 100644 --- a/spa/plugins/audioconvert/channelmix-ops.c +++ b/spa/plugins/audioconvert/channelmix-ops.c @@ -775,7 +775,11 @@ int channelmix_init(struct channelmix *mix) mix->delay = (uint32_t)(mix->rear_delay * mix->freq / 1000.0f); mix->func_name = info->name; - spa_zero(mix->taps); + spa_zero(mix->taps_mem); + mix->taps = SPA_PTR_ALIGN(mix->taps_mem, CHANNELMIX_OPS_MAX_ALIGN, float); + mix->buffer[0] = SPA_PTR_ALIGN(&mix->buffer_mem[0], CHANNELMIX_OPS_MAX_ALIGN, float); + mix->buffer[1] = SPA_PTR_ALIGN(&mix->buffer_mem[2*BUFFER_SIZE], CHANNELMIX_OPS_MAX_ALIGN, float); + if (mix->hilbert_taps > 0) { mix->n_taps = SPA_CLAMP(mix->hilbert_taps, 15u, MAX_TAPS) | 1; blackman_window(mix->taps, mix->n_taps); diff --git a/spa/plugins/audioconvert/channelmix-ops.h b/spa/plugins/audioconvert/channelmix-ops.h index e1102c5ec..26e2efc3a 100644 --- a/spa/plugins/audioconvert/channelmix-ops.h +++ b/spa/plugins/audioconvert/channelmix-ops.h @@ -25,6 +25,8 @@ #define BUFFER_SIZE 4096 #define MAX_TAPS 255u +#define CHANNELMIX_OPS_MAX_ALIGN 16 + struct channelmix { uint32_t src_chan; uint32_t dst_chan; @@ -59,10 +61,12 @@ struct channelmix { uint32_t hilbert_taps; /* to phase shift, 0 disabled */ struct lr4 lr4[SPA_AUDIO_MAX_CHANNELS]; - float buffer[2][BUFFER_SIZE*2 + 16]; + float buffer_mem[2 * BUFFER_SIZE*2 + CHANNELMIX_OPS_MAX_ALIGN/4]; + float *buffer[2]; uint32_t pos[2]; uint32_t delay; - float taps[MAX_TAPS]; + float taps_mem[MAX_TAPS + CHANNELMIX_OPS_MAX_ALIGN/4]; + float *taps; uint32_t n_taps; void (*process) (struct channelmix *mix, void * SPA_RESTRICT dst[], @@ -104,8 +108,6 @@ void channelmix_##name##_##arch(struct channelmix *mix, \ void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], \ uint32_t n_samples); -#define CHANNELMIX_OPS_MAX_ALIGN 16 - DEFINE_FUNCTION(copy, c); DEFINE_FUNCTION(f32_n_m, c); DEFINE_FUNCTION(f32_1_2, c);