diff --git a/pipewire-jack b/pipewire-jack index c404942e9..1cf3e0121 160000 --- a/pipewire-jack +++ b/pipewire-jack @@ -1 +1 @@ -Subproject commit c404942e9d15bd3340c57121753fed8d38b247c6 +Subproject commit 1cf3e01219d66f92ea655ddf5c2f4caa9b96bcf7 diff --git a/spa/include/spa/buffer/alloc.h b/spa/include/spa/buffer/alloc.h index 30a4f57f3..2ffd2d282 100644 --- a/spa/include/spa/buffer/alloc.h +++ b/spa/include/spa/buffer/alloc.h @@ -66,7 +66,7 @@ static inline int spa_buffer_alloc_fill_info(struct spa_buffer_alloc_info *info, info->skel_size += n_datas * sizeof(struct spa_data); for (i = 0, size = 0; i < n_metas; i++) - size += metas[i].size; + size += SPA_ROUND_UP_N(metas[i].size, 8); info->meta_size = size; if (SPA_FLAG_CHECK(info->flags, SPA_BUFFER_ALLOC_FLAG_INLINE_META)) @@ -76,13 +76,18 @@ static inline int spa_buffer_alloc_fill_info(struct spa_buffer_alloc_info *info, if (SPA_FLAG_CHECK(info->flags, SPA_BUFFER_ALLOC_FLAG_INLINE_CHUNK)) info->skel_size += info->chunk_size; - for (i = 0, size = 0; i < n_datas; i++) + for (i = 0, size = 0; i < n_datas; i++) { + size = SPA_ROUND_UP_N(size, data_aligns[i]); size += datas[i].maxsize; + } info->data_size = size; if (!SPA_FLAG_CHECK(info->flags, SPA_BUFFER_ALLOC_FLAG_NO_DATA) && - SPA_FLAG_CHECK(info->flags, SPA_BUFFER_ALLOC_FLAG_INLINE_DATA)) - info->skel_size += size; + SPA_FLAG_CHECK(info->flags, SPA_BUFFER_ALLOC_FLAG_INLINE_DATA)) { + info->skel_size += n_datas ? data_aligns[0] - 1 : 0; + info->skel_size += info->data_size; + } + info->skel_size = SPA_ROUND_UP_N(info->skel_size, 8); return 0; } @@ -114,7 +119,7 @@ spa_buffer_alloc_layout(struct spa_buffer_alloc_info *info, struct spa_meta *m = &b->metas[i]; *m = info->metas[i]; m->data = *dp; - *dp = SPA_MEMBER(*dp, m->size, void); + *dp = SPA_MEMBER(*dp, SPA_ROUND_UP_N(m->size, 8), void); } size = info->n_datas * sizeof(struct spa_chunk); @@ -138,6 +143,7 @@ spa_buffer_alloc_layout(struct spa_buffer_alloc_info *info, *d = info->datas[i]; d->chunk = &cp[i]; if (!SPA_FLAG_CHECK(info->flags, SPA_BUFFER_ALLOC_FLAG_NO_DATA)) { + *dp = SPA_PTR_ALIGN(*dp, info->data_aligns[i], void); d->data = *dp; *dp = SPA_MEMBER(*dp, d->maxsize, void); } @@ -173,8 +179,6 @@ spa_buffer_alloc_array(uint32_t n_buffers, uint32_t flags, spa_buffer_alloc_fill_info(&info, n_metas, metas, n_datas, datas, data_aligns); - info.skel_size = SPA_ROUND_UP_N(info.skel_size, 16); - buffers = (struct spa_buffer **)calloc(n_buffers, sizeof(struct spa_buffer *) + info.skel_size); skel = SPA_MEMBER(buffers, sizeof(struct spa_buffer *) * n_buffers, void); diff --git a/spa/include/spa/utils/defs.h b/spa/include/spa/utils/defs.h index 401d8fca8..912d06201 100644 --- a/spa/include/spa/utils/defs.h +++ b/spa/include/spa/utils/defs.h @@ -147,6 +147,9 @@ struct spa_fraction { #define SPA_ROUND_DOWN_N(num,align) ((num) & ~((align) - 1)) #define SPA_ROUND_UP_N(num,align) SPA_ROUND_DOWN_N((num) + ((align) - 1),align) +#define SPA_IS_ALIGNED(p,align) (((intptr_t)(p) & ((align)-1)) == 0) +#define SPA_PTR_ALIGN(p,align,type) (type*)SPA_ROUND_UP_N((intptr_t)(p), (intptr_t)(align)) + #ifndef SPA_LIKELY #ifdef __GNUC__ #define SPA_LIKELY(x) (__builtin_expect(!!(x),1)) diff --git a/spa/plugins/alsa/alsa-sink.c b/spa/plugins/alsa/alsa-sink.c index 7d2da205e..02fa3da4a 100644 --- a/spa/plugins/alsa/alsa-sink.c +++ b/spa/plugins/alsa/alsa-sink.c @@ -487,8 +487,6 @@ static int port_set_format(struct spa_node *node, info.media_subtype != SPA_MEDIA_SUBTYPE_raw) return -EINVAL; - spa_debug_pod(0, NULL, format); - if (spa_format_audio_raw_parse(format, &info.info.raw) < 0) return -EINVAL; diff --git a/spa/plugins/audioconvert/benchmark-fmt-ops.c b/spa/plugins/audioconvert/benchmark-fmt-ops.c index 1c1cc9ae9..823030328 100644 --- a/spa/plugins/audioconvert/benchmark-fmt-ops.c +++ b/spa/plugins/audioconvert/benchmark-fmt-ops.c @@ -31,52 +31,65 @@ #include "fmt-ops.c" -#define N_SAMPLES 4096 -#define N_CHANNELS 5 +#define MAX_SAMPLES 4096 +#define MAX_CHANNELS 11 #define MAX_COUNT 1000 -static uint8_t samp_in[N_SAMPLES * N_CHANNELS * 4]; -static uint8_t samp_out[N_SAMPLES * N_CHANNELS * 4]; +static uint8_t samp_in[MAX_SAMPLES * MAX_CHANNELS * 4]; +static uint8_t samp_out[MAX_SAMPLES * MAX_CHANNELS * 4]; -static void run_test(const char *name, bool in_packed, bool out_packed, convert_func_t func) +static const int sample_sizes[] = { 0, 1, 128, 513, 4096 }; +static const int channel_counts[] = { 1, 2, 4, 6, 8, 11 }; + +static void run_test1(const char *name, bool in_packed, bool out_packed, convert_func_t func, + int n_channels, int n_samples) { - const void *ip[N_CHANNELS]; - void *op[N_CHANNELS]; - int i, j, ic, oc, ns; + int i, j; + const void *ip[n_channels]; + void *op[n_channels]; struct timespec ts; - uint64_t t1, t2; - uint64_t count = 0; + uint64_t count, t1, t2; - for (j = 0; j < N_CHANNELS; j++) { - ip[j] = &samp_in[j * N_SAMPLES * 4]; - op[j] = &samp_out[j * N_SAMPLES * 4]; + for (j = 0; j < n_channels; j++) { + ip[j] = &samp_in[j * n_samples * 4]; + op[j] = &samp_out[j * n_samples * 4]; } - ic = in_packed ? 1 : N_CHANNELS; - oc = out_packed ? 1 : N_CHANNELS; - ns = (in_packed && out_packed) ? N_SAMPLES * N_CHANNELS : N_SAMPLES; - clock_gettime(CLOCK_MONOTONIC, &ts); t1 = SPA_TIMESPEC_TO_NSEC(&ts); + count = 0; for (i = 0; i < MAX_COUNT; i++) { - func(NULL, oc, op, ic, ip, ns); + func(NULL, op, ip, n_channels, n_samples); count++; } - count *= N_SAMPLES; clock_gettime(CLOCK_MONOTONIC, &ts); t2 = SPA_TIMESPEC_TO_NSEC(&ts); - fprintf(stderr, "%s: elapsed %"PRIu64" count %"PRIu64" = %"PRIu64"/sec\n", name, + fprintf(stderr, "%s: samples %d, channels %d: elapsed %"PRIu64" count %" + PRIu64" = %"PRIu64"/sec\n", name, n_samples, n_channels, t2 - t1, count, count * (uint64_t)SPA_NSEC_PER_SEC / (t2 - t1)); } +static void run_test(const char *name, bool in_packed, bool out_packed, convert_func_t func) +{ + size_t i, j; + + for (i = 0; i < SPA_N_ELEMENTS(sample_sizes); i++) { + for (j = 0; j < SPA_N_ELEMENTS(channel_counts); j++) { + run_test1(name, in_packed, out_packed, func, channel_counts[j], + (sample_sizes[i] + (channel_counts[j] -1)) / channel_counts[j]); + } + } +} + static void test_f32_u8(void) { run_test("test_f32_u8", true, true, conv_f32_to_u8); run_test("test_f32d_u8", false, true, conv_f32d_to_u8); run_test("test_f32_u8d", true, false, conv_f32_to_u8d); + run_test("test_f32d_u8d", false, false, conv_f32d_to_u8d); } static void test_u8_f32(void) diff --git a/spa/plugins/audioconvert/channelmix-ops-sse.c b/spa/plugins/audioconvert/channelmix-ops-sse.c index 749c8ca11..6cb6881c6 100644 --- a/spa/plugins/audioconvert/channelmix-ops-sse.c +++ b/spa/plugins/audioconvert/channelmix-ops-sse.c @@ -26,31 +26,34 @@ static void channelmix_copy_sse(void *data, int n_dst, void *dst[n_dst], - int n_src, const void *src[n_src], void *matrix, float v, int n_bytes) + int n_src, const void *src[n_src], void *matrix, float v, int n_samples) { - int i, n, n_samples = n_bytes / sizeof(float), unrolled, remain; + int i, n, unrolled; float **d = (float **)dst; float **s = (float **)src; __m128 vol = _mm_set1_ps(v); if (v <= VOLUME_MIN) { for (i = 0; i < n_dst; i++) - memset(d[i], 0, n_bytes); + memset(d[i], 0, n_samples * sizeof(float)); } else if (v == VOLUME_NORM) { for (i = 0; i < n_dst; i++) - memcpy(d[i], s[i], n_bytes); + memcpy(d[i], s[i], n_samples * sizeof(float)); } else { for (i = 0; i < n_dst; i++) { float *di = d[i], *si = s[i]; - unrolled = n_samples / 4; - remain = n_samples & 3; + if (SPA_IS_ALIGNED(di, 16) && + SPA_IS_ALIGNED(si, 16)) + unrolled = n_samples / 4; + else + unrolled = 0; for(n = 0; unrolled--; n += 4) - _mm_storeu_ps(&di[n], _mm_mul_ps(_mm_loadu_ps(&si[n]), vol)); - for(; remain--; n++) + _mm_store_ps(&di[n], _mm_mul_ps(_mm_load_ps(&si[n]), vol)); + for(; n < n_samples; n++) _mm_store_ss(&di[n], _mm_mul_ss(_mm_load_ss(&si[n]), vol)); } } @@ -58,33 +61,40 @@ channelmix_copy_sse(void *data, int n_dst, void *dst[n_dst], static void channelmix_f32_2_4_sse(void *data, int n_dst, void *dst[n_dst], - int n_src, const void *src[n_src], void *matrix, float v, int n_bytes) + int n_src, const void *src[n_src], void *matrix, float v, int n_samples) { - int i, n, n_samples = n_bytes / sizeof(float), unrolled, remain; + int i, n, unrolled; float **d = (float **)dst; float **s = (float **)src; __m128 vol = _mm_set1_ps(v); __m128 in; - float *dFL = d[0], *dFR = d[1], *dRL = d[2], *dRR = d[3]; float *sFL = s[0], *sFR = s[1]; + float *dFL = d[0], *dFR = d[1], *dRL = d[2], *dRR = d[3]; + + if (SPA_IS_ALIGNED(sFL, 16) && + SPA_IS_ALIGNED(sFR, 16) && + SPA_IS_ALIGNED(dFL, 16) && + SPA_IS_ALIGNED(dFR, 16) && + SPA_IS_ALIGNED(dRL, 16) && + SPA_IS_ALIGNED(dRR, 16)) + unrolled = n_samples / 4; + else + unrolled = 0; if (v <= VOLUME_MIN) { for (i = 0; i < n_dst; i++) - memset(d[i], 0, n_bytes); + memset(d[i], 0, n_samples * sizeof(float)); } else if (v == VOLUME_NORM) { - unrolled = n_samples / 4; - remain = n_samples & 3; - for(n = 0; unrolled--; n += 4) { - in = _mm_loadu_ps(&sFL[n]); - _mm_storeu_ps(&dFL[n], in); - _mm_storeu_ps(&dRL[n], in); - in = _mm_loadu_ps(&sFR[n]); - _mm_storeu_ps(&dFR[n], in); - _mm_storeu_ps(&dRR[n], in); + in = _mm_load_ps(&sFL[n]); + _mm_store_ps(&dFL[n], in); + _mm_store_ps(&dRL[n], in); + in = _mm_load_ps(&sFR[n]); + _mm_store_ps(&dFR[n], in); + _mm_store_ps(&dRR[n], in); } - for(; remain--; n++) { + for(; n < n_samples; n++) { in = _mm_load_ss(&sFL[n]); _mm_store_ss(&dFL[n], in); _mm_store_ss(&dRL[n], in); @@ -94,18 +104,15 @@ channelmix_f32_2_4_sse(void *data, int n_dst, void *dst[n_dst], } } else { - unrolled = n_samples / 4; - remain = n_samples & 3; - for(n = 0; unrolled--; n += 4) { - in = _mm_mul_ps(_mm_loadu_ps(&sFL[n]), vol); - _mm_storeu_ps(&dFL[n], in); - _mm_storeu_ps(&dRL[n], in); - in = _mm_mul_ps(_mm_loadu_ps(&sFR[n]), vol); - _mm_storeu_ps(&dFR[n], in); - _mm_storeu_ps(&dRR[n], in); + in = _mm_mul_ps(_mm_load_ps(&sFL[n]), vol); + _mm_store_ps(&dFL[n], in); + _mm_store_ps(&dRL[n], in); + in = _mm_mul_ps(_mm_load_ps(&sFR[n]), vol); + _mm_store_ps(&dFR[n], in); + _mm_store_ps(&dRR[n], in); } - for(; remain--; n++) { + for(; n < n_samples; n++) { in = _mm_mul_ss(_mm_load_ss(&sFL[n]), vol); _mm_store_ss(&dFL[n], in); _mm_store_ss(&dRL[n], in); @@ -119,9 +126,9 @@ channelmix_f32_2_4_sse(void *data, int n_dst, void *dst[n_dst], /* FL+FR+FC+LFE+SL+SR -> FL+FR */ static void channelmix_f32_5p1_2_sse(void *data, int n_dst, void *dst[n_dst], - int n_src, const void *src[n_src], void *matrix, float v, int n_bytes) + int n_src, const void *src[n_src], void *matrix, float v, int n_samples) { - int n, n_samples = n_bytes / sizeof(float), unrolled, remain; + int n, unrolled; float **d = (float **) dst; float **s = (float **) src; float *m = matrix; @@ -130,33 +137,41 @@ channelmix_f32_5p1_2_sse(void *data, int n_dst, void *dst[n_dst], __m128 slev = _mm_set1_ps(m[4]); __m128 vol = _mm_set1_ps(v); __m128 in, ctr; - float *dFL = d[0], *dFR = d[1]; float *sFL = s[0], *sFR = s[1], *sFC = s[2], *sLFE = s[3], *sSL = s[4], *sSR = s[5]; + float *dFL = d[0], *dFR = d[1]; + + if (SPA_IS_ALIGNED(sFL, 16) && + SPA_IS_ALIGNED(sFR, 16) && + SPA_IS_ALIGNED(sFC, 16) && + SPA_IS_ALIGNED(sLFE, 16) && + SPA_IS_ALIGNED(sSL, 16) && + SPA_IS_ALIGNED(sSR, 16) && + SPA_IS_ALIGNED(dFL, 16) && + SPA_IS_ALIGNED(dFR, 16)) + unrolled = n_samples / 4; + else + unrolled = 0; if (v <= VOLUME_MIN) { - memset(dFL, 0, n_bytes); - memset(dFR, 0, n_bytes); + memset(dFL, 0, n_samples * sizeof(float)); + memset(dFR, 0, n_samples * sizeof(float)); } else if (v == VOLUME_NORM) { - - unrolled = n_samples / 4; - remain = n_samples & 3; - for(n = 0; unrolled--; n += 4) { - ctr = _mm_mul_ps(_mm_loadu_ps(&sFC[n]), clev); - ctr = _mm_add_ps(ctr, _mm_mul_ps(_mm_loadu_ps(&sLFE[n]), llev)); - in = _mm_mul_ps(_mm_loadu_ps(&sSL[n]), slev); + ctr = _mm_mul_ps(_mm_load_ps(&sFC[n]), clev); + ctr = _mm_add_ps(ctr, _mm_mul_ps(_mm_load_ps(&sLFE[n]), llev)); + in = _mm_mul_ps(_mm_load_ps(&sSL[n]), slev); in = _mm_add_ps(in, ctr); - in = _mm_add_ps(in, _mm_loadu_ps(&sFL[n])); - _mm_storeu_ps(&dFL[n], in); - in = _mm_mul_ps(_mm_loadu_ps(&sSR[n]), slev); + in = _mm_add_ps(in, _mm_load_ps(&sFL[n])); + _mm_store_ps(&dFL[n], in); + in = _mm_mul_ps(_mm_load_ps(&sSR[n]), slev); in = _mm_add_ps(in, ctr); - in = _mm_add_ps(in, _mm_loadu_ps(&sFR[n])); - _mm_storeu_ps(&dFR[n], in); + in = _mm_add_ps(in, _mm_load_ps(&sFR[n])); + _mm_store_ps(&dFR[n], in); } - for(; remain--; n++) { + for(; n < n_samples; n++) { ctr = _mm_mul_ss(_mm_load_ss(&sFC[n]), clev); - ctr = _mm_add_ps(ctr, _mm_mul_ps(_mm_loadu_ps(&sLFE[n]), llev)); + ctr = _mm_add_ss(ctr, _mm_mul_ss(_mm_load_ss(&sLFE[n]), llev)); in = _mm_mul_ss(_mm_load_ss(&sSL[n]), slev); in = _mm_add_ss(in, ctr); in = _mm_add_ss(in, _mm_load_ss(&sFL[n])); @@ -168,26 +183,23 @@ channelmix_f32_5p1_2_sse(void *data, int n_dst, void *dst[n_dst], } } else { - unrolled = n_samples / 4; - remain = n_samples & 3; - for(n = 0; unrolled--; n += 4) { - ctr = _mm_mul_ps(_mm_loadu_ps(&sFC[n]), clev); - ctr = _mm_add_ps(ctr, _mm_mul_ps(_mm_loadu_ps(&sLFE[n]), llev)); - in = _mm_mul_ps(_mm_loadu_ps(&sSL[n]), slev); + ctr = _mm_mul_ps(_mm_load_ps(&sFC[n]), clev); + ctr = _mm_add_ps(ctr, _mm_mul_ps(_mm_load_ps(&sLFE[n]), llev)); + in = _mm_mul_ps(_mm_load_ps(&sSL[n]), slev); in = _mm_add_ps(in, ctr); - in = _mm_add_ps(in, _mm_loadu_ps(&sFL[n])); + in = _mm_add_ps(in, _mm_load_ps(&sFL[n])); in = _mm_mul_ps(in, vol); - _mm_storeu_ps(&dFL[n], in); - in = _mm_mul_ps(_mm_loadu_ps(&sSR[n]), slev); + _mm_store_ps(&dFL[n], in); + in = _mm_mul_ps(_mm_load_ps(&sSR[n]), slev); in = _mm_add_ps(in, ctr); - in = _mm_add_ps(in, _mm_loadu_ps(&sFR[n])); + in = _mm_add_ps(in, _mm_load_ps(&sFR[n])); in = _mm_mul_ps(in, vol); - _mm_storeu_ps(&dFR[n], in); + _mm_store_ps(&dFR[n], in); } - for(; remain--; n++) { + for(; n < n_samples; n++) { ctr = _mm_mul_ss(_mm_load_ss(&sFC[n]), clev); - ctr = _mm_add_ps(ctr, _mm_mul_ps(_mm_loadu_ps(&sLFE[n]), llev)); + ctr = _mm_add_ss(ctr, _mm_mul_ss(_mm_load_ss(&sLFE[n]), llev)); in = _mm_mul_ss(_mm_load_ss(&sSL[n]), slev); in = _mm_add_ss(in, ctr); in = _mm_add_ss(in, _mm_load_ss(&sFL[n])); @@ -205,58 +217,66 @@ channelmix_f32_5p1_2_sse(void *data, int n_dst, void *dst[n_dst], /* FL+FR+FC+LFE+SL+SR -> FL+FR+FC+LFE*/ static void channelmix_f32_5p1_3p1_sse(void *data, int n_dst, void *dst[n_dst], - int n_src, const void *src[n_src], void *matrix, float v, int n_bytes) + int n_src, const void *src[n_src], void *matrix, float v, int n_samples) { - int i, n, n_samples = n_bytes / sizeof(float), unrolled, remain; + int i, n, unrolled; float **d = (float **) dst; float **s = (float **) src; __m128 mix = _mm_set1_ps(v * 0.5f); __m128 vol = _mm_set1_ps(v); __m128 avg; - float *dFL = d[0], *dFR = d[1], *dFC = d[2], *dLFE = d[3]; float *sFL = s[0], *sFR = s[1], *sFC = s[2], *sLFE = s[3], *sSL = s[4], *sSR = s[5]; + float *dFL = d[0], *dFR = d[1], *dFC = d[2], *dLFE = d[3]; + + if (SPA_IS_ALIGNED(sFL, 16) && + SPA_IS_ALIGNED(sFR, 16) && + SPA_IS_ALIGNED(sFC, 16) && + SPA_IS_ALIGNED(sLFE, 16) && + SPA_IS_ALIGNED(sSL, 16) && + SPA_IS_ALIGNED(sSR, 16) && + SPA_IS_ALIGNED(dFL, 16) && + SPA_IS_ALIGNED(dFR, 16) && + SPA_IS_ALIGNED(dFC, 16) && + SPA_IS_ALIGNED(dLFE, 16)) + unrolled = n_samples / 4; + else + unrolled = 0; if (v <= VOLUME_MIN) { for (i = 0; i < n_dst; i++) - memset(d[i], 0, n_bytes); + memset(d[i], 0, n_samples * sizeof(float)); } else if (v == VOLUME_NORM) { - unrolled = n_samples / 4; - remain = n_samples & 3; - for(n = 0; unrolled--; n += 4) { - avg = _mm_add_ps(_mm_loadu_ps(&sFL[n]), _mm_loadu_ps(&sSL[n])); - _mm_storeu_ps(&dFL[n], _mm_mul_ps(avg, mix)); - avg = _mm_add_ps(_mm_loadu_ps(&sFR[n]), _mm_loadu_ps(&sSR[n])); - _mm_storeu_ps(&dFR[n], _mm_mul_ps(avg, mix)); - _mm_storeu_ps(&dFC[n], _mm_loadu_ps(&sFC[n])); - _mm_storeu_ps(&dLFE[n], _mm_loadu_ps(&sLFE[n])); + avg = _mm_add_ps(_mm_load_ps(&sFL[n]), _mm_load_ps(&sSL[n])); + _mm_store_ps(&dFL[n], _mm_mul_ps(avg, mix)); + avg = _mm_add_ps(_mm_load_ps(&sFR[n]), _mm_load_ps(&sSR[n])); + _mm_store_ps(&dFR[n], _mm_mul_ps(avg, mix)); + _mm_store_ps(&dFC[n], _mm_load_ps(&sFC[n])); + _mm_store_ps(&dLFE[n], _mm_load_ps(&sLFE[n])); } - for(; remain--; n++) { + for(; n < n_samples; n++) { avg = _mm_add_ss(_mm_load_ss(&sFL[n]), _mm_load_ss(&sSL[n])); _mm_store_ss(&dFL[n], _mm_mul_ss(avg, mix)); - avg = _mm_add_ps(_mm_load_ss(&sFR[n]), _mm_load_ss(&sSR[n])); + avg = _mm_add_ss(_mm_load_ss(&sFR[n]), _mm_load_ss(&sSR[n])); _mm_store_ss(&dFR[n], _mm_mul_ss(avg, mix)); _mm_store_ss(&dFC[n], _mm_load_ss(&sFC[n])); _mm_store_ss(&dLFE[n], _mm_load_ss(&sLFE[n])); } } else { - unrolled = n_samples / 4; - remain = n_samples & 3; - for(n = 0; unrolled--; n += 4) { - avg = _mm_add_ps(_mm_loadu_ps(&sFL[n]), _mm_loadu_ps(&sSL[n])); - _mm_storeu_ps(&dFL[n], _mm_mul_ps(avg, mix)); - avg = _mm_add_ps(_mm_loadu_ps(&sFR[n]), _mm_loadu_ps(&sSR[n])); - _mm_storeu_ps(&dFR[n], _mm_mul_ps(avg, mix)); - _mm_storeu_ps(&dFC[n], _mm_mul_ps(_mm_loadu_ps(&sFC[n]), vol)); - _mm_storeu_ps(&dLFE[n], _mm_mul_ps(_mm_loadu_ps(&sLFE[n]), vol)); + avg = _mm_add_ps(_mm_load_ps(&sFL[n]), _mm_load_ps(&sSL[n])); + _mm_store_ps(&dFL[n], _mm_mul_ps(avg, mix)); + avg = _mm_add_ps(_mm_load_ps(&sFR[n]), _mm_load_ps(&sSR[n])); + _mm_store_ps(&dFR[n], _mm_mul_ps(avg, mix)); + _mm_store_ps(&dFC[n], _mm_mul_ps(_mm_load_ps(&sFC[n]), vol)); + _mm_store_ps(&dLFE[n], _mm_mul_ps(_mm_load_ps(&sLFE[n]), vol)); } - for(; remain--; n++) { + for(; n < n_samples; n++) { avg = _mm_add_ss(_mm_load_ss(&sFL[n]), _mm_load_ss(&sSL[n])); _mm_store_ss(&dFL[n], _mm_mul_ss(avg, mix)); - avg = _mm_add_ps(_mm_load_ss(&sFR[n]), _mm_load_ss(&sSR[n])); + avg = _mm_add_ss(_mm_load_ss(&sFR[n]), _mm_load_ss(&sSR[n])); _mm_store_ss(&dFR[n], _mm_mul_ss(avg, mix)); _mm_store_ss(&dFC[n], _mm_mul_ss(_mm_load_ss(&sFC[n]), vol)); _mm_store_ss(&dLFE[n], _mm_mul_ss(_mm_load_ss(&sLFE[n]), vol)); @@ -267,9 +287,9 @@ channelmix_f32_5p1_3p1_sse(void *data, int n_dst, void *dst[n_dst], /* FL+FR+FC+LFE+SL+SR -> FL+FR+RL+RR*/ static void channelmix_f32_5p1_4_sse(void *data, int n_dst, void *dst[n_dst], - int n_src, const void *src[n_src], void *matrix, float v, int n_bytes) + int n_src, const void *src[n_src], void *matrix, float v, int n_samples) { - int i, n, n_samples = n_bytes / sizeof(float), unrolled, remain; + int i, n, unrolled; float **d = (float **) dst; float **s = (float **) src; float *m = matrix; @@ -277,28 +297,39 @@ channelmix_f32_5p1_4_sse(void *data, int n_dst, void *dst[n_dst], __m128 llev = _mm_set1_ps(m[3]); __m128 vol = _mm_set1_ps(v); __m128 ctr; - float *dFL = d[0], *dFR = d[1], *dRL = d[2], *dRR = d[3]; float *sFL = s[0], *sFR = s[1], *sFC = s[2], *sLFE = s[3], *sSL = s[4], *sSR = s[5]; + float *dFL = d[0], *dFR = d[1], *dRL = d[2], *dRR = d[3]; + + if (SPA_IS_ALIGNED(sFL, 16) && + SPA_IS_ALIGNED(sFR, 16) && + SPA_IS_ALIGNED(sFC, 16) && + SPA_IS_ALIGNED(sLFE, 16) && + SPA_IS_ALIGNED(sSL, 16) && + SPA_IS_ALIGNED(sSR, 16) && + SPA_IS_ALIGNED(dFL, 16) && + SPA_IS_ALIGNED(dFR, 16) && + SPA_IS_ALIGNED(dRL, 16) && + SPA_IS_ALIGNED(dRR, 16)) + unrolled = n_samples / 4; + else + unrolled = 0; if (v <= VOLUME_MIN) { for (i = 0; i < n_dst; i++) - memset(d[i], 0, n_bytes); + memset(d[i], 0, n_samples * sizeof(float)); } else if (v == VOLUME_NORM) { - unrolled = n_samples / 4; - remain = n_samples & 3; - for(n = 0; unrolled--; n += 4) { - ctr = _mm_mul_ps(_mm_loadu_ps(&sFC[n]), clev); - ctr = _mm_add_ps(ctr, _mm_mul_ps(_mm_loadu_ps(&sLFE[n]), llev)); - _mm_storeu_ps(&dFL[n], _mm_add_ps(_mm_loadu_ps(&sFL[n]), ctr)); - _mm_storeu_ps(&dFR[n], _mm_add_ps(_mm_loadu_ps(&sFR[n]), ctr)); - _mm_storeu_ps(&dRL[n], _mm_loadu_ps(&sSL[n])); - _mm_storeu_ps(&dRR[n], _mm_loadu_ps(&sSR[n])); + ctr = _mm_mul_ps(_mm_load_ps(&sFC[n]), clev); + ctr = _mm_add_ps(ctr, _mm_mul_ps(_mm_load_ps(&sLFE[n]), llev)); + _mm_store_ps(&dFL[n], _mm_add_ps(_mm_load_ps(&sFL[n]), ctr)); + _mm_store_ps(&dFR[n], _mm_add_ps(_mm_load_ps(&sFR[n]), ctr)); + _mm_store_ps(&dRL[n], _mm_load_ps(&sSL[n])); + _mm_store_ps(&dRR[n], _mm_load_ps(&sSR[n])); } - for(; remain--; n++) { + for(; n < n_samples; n++) { ctr = _mm_mul_ss(_mm_load_ss(&sFC[n]), clev); - ctr = _mm_add_ps(ctr, _mm_mul_ps(_mm_loadu_ps(&sLFE[n]), llev)); + ctr = _mm_add_ss(ctr, _mm_mul_ss(_mm_load_ss(&sLFE[n]), llev)); _mm_store_ss(&dFL[n], _mm_add_ss(_mm_load_ss(&sFL[n]), ctr)); _mm_store_ss(&dFR[n], _mm_add_ss(_mm_load_ss(&sFR[n]), ctr)); _mm_store_ss(&dRL[n], _mm_load_ss(&sSL[n])); @@ -306,20 +337,17 @@ channelmix_f32_5p1_4_sse(void *data, int n_dst, void *dst[n_dst], } } else { - unrolled = n_samples / 4; - remain = n_samples & 3; - for(n = 0; unrolled--; n += 4) { - ctr = _mm_mul_ps(_mm_loadu_ps(&sFC[n]), clev); - ctr = _mm_add_ps(ctr, _mm_mul_ps(_mm_loadu_ps(&sLFE[n]), llev)); - _mm_storeu_ps(&dFL[n], _mm_mul_ps(_mm_add_ps(_mm_loadu_ps(&sFL[n]), ctr), vol)); - _mm_storeu_ps(&dFR[n], _mm_mul_ps(_mm_add_ps(_mm_loadu_ps(&sFR[n]), ctr), vol)); - _mm_storeu_ps(&dRL[n], _mm_mul_ps(_mm_loadu_ps(&sSL[n]), vol)); - _mm_storeu_ps(&dRR[n], _mm_mul_ps(_mm_loadu_ps(&sSR[n]), vol)); + ctr = _mm_mul_ps(_mm_load_ps(&sFC[n]), clev); + ctr = _mm_add_ps(ctr, _mm_mul_ps(_mm_load_ps(&sLFE[n]), llev)); + _mm_store_ps(&dFL[n], _mm_mul_ps(_mm_add_ps(_mm_load_ps(&sFL[n]), ctr), vol)); + _mm_store_ps(&dFR[n], _mm_mul_ps(_mm_add_ps(_mm_load_ps(&sFR[n]), ctr), vol)); + _mm_store_ps(&dRL[n], _mm_mul_ps(_mm_load_ps(&sSL[n]), vol)); + _mm_store_ps(&dRR[n], _mm_mul_ps(_mm_load_ps(&sSR[n]), vol)); } - for(; remain--; n++) { + for(; n < n_samples; n++) { ctr = _mm_mul_ss(_mm_load_ss(&sFC[n]), clev); - ctr = _mm_add_ps(ctr, _mm_mul_ps(_mm_loadu_ps(&sLFE[n]), llev)); + ctr = _mm_add_ss(ctr, _mm_mul_ss(_mm_load_ss(&sLFE[n]), llev)); _mm_store_ss(&dFL[n], _mm_mul_ss(_mm_add_ss(_mm_load_ss(&sFL[n]), ctr), vol)); _mm_store_ss(&dFR[n], _mm_mul_ss(_mm_add_ss(_mm_load_ss(&sFR[n]), ctr), vol)); _mm_store_ss(&dRL[n], _mm_mul_ss(_mm_load_ss(&sSL[n]), vol)); diff --git a/spa/plugins/audioconvert/channelmix-ops.c b/spa/plugins/audioconvert/channelmix-ops.c index 23d69c6d4..c326a4814 100644 --- a/spa/plugins/audioconvert/channelmix-ops.c +++ b/spa/plugins/audioconvert/channelmix-ops.c @@ -37,19 +37,19 @@ static void channelmix_copy(void *data, int n_dst, void *dst[n_dst], - int n_src, const void *src[n_src], void *matrix, float v, int n_bytes) + int n_src, const void *src[n_src], void *matrix, float v, int n_samples) { - int i, n, n_samples = n_bytes / sizeof(float); + int i, n; float **d = (float **)dst; float **s = (float **)src; if (v <= VOLUME_MIN) { for (i = 0; i < n_dst; i++) - memset(d[i], 0, n_bytes); + memset(d[i], 0, n_samples * sizeof(float)); } else if (v == VOLUME_NORM) { for (i = 0; i < n_dst; i++) - memcpy(d[i], s[i], n_bytes); + memcpy(d[i], s[i], n_samples * sizeof(float)); } else { for (i = 0; i < n_dst; i++) @@ -62,9 +62,9 @@ channelmix_copy(void *data, int n_dst, void *dst[n_dst], static void channelmix_f32_n_m(void *data, int n_dst, void *dst[n_dst], - int n_src, const void *src[n_src], void *matrix, float v, int n_bytes) + int n_src, const void *src[n_src], void *matrix, float v, int n_samples) { - int i, j, n, n_samples = n_bytes / sizeof(float); + int i, j, n; float **d = (float **) dst; float **s = (float **) src; float *m = matrix; @@ -84,15 +84,15 @@ channelmix_f32_n_m(void *data, int n_dst, void *dst[n_dst], static void channelmix_f32_1_2(void *data, int n_dst, void *dst[n_dst], - int n_src, const void *src[n_src], void *matrix, float v, int n_bytes) + int n_src, const void *src[n_src], void *matrix, float v, int n_samples) { - int n, n_samples = n_bytes / sizeof(float); + int n; float **d = (float **)dst; float **s = (float **)src; if (v <= VOLUME_MIN) { - memset(d[0], 0, n_bytes); - memset(d[1], 0, n_bytes); + memset(d[0], 0, n_samples * sizeof(float)); + memset(d[1], 0, n_samples * sizeof(float)); } else if (v == VOLUME_NORM) { for (n = 0; n < n_samples; n++) @@ -106,14 +106,14 @@ channelmix_f32_1_2(void *data, int n_dst, void *dst[n_dst], static void channelmix_f32_2_1(void *data, int n_dst, void *dst[n_dst], - int n_src, const void *src[n_src], void *matrix, float v, int n_bytes) + int n_src, const void *src[n_src], void *matrix, float v, int n_samples) { - int n, n_samples = n_bytes / sizeof(float); + int n; float **d = (float **)dst; float **s = (float **)src; if (v <= VOLUME_MIN) { - memset(d[0], 0, n_bytes); + memset(d[0], 0, n_samples * sizeof(float)); } else { const float f = v * 0.5f; @@ -124,14 +124,14 @@ channelmix_f32_2_1(void *data, int n_dst, void *dst[n_dst], static void channelmix_f32_4_1(void *data, int n_dst, void *dst[n_dst], - int n_src, const void *src[n_src], void *matrix, float v, int n_bytes) + int n_src, const void *src[n_src], void *matrix, float v, int n_samples) { - int n, n_samples = n_bytes / sizeof(float); + int n; float **d = (float **)dst; float **s = (float **)src; if (v <= VOLUME_MIN) { - memset(d[0], 0, n_bytes); + memset(d[0], 0, n_samples * sizeof(float)); } else { const float f = v * 0.25f; @@ -142,14 +142,14 @@ channelmix_f32_4_1(void *data, int n_dst, void *dst[n_dst], static void channelmix_f32_3p1_1(void *data, int n_dst, void *dst[n_dst], - int n_src, const void *src[n_src], void *matrix, float v, int n_bytes) + int n_src, const void *src[n_src], void *matrix, float v, int n_samples) { - int n, n_samples = n_bytes / sizeof(float); + int n; float **d = (float **)dst; float **s = (float **)src; if (v <= VOLUME_MIN) { - memset(d[0], 0, n_bytes); + memset(d[0], 0, n_samples * sizeof(float)); } else { const float f = v * 0.5f; @@ -163,15 +163,15 @@ channelmix_f32_3p1_1(void *data, int n_dst, void *dst[n_dst], static void channelmix_f32_2_4(void *data, int n_dst, void *dst[n_dst], - int n_src, const void *src[n_src], void *matrix, float v, int n_bytes) + int n_src, const void *src[n_src], void *matrix, float v, int n_samples) { - int i, n, n_samples = n_bytes / sizeof(float); + int i, n; float **d = (float **)dst; float **s = (float **)src; if (v <= VOLUME_MIN) { for (i = 0; i < n_dst; i++) - memset(d[i], 0, n_bytes); + memset(d[i], 0, n_samples * sizeof(float)); } else if (v == VOLUME_NORM) { for (n = 0; n < n_samples; n++) { @@ -190,15 +190,15 @@ channelmix_f32_2_4(void *data, int n_dst, void *dst[n_dst], #define MASK_3_1 _M(FL)|_M(FR)|_M(FC)|_M(LFE) static void channelmix_f32_2_3p1(void *data, int n_dst, void *dst[n_dst], - int n_src, const void *src[n_src], void *matrix, float v, int n_bytes) + int n_src, const void *src[n_src], void *matrix, float v, int n_samples) { - int i, n, n_samples = n_bytes / sizeof(float); + int i, n; float **d = (float **)dst; float **s = (float **)src; if (v <= VOLUME_MIN) { for (i = 0; i < n_dst; i++) - memset(d[i], 0, n_bytes); + memset(d[i], 0, n_samples * sizeof(float)); } else if (v == VOLUME_NORM) { for (n = 0; n < n_samples; n++) { @@ -222,15 +222,15 @@ channelmix_f32_2_3p1(void *data, int n_dst, void *dst[n_dst], #define MASK_5_1 _M(FL)|_M(FR)|_M(FC)|_M(LFE)|_M(SL)|_M(SR)|_M(RL)|_M(RR) static void channelmix_f32_2_5p1(void *data, int n_dst, void *dst[n_dst], - int n_src, const void *src[n_src], void *matrix, float v, int n_bytes) + int n_src, const void *src[n_src], void *matrix, float v, int n_samples) { - int i, n, n_samples = n_bytes / sizeof(float); + int i, n; float **d = (float **)dst; float **s = (float **)src; if (v <= VOLUME_MIN) { for (i = 0; i < n_dst; i++) - memset(d[i], 0, n_bytes); + memset(d[i], 0, n_samples * sizeof(float)); } else if (v == VOLUME_NORM) { for (n = 0; n < n_samples; n++) { @@ -254,9 +254,9 @@ channelmix_f32_2_5p1(void *data, int n_dst, void *dst[n_dst], /* FL+FR+FC+LFE+SL+SR -> FL+FR */ static void channelmix_f32_5p1_2(void *data, int n_dst, void *dst[n_dst], - int n_src, const void *src[n_src], void *matrix, float v, int n_bytes) + int n_src, const void *src[n_src], void *matrix, float v, int n_samples) { - int n, n_samples = n_bytes / sizeof(float); + int n; float **d = (float **) dst; float **s = (float **) src; float *m = matrix; @@ -265,8 +265,8 @@ channelmix_f32_5p1_2(void *data, int n_dst, void *dst[n_dst], const float slev = m[4]; if (v <= VOLUME_MIN) { - memset(d[0], 0, n_bytes); - memset(d[1], 0, n_bytes); + memset(d[0], 0, n_samples * sizeof(float)); + memset(d[1], 0, n_samples * sizeof(float)); } else if (v == VOLUME_NORM) { for (n = 0; n < n_samples; n++) { @@ -287,16 +287,15 @@ channelmix_f32_5p1_2(void *data, int n_dst, void *dst[n_dst], /* FL+FR+FC+LFE+SL+SR -> FL+FR+FC+LFE*/ static void channelmix_f32_5p1_3p1(void *data, int n_dst, void *dst[n_dst], - int n_src, const void *src[n_src], void *matrix, float v, int n_bytes) + int n_src, const void *src[n_src], void *matrix, float v, int n_samples) { - int i, n, n_samples; + int i, n; float **d = (float **) dst; float **s = (float **) src; - n_samples = n_bytes / sizeof(float); if (v <= VOLUME_MIN) { for (i = 0; i < n_dst; i++) - memset(d[i], 0, n_bytes); + memset(d[i], 0, n_samples * sizeof(float)); } else { const float f1 = 0.5f * v; @@ -312,19 +311,18 @@ channelmix_f32_5p1_3p1(void *data, int n_dst, void *dst[n_dst], /* FL+FR+FC+LFE+SL+SR -> FL+FR+RL+RR*/ static void channelmix_f32_5p1_4(void *data, int n_dst, void *dst[n_dst], - int n_src, const void *src[n_src], void *matrix, float v, int n_bytes) + int n_src, const void *src[n_src], void *matrix, float v, int n_samples) { - int i, n, n_samples; + int i, n; float **d = (float **) dst; float **s = (float **) src; float *m = matrix; const float clev = m[2]; const float llev = m[3]; - n_samples = n_bytes / sizeof(float); if (v <= VOLUME_MIN) { for (i = 0; i < n_dst; i++) - memset(d[i], 0, n_bytes); + memset(d[i], 0, n_samples * sizeof(float)); } else if (v == VOLUME_NORM) { for (n = 0; n < n_samples; n++) { @@ -351,9 +349,9 @@ channelmix_f32_5p1_4(void *data, int n_dst, void *dst[n_dst], /* FL+FR+FC+LFE+SL+SR+RL+RR -> FL+FR */ static void channelmix_f32_7p1_2(void *data, int n_dst, void *dst[n_dst], - int n_src, const void *src[n_src], void *matrix, float v, int n_bytes) + int n_src, const void *src[n_src], void *matrix, float v, int n_samples) { - int n, n_samples = n_bytes / sizeof(float); + int n; float **d = (float **) dst; float **s = (float **) src; float *m = matrix; @@ -362,8 +360,8 @@ channelmix_f32_7p1_2(void *data, int n_dst, void *dst[n_dst], const float slev = m[4]; if (v <= VOLUME_MIN) { - memset(d[0], 0, n_bytes); - memset(d[1], 0, n_bytes); + memset(d[0], 0, n_samples * sizeof(float)); + memset(d[1], 0, n_samples * sizeof(float)); } else if (v == VOLUME_NORM) { for (n = 0; n < n_samples; n++) { @@ -384,16 +382,15 @@ channelmix_f32_7p1_2(void *data, int n_dst, void *dst[n_dst], /* FL+FR+FC+LFE+SL+SR+RL+RR -> FL+FR+FC+LFE*/ static void channelmix_f32_7p1_3p1(void *data, int n_dst, void *dst[n_dst], - int n_src, const void *src[n_src], void *matrix, float v, int n_bytes) + int n_src, const void *src[n_src], void *matrix, float v, int n_samples) { - int i, n, n_samples; + int i, n; float **d = (float **) dst; float **s = (float **) src; - n_samples = n_bytes / sizeof(float); if (v <= VOLUME_MIN) { for (i = 0; i < n_dst; i++) - memset(d[i], 0, n_bytes); + memset(d[i], 0, n_samples * sizeof(float)); } else { const float f1 = 0.5 * v; @@ -409,9 +406,9 @@ channelmix_f32_7p1_3p1(void *data, int n_dst, void *dst[n_dst], /* FL+FR+FC+LFE+SL+SR+RL+RR -> FL+FR+RL+RR*/ static void channelmix_f32_7p1_4(void *data, int n_dst, void *dst[n_dst], - int n_src, const void *src[n_src], void *matrix, float v, int n_bytes) + int n_src, const void *src[n_src], void *matrix, float v, int n_samples) { - int i, n, n_samples; + int i, n; float **d = (float **) dst; float **s = (float **) src; float *m = matrix; @@ -419,10 +416,9 @@ channelmix_f32_7p1_4(void *data, int n_dst, void *dst[n_dst], const float llev = m[3]; const float slev = m[4]; - n_samples = n_bytes / sizeof(float); if (v <= VOLUME_MIN) { for (i = 0; i < n_dst; i++) - memset(d[i], 0, n_bytes); + memset(d[i], 0, n_samples * sizeof(float)); } else if (v == VOLUME_NORM) { for (n = 0; n < n_samples; n++) { @@ -450,7 +446,7 @@ channelmix_f32_7p1_4(void *data, int n_dst, void *dst[n_dst], typedef void (*channelmix_func_t) (void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], - void *matrix, float v, int n_bytes); + void *matrix, float v, int n_samples); #define ANY ((uint32_t)-1) diff --git a/spa/plugins/audioconvert/channelmix.c b/spa/plugins/audioconvert/channelmix.c index 6b885c27f..c01db3c8a 100644 --- a/spa/plugins/audioconvert/channelmix.c +++ b/spa/plugins/audioconvert/channelmix.c @@ -1132,27 +1132,26 @@ static int impl_node_process(struct spa_node *node) sbuf = &inport->buffers[inio->buffer_id]; { - uint32_t i, n_bytes; + uint32_t i, n_samples; struct spa_buffer *sb = sbuf->outbuf, *db = dbuf->outbuf; uint32_t n_src_datas = sb->n_datas; uint32_t n_dst_datas = db->n_datas; const void *src_datas[n_src_datas]; void *dst_datas[n_dst_datas]; - n_bytes = sb->datas[0].chunk->size; + n_samples = sb->datas[0].chunk->size / inport->stride; for (i = 0; i < n_src_datas; i++) src_datas[i] = sb->datas[i].data; for (i = 0; i < n_dst_datas; i++) { dst_datas[i] = db->datas[i].data; - db->datas[i].chunk->size = - (n_bytes / inport->stride) * outport->stride; + db->datas[i].chunk->size = n_samples * outport->stride; } this->convert(this, n_dst_datas, dst_datas, n_src_datas, src_datas, this->matrix, this->props.mute ? 0.0f : this->props.volume, - n_bytes); + n_samples); } outio->status = SPA_STATUS_HAVE_BUFFER; diff --git a/spa/plugins/audioconvert/fmt-ops-sse2.c b/spa/plugins/audioconvert/fmt-ops-sse2.c index 6428757b7..ebca10467 100644 --- a/spa/plugins/audioconvert/fmt-ops-sse2.c +++ b/spa/plugins/audioconvert/fmt-ops-sse2.c @@ -30,142 +30,148 @@ #include static void -conv_s16_to_f32d_1_sse2(void *data, int n_dst, void *dst[n_dst], const void *src, int n_samples) +conv_s16_to_f32d_1_sse2(void *data, void *dst[], const void *src, int n_channels, int n_samples) { const int16_t *s = src; float **d = (float **) dst; float *d0 = d[0]; - int n = 0, unrolled; + int n, unrolled; __m128i in; __m128 out, factor = _mm_set1_ps(1.0f / S16_SCALE); - unrolled = n_samples / 4; - n_samples = n_samples & 3; + if (SPA_IS_ALIGNED(d0, 16)) + unrolled = n_samples / 4; + else + unrolled = 0; - for(; unrolled--; n += 4) { - in = _mm_insert_epi16(in, s[0*n_dst], 1); - in = _mm_insert_epi16(in, s[1*n_dst], 3); - in = _mm_insert_epi16(in, s[2*n_dst], 5); - in = _mm_insert_epi16(in, s[3*n_dst], 7); + for(n = 0; unrolled--; n += 4) { + in = _mm_insert_epi16(in, s[0*n_channels], 1); + in = _mm_insert_epi16(in, s[1*n_channels], 3); + in = _mm_insert_epi16(in, s[2*n_channels], 5); + in = _mm_insert_epi16(in, s[3*n_channels], 7); in = _mm_srai_epi32(in, 16); out = _mm_cvtepi32_ps(in); out = _mm_mul_ps(out, factor); - _mm_storeu_ps(&d0[n], out); - s += 4*n_dst; + _mm_store_ps(&d0[n], out); + s += 4*n_channels; } - for(; n_samples--; n++) { + for(; n < n_samples; n++) { out = _mm_cvtsi32_ss(out, s[0]); out = _mm_mul_ss(out, factor); _mm_store_ss(&d0[n], out); - s += n_dst; + s += n_channels; } } static void -conv_s16_to_f32d_2_sse2(void *data, int n_dst, void *dst[n_dst], const void *src, int n_samples) +conv_s16_to_f32d_2_sse2(void *data, void *dst[], const void *src, int n_channels, int n_samples) { const int16_t *s = src; float **d = (float **) dst; float *d0 = d[0], *d1 = d[1]; - int n = 0, unrolled; + int n, unrolled; __m128i in, t[2]; __m128 out[2], factor = _mm_set1_ps(1.0f / S16_SCALE); - if (n_dst == 2) { + if (n_channels == 2 && + SPA_IS_ALIGNED(s, 16) && + SPA_IS_ALIGNED(d0, 16) && + SPA_IS_ALIGNED(d1, 16)) unrolled = n_samples / 4; - n_samples = n_samples & 3; + else + unrolled = 0; - for(; unrolled--; n += 4) { - in = _mm_loadu_si128((__m128i*)s); + for(n = 0; unrolled--; n += 4) { + in = _mm_load_si128((__m128i*)s); - t[0] = _mm_slli_epi32(in, 16); - t[0] = _mm_srai_epi32(t[0], 16); - t[1] = _mm_srai_epi32(in, 16); + t[0] = _mm_slli_epi32(in, 16); + t[0] = _mm_srai_epi32(t[0], 16); + t[1] = _mm_srai_epi32(in, 16); - out[0] = _mm_cvtepi32_ps(t[0]); - out[0] = _mm_mul_ps(out[0], factor); - out[1] = _mm_cvtepi32_ps(t[1]); - out[1] = _mm_mul_ps(out[1], factor); + out[0] = _mm_cvtepi32_ps(t[0]); + out[0] = _mm_mul_ps(out[0], factor); + out[1] = _mm_cvtepi32_ps(t[1]); + out[1] = _mm_mul_ps(out[1], factor); - _mm_storeu_ps(&d0[n], out[0]); - _mm_storeu_ps(&d1[n], out[1]); + _mm_store_ps(&d0[n], out[0]); + _mm_store_ps(&d1[n], out[1]); - s += 4*n_dst; - } + s += 4*n_channels; } - for(; n_samples--; n++) { + for(; n < n_samples; n++) { out[0] = _mm_cvtsi32_ss(out[0], s[0]); out[0] = _mm_mul_ss(out[0], factor); out[1] = _mm_cvtsi32_ss(out[1], s[1]); out[1] = _mm_mul_ss(out[1], factor); _mm_store_ss(&d0[n], out[0]); _mm_store_ss(&d1[n], out[1]); - s += n_dst; + s += n_channels; } } static void -conv_s16_to_f32d_sse2(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_s16_to_f32d_sse2(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { const int16_t *s = src[0]; int i = 0; - for(; i + 1 < n_dst; i += 2) - conv_s16_to_f32d_2_sse2(data, n_dst, &dst[i], &s[i], n_samples); - for(; i < n_dst; i++) - conv_s16_to_f32d_1_sse2(data, n_dst, &dst[i], &s[i], n_samples); + for(; i + 1 < n_channels; i += 2) + conv_s16_to_f32d_2_sse2(data, &dst[i], &s[i], n_channels, n_samples); + for(; i < n_channels; i++) + conv_s16_to_f32d_1_sse2(data, &dst[i], &s[i], n_channels, n_samples); } static void -conv_s24_to_f32d_1_sse2(void *data, int n_dst, void *dst[n_dst], const void *src, int n_samples) +conv_s24_to_f32d_1_sse2(void *data, void *dst[], const void *src, int n_channels, int n_samples) { const uint8_t *s = src; float **d = (float **) dst; float *d0 = d[0]; - int n = 0, unrolled; + int n, unrolled; __m128i in; __m128 out, factor = _mm_set1_ps(1.0f / S24_SCALE); - unrolled = n_samples / 4; - n_samples = n_samples & 3; - if (n_samples == 0) { - n_samples += 4; - unrolled--; + if (SPA_IS_ALIGNED(d0, 16) && n_samples > 4) { + unrolled = n_samples / 4; + if ((n_samples & 3) == 0) + unrolled--; } + else + unrolled = 0; - for(; unrolled--; n += 4) { + for(n = 0; unrolled--; n += 4) { in = _mm_setr_epi32( - *((uint32_t*)&s[0 * n_dst]), - *((uint32_t*)&s[3 * n_dst]), - *((uint32_t*)&s[6 * n_dst]), - *((uint32_t*)&s[9 * n_dst])); + *((uint32_t*)&s[0 * n_channels]), + *((uint32_t*)&s[3 * n_channels]), + *((uint32_t*)&s[6 * n_channels]), + *((uint32_t*)&s[9 * n_channels])); in = _mm_slli_epi32(in, 8); in = _mm_srai_epi32(in, 8); out = _mm_cvtepi32_ps(in); out = _mm_mul_ps(out, factor); - _mm_storeu_ps(&d0[n], out); - s += 12 * n_dst; + _mm_store_ps(&d0[n], out); + s += 12 * n_channels; } - for(; n_samples--; n++) { + for(; n < n_samples; n++) { out = _mm_cvtsi32_ss(out, read_s24(s)); out = _mm_mul_ss(out, factor); _mm_store_ss(&d0[n], out); - s += 3 * n_dst; + s += 3 * n_channels; } } static void -conv_s24_to_f32d_sse2(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_s24_to_f32d_sse2(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { const int8_t *s = src[0]; int i = 0; - for(; i < n_dst; i++) - conv_s24_to_f32d_1_sse2(data, n_dst, &dst[i], &s[3*i], n_samples); + for(; i < n_channels; i++) + conv_s24_to_f32d_1_sse2(data, &dst[i], &s[3*i], n_channels, n_samples); } static void -conv_f32d_to_s32_1_sse2(void *data, void *dst, int n_src, const void *src[n_src], int n_samples) +conv_f32d_to_s32_1_sse2(void *data, void *dst, const void *src[], int n_channels, int n_samples) { const float **s = (const float **) src; const float *s0 = s[0]; @@ -176,11 +182,13 @@ conv_f32d_to_s32_1_sse2(void *data, void *dst, int n_src, const void *src[n_src] __m128 int_max = _mm_set1_ps(S24_MAX_F); __m128 int_min = _mm_sub_ps(_mm_setzero_ps(), int_max); - unrolled = n_samples / 4; - n_samples = n_samples & 3; + if (SPA_IS_ALIGNED(s0, 16)) + unrolled = n_samples / 4; + else + unrolled = 0; for(n = 0; unrolled--; n += 4) { - in[0] = _mm_mul_ps(_mm_loadu_ps(&s0[n]), int_max); + in[0] = _mm_mul_ps(_mm_load_ps(&s0[n]), int_max); in[0] = _mm_min_ps(int_max, _mm_max_ps(in[0], int_min)); out[0] = _mm_slli_epi32(_mm_cvtps_epi32(in[0]), 8); @@ -188,23 +196,23 @@ conv_f32d_to_s32_1_sse2(void *data, void *dst, int n_src, const void *src[n_src] out[2] = _mm_shuffle_epi32(out[0], _MM_SHUFFLE(1, 0, 3, 2)); out[3] = _mm_shuffle_epi32(out[0], _MM_SHUFFLE(2, 1, 0, 3)); - d[0*n_src] = _mm_cvtsi128_si32(out[0]); - d[1*n_src] = _mm_cvtsi128_si32(out[1]); - d[2*n_src] = _mm_cvtsi128_si32(out[2]); - d[3*n_src] = _mm_cvtsi128_si32(out[3]); - d += 4*n_src; + d[0*n_channels] = _mm_cvtsi128_si32(out[0]); + d[1*n_channels] = _mm_cvtsi128_si32(out[1]); + d[2*n_channels] = _mm_cvtsi128_si32(out[2]); + d[3*n_channels] = _mm_cvtsi128_si32(out[3]); + d += 4*n_channels; } - for(; n_samples--; n++) { + for(; n < n_samples; n++) { in[0] = _mm_load_ss(&s0[n]); in[0] = _mm_mul_ss(in[0], int_max); in[0] = _mm_min_ss(int_max, _mm_max_ss(in[0], int_min)); *d = _mm_cvtss_si32(in[0]) << 8; - d += n_src; + d += n_channels; } } static void -conv_f32d_to_s32_2_sse2(void *data, void *dst, int n_src, const void *src[n_src], int n_samples) +conv_f32d_to_s32_2_sse2(void *data, void *dst, const void *src[], int n_channels, int n_samples) { const float **s = (const float **) src; const float *s0 = s[0], *s1 = s[1]; @@ -215,12 +223,15 @@ conv_f32d_to_s32_2_sse2(void *data, void *dst, int n_src, const void *src[n_src] __m128 int_max = _mm_set1_ps(S24_MAX_F); __m128 int_min = _mm_sub_ps(_mm_setzero_ps(), int_max); - unrolled = n_samples / 4; - n_samples = n_samples & 3; + if (SPA_IS_ALIGNED(s0, 16) && + SPA_IS_ALIGNED(s1, 16)) + unrolled = n_samples / 4; + else + unrolled = 0; for(n = 0; unrolled--; n += 4) { - in[0] = _mm_mul_ps(_mm_loadu_ps(&s0[n]), int_max); - in[1] = _mm_mul_ps(_mm_loadu_ps(&s1[n]), int_max); + in[0] = _mm_mul_ps(_mm_load_ps(&s0[n]), int_max); + in[1] = _mm_mul_ps(_mm_load_ps(&s1[n]), int_max); in[0] = _mm_min_ps(int_max, _mm_max_ps(in[0], int_min)); in[1] = _mm_min_ps(int_max, _mm_max_ps(in[1], int_min)); @@ -233,13 +244,13 @@ conv_f32d_to_s32_2_sse2(void *data, void *dst, int n_src, const void *src[n_src] t[2] = _mm_unpackhi_epi32(out[0], out[1]); t[3] = _mm_shuffle_epi32(t[2], _MM_SHUFFLE(0, 0, 2, 2)); - _mm_storel_epi64((__m128i*)(d + 0*n_src), t[0]); - _mm_storel_epi64((__m128i*)(d + 1*n_src), t[1]); - _mm_storel_epi64((__m128i*)(d + 2*n_src), t[2]); - _mm_storel_epi64((__m128i*)(d + 3*n_src), t[3]); - d += 4*n_src; + _mm_storel_epi64((__m128i*)(d + 0*n_channels), t[0]); + _mm_storel_epi64((__m128i*)(d + 1*n_channels), t[1]); + _mm_storel_epi64((__m128i*)(d + 2*n_channels), t[2]); + _mm_storel_epi64((__m128i*)(d + 3*n_channels), t[3]); + d += 4*n_channels; } - for(; n_samples--; n++) { + for(; n < n_samples; n++) { in[0] = _mm_load_ss(&s0[n]); in[1] = _mm_load_ss(&s1[n]); @@ -249,12 +260,12 @@ conv_f32d_to_s32_2_sse2(void *data, void *dst, int n_src, const void *src[n_src] in[0] = _mm_min_ps(int_max, _mm_max_ps(in[0], int_min)); out[0] = _mm_slli_epi32(_mm_cvtps_epi32(in[0]), 8); _mm_storel_epi64((__m128i*)d, out[0]); - d += n_src; + d += n_channels; } } static void -conv_f32d_to_s32_4_sse2(void *data, void *dst, int n_src, const void *src[n_src], int n_samples) +conv_f32d_to_s32_4_sse2(void *data, void *dst, const void *src[], int n_channels, int n_samples) { const float **s = (const float **) src; const float *s0 = s[0], *s1 = s[1], *s2 = s[2], *s3 = s[3]; @@ -265,14 +276,19 @@ conv_f32d_to_s32_4_sse2(void *data, void *dst, int n_src, const void *src[n_src] __m128 int_max = _mm_set1_ps(S24_MAX_F); __m128 int_min = _mm_sub_ps(_mm_setzero_ps(), int_max); - unrolled = n_samples / 4; - n_samples = n_samples & 3; + if (SPA_IS_ALIGNED(s0, 16) && + SPA_IS_ALIGNED(s1, 16) && + SPA_IS_ALIGNED(s2, 16) && + SPA_IS_ALIGNED(s3, 16)) + unrolled = n_samples / 4; + else + unrolled = 0; for(n = 0; unrolled--; n += 4) { - in[0] = _mm_mul_ps(_mm_loadu_ps(&s0[n]), int_max); - in[1] = _mm_mul_ps(_mm_loadu_ps(&s1[n]), int_max); - in[2] = _mm_mul_ps(_mm_loadu_ps(&s2[n]), int_max); - in[3] = _mm_mul_ps(_mm_loadu_ps(&s3[n]), int_max); + in[0] = _mm_mul_ps(_mm_load_ps(&s0[n]), int_max); + in[1] = _mm_mul_ps(_mm_load_ps(&s1[n]), int_max); + in[2] = _mm_mul_ps(_mm_load_ps(&s2[n]), int_max); + in[3] = _mm_mul_ps(_mm_load_ps(&s3[n]), int_max); in[0] = _mm_min_ps(int_max, _mm_max_ps(in[0], int_min)); in[1] = _mm_min_ps(int_max, _mm_max_ps(in[1], int_min)); @@ -294,13 +310,13 @@ conv_f32d_to_s32_4_sse2(void *data, void *dst, int n_src, const void *src[n_src] out[2] = _mm_unpacklo_epi64(t[2], t[3]); out[3] = _mm_unpackhi_epi64(t[2], t[3]); - _mm_storeu_si128((__m128i*)(d + 0*n_src), out[0]); - _mm_storeu_si128((__m128i*)(d + 1*n_src), out[1]); - _mm_storeu_si128((__m128i*)(d + 2*n_src), out[2]); - _mm_storeu_si128((__m128i*)(d + 3*n_src), out[3]); - d += 4*n_src; + _mm_storeu_si128((__m128i*)(d + 0*n_channels), out[0]); + _mm_storeu_si128((__m128i*)(d + 1*n_channels), out[1]); + _mm_storeu_si128((__m128i*)(d + 2*n_channels), out[2]); + _mm_storeu_si128((__m128i*)(d + 3*n_channels), out[3]); + d += 4*n_channels; } - for(; n_samples--; n++) { + for(; n < n_samples; n++) { in[0] = _mm_load_ss(&s0[n]); in[1] = _mm_load_ss(&s1[n]); in[2] = _mm_load_ss(&s2[n]); @@ -314,26 +330,26 @@ conv_f32d_to_s32_4_sse2(void *data, void *dst, int n_src, const void *src[n_src] in[0] = _mm_min_ps(int_max, _mm_max_ps(in[0], int_min)); out[0] = _mm_slli_epi32(_mm_cvtps_epi32(in[0]), 8); _mm_storeu_si128((__m128i*)d, out[0]); - d += n_src; + d += n_channels; } } static void -conv_f32d_to_s32_sse2(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_f32d_to_s32_sse2(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { int32_t *d = dst[0]; int i = 0; - for(; i + 3 < n_src; i += 4) - conv_f32d_to_s32_4_sse2(data, &d[i], n_src, &src[i], n_samples); - for(; i + 1 < n_src; i += 2) - conv_f32d_to_s32_2_sse2(data, &d[i], n_src, &src[i], n_samples); - for(; i < n_src; i++) - conv_f32d_to_s32_1_sse2(data, &d[i], n_src, &src[i], n_samples); + for(; i + 3 < n_channels; i += 4) + conv_f32d_to_s32_4_sse2(data, &d[i], &src[i], n_channels, n_samples); + for(; i + 1 < n_channels; i += 2) + conv_f32d_to_s32_2_sse2(data, &d[i], &src[i], n_channels, n_samples); + for(; i < n_channels; i++) + conv_f32d_to_s32_1_sse2(data, &d[i], &src[i], n_channels, n_samples); } static void -conv_f32d_to_s16_1_sse2(void *data, void *dst, int n_src, const void *src[n_src], int n_samples) +conv_f32d_to_s16_1_sse2(void *data, void *dst, const void *src[], int n_channels, int n_samples) { const float **s = (const float **) src; const float *s0 = s[0]; @@ -344,52 +360,59 @@ conv_f32d_to_s16_1_sse2(void *data, void *dst, int n_src, const void *src[n_src] __m128 int_max = _mm_set1_ps(S16_MAX_F); __m128 int_min = _mm_sub_ps(_mm_setzero_ps(), int_max); - unrolled = n_samples / 8; - n_samples = n_samples & 7; + if (SPA_IS_ALIGNED(s0, 16)) + unrolled = n_samples / 8; + else + unrolled = 0; for(n = 0; unrolled--; n += 8) { - in[0] = _mm_mul_ps(_mm_loadu_ps(&s0[n]), int_max); - in[1] = _mm_mul_ps(_mm_loadu_ps(&s0[n+4]), int_max); + in[0] = _mm_mul_ps(_mm_load_ps(&s0[n]), int_max); + in[1] = _mm_mul_ps(_mm_load_ps(&s0[n+4]), int_max); out[0] = _mm_cvtps_epi32(in[0]); out[1] = _mm_cvtps_epi32(in[1]); out[0] = _mm_packs_epi32(out[0], out[1]); - d[0*n_src] = _mm_extract_epi16(out[0], 0); - d[1*n_src] = _mm_extract_epi16(out[0], 1); - d[2*n_src] = _mm_extract_epi16(out[0], 2); - d[3*n_src] = _mm_extract_epi16(out[0], 3); - d[4*n_src] = _mm_extract_epi16(out[0], 4); - d[5*n_src] = _mm_extract_epi16(out[0], 5); - d[6*n_src] = _mm_extract_epi16(out[0], 6); - d[7*n_src] = _mm_extract_epi16(out[0], 7); - d += 8*n_src; + d[0*n_channels] = _mm_extract_epi16(out[0], 0); + d[1*n_channels] = _mm_extract_epi16(out[0], 1); + d[2*n_channels] = _mm_extract_epi16(out[0], 2); + d[3*n_channels] = _mm_extract_epi16(out[0], 3); + d[4*n_channels] = _mm_extract_epi16(out[0], 4); + d[5*n_channels] = _mm_extract_epi16(out[0], 5); + d[6*n_channels] = _mm_extract_epi16(out[0], 6); + d[7*n_channels] = _mm_extract_epi16(out[0], 7); + d += 8*n_channels; } - for(; n_samples--; n++) { + for(; n < n_samples; n++) { + fprintf(stderr, "%p %d %d %d\n", s0, n_samples, n, n_channels); + spa_assert_not_reached(); in[0] = _mm_mul_ss(_mm_load_ss(&s0[n]), int_max); in[0] = _mm_min_ss(int_max, _mm_max_ss(in[0], int_min)); *d = _mm_cvtss_si32(in[0]); - d += n_src; + d += n_channels; } } static void -conv_f32d_to_s16_2_sse2(void *data, void *dst, int n_src, const void *src[n_src], int n_samples) +conv_f32d_to_s16_2_sse2(void *data, void *dst, const void *src[], int n_channels, int n_samples) { const float **s = (const float **) src; const float *s0 = s[0], *s1 = s[1]; int16_t *d = dst; - int n = 0, unrolled; + int n, unrolled; __m128 in[2]; __m128i out[4], t[2]; __m128 int_max = _mm_set1_ps(S16_MAX_F); __m128 int_min = _mm_sub_ps(_mm_setzero_ps(), int_max); - unrolled = n_samples / 4; - n_samples = n_samples & 3; + if (SPA_IS_ALIGNED(s0, 16) && + SPA_IS_ALIGNED(s1, 16)) + unrolled = n_samples / 4; + else + unrolled = 0; - for(; unrolled--; n += 4) { - in[0] = _mm_mul_ps(_mm_loadu_ps(&s0[n]), int_max); - in[1] = _mm_mul_ps(_mm_loadu_ps(&s1[n]), int_max); + for(n = 0; unrolled--; n += 4) { + in[0] = _mm_mul_ps(_mm_load_ps(&s0[n]), int_max); + in[1] = _mm_mul_ps(_mm_load_ps(&s1[n]), int_max); t[0] = _mm_cvtps_epi32(in[0]); t[1] = _mm_cvtps_epi32(in[1]); @@ -402,31 +425,33 @@ conv_f32d_to_s16_2_sse2(void *data, void *dst, int n_src, const void *src[n_src] out[2] = _mm_shuffle_epi32(out[0], _MM_SHUFFLE(1, 0, 3, 2)); out[3] = _mm_shuffle_epi32(out[0], _MM_SHUFFLE(2, 1, 0, 3)); - *((uint32_t*)(d + 0*n_src)) = _mm_cvtsi128_si32(out[0]); - *((uint32_t*)(d + 1*n_src)) = _mm_cvtsi128_si32(out[1]); - *((uint32_t*)(d + 2*n_src)) = _mm_cvtsi128_si32(out[2]); - *((uint32_t*)(d + 3*n_src)) = _mm_cvtsi128_si32(out[3]); - d += 4*n_src; + *((int32_t*)(d + 0*n_channels)) = _mm_cvtsi128_si32(out[0]); + *((int32_t*)(d + 1*n_channels)) = _mm_cvtsi128_si32(out[1]); + *((int32_t*)(d + 2*n_channels)) = _mm_cvtsi128_si32(out[2]); + *((int32_t*)(d + 3*n_channels)) = _mm_cvtsi128_si32(out[3]); + d += 4*n_channels; } - for(; n_samples--; n++) { + for(; n < n_samples; n++) { + fprintf(stderr, "%p %p %d %d %d\n", s0, s1, n_samples, n, n_channels); + spa_assert_not_reached(); in[0] = _mm_mul_ss(_mm_load_ss(&s0[n]), int_max); in[1] = _mm_mul_ss(_mm_load_ss(&s1[n]), int_max); in[0] = _mm_min_ss(int_max, _mm_max_ss(in[0], int_min)); in[1] = _mm_min_ss(int_max, _mm_max_ss(in[1], int_min)); d[0] = _mm_cvtss_si32(in[0]); d[1] = _mm_cvtss_si32(in[1]); - d += n_src; + d += n_channels; } } static void -conv_f32d_to_s16_sse2(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_f32d_to_s16_sse2(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { int16_t *d = dst[0]; int i = 0; - for(; i + 1 < n_src; i += 2) - conv_f32d_to_s16_2_sse2(data, &d[i], n_src, &src[i], n_samples); - for(; i < n_src; i++) - conv_f32d_to_s16_1_sse2(data, &d[i], n_src, &src[i], n_samples); + for(; i + 1 < n_channels; i += 2) + conv_f32d_to_s16_2_sse2(data, &d[i], &src[i], n_channels, n_samples); + for(; i < n_channels; i++) + conv_f32d_to_s16_1_sse2(data, &d[i], &src[i], n_channels, n_samples); } diff --git a/spa/plugins/audioconvert/fmt-ops.c b/spa/plugins/audioconvert/fmt-ops.c index 00256a096..08be6b832 100644 --- a/spa/plugins/audioconvert/fmt-ops.c +++ b/spa/plugins/audioconvert/fmt-ops.c @@ -30,6 +30,8 @@ #include #include +#include + #define U8_MIN 0 #define U8_MAX 255 #define U8_SCALE 127.5f @@ -85,43 +87,68 @@ static inline void write_s24(void *dst, int32_t val) #endif static void -conv_copy8(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_copy8d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { int i; - for (i = 0; i < n_src; i++) + for (i = 0; i < n_channels; i++) memcpy(dst[i], src[i], n_samples); } static void -conv_copy16(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_copy8(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +{ + memcpy(dst[0], src[0], n_samples * n_channels); +} + + +static void +conv_copy16d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { int i; - for (i = 0; i < n_src; i++) + for (i = 0; i < n_channels; i++) memcpy(dst[i], src[i], n_samples * sizeof(int16_t)); } static void -conv_copy24(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_copy16(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +{ + memcpy(dst[0], src[0], n_samples * sizeof(int16_t) * n_channels); +} + +static void +conv_copy24d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { int i; - for (i = 0; i < n_src; i++) + for (i = 0; i < n_channels; i++) memcpy(dst[i], src[i], n_samples * 3); } static void -conv_copy32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_copy24(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +{ + memcpy(dst[0], src[0], n_samples * 3 * n_channels); +} + +static void +conv_copy32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { int i; - for (i = 0; i < n_src; i++) + for (i = 0; i < n_channels; i++) memcpy(dst[i], src[i], n_samples * sizeof(int32_t)); } static void -conv_u8_to_f32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_copy32(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +{ + memcpy(dst[0], src[0], n_samples * sizeof(int32_t) * n_channels); +} + +static void +conv_u8d_to_f32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { int i, j; - for (i = 0; i < n_src; i++) { + for (i = 0; i < n_channels; i++) { const uint8_t *s = src[i]; float *d = dst[i]; @@ -131,37 +158,43 @@ conv_u8_to_f32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *s } static void -conv_u8_to_f32d(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_u8_to_f32(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +{ + conv_u8d_to_f32d(data, dst, src, 1, n_samples * n_channels); +} + +static void +conv_u8_to_f32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { const uint8_t *s = src[0]; float **d = (float **) dst; int i, j; for (j = 0; j < n_samples; j++) { - for (i = 0; i < n_dst; i++) + for (i = 0; i < n_channels; i++) d[i][j] = U8_TO_F32(*s++); } } static void -conv_u8d_to_f32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_u8d_to_f32(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { const uint8_t **s = (const uint8_t **) src; float *d = dst[0]; int i, j; for (j = 0; j < n_samples; j++) { - for (i = 0; i < n_src; i++) + for (i = 0; i < n_channels; i++) *d++ = U8_TO_F32(s[i][j]); } } static void -conv_s16_to_f32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_s16d_to_f32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { int i, j; - for (i = 0; i < n_src; i++) { + for (i = 0; i < n_channels; i++) { const int16_t *s = src[i]; float *d = dst[i]; for (j = 0; j < n_samples; j++) @@ -170,37 +203,43 @@ conv_s16_to_f32(void *data, int n_dst, void *dst[n_dst], int n_src, const void * } static void -conv_s16_to_f32d(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_s16_to_f32(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +{ + conv_s16d_to_f32d(data, dst, src, 1, n_samples * n_channels); +} + +static void +conv_s16_to_f32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { const int16_t *s = src[0]; float **d = (float **) dst; int i, j; for (j = 0; j < n_samples; j++) { - for (i = 0; i < n_dst; i++) + for (i = 0; i < n_channels; i++) d[i][j] = S16_TO_F32(*s++); } } static void -conv_s16d_to_f32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_s16d_to_f32(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { const int16_t **s = (const int16_t **) src; float *d = dst[0]; int i, j; for (j = 0; j < n_samples; j++) { - for (i = 0; i < n_src; i++) + for (i = 0; i < n_channels; i++) *d++ = S16_TO_F32(s[i][j]); } } static void -conv_s32_to_f32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_s32d_to_f32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { int i, j; - for (i = 0; i < n_src; i++) { + for (i = 0; i < n_channels; i++) { const int32_t *s = src[i]; float *d = dst[i]; @@ -210,38 +249,43 @@ conv_s32_to_f32(void *data, int n_dst, void *dst[n_dst], int n_src, const void * } static void -conv_s32_to_f32d(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_s32_to_f32(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +{ + conv_s32d_to_f32d(data, dst, src, 1, n_samples * n_channels); +} + +static void +conv_s32_to_f32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { const int32_t *s = src[0]; float **d = (float **) dst; int i, j; for (j = 0; j < n_samples; j++) { - for (i = 0; i < n_dst; i++) + for (i = 0; i < n_channels; i++) d[i][j] = S32_TO_F32(*s++); } } static void -conv_s32d_to_f32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_s32d_to_f32(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { const int32_t **s = (const int32_t **) src; float *d = dst[0]; int i, j; for (j = 0; j < n_samples; j++) { - for (i = 0; i < n_src; i++) + for (i = 0; i < n_channels; i++) *d++ = S32_TO_F32(s[i][j]); } } - static void -conv_s24_to_f32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_s24d_to_f32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { int i, j; - for (i = 0; i < n_src; i++) { + for (i = 0; i < n_channels; i++) { const int8_t *s = src[i]; float *d = dst[i]; @@ -253,14 +297,20 @@ conv_s24_to_f32(void *data, int n_dst, void *dst[n_dst], int n_src, const void * } static void -conv_s24_to_f32d(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_s24_to_f32(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +{ + conv_s24d_to_f32d(data, dst, src, 1, n_samples * n_channels); +} + +static void +conv_s24_to_f32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { const uint8_t *s = src[0]; float **d = (float **) dst; int i, j; for (j = 0; j < n_samples; j++) { - for (i = 0; i < n_dst; i++) { + for (i = 0; i < n_channels; i++) { d[i][j] = S24_TO_F32(read_s24(s)); s += 3; } @@ -268,25 +318,25 @@ conv_s24_to_f32d(void *data, int n_dst, void *dst[n_dst], int n_src, const void } static void -conv_s24d_to_f32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_s24d_to_f32(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { const uint8_t **s = (const uint8_t **) src; float *d = dst[0]; int i, j; for (j = 0; j < n_samples; j++) { - for (i = 0; i < n_src; i++) { + for (i = 0; i < n_channels; i++) { *d++ = S24_TO_F32(read_s24(&s[i][j*3])); } } } static void -conv_s24_32_to_f32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_s24_32d_to_f32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { int i, j; - for (i = 0; i < n_src; i++) { + for (i = 0; i < n_channels; i++) { const int32_t *s = src[i]; float *d = dst[i]; @@ -296,37 +346,43 @@ conv_s24_32_to_f32(void *data, int n_dst, void *dst[n_dst], int n_src, const voi } static void -conv_s24_32_to_f32d(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_s24_32_to_f32(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +{ + conv_s24_32d_to_f32d(data, dst, src, 1, n_samples * n_channels); +} + +static void +conv_s24_32_to_f32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { const int32_t *s = src[0]; float **d = (float **) dst; int i, j; for (j = 0; j < n_samples; j++) { - for (i = 0; i < n_dst; i++) + for (i = 0; i < n_channels; i++) d[i][j] = S24_TO_F32(*s++); } } static void -conv_s24_32d_to_f32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_s24_32d_to_f32(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { const int32_t **s = (const int32_t **) src; float *d = dst[0]; int i, j; for (j = 0; j < n_samples; j++) { - for (i = 0; i < n_src; i++) + for (i = 0; i < n_channels; i++) *d++ = S24_TO_F32(s[i][j]); } } static void -conv_f32_to_u8(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_f32d_to_u8d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { int i, j; - for (i = 0; i < n_src; i++) { + for (i = 0; i < n_channels; i++) { const float *s = src[i]; uint8_t *d = dst[i]; @@ -336,37 +392,43 @@ conv_f32_to_u8(void *data, int n_dst, void *dst[n_dst], int n_src, const void *s } static void -conv_f32_to_u8d(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_f32_to_u8(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +{ + conv_f32d_to_u8d(data, dst, src, 1, n_samples * n_channels); +} + +static void +conv_f32_to_u8d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { const float *s = src[0]; uint8_t **d = (uint8_t **) dst; int i, j; for (j = 0; j < n_samples; j++) { - for (i = 0; i < n_dst; i++) + for (i = 0; i < n_channels; i++) d[i][j] = F32_TO_U8(*s++); } } static void -conv_f32d_to_u8(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_f32d_to_u8(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { const float **s = (const float **) src; uint8_t *d = dst[0]; int i, j; for (j = 0; j < n_samples; j++) { - for (i = 0; i < n_src; i++) + for (i = 0; i < n_channels; i++) *d++ = F32_TO_U8(s[i][j]); } } static void -conv_f32_to_s16(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_f32d_to_s16d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { int i, j; - for (i = 0; i < n_src; i++) { + for (i = 0; i < n_channels; i++) { const float *s = src[i]; int16_t *d = dst[i]; @@ -376,37 +438,43 @@ conv_f32_to_s16(void *data, int n_dst, void *dst[n_dst], int n_src, const void * } static void -conv_f32_to_s16d(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_f32_to_s16(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +{ + conv_f32d_to_s16d(data, dst, src, 1, n_samples * n_channels); +} + +static void +conv_f32_to_s16d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { const float *s = src[0]; int16_t **d = (int16_t **) dst; int i, j; for (j = 0; j < n_samples; j++) { - for (i = 0; i < n_dst; i++) + for (i = 0; i < n_channels; i++) d[i][j] = F32_TO_S16(*s++); } } static void -conv_f32d_to_s16(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_f32d_to_s16(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { const float **s = (const float **) src; int16_t *d = dst[0]; int i, j; for (j = 0; j < n_samples; j++) { - for (i = 0; i < n_src; i++) + for (i = 0; i < n_channels; i++) *d++ = F32_TO_S16(s[i][j]); } } static void -conv_f32_to_s32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_f32d_to_s32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { int i, j; - for (i = 0; i < n_src; i++) { + for (i = 0; i < n_channels; i++) { const float *s = src[i]; int32_t *d = dst[i]; @@ -416,27 +484,33 @@ conv_f32_to_s32(void *data, int n_dst, void *dst[n_dst], int n_src, const void * } static void -conv_f32_to_s32d(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_f32_to_s32(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +{ + conv_f32d_to_s32d(data, dst, src, 1, n_samples * n_channels); +} + +static void +conv_f32_to_s32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { const float *s = src[0]; int32_t **d = (int32_t **) dst; int i, j; for (j = 0; j < n_samples; j++) { - for (i = 0; i < n_dst; i++) + for (i = 0; i < n_channels; i++) d[i][j] = F32_TO_S32(*s++); } } static void -conv_f32d_to_s32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_f32d_to_s32(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { const float **s = (const float **) src; int32_t *d = dst[0]; int i, j; for (j = 0; j < n_samples; j++) { - for (i = 0; i < n_src; i++) + for (i = 0; i < n_channels; i++) *d++ = F32_TO_S32(s[i][j]); } } @@ -444,11 +518,11 @@ conv_f32d_to_s32(void *data, int n_dst, void *dst[n_dst], int n_src, const void static void -conv_f32_to_s24(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_f32d_to_s24d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { int i, j; - for (i = 0; i < n_src; i++) { + for (i = 0; i < n_channels; i++) { const float *s = src[i]; uint8_t *d = dst[i]; @@ -460,28 +534,34 @@ conv_f32_to_s24(void *data, int n_dst, void *dst[n_dst], int n_src, const void * } static void -conv_f32_to_s24d(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_f32_to_s24(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +{ + conv_f32d_to_s24d(data, dst, src, 1, n_samples * n_channels); +} + +static void +conv_f32_to_s24d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { const float *s = src[0]; uint8_t **d = (uint8_t **) dst; int i, j; for (j = 0; j < n_samples; j++) { - for (i = 0; i < n_dst; i++) { + for (i = 0; i < n_channels; i++) { write_s24(&d[i][j*3], F32_TO_S24(*s++)); } } } static void -conv_f32d_to_s24(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_f32d_to_s24(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { const float **s = (const float **) src; uint8_t *d = dst[0]; int i, j; for (j = 0; j < n_samples; j++) { - for (i = 0; i < n_src; i++) { + for (i = 0; i < n_channels; i++) { write_s24(d, F32_TO_S24(s[i][j])); d += 3; } @@ -490,11 +570,11 @@ conv_f32d_to_s24(void *data, int n_dst, void *dst[n_dst], int n_src, const void static void -conv_f32_to_s24_32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_f32d_to_s24_32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { int i, j; - for (i = 0; i < n_src; i++) { + for (i = 0; i < n_channels; i++) { const float *s = src[i]; int32_t *d = dst[i]; @@ -504,66 +584,72 @@ conv_f32_to_s24_32(void *data, int n_dst, void *dst[n_dst], int n_src, const voi } static void -conv_f32_to_s24_32d(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_f32_to_s24_32(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +{ + conv_f32d_to_s24_32d(data, dst, src, 1, n_samples * n_channels); +} + +static void +conv_f32_to_s24_32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { const float *s = src[0]; int32_t **d = (int32_t **) dst; int i, j; for (j = 0; j < n_samples; j++) { - for (i = 0; i < n_dst; i++) + for (i = 0; i < n_channels; i++) d[i][j] = F32_TO_S24(*s++); } } static void -conv_f32d_to_s24_32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +conv_f32d_to_s24_32(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { const float **s = (const float **) src; int32_t *d = dst[0]; int i, j; for (j = 0; j < n_samples; j++) { - for (i = 0; i < n_src; i++) + for (i = 0; i < n_channels; i++) *d++ = F32_TO_S24(s[i][j]); } } static void -deinterleave_8(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +deinterleave_8(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { const uint8_t *s = src[0]; uint8_t **d = (uint8_t **) dst; int i, j; for (j = 0; j < n_samples; j++) { - for (i = 0; i < n_dst; i++) + for (i = 0; i < n_channels; i++) d[i][j] = *s++; } } static void -deinterleave_16(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +deinterleave_16(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { const uint16_t *s = src[0]; uint16_t **d = (uint16_t **) dst; int i, j; for (j = 0; j < n_samples; j++) { - for (i = 0; i < n_dst; i++) + for (i = 0; i < n_channels; i++) d[i][j] = *s++; } } static void -deinterleave_24(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +deinterleave_24(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { const uint8_t *s = src[0]; uint8_t **d = (uint8_t **) dst; int i, j; for (j = 0; j < n_samples; j++) { - for (i = 0; i < n_dst; i++) { + for (i = 0; i < n_channels; i++) { write_s24(&d[i][j*3], read_s24(s)); s += 3; } @@ -571,53 +657,53 @@ deinterleave_24(void *data, int n_dst, void *dst[n_dst], int n_src, const void * } static void -deinterleave_32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +deinterleave_32(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { const uint32_t *s = src[0]; uint32_t **d = (uint32_t **) dst; int i, j; for (j = 0; j < n_samples; j++) { - for (i = 0; i < n_dst; i++) + for (i = 0; i < n_channels; i++) d[i][j] = *s++; } } static void -interleave_8(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +interleave_8(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { const int8_t **s = (const int8_t **) src; uint8_t *d = dst[0]; int i, j; for (j = 0; j < n_samples; j++) { - for (i = 0; i < n_src; i++) + for (i = 0; i < n_channels; i++) *d++ = s[i][j]; } } static void -interleave_16(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +interleave_16(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { const int16_t **s = (const int16_t **) src; uint16_t *d = dst[0]; int i, j; for (j = 0; j < n_samples; j++) { - for (i = 0; i < n_src; i++) + for (i = 0; i < n_channels; i++) *d++ = s[i][j]; } } static void -interleave_24(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +interleave_24(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { const int8_t **s = (const int8_t **) src; uint8_t *d = dst[0]; int i, j; for (j = 0; j < n_samples; j++) { - for (i = 0; i < n_src; i++) { + for (i = 0; i < n_channels; i++) { write_s24(d, read_s24(&s[i][j*3])); d += 3; } @@ -625,21 +711,21 @@ interleave_24(void *data, int n_dst, void *dst[n_dst], int n_src, const void *sr } static void -interleave_32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples) +interleave_32(void *data, void *dst[], const void *src[], int n_channels, int n_samples) { const int32_t **s = (const int32_t **) src; uint32_t *d = dst[0]; int i, j; for (j = 0; j < n_samples; j++) { - for (i = 0; i < n_src; i++) + for (i = 0; i < n_channels; i++) *d++ = s[i][j]; } } -typedef void (*convert_func_t) (void *data, int n_dst, void *dst[n_dst], - int n_src, const void *src[n_src], int n_samples); +typedef void (*convert_func_t) (void *data, void *dst[], const void *src[], + int n_channels, int n_samples); static const struct conv_info { uint32_t src_fmt; @@ -652,13 +738,13 @@ static const struct conv_info { { /* to f32 */ { SPA_AUDIO_FORMAT_U8, SPA_AUDIO_FORMAT_F32, 0, conv_u8_to_f32 }, - { SPA_AUDIO_FORMAT_U8P, SPA_AUDIO_FORMAT_F32P, 0, conv_u8_to_f32 }, + { SPA_AUDIO_FORMAT_U8P, SPA_AUDIO_FORMAT_F32P, 0, conv_u8d_to_f32d }, { SPA_AUDIO_FORMAT_U8, SPA_AUDIO_FORMAT_F32P, 0, conv_u8_to_f32d }, { SPA_AUDIO_FORMAT_U8P, SPA_AUDIO_FORMAT_F32, 0, conv_u8d_to_f32 }, { SPA_AUDIO_FORMAT_S16, SPA_AUDIO_FORMAT_F32, 0, conv_s16_to_f32 }, - { SPA_AUDIO_FORMAT_S16P, SPA_AUDIO_FORMAT_F32P, 0, conv_s16_to_f32 }, + { SPA_AUDIO_FORMAT_S16P, SPA_AUDIO_FORMAT_F32P, 0, conv_s16d_to_f32d }, #if defined (__SSE2__) { SPA_AUDIO_FORMAT_S16, SPA_AUDIO_FORMAT_F32P, FEATURE_SSE2, conv_s16_to_f32d_sse2 }, #endif @@ -666,17 +752,17 @@ static const struct conv_info { { SPA_AUDIO_FORMAT_S16P, SPA_AUDIO_FORMAT_F32, 0, conv_s16d_to_f32 }, { SPA_AUDIO_FORMAT_F32, SPA_AUDIO_FORMAT_F32, 0, conv_copy32 }, - { SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_F32P, 0, conv_copy32 }, + { SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_F32P, 0, conv_copy32d }, { SPA_AUDIO_FORMAT_F32, SPA_AUDIO_FORMAT_F32P, 0, deinterleave_32 }, { SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_F32, 0, interleave_32 }, { SPA_AUDIO_FORMAT_S32, SPA_AUDIO_FORMAT_F32, 0, conv_s32_to_f32 }, - { SPA_AUDIO_FORMAT_S32P, SPA_AUDIO_FORMAT_F32P, 0, conv_s32_to_f32 }, + { SPA_AUDIO_FORMAT_S32P, SPA_AUDIO_FORMAT_F32P, 0, conv_s32d_to_f32d }, { SPA_AUDIO_FORMAT_S32, SPA_AUDIO_FORMAT_F32P, 0, conv_s32_to_f32d }, { SPA_AUDIO_FORMAT_S32P, SPA_AUDIO_FORMAT_F32, 0, conv_s32d_to_f32 }, { SPA_AUDIO_FORMAT_S24, SPA_AUDIO_FORMAT_F32, 0, conv_s24_to_f32 }, - { SPA_AUDIO_FORMAT_S24P, SPA_AUDIO_FORMAT_F32P, 0, conv_s24_to_f32 }, + { SPA_AUDIO_FORMAT_S24P, SPA_AUDIO_FORMAT_F32P, 0, conv_s24d_to_f32d }, #if defined (__SSE2__) { SPA_AUDIO_FORMAT_S24, SPA_AUDIO_FORMAT_F32P, FEATURE_SSE2, conv_s24_to_f32d_sse2 }, #endif @@ -684,18 +770,18 @@ static const struct conv_info { { SPA_AUDIO_FORMAT_S24P, SPA_AUDIO_FORMAT_F32, 0, conv_s24d_to_f32 }, { SPA_AUDIO_FORMAT_S24_32, SPA_AUDIO_FORMAT_F32, 0, conv_s24_32_to_f32 }, - { SPA_AUDIO_FORMAT_S24_32P, SPA_AUDIO_FORMAT_F32P, 0, conv_s24_32_to_f32 }, + { SPA_AUDIO_FORMAT_S24_32P, SPA_AUDIO_FORMAT_F32P, 0, conv_s24_32d_to_f32d }, { SPA_AUDIO_FORMAT_S24_32, SPA_AUDIO_FORMAT_F32P, 0, conv_s24_32_to_f32d }, { SPA_AUDIO_FORMAT_S24_32P, SPA_AUDIO_FORMAT_F32, 0, conv_s24_32d_to_f32 }, /* from f32 */ { SPA_AUDIO_FORMAT_F32, SPA_AUDIO_FORMAT_U8, 0, conv_f32_to_u8 }, - { SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_U8P, 0, conv_f32_to_u8 }, + { SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_U8P, 0, conv_f32d_to_u8d }, { SPA_AUDIO_FORMAT_F32, SPA_AUDIO_FORMAT_U8P, 0, conv_f32_to_u8d }, { SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_U8, 0, conv_f32d_to_u8 }, { SPA_AUDIO_FORMAT_F32, SPA_AUDIO_FORMAT_S16, 0, conv_f32_to_s16 }, - { SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_S16P, 0, conv_f32_to_s16 }, + { SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_S16P, 0, conv_f32d_to_s16d }, { SPA_AUDIO_FORMAT_F32, SPA_AUDIO_FORMAT_S16P, 0, conv_f32_to_s16d }, #if defined (__SSE2__) { SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_S16, FEATURE_SSE2, conv_f32d_to_s16_sse2 }, @@ -703,7 +789,7 @@ static const struct conv_info { { SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_S16, 0, conv_f32d_to_s16 }, { SPA_AUDIO_FORMAT_F32, SPA_AUDIO_FORMAT_S32, 0, conv_f32_to_s32 }, - { SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_S32P, 0, conv_f32_to_s32 }, + { SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_S32P, 0, conv_f32d_to_s32d }, { SPA_AUDIO_FORMAT_F32, SPA_AUDIO_FORMAT_S32P, 0, conv_f32_to_s32d }, #if defined (__SSE2__) { SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_S32, FEATURE_SSE2, conv_f32d_to_s32_sse2 }, @@ -711,42 +797,42 @@ static const struct conv_info { { SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_S32, 0, conv_f32d_to_s32 }, { SPA_AUDIO_FORMAT_F32, SPA_AUDIO_FORMAT_S24, 0, conv_f32_to_s24 }, - { SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_S24P, 0, conv_f32_to_s24 }, + { SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_S24P, 0, conv_f32d_to_s24d }, { SPA_AUDIO_FORMAT_F32, SPA_AUDIO_FORMAT_S24P, 0, conv_f32_to_s24d }, { SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_S24, 0, conv_f32d_to_s24 }, { SPA_AUDIO_FORMAT_F32, SPA_AUDIO_FORMAT_S24_32, 0, conv_f32_to_s24_32 }, - { SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_S24_32P, 0, conv_f32_to_s24_32 }, + { SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_S24_32P, 0, conv_f32d_to_s24_32d }, { SPA_AUDIO_FORMAT_F32, SPA_AUDIO_FORMAT_S24_32P, 0, conv_f32_to_s24_32d }, { SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_S24_32, 0, conv_f32d_to_s24_32 }, /* u8 */ { SPA_AUDIO_FORMAT_U8, SPA_AUDIO_FORMAT_U8, 0, conv_copy8 }, - { SPA_AUDIO_FORMAT_U8P, SPA_AUDIO_FORMAT_U8P, 0, conv_copy8 }, + { SPA_AUDIO_FORMAT_U8P, SPA_AUDIO_FORMAT_U8P, 0, conv_copy8d }, { SPA_AUDIO_FORMAT_U8, SPA_AUDIO_FORMAT_U8P, 0, deinterleave_8 }, { SPA_AUDIO_FORMAT_U8P, SPA_AUDIO_FORMAT_U8, 0, interleave_8 }, /* s16 */ { SPA_AUDIO_FORMAT_S16, SPA_AUDIO_FORMAT_S16, 0, conv_copy16 }, - { SPA_AUDIO_FORMAT_S16P, SPA_AUDIO_FORMAT_S16P, 0, conv_copy16 }, + { SPA_AUDIO_FORMAT_S16P, SPA_AUDIO_FORMAT_S16P, 0, conv_copy16d }, { SPA_AUDIO_FORMAT_S16, SPA_AUDIO_FORMAT_S16P, 0, deinterleave_16 }, { SPA_AUDIO_FORMAT_S16P, SPA_AUDIO_FORMAT_S16, 0, interleave_16 }, /* s32 */ { SPA_AUDIO_FORMAT_S32, SPA_AUDIO_FORMAT_S32, 0, conv_copy32 }, - { SPA_AUDIO_FORMAT_S32P, SPA_AUDIO_FORMAT_S32P, 0, conv_copy32 }, + { SPA_AUDIO_FORMAT_S32P, SPA_AUDIO_FORMAT_S32P, 0, conv_copy32d }, { SPA_AUDIO_FORMAT_S32, SPA_AUDIO_FORMAT_S32P, 0, deinterleave_32 }, { SPA_AUDIO_FORMAT_S32P, SPA_AUDIO_FORMAT_S32, 0, interleave_32 }, /* s24 */ { SPA_AUDIO_FORMAT_S24, SPA_AUDIO_FORMAT_S24, 0, conv_copy24 }, - { SPA_AUDIO_FORMAT_S24P, SPA_AUDIO_FORMAT_S24P, 0, conv_copy24 }, + { SPA_AUDIO_FORMAT_S24P, SPA_AUDIO_FORMAT_S24P, 0, conv_copy24d }, { SPA_AUDIO_FORMAT_S24, SPA_AUDIO_FORMAT_S24P, 0, deinterleave_24 }, { SPA_AUDIO_FORMAT_S24P, SPA_AUDIO_FORMAT_S24, 0, interleave_24 }, /* s24_32 */ { SPA_AUDIO_FORMAT_S24_32, SPA_AUDIO_FORMAT_S24_32, 0, conv_copy32 }, - { SPA_AUDIO_FORMAT_S24_32P, SPA_AUDIO_FORMAT_S24_32P, 0, conv_copy32 }, + { SPA_AUDIO_FORMAT_S24_32P, SPA_AUDIO_FORMAT_S24_32P, 0, conv_copy32d }, { SPA_AUDIO_FORMAT_S24_32, SPA_AUDIO_FORMAT_S24_32P, 0, deinterleave_32 }, { SPA_AUDIO_FORMAT_S24_32P, SPA_AUDIO_FORMAT_S24_32, 0, interleave_32 }, }; diff --git a/spa/plugins/audioconvert/fmtconvert.c b/spa/plugins/audioconvert/fmtconvert.c index a99216f5a..c49f6c1d1 100644 --- a/spa/plugins/audioconvert/fmtconvert.c +++ b/spa/plugins/audioconvert/fmtconvert.c @@ -115,8 +115,6 @@ struct impl { uint32_t cpu_flags; convert_func_t convert; - - float empty[4096]; }; #define CHECK_PORT(this,d,id) (id == 0) @@ -656,7 +654,7 @@ impl_node_port_use_buffers(struct spa_node *node, { struct impl *this; struct port *port; - uint32_t i, size = SPA_ID_INVALID; + uint32_t i, size = SPA_ID_INVALID, j; spa_return_val_if_fail(node != NULL, -EINVAL); @@ -674,6 +672,7 @@ impl_node_port_use_buffers(struct spa_node *node, for (i = 0; i < n_buffers; i++) { struct buffer *b; + uint32_t n_datas = buffers[i]->n_datas; struct spa_data *d = buffers[i]->datas; b = &port->buffers[i]; @@ -682,19 +681,35 @@ impl_node_port_use_buffers(struct spa_node *node, b->outbuf = buffers[i]; b->h = spa_buffer_find_meta_data(buffers[i], SPA_META_Header, sizeof(*b->h)); + if (n_datas != port->blocks) { + spa_log_error(this->log, NAME " %p: expected %d blocks on buffer %d", this, + port->blocks, i); + return -EINVAL; + } + if (size == SPA_ID_INVALID) size = d[0].maxsize; else - if (size != d[0].maxsize) + if (size != d[0].maxsize) { + spa_log_error(this->log, NAME " %p: expected size %d on buffer %d", this, + size, i); return -EINVAL; + } - if (!((d[0].type == SPA_DATA_MemPtr || - d[0].type == SPA_DATA_MemFd || - d[0].type == SPA_DATA_DmaBuf) && d[0].data != NULL)) { - spa_log_error(this->log, NAME " %p: invalid memory on buffer %p", this, - buffers[i]); - return -EINVAL; + for (j = 0; j < n_datas; j++) { + if (!((d[j].type == SPA_DATA_MemPtr || + d[j].type == SPA_DATA_MemFd || + d[j].type == SPA_DATA_DmaBuf) && d[j].data != NULL)) { + spa_log_error(this->log, NAME " %p: invalid memory %d on buffer %d", + this, j, i); + return -EINVAL; + } + if (!SPA_IS_ALIGNED(d[j].data, 16)) { + spa_log_warn(this->log, NAME " %p: memory %d on buffer %d not aligned", + this, j, i); + } } + if (direction == SPA_DIRECTION_OUTPUT) spa_list_append(&port->queue, &b->link); else @@ -878,7 +893,7 @@ static int impl_node_process(struct spa_node *node) spa_log_trace(this->log, NAME " %p: n_src:%d n_dst:%d size:%d maxsize:%d n_samples:%d", this, n_src_datas, n_dst_datas, size, maxsize, n_samples); - this->convert(this, n_dst_datas, dst_datas, n_src_datas, src_datas, n_samples); + this->convert(this, dst_datas, src_datas, SPA_MAX(n_src_datas, n_dst_datas), n_samples); inio->status = SPA_STATUS_NEED_BUFFER; res |= SPA_STATUS_NEED_BUFFER; diff --git a/spa/plugins/audioconvert/merger.c b/spa/plugins/audioconvert/merger.c index 4dd17a663..cbcd8f357 100644 --- a/spa/plugins/audioconvert/merger.c +++ b/spa/plugins/audioconvert/merger.c @@ -42,7 +42,7 @@ #define DEFAULT_RATE 48000 #define DEFAULT_CHANNELS 2 -#define MAX_SAMPLES 1024 +#define MAX_SAMPLES 2048 #define MAX_BUFFERS 64 #define MAX_PORTS 128 @@ -100,7 +100,7 @@ struct impl { bool monitor; bool have_profile; - float empty[MAX_SAMPLES]; + float empty[MAX_SAMPLES + 15]; }; #define CHECK_IN_PORT(this,d,p) ((d) == SPA_DIRECTION_INPUT && (p) < this->port_count) @@ -750,7 +750,7 @@ impl_node_port_use_buffers(struct spa_node *node, { struct impl *this; struct port *port; - uint32_t i; + uint32_t i, j; spa_return_val_if_fail(node != NULL, -EINVAL); @@ -769,6 +769,7 @@ impl_node_port_use_buffers(struct spa_node *node, for (i = 0; i < n_buffers; i++) { struct buffer *b; + uint32_t n_datas = buffers[i]->n_datas; struct spa_data *d = buffers[i]->datas; b = &port->buffers[i]; @@ -776,13 +777,25 @@ impl_node_port_use_buffers(struct spa_node *node, b->flags = 0; b->buf = buffers[i]; - if (!((d[0].type == SPA_DATA_MemPtr || - d[0].type == SPA_DATA_MemFd || - d[0].type == SPA_DATA_DmaBuf) && d[0].data != NULL)) { - spa_log_error(this->log, NAME " %p: invalid memory on buffer %p %d %p", this, - buffers[i], d[0].type, d[0].data); + if (n_datas != port->blocks) { + spa_log_error(this->log, NAME " %p: invalid blocks %d on buffer %d", + this, n_datas, i); return -EINVAL; } + + for (j = 0; j < n_datas; j++) { + if (!((d[j].type == SPA_DATA_MemPtr || + d[j].type == SPA_DATA_MemFd || + d[j].type == SPA_DATA_DmaBuf) && d[j].data != NULL)) { + spa_log_error(this->log, NAME " %p: invalid memory %d on buffer %d %d %p", + this, j, i, d[j].type, d[j].data); + return -EINVAL; + } + if (!SPA_IS_ALIGNED(d[j].data, 16)) + spa_log_warn(this->log, NAME " %p: memory %d on buffer %d not aligned", + this, j, i); + } + if (direction == SPA_DIRECTION_OUTPUT) queue_buffer(this, port, i); } @@ -960,7 +973,7 @@ static int impl_node_process(struct spa_node *node) struct port *inport = GET_IN_PORT(this, i); if (get_in_buffer(this, inport, &sbuf) < 0) { - src_datas[n_src_datas++] = this->empty; + src_datas[n_src_datas++] = SPA_PTR_ALIGN(this->empty, 16, void); continue; } @@ -987,7 +1000,7 @@ static int impl_node_process(struct spa_node *node) n_samples * outport->stride); } - this->convert(this, n_dst_datas, dst_datas, n_src_datas, src_datas, n_samples); + this->convert(this, dst_datas, src_datas, SPA_MAX(n_dst_datas, n_src_datas), n_samples); return res | SPA_STATUS_HAVE_BUFFER; } diff --git a/spa/plugins/audioconvert/splitter.c b/spa/plugins/audioconvert/splitter.c index 06e82da16..2d2035f82 100644 --- a/spa/plugins/audioconvert/splitter.c +++ b/spa/plugins/audioconvert/splitter.c @@ -44,7 +44,7 @@ #define DEFAULT_CHANNELS 2 #define DEFAULT_MASK (1LL << SPA_AUDIO_CHANNEL_FL) | (1LL << SPA_AUDIO_CHANNEL_FR) -#define MAX_SAMPLES 1024 +#define MAX_SAMPLES 2048 #define MAX_BUFFERS 64 #define MAX_PORTS 128 @@ -100,7 +100,7 @@ struct impl { bool have_profile; - float empty[MAX_SAMPLES]; + float empty[MAX_SAMPLES + 15]; }; #define CHECK_OUT_PORT(this,d,p) ((d) == SPA_DIRECTION_OUTPUT && (p) < this->port_count) @@ -754,10 +754,13 @@ impl_node_port_use_buffers(struct spa_node *node, if (!((d[0].type == SPA_DATA_MemPtr || d[0].type == SPA_DATA_MemFd || d[0].type == SPA_DATA_DmaBuf) && d[0].data != NULL)) { - spa_log_error(this->log, NAME " %p: invalid memory on buffer %p %d %p", this, - buffers[i], d[0].type, d[0].data); + spa_log_error(this->log, NAME " %p: invalid memory on buffer %d %d %p", this, + i, d[0].type, d[0].data); return -EINVAL; } + if (!SPA_IS_ALIGNED(d[0].data, 16)) + spa_log_warn(this->log, NAME " %p: memory on buffer %d not aligned", this, i); + if (direction == SPA_DIRECTION_OUTPUT) queue_buffer(this, port, i); } @@ -903,7 +906,7 @@ static int impl_node_process(struct spa_node *node) if ((dbuf = dequeue_buffer(this, outport)) == NULL) { outio->status = -EPIPE; empty: - dst_datas[n_dst_datas++] = this->empty; + dst_datas[n_dst_datas++] = SPA_PTR_ALIGN(this->empty, 16, void); continue; } @@ -927,7 +930,7 @@ static int impl_node_process(struct spa_node *node) spa_log_trace(this->log, NAME " %p: %d %d %d %d %d", this, n_src_datas, n_dst_datas, n_samples, maxsize, inport->stride); - this->convert(this, n_dst_datas, dst_datas, n_src_datas, src_datas, n_samples); + this->convert(this, dst_datas, src_datas, SPA_MAX(n_dst_datas, n_src_datas), n_samples); inio->status = SPA_STATUS_NEED_BUFFER; res |= SPA_STATUS_NEED_BUFFER; diff --git a/spa/plugins/audioconvert/test-fmt-ops.c b/spa/plugins/audioconvert/test-fmt-ops.c index f52dca11f..797c074df 100644 --- a/spa/plugins/audioconvert/test-fmt-ops.c +++ b/spa/plugins/audioconvert/test-fmt-ops.c @@ -33,7 +33,7 @@ #include "fmt-ops.c" -#define N_SAMPLES 29 +#define N_SAMPLES 253 #define N_CHANNELS 11 static uint8_t samp_in[N_SAMPLES * 4]; @@ -47,7 +47,7 @@ static void run_test(const char *name, { const void *ip[N_CHANNELS]; void *tp[N_CHANNELS]; - int i, j, ic, oc, ns; + int i, j; const uint8_t *in8 = in, *out8 = out; for (j = 0; j < N_SAMPLES; j++) { @@ -62,16 +62,16 @@ static void run_test(const char *name, tp[0] = temp_in; switch(in_size) { case 1: - interleave_8(NULL, 1, tp, N_CHANNELS, ip, N_SAMPLES); + interleave_8(NULL, tp, ip, N_CHANNELS, N_SAMPLES); break; case 2: - interleave_16(NULL, 1, tp, N_CHANNELS, ip, N_SAMPLES); + interleave_16(NULL, tp, ip, N_CHANNELS, N_SAMPLES); break; case 3: - interleave_24(NULL, 1, tp, N_CHANNELS, ip, N_SAMPLES); + interleave_24(NULL, tp, ip, N_CHANNELS, N_SAMPLES); break; case 4: - interleave_32(NULL, 1, tp, N_CHANNELS, ip, N_SAMPLES); + interleave_32(NULL, tp, ip, N_CHANNELS, N_SAMPLES); break; default: fprintf(stderr, "unknown size %zd\n", in_size); @@ -84,16 +84,11 @@ static void run_test(const char *name, for (j = 0; j < N_CHANNELS; j++) tp[j] = &temp_out[j * N_SAMPLES * out_size]; - ic = in_packed ? 1 : N_CHANNELS; - oc = out_packed ? 1 : N_CHANNELS; - ns = (in_packed && out_packed) ? N_SAMPLES * N_CHANNELS : N_SAMPLES; - - func(NULL, oc, tp, ic, ip, ns); + func(NULL, tp, ip, N_CHANNELS, N_SAMPLES); fprintf(stderr, "test %s:\n", name); if (out_packed) { const uint8_t *d = tp[0], *s = samp_out; - spa_debug_mem(0, d, N_SAMPLES * N_CHANNELS * out_size); for (i = 0; i < N_SAMPLES; i++) { for (j = 0; j < N_CHANNELS; j++) { spa_assert(memcmp(d, s, out_size) == 0); @@ -119,6 +114,8 @@ static void test_f32_u8(void) false, true, conv_f32d_to_u8); run_test("test_f32_u8d", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out), true, false, conv_f32_to_u8d); + run_test("test_f32d_u8d", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out), + false, false, conv_f32d_to_u8d); } static void test_u8_f32(void) @@ -132,6 +129,8 @@ static void test_u8_f32(void) false, true, conv_u8d_to_f32); run_test("test_u8_f32d", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out), true, false, conv_u8_to_f32d); + run_test("test_u8d_f32d", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out), + false, false, conv_u8d_to_f32d); } static void test_f32_s16(void) @@ -145,6 +144,8 @@ static void test_f32_s16(void) false, true, conv_f32d_to_s16); run_test("test_f32_s16d", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out), true, false, conv_f32_to_s16d); + run_test("test_f32d_s16d", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out), + false, false, conv_f32d_to_s16d); } static void test_s16_f32(void) @@ -158,6 +159,8 @@ static void test_s16_f32(void) false, true, conv_s16d_to_f32); run_test("test_s16_f32", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out), true, true, conv_s16_to_f32); + run_test("test_s16d_f32d", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out), + false, false, conv_s16d_to_f32d); } static void test_f32_s32(void) @@ -172,6 +175,8 @@ static void test_f32_s32(void) false, true, conv_f32d_to_s32); run_test("test_f32_s32d", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out), true, false, conv_f32_to_s32d); + run_test("test_f32d_s32d", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out), + false, false, conv_f32d_to_s32d); } static void test_s32_f32(void) @@ -185,6 +190,8 @@ static void test_s32_f32(void) false, true, conv_s32d_to_f32); run_test("test_s32_f32", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out), true, true, conv_s32_to_f32); + run_test("test_s32d_f32d", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out), + false, false, conv_s32d_to_f32d); } static void test_f32_s24(void) @@ -193,9 +200,14 @@ static void test_f32_s24(void) const uint8_t out[] = { 0x00, 0x00, 0x00, 0xff, 0xff, 0x7f, 0x01, 0x00, 0x80, 0xff, 0xff, 0x3f, 0x01, 0x00, 0xc0, 0xff, 0xff, 0x7f, 0x01, 0x00, 0x80 }; - run_test("test_f32_s24", in, sizeof(in[0]), out, 3, SPA_N_ELEMENTS(in), true, true, conv_f32_to_s24); - run_test("test_f32d_s24", in, sizeof(in[0]), out, 3, SPA_N_ELEMENTS(in), false, true, conv_f32d_to_s24); - run_test("test_f32_s24d", in, sizeof(in[0]), out, 3, SPA_N_ELEMENTS(in), true, false, conv_f32_to_s24d); + run_test("test_f32_s24", in, sizeof(in[0]), out, 3, SPA_N_ELEMENTS(in), + true, true, conv_f32_to_s24); + run_test("test_f32d_s24", in, sizeof(in[0]), out, 3, SPA_N_ELEMENTS(in), + false, true, conv_f32d_to_s24); + run_test("test_f32_s24d", in, sizeof(in[0]), out, 3, SPA_N_ELEMENTS(in), + true, false, conv_f32_to_s24d); + run_test("test_f32d_s24d", in, sizeof(in[0]), out, 3, SPA_N_ELEMENTS(in), + false, false, conv_f32d_to_s24d); } static void test_s24_f32(void) @@ -204,9 +216,14 @@ static void test_s24_f32(void) 0xff, 0xff, 0x3f, 0x01, 0x00, 0xc0, }; const float out[] = { 0.0f, 1.0f, -1.0f, 0.4999999404f, -0.4999999404f, }; - run_test("test_s24_f32d", in, 3, out, sizeof(out[0]), SPA_N_ELEMENTS(out), true, false, conv_s24_to_f32d); - run_test("test_s24d_f32", in, 3, out, sizeof(out[0]), SPA_N_ELEMENTS(out), false, true, conv_s24d_to_f32); - run_test("test_s24_f32", in, 3, out, sizeof(out[0]), SPA_N_ELEMENTS(out), true, true, conv_s24_to_f32); + run_test("test_s24_f32d", in, 3, out, sizeof(out[0]), SPA_N_ELEMENTS(out), + true, false, conv_s24_to_f32d); + run_test("test_s24d_f32", in, 3, out, sizeof(out[0]), SPA_N_ELEMENTS(out), + false, true, conv_s24d_to_f32); + run_test("test_s24_f32", in, 3, out, sizeof(out[0]), SPA_N_ELEMENTS(out), + true, true, conv_s24_to_f32); + run_test("test_s24d_f32d", in, 3, out, sizeof(out[0]), SPA_N_ELEMENTS(out), + false, false, conv_s24d_to_f32d); } static void test_f32_s24_32(void) @@ -221,6 +238,8 @@ static void test_f32_s24_32(void) false, true, conv_f32d_to_s24_32); run_test("test_f32_s24_32d", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out), true, false, conv_f32_to_s24_32d); + run_test("test_f32d_s24_32d", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out), + false, false, conv_f32d_to_s24_32d); } static void test_s24_32_f32(void) @@ -234,6 +253,8 @@ static void test_s24_32_f32(void) false, true, conv_s24_32d_to_f32); run_test("test_s24_32_f32", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out), true, true, conv_s24_32_to_f32); + run_test("test_s24_32d_f32d", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out), + false, false, conv_s24_32d_to_f32d); } int main(int argc, char *argv[]) diff --git a/src/modules/module-audio-dsp/audio-dsp.c b/src/modules/module-audio-dsp/audio-dsp.c index 074d5c2eb..350cecd9c 100644 --- a/src/modules/module-audio-dsp/audio-dsp.c +++ b/src/modules/module-audio-dsp/audio-dsp.c @@ -69,7 +69,7 @@ struct port { struct spa_handle *spa_handle; struct spa_node *spa_node; - float empty[MAX_BUFFER_SIZE]; + float empty[MAX_BUFFER_SIZE + 15]; }; struct node { @@ -101,14 +101,15 @@ static void init_buffer(struct port *port, uint32_t id) b->datas[0].flags = 0; b->datas[0].fd = -1; b->datas[0].mapoffset = 0; - b->datas[0].maxsize = sizeof(port->empty); - b->datas[0].data = port->empty; + b->datas[0].maxsize = SPA_ROUND_DOWN_N(sizeof(port->empty), 16); + b->datas[0].data = SPA_PTR_ALIGN(port->empty, 16, void); b->datas[0].chunk = b->chunk; b->datas[0].chunk->offset = 0; b->datas[0].chunk->size = 0; b->datas[0].chunk->stride = 0; port->bufs[id] = &b->buf; memset(port->empty, 0, sizeof(port->empty)); + pw_log_debug("%p %d", b->datas[0].data, b->datas[0].maxsize); } static void init_port(struct port *p, enum spa_direction direction) diff --git a/src/modules/module-audio-dsp/floatmix.c b/src/modules/module-audio-dsp/floatmix.c index d87091f8f..bf9c6508f 100644 --- a/src/modules/module-audio-dsp/floatmix.c +++ b/src/modules/module-audio-dsp/floatmix.c @@ -109,7 +109,7 @@ struct impl { uint32_t stride; bool started; - float empty[MAX_SAMPLES]; + float empty[MAX_SAMPLES + 15]; }; #define CHECK_FREE_IN_PORT(this,d,p) ((d) == SPA_DIRECTION_INPUT && (p) < MAX_PORTS && !this->in_ports[(p)].valid) @@ -632,10 +632,12 @@ impl_node_port_use_buffers(struct spa_node *node, if (!((d[0].type == SPA_DATA_MemPtr || d[0].type == SPA_DATA_MemFd || d[0].type == SPA_DATA_DmaBuf) && d[0].data != NULL)) { - spa_log_error(this->log, NAME " %p: invalid memory on buffer %p", this, - buffers[i]); + spa_log_error(this->log, NAME " %p: invalid memory on buffer %d", this, i); return -EINVAL; } + if (!SPA_IS_ALIGNED(d[0].data, 16)) { + spa_log_warn(this->log, NAME " %p: memory on buffer %d not aligned", this, i); + } if (direction == SPA_DIRECTION_OUTPUT) queue_buffer(this, port, b); } @@ -717,23 +719,27 @@ impl_node_port_send_command(struct spa_node *node, #include static void mix_2(float *dst, float *src1, float *src2, int n_samples) { - int i, unrolled; + int n, unrolled; __m128 in[2]; - unrolled = n_samples / 4; - n_samples &= 3; + if (SPA_IS_ALIGNED(src1, 16) && + SPA_IS_ALIGNED(src2, 16) && + SPA_IS_ALIGNED(dst, 16)) + unrolled = n_samples / 4; + else + unrolled = 0; - for (i = 0; unrolled--; i += 4) { - in[0] = _mm_loadu_ps(&src1[i]), - in[1] = _mm_loadu_ps(&src2[i]), + for (n = 0; unrolled--; n += 4) { + in[0] = _mm_load_ps(&src1[n]), + in[1] = _mm_load_ps(&src2[n]), in[0] = _mm_add_ps(in[0], in[1]); - _mm_storeu_ps(&dst[i], in[0]); + _mm_store_ps(&dst[n], in[0]); } - for (; n_samples--; i++) { - in[0] = _mm_load_ss(&src1[i]), - in[1] = _mm_load_ss(&src2[i]), + for (; n < n_samples; n++) { + in[0] = _mm_load_ss(&src1[n]), + in[1] = _mm_load_ss(&src2[n]), in[0] = _mm_add_ss(in[0], in[1]); - _mm_store_ss(&dst[i], in[0]); + _mm_store_ss(&dst[n], in[0]); } } #else @@ -825,13 +831,13 @@ static int impl_node_process(struct spa_node *node) outb->buffer->n_datas = 1; outb->buffer->datas = outb->datas; - outb->datas[0].data = this->empty; + outb->datas[0].data = SPA_PTR_ALIGN(this->empty, 16, void); outb->datas[0].chunk = outb->chunk; outb->datas[0].chunk->offset = 0; outb->datas[0].chunk->size = n_samples * sizeof(float); outb->datas[0].chunk->stride = sizeof(float); - dst = this->empty; + dst = outb->datas[0].data; if (n_buffers == 0) { memset(dst, 0, n_samples * sizeof(float)); } diff --git a/src/modules/module-client-node/client-node.c b/src/modules/module-client-node/client-node.c index 02b796485..594dac6b6 100644 --- a/src/modules/module-client-node/client-node.c +++ b/src/modules/module-client-node/client-node.c @@ -837,7 +837,7 @@ do_port_use_buffers(struct impl *impl, data_size = 0; for (j = 0; j < buffers[i]->n_metas; j++) { - data_size += buffers[i]->metas[j].size; + data_size += SPA_ROUND_UP_N(buffers[i]->metas[j].size, 8); } for (j = 0; j < buffers[i]->n_datas; j++) { struct spa_data *d = buffers[i]->datas; diff --git a/src/pipewire/link.c b/src/pipewire/link.c index 2d7abdabd..4aa369455 100644 --- a/src/pipewire/link.c +++ b/src/pipewire/link.c @@ -419,6 +419,7 @@ static int alloc_buffers(struct pw_link *this, uint32_t n_datas, size_t *data_sizes, ssize_t *data_strides, + size_t *data_aligns, struct allocation *allocation) { int res; @@ -452,12 +453,13 @@ static int alloc_buffers(struct pw_link *this, metas[n_metas].type = type; metas[n_metas].size = size; - meta_size += metas[n_metas].size; + meta_size += SPA_ROUND_UP_N(metas[n_metas].size, 8); n_metas++; skel_size += sizeof(struct spa_meta); } } data_size += meta_size; + data_size = SPA_ROUND_UP_N(data_size, data_aligns[0]); /* data */ for (i = 0; i < n_datas; i++) { @@ -492,7 +494,7 @@ static int alloc_buffers(struct pw_link *this, m->type = metas[j].type; m->size = metas[j].size; m->data = p; - p = SPA_MEMBER(p, m->size, void); + p = SPA_MEMBER(p, SPA_ROUND_UP_N(m->size, 8), void); } /* pointer to data structure */ b->n_datas = n_datas; @@ -509,7 +511,7 @@ static int alloc_buffers(struct pw_link *this, d->type = SPA_DATA_MemFd; d->flags = 0; d->fd = m->fd; - d->mapoffset = SPA_PTRDIFF(ddp, m->ptr); + d->mapoffset = SPA_ROUND_UP_N(SPA_PTRDIFF(ddp, m->ptr), data_aligns[i]); d->maxsize = data_sizes[j]; d->data = SPA_MEMBER(m->ptr, d->mapoffset, void); d->chunk->offset = 0; @@ -701,9 +703,10 @@ static int do_allocation(struct pw_link *this, uint32_t in_state, uint32_t out_s struct spa_pod_builder b = SPA_POD_BUILDER_INIT(buffer, sizeof(buffer)); uint32_t i, offset, n_params; uint32_t max_buffers; - size_t minsize = 8192, stride = 0; + size_t minsize = 8192, stride = 0, align; size_t data_sizes[1]; ssize_t data_strides[1]; + size_t data_aligns[1]; n_params = param_filter(this, input, output, SPA_PARAM_Buffers, &b); n_params += param_filter(this, input, output, SPA_PARAM_Meta, &b); @@ -720,25 +723,29 @@ static int do_allocation(struct pw_link *this, uint32_t in_state, uint32_t out_s max_buffers = MAX_BUFFERS; minsize = stride = 0; + align = 8; param = find_param(params, n_params, SPA_TYPE_OBJECT_ParamBuffers); if (param) { uint32_t qmax_buffers = max_buffers, - qminsize = minsize, qstride = stride; + qminsize = minsize, qstride = stride, qalign = align; spa_pod_parse_object(param, SPA_TYPE_OBJECT_ParamBuffers, NULL, SPA_PARAM_BUFFERS_buffers, SPA_POD_Int(&qmax_buffers), SPA_PARAM_BUFFERS_size, SPA_POD_Int(&qminsize), - SPA_PARAM_BUFFERS_stride, SPA_POD_Int(&qstride)); + SPA_PARAM_BUFFERS_stride, SPA_POD_Int(&qstride), + SPA_PARAM_BUFFERS_align, SPA_POD_Int(&qalign)); max_buffers = qmax_buffers == 0 ? max_buffers : SPA_MIN(qmax_buffers, max_buffers); minsize = SPA_MAX(minsize, qminsize); stride = SPA_MAX(stride, qstride); + align = SPA_MAX(align, qalign); - pw_log_debug("%d %d %d -> %zd %zd %d", qminsize, qstride, qmax_buffers, - minsize, stride, max_buffers); + pw_log_debug("%d %d %d %d -> %zd %zd %d %zd", + qminsize, qstride, qmax_buffers, qalign, + minsize, stride, max_buffers, align); } else { pw_log_warn("no buffers param"); minsize = 8192; @@ -754,6 +761,7 @@ static int do_allocation(struct pw_link *this, uint32_t in_state, uint32_t out_s data_sizes[0] = minsize; data_strides[0] = stride; + data_aligns[0] = align; if ((res = alloc_buffers(this, max_buffers, @@ -761,6 +769,7 @@ static int do_allocation(struct pw_link *this, uint32_t in_state, uint32_t out_s params, 1, data_sizes, data_strides, + data_aligns, &allocation)) < 0) { asprintf(&error, "error alloc buffers: %d", res); goto error; diff --git a/src/pipewire/remote.c b/src/pipewire/remote.c index b04c9a5c0..5ed844984 100644 --- a/src/pipewire/remote.c +++ b/src/pipewire/remote.c @@ -1075,7 +1075,7 @@ client_node_port_use_buffers(void *object, struct spa_meta *m = &b->metas[j]; memcpy(m, &buffers[i].buffer->metas[j], sizeof(struct spa_meta)); m->data = SPA_MEMBER(bmem.map.ptr, offset, void); - offset += m->size; + offset += SPA_ROUND_UP_N(m->size, 8); } for (j = 0; j < b->n_datas; j++) {