diff --git a/pipewire-jack b/pipewire-jack
index c404942e9..1cf3e0121 160000
--- a/pipewire-jack
+++ b/pipewire-jack
@@ -1 +1 @@
-Subproject commit c404942e9d15bd3340c57121753fed8d38b247c6
+Subproject commit 1cf3e01219d66f92ea655ddf5c2f4caa9b96bcf7
diff --git a/spa/include/spa/buffer/alloc.h b/spa/include/spa/buffer/alloc.h
index 30a4f57f3..2ffd2d282 100644
--- a/spa/include/spa/buffer/alloc.h
+++ b/spa/include/spa/buffer/alloc.h
@@ -66,7 +66,7 @@ static inline int spa_buffer_alloc_fill_info(struct spa_buffer_alloc_info *info,
         info->skel_size += n_datas * sizeof(struct spa_data);
 
 	for (i = 0, size = 0; i < n_metas; i++)
-		size += metas[i].size;
+		size += SPA_ROUND_UP_N(metas[i].size, 8);
 	info->meta_size = size;
 
 	if (SPA_FLAG_CHECK(info->flags, SPA_BUFFER_ALLOC_FLAG_INLINE_META))
@@ -76,13 +76,18 @@ static inline int spa_buffer_alloc_fill_info(struct spa_buffer_alloc_info *info,
 	if (SPA_FLAG_CHECK(info->flags, SPA_BUFFER_ALLOC_FLAG_INLINE_CHUNK))
 	        info->skel_size += info->chunk_size;
 
-	for (i = 0, size = 0; i < n_datas; i++)
+	for (i = 0, size = 0; i < n_datas; i++) {
+		size = SPA_ROUND_UP_N(size, data_aligns[i]);
 		size += datas[i].maxsize;
+	}
 	info->data_size = size;
 
 	if (!SPA_FLAG_CHECK(info->flags, SPA_BUFFER_ALLOC_FLAG_NO_DATA) &&
-	    SPA_FLAG_CHECK(info->flags, SPA_BUFFER_ALLOC_FLAG_INLINE_DATA))
-		info->skel_size += size;
+	    SPA_FLAG_CHECK(info->flags, SPA_BUFFER_ALLOC_FLAG_INLINE_DATA)) {
+		info->skel_size += n_datas ? data_aligns[0] - 1 : 0;
+		info->skel_size += info->data_size;
+	}
+	info->skel_size = SPA_ROUND_UP_N(info->skel_size, 8);
 
 	return 0;
 }
@@ -114,7 +119,7 @@ spa_buffer_alloc_layout(struct spa_buffer_alloc_info *info,
 		struct spa_meta *m = &b->metas[i];
 		*m = info->metas[i];
 		m->data = *dp;
-		*dp = SPA_MEMBER(*dp, m->size, void);
+		*dp = SPA_MEMBER(*dp, SPA_ROUND_UP_N(m->size, 8), void);
 	}
 
 	size = info->n_datas * sizeof(struct spa_chunk);
@@ -138,6 +143,7 @@ spa_buffer_alloc_layout(struct spa_buffer_alloc_info *info,
 		*d = info->datas[i];
 		d->chunk = &cp[i];
 		if (!SPA_FLAG_CHECK(info->flags, SPA_BUFFER_ALLOC_FLAG_NO_DATA)) {
+			*dp = SPA_PTR_ALIGN(*dp, info->data_aligns[i], void);
 			d->data = *dp;
 			*dp = SPA_MEMBER(*dp, d->maxsize, void);
 		}
@@ -173,8 +179,6 @@ spa_buffer_alloc_array(uint32_t n_buffers, uint32_t flags,
 
         spa_buffer_alloc_fill_info(&info, n_metas, metas, n_datas, datas, data_aligns);
 
-	info.skel_size = SPA_ROUND_UP_N(info.skel_size, 16);
-
         buffers = (struct spa_buffer **)calloc(n_buffers, sizeof(struct spa_buffer *) + info.skel_size);
 
         skel = SPA_MEMBER(buffers, sizeof(struct spa_buffer *) * n_buffers, void);
diff --git a/spa/include/spa/utils/defs.h b/spa/include/spa/utils/defs.h
index 401d8fca8..912d06201 100644
--- a/spa/include/spa/utils/defs.h
+++ b/spa/include/spa/utils/defs.h
@@ -147,6 +147,9 @@ struct spa_fraction {
 #define SPA_ROUND_DOWN_N(num,align)	((num) & ~((align) - 1))
 #define SPA_ROUND_UP_N(num,align)	SPA_ROUND_DOWN_N((num) + ((align) - 1),align)
 
+#define SPA_IS_ALIGNED(p,align)		(((intptr_t)(p) & ((align)-1)) == 0)
+#define SPA_PTR_ALIGN(p,align,type)	(type*)SPA_ROUND_UP_N((intptr_t)(p), (intptr_t)(align))
+
 #ifndef SPA_LIKELY
 #ifdef __GNUC__
 #define SPA_LIKELY(x) (__builtin_expect(!!(x),1))
diff --git a/spa/plugins/alsa/alsa-sink.c b/spa/plugins/alsa/alsa-sink.c
index 7d2da205e..02fa3da4a 100644
--- a/spa/plugins/alsa/alsa-sink.c
+++ b/spa/plugins/alsa/alsa-sink.c
@@ -487,8 +487,6 @@ static int port_set_format(struct spa_node *node,
 		    info.media_subtype != SPA_MEDIA_SUBTYPE_raw)
 			return -EINVAL;
 
-		spa_debug_pod(0, NULL, format);
-
 		if (spa_format_audio_raw_parse(format, &info.info.raw) < 0)
 			return -EINVAL;
 
diff --git a/spa/plugins/audioconvert/benchmark-fmt-ops.c b/spa/plugins/audioconvert/benchmark-fmt-ops.c
index 1c1cc9ae9..823030328 100644
--- a/spa/plugins/audioconvert/benchmark-fmt-ops.c
+++ b/spa/plugins/audioconvert/benchmark-fmt-ops.c
@@ -31,52 +31,65 @@
 
 #include "fmt-ops.c"
 
-#define N_SAMPLES	4096
-#define N_CHANNELS	5
+#define MAX_SAMPLES	4096
+#define MAX_CHANNELS	11
 
 #define MAX_COUNT 1000
 
-static uint8_t samp_in[N_SAMPLES * N_CHANNELS * 4];
-static uint8_t samp_out[N_SAMPLES * N_CHANNELS * 4];
+static uint8_t samp_in[MAX_SAMPLES * MAX_CHANNELS * 4];
+static uint8_t samp_out[MAX_SAMPLES * MAX_CHANNELS * 4];
 
-static void run_test(const char *name, bool in_packed, bool out_packed, convert_func_t func)
+static const int sample_sizes[] = { 0, 1, 128, 513, 4096 };
+static const int channel_counts[] = { 1, 2, 4, 6, 8, 11 };
+
+static void run_test1(const char *name, bool in_packed, bool out_packed, convert_func_t func,
+		int n_channels, int n_samples)
 {
-	const void *ip[N_CHANNELS];
-	void *op[N_CHANNELS];
-	int i, j, ic, oc, ns;
+	int i, j;
+	const void *ip[n_channels];
+	void *op[n_channels];
 	struct timespec ts;
-	uint64_t t1, t2;
-	uint64_t count = 0;
+	uint64_t count, t1, t2;
 
-	for (j = 0; j < N_CHANNELS; j++) {
-		ip[j] = &samp_in[j * N_SAMPLES * 4];
-		op[j] = &samp_out[j * N_SAMPLES * 4];
+	for (j = 0; j < n_channels; j++) {
+		ip[j] = &samp_in[j * n_samples * 4];
+		op[j] = &samp_out[j * n_samples * 4];
 	}
 
-	ic = in_packed ? 1 : N_CHANNELS;
-	oc = out_packed ? 1 : N_CHANNELS;
-	ns = (in_packed && out_packed) ? N_SAMPLES * N_CHANNELS : N_SAMPLES;
-
 	clock_gettime(CLOCK_MONOTONIC, &ts);
 	t1 = SPA_TIMESPEC_TO_NSEC(&ts);
 
+	count = 0;
 	for (i = 0; i < MAX_COUNT; i++) {
-		func(NULL, oc, op, ic, ip, ns);
+		func(NULL, op, ip, n_channels, n_samples);
 		count++;
 	}
-	count *= N_SAMPLES;
 	clock_gettime(CLOCK_MONOTONIC, &ts);
 	t2 = SPA_TIMESPEC_TO_NSEC(&ts);
 
-	fprintf(stderr, "%s: elapsed %"PRIu64" count %"PRIu64" = %"PRIu64"/sec\n", name,
+	fprintf(stderr, "%s: samples %d, channels %d: elapsed %"PRIu64" count %"
+			PRIu64" = %"PRIu64"/sec\n", name, n_samples, n_channels,
 			t2 - t1, count, count * (uint64_t)SPA_NSEC_PER_SEC / (t2 - t1));
 }
 
+static void run_test(const char *name, bool in_packed, bool out_packed, convert_func_t func)
+{
+	size_t i, j;
+
+	for (i = 0; i < SPA_N_ELEMENTS(sample_sizes); i++) {
+		for (j = 0; j < SPA_N_ELEMENTS(channel_counts); j++) {
+			run_test1(name, in_packed, out_packed, func, channel_counts[j],
+				(sample_sizes[i] + (channel_counts[j] -1)) / channel_counts[j]);
+		}
+	}
+}
+
 static void test_f32_u8(void)
 {
 	run_test("test_f32_u8", true, true, conv_f32_to_u8);
 	run_test("test_f32d_u8", false, true, conv_f32d_to_u8);
 	run_test("test_f32_u8d", true, false, conv_f32_to_u8d);
+	run_test("test_f32d_u8d", false, false, conv_f32d_to_u8d);
 }
 
 static void test_u8_f32(void)
diff --git a/spa/plugins/audioconvert/channelmix-ops-sse.c b/spa/plugins/audioconvert/channelmix-ops-sse.c
index 749c8ca11..6cb6881c6 100644
--- a/spa/plugins/audioconvert/channelmix-ops-sse.c
+++ b/spa/plugins/audioconvert/channelmix-ops-sse.c
@@ -26,31 +26,34 @@
 
 static void
 channelmix_copy_sse(void *data, int n_dst, void *dst[n_dst],
-	   int n_src, const void *src[n_src], void *matrix, float v, int n_bytes)
+	   int n_src, const void *src[n_src], void *matrix, float v, int n_samples)
 {
-	int i, n, n_samples = n_bytes / sizeof(float), unrolled, remain;
+	int i, n, unrolled;
 	float **d = (float **)dst;
 	float **s = (float **)src;
         __m128 vol = _mm_set1_ps(v);
 
 	if (v <= VOLUME_MIN) {
 		for (i = 0; i < n_dst; i++)
-			memset(d[i], 0, n_bytes);
+			memset(d[i], 0, n_samples * sizeof(float));
 	}
 	else if (v == VOLUME_NORM) {
 		for (i = 0; i < n_dst; i++)
-			memcpy(d[i], s[i], n_bytes);
+			memcpy(d[i], s[i], n_samples * sizeof(float));
 	}
 	else {
 		for (i = 0; i < n_dst; i++) {
 			float *di = d[i], *si = s[i];
 
-			unrolled = n_samples / 4;
-			remain = n_samples & 3;
+			if (SPA_IS_ALIGNED(di, 16) &&
+			    SPA_IS_ALIGNED(si, 16))
+				unrolled = n_samples / 4;
+			else
+				unrolled = 0;
 
 			for(n = 0; unrolled--; n += 4)
-				_mm_storeu_ps(&di[n], _mm_mul_ps(_mm_loadu_ps(&si[n]), vol));
-			for(; remain--; n++)
+				_mm_store_ps(&di[n], _mm_mul_ps(_mm_load_ps(&si[n]), vol));
+			for(; n < n_samples; n++)
 				_mm_store_ss(&di[n], _mm_mul_ss(_mm_load_ss(&si[n]), vol));
 		}
 	}
@@ -58,33 +61,40 @@ channelmix_copy_sse(void *data, int n_dst, void *dst[n_dst],
 
 static void
 channelmix_f32_2_4_sse(void *data, int n_dst, void *dst[n_dst],
-		   int n_src, const void *src[n_src], void *matrix, float v, int n_bytes)
+		   int n_src, const void *src[n_src], void *matrix, float v, int n_samples)
 {
-	int i, n, n_samples = n_bytes / sizeof(float), unrolled, remain;
+	int i, n, unrolled;
 	float **d = (float **)dst;
 	float **s = (float **)src;
         __m128 vol = _mm_set1_ps(v);
 	__m128 in;
-	float *dFL = d[0], *dFR = d[1], *dRL = d[2], *dRR = d[3];
 	float *sFL = s[0], *sFR = s[1];
+	float *dFL = d[0], *dFR = d[1], *dRL = d[2], *dRR = d[3];
+
+	if (SPA_IS_ALIGNED(sFL, 16) &&
+	    SPA_IS_ALIGNED(sFR, 16) &&
+	    SPA_IS_ALIGNED(dFL, 16) &&
+	    SPA_IS_ALIGNED(dFR, 16) &&
+	    SPA_IS_ALIGNED(dRL, 16) &&
+	    SPA_IS_ALIGNED(dRR, 16))
+		unrolled = n_samples / 4;
+	else
+		unrolled = 0;
 
 	if (v <= VOLUME_MIN) {
 		for (i = 0; i < n_dst; i++)
-			memset(d[i], 0, n_bytes);
+			memset(d[i], 0, n_samples * sizeof(float));
 	}
 	else if (v == VOLUME_NORM) {
-		unrolled = n_samples / 4;
-		remain = n_samples & 3;
-
 		for(n = 0; unrolled--; n += 4) {
-			in = _mm_loadu_ps(&sFL[n]);
-			_mm_storeu_ps(&dFL[n], in);
-			_mm_storeu_ps(&dRL[n], in);
-			in = _mm_loadu_ps(&sFR[n]);
-			_mm_storeu_ps(&dFR[n], in);
-			_mm_storeu_ps(&dRR[n], in);
+			in = _mm_load_ps(&sFL[n]);
+			_mm_store_ps(&dFL[n], in);
+			_mm_store_ps(&dRL[n], in);
+			in = _mm_load_ps(&sFR[n]);
+			_mm_store_ps(&dFR[n], in);
+			_mm_store_ps(&dRR[n], in);
 		}
-		for(; remain--; n++) {
+		for(; n < n_samples; n++) {
 			in = _mm_load_ss(&sFL[n]);
 			_mm_store_ss(&dFL[n], in);
 			_mm_store_ss(&dRL[n], in);
@@ -94,18 +104,15 @@ channelmix_f32_2_4_sse(void *data, int n_dst, void *dst[n_dst],
 		}
 	}
 	else {
-		unrolled = n_samples / 4;
-		remain = n_samples & 3;
-
 		for(n = 0; unrolled--; n += 4) {
-			in = _mm_mul_ps(_mm_loadu_ps(&sFL[n]), vol);
-			_mm_storeu_ps(&dFL[n], in);
-			_mm_storeu_ps(&dRL[n], in);
-			in = _mm_mul_ps(_mm_loadu_ps(&sFR[n]), vol);
-			_mm_storeu_ps(&dFR[n], in);
-			_mm_storeu_ps(&dRR[n], in);
+			in = _mm_mul_ps(_mm_load_ps(&sFL[n]), vol);
+			_mm_store_ps(&dFL[n], in);
+			_mm_store_ps(&dRL[n], in);
+			in = _mm_mul_ps(_mm_load_ps(&sFR[n]), vol);
+			_mm_store_ps(&dFR[n], in);
+			_mm_store_ps(&dRR[n], in);
 		}
-		for(; remain--; n++) {
+		for(; n < n_samples; n++) {
 			in = _mm_mul_ss(_mm_load_ss(&sFL[n]), vol);
 			_mm_store_ss(&dFL[n], in);
 			_mm_store_ss(&dRL[n], in);
@@ -119,9 +126,9 @@ channelmix_f32_2_4_sse(void *data, int n_dst, void *dst[n_dst],
 /* FL+FR+FC+LFE+SL+SR -> FL+FR */
 static void
 channelmix_f32_5p1_2_sse(void *data, int n_dst, void *dst[n_dst],
-		   int n_src, const void *src[n_src], void *matrix, float v, int n_bytes)
+		   int n_src, const void *src[n_src], void *matrix, float v, int n_samples)
 {
-	int n, n_samples = n_bytes / sizeof(float), unrolled, remain;
+	int n, unrolled;
 	float **d = (float **) dst;
 	float **s = (float **) src;
 	float *m = matrix;
@@ -130,33 +137,41 @@ channelmix_f32_5p1_2_sse(void *data, int n_dst, void *dst[n_dst],
         __m128 slev = _mm_set1_ps(m[4]);
         __m128 vol = _mm_set1_ps(v);
 	__m128 in, ctr;
-	float *dFL = d[0], *dFR = d[1];
 	float *sFL = s[0], *sFR = s[1], *sFC = s[2], *sLFE = s[3], *sSL = s[4], *sSR = s[5];
+	float *dFL = d[0], *dFR = d[1];
+
+	if (SPA_IS_ALIGNED(sFL, 16) &&
+	    SPA_IS_ALIGNED(sFR, 16) &&
+	    SPA_IS_ALIGNED(sFC, 16) &&
+	    SPA_IS_ALIGNED(sLFE, 16) &&
+	    SPA_IS_ALIGNED(sSL, 16) &&
+	    SPA_IS_ALIGNED(sSR, 16) &&
+	    SPA_IS_ALIGNED(dFL, 16) &&
+	    SPA_IS_ALIGNED(dFR, 16))
+		unrolled = n_samples / 4;
+	else
+		unrolled = 0;
 
 	if (v <= VOLUME_MIN) {
-		memset(dFL, 0, n_bytes);
-		memset(dFR, 0, n_bytes);
+		memset(dFL, 0, n_samples * sizeof(float));
+		memset(dFR, 0, n_samples * sizeof(float));
 	}
 	else if (v == VOLUME_NORM) {
-
-		unrolled = n_samples / 4;
-		remain = n_samples & 3;
-
 		for(n = 0; unrolled--; n += 4) {
-			ctr = _mm_mul_ps(_mm_loadu_ps(&sFC[n]), clev);
-			ctr = _mm_add_ps(ctr, _mm_mul_ps(_mm_loadu_ps(&sLFE[n]), llev));
-			in = _mm_mul_ps(_mm_loadu_ps(&sSL[n]), slev);
+			ctr = _mm_mul_ps(_mm_load_ps(&sFC[n]), clev);
+			ctr = _mm_add_ps(ctr, _mm_mul_ps(_mm_load_ps(&sLFE[n]), llev));
+			in = _mm_mul_ps(_mm_load_ps(&sSL[n]), slev);
 			in = _mm_add_ps(in, ctr);
-			in = _mm_add_ps(in, _mm_loadu_ps(&sFL[n]));
-			_mm_storeu_ps(&dFL[n], in);
-			in = _mm_mul_ps(_mm_loadu_ps(&sSR[n]), slev);
+			in = _mm_add_ps(in, _mm_load_ps(&sFL[n]));
+			_mm_store_ps(&dFL[n], in);
+			in = _mm_mul_ps(_mm_load_ps(&sSR[n]), slev);
 			in = _mm_add_ps(in, ctr);
-			in = _mm_add_ps(in, _mm_loadu_ps(&sFR[n]));
-			_mm_storeu_ps(&dFR[n], in);
+			in = _mm_add_ps(in, _mm_load_ps(&sFR[n]));
+			_mm_store_ps(&dFR[n], in);
 		}
-		for(; remain--; n++) {
+		for(; n < n_samples; n++) {
 			ctr = _mm_mul_ss(_mm_load_ss(&sFC[n]), clev);
-			ctr = _mm_add_ps(ctr, _mm_mul_ps(_mm_loadu_ps(&sLFE[n]), llev));
+			ctr = _mm_add_ss(ctr, _mm_mul_ss(_mm_load_ss(&sLFE[n]), llev));
 			in = _mm_mul_ss(_mm_load_ss(&sSL[n]), slev);
 			in = _mm_add_ss(in, ctr);
 			in = _mm_add_ss(in, _mm_load_ss(&sFL[n]));
@@ -168,26 +183,23 @@ channelmix_f32_5p1_2_sse(void *data, int n_dst, void *dst[n_dst],
 		}
 	}
 	else {
-		unrolled = n_samples / 4;
-		remain = n_samples & 3;
-
 		for(n = 0; unrolled--; n += 4) {
-			ctr = _mm_mul_ps(_mm_loadu_ps(&sFC[n]), clev);
-			ctr = _mm_add_ps(ctr, _mm_mul_ps(_mm_loadu_ps(&sLFE[n]), llev));
-			in = _mm_mul_ps(_mm_loadu_ps(&sSL[n]), slev);
+			ctr = _mm_mul_ps(_mm_load_ps(&sFC[n]), clev);
+			ctr = _mm_add_ps(ctr, _mm_mul_ps(_mm_load_ps(&sLFE[n]), llev));
+			in = _mm_mul_ps(_mm_load_ps(&sSL[n]), slev);
 			in = _mm_add_ps(in, ctr);
-			in = _mm_add_ps(in, _mm_loadu_ps(&sFL[n]));
+			in = _mm_add_ps(in, _mm_load_ps(&sFL[n]));
 			in = _mm_mul_ps(in, vol);
-			_mm_storeu_ps(&dFL[n], in);
-			in = _mm_mul_ps(_mm_loadu_ps(&sSR[n]), slev);
+			_mm_store_ps(&dFL[n], in);
+			in = _mm_mul_ps(_mm_load_ps(&sSR[n]), slev);
 			in = _mm_add_ps(in, ctr);
-			in = _mm_add_ps(in, _mm_loadu_ps(&sFR[n]));
+			in = _mm_add_ps(in, _mm_load_ps(&sFR[n]));
 			in = _mm_mul_ps(in, vol);
-			_mm_storeu_ps(&dFR[n], in);
+			_mm_store_ps(&dFR[n], in);
 		}
-		for(; remain--; n++) {
+		for(; n < n_samples; n++) {
 			ctr = _mm_mul_ss(_mm_load_ss(&sFC[n]), clev);
-			ctr = _mm_add_ps(ctr, _mm_mul_ps(_mm_loadu_ps(&sLFE[n]), llev));
+			ctr = _mm_add_ss(ctr, _mm_mul_ss(_mm_load_ss(&sLFE[n]), llev));
 			in = _mm_mul_ss(_mm_load_ss(&sSL[n]), slev);
 			in = _mm_add_ss(in, ctr);
 			in = _mm_add_ss(in, _mm_load_ss(&sFL[n]));
@@ -205,58 +217,66 @@ channelmix_f32_5p1_2_sse(void *data, int n_dst, void *dst[n_dst],
 /* FL+FR+FC+LFE+SL+SR -> FL+FR+FC+LFE*/
 static void
 channelmix_f32_5p1_3p1_sse(void *data, int n_dst, void *dst[n_dst],
-		   int n_src, const void *src[n_src], void *matrix, float v, int n_bytes)
+		   int n_src, const void *src[n_src], void *matrix, float v, int n_samples)
 {
-	int i, n, n_samples = n_bytes / sizeof(float), unrolled, remain;
+	int i, n, unrolled;
 	float **d = (float **) dst;
 	float **s = (float **) src;
         __m128 mix = _mm_set1_ps(v * 0.5f);
         __m128 vol = _mm_set1_ps(v);
 	__m128 avg;
-	float *dFL = d[0], *dFR = d[1], *dFC = d[2], *dLFE = d[3];
 	float *sFL = s[0], *sFR = s[1], *sFC = s[2], *sLFE = s[3], *sSL = s[4], *sSR = s[5];
+	float *dFL = d[0], *dFR = d[1], *dFC = d[2], *dLFE = d[3];
+
+	if (SPA_IS_ALIGNED(sFL, 16) &&
+	    SPA_IS_ALIGNED(sFR, 16) &&
+	    SPA_IS_ALIGNED(sFC, 16) &&
+	    SPA_IS_ALIGNED(sLFE, 16) &&
+	    SPA_IS_ALIGNED(sSL, 16) &&
+	    SPA_IS_ALIGNED(sSR, 16) &&
+	    SPA_IS_ALIGNED(dFL, 16) &&
+	    SPA_IS_ALIGNED(dFR, 16) &&
+	    SPA_IS_ALIGNED(dFC, 16) &&
+	    SPA_IS_ALIGNED(dLFE, 16))
+		unrolled = n_samples / 4;
+	else
+		unrolled = 0;
 
 	if (v <= VOLUME_MIN) {
 		for (i = 0; i < n_dst; i++)
-			memset(d[i], 0, n_bytes);
+			memset(d[i], 0, n_samples * sizeof(float));
 	}
 	else if (v == VOLUME_NORM) {
-		unrolled = n_samples / 4;
-		remain = n_samples & 3;
-
 		for(n = 0; unrolled--; n += 4) {
-			avg = _mm_add_ps(_mm_loadu_ps(&sFL[n]), _mm_loadu_ps(&sSL[n]));
-			_mm_storeu_ps(&dFL[n], _mm_mul_ps(avg, mix));
-			avg = _mm_add_ps(_mm_loadu_ps(&sFR[n]), _mm_loadu_ps(&sSR[n]));
-			_mm_storeu_ps(&dFR[n], _mm_mul_ps(avg, mix));
-			_mm_storeu_ps(&dFC[n], _mm_loadu_ps(&sFC[n]));
-			_mm_storeu_ps(&dLFE[n], _mm_loadu_ps(&sLFE[n]));
+			avg = _mm_add_ps(_mm_load_ps(&sFL[n]), _mm_load_ps(&sSL[n]));
+			_mm_store_ps(&dFL[n], _mm_mul_ps(avg, mix));
+			avg = _mm_add_ps(_mm_load_ps(&sFR[n]), _mm_load_ps(&sSR[n]));
+			_mm_store_ps(&dFR[n], _mm_mul_ps(avg, mix));
+			_mm_store_ps(&dFC[n], _mm_load_ps(&sFC[n]));
+			_mm_store_ps(&dLFE[n], _mm_load_ps(&sLFE[n]));
 		}
-		for(; remain--; n++) {
+		for(; n < n_samples; n++) {
 			avg = _mm_add_ss(_mm_load_ss(&sFL[n]), _mm_load_ss(&sSL[n]));
 			_mm_store_ss(&dFL[n], _mm_mul_ss(avg, mix));
-			avg = _mm_add_ps(_mm_load_ss(&sFR[n]), _mm_load_ss(&sSR[n]));
+			avg = _mm_add_ss(_mm_load_ss(&sFR[n]), _mm_load_ss(&sSR[n]));
 			_mm_store_ss(&dFR[n], _mm_mul_ss(avg, mix));
 			_mm_store_ss(&dFC[n], _mm_load_ss(&sFC[n]));
 			_mm_store_ss(&dLFE[n], _mm_load_ss(&sLFE[n]));
 		}
 	}
 	else {
-		unrolled = n_samples / 4;
-		remain = n_samples & 3;
-
 		for(n = 0; unrolled--; n += 4) {
-			avg = _mm_add_ps(_mm_loadu_ps(&sFL[n]), _mm_loadu_ps(&sSL[n]));
-			_mm_storeu_ps(&dFL[n], _mm_mul_ps(avg, mix));
-			avg = _mm_add_ps(_mm_loadu_ps(&sFR[n]), _mm_loadu_ps(&sSR[n]));
-			_mm_storeu_ps(&dFR[n], _mm_mul_ps(avg, mix));
-			_mm_storeu_ps(&dFC[n], _mm_mul_ps(_mm_loadu_ps(&sFC[n]), vol));
-			_mm_storeu_ps(&dLFE[n], _mm_mul_ps(_mm_loadu_ps(&sLFE[n]), vol));
+			avg = _mm_add_ps(_mm_load_ps(&sFL[n]), _mm_load_ps(&sSL[n]));
+			_mm_store_ps(&dFL[n], _mm_mul_ps(avg, mix));
+			avg = _mm_add_ps(_mm_load_ps(&sFR[n]), _mm_load_ps(&sSR[n]));
+			_mm_store_ps(&dFR[n], _mm_mul_ps(avg, mix));
+			_mm_store_ps(&dFC[n], _mm_mul_ps(_mm_load_ps(&sFC[n]), vol));
+			_mm_store_ps(&dLFE[n], _mm_mul_ps(_mm_load_ps(&sLFE[n]), vol));
 		}
-		for(; remain--; n++) {
+		for(; n < n_samples; n++) {
 			avg = _mm_add_ss(_mm_load_ss(&sFL[n]), _mm_load_ss(&sSL[n]));
 			_mm_store_ss(&dFL[n], _mm_mul_ss(avg, mix));
-			avg = _mm_add_ps(_mm_load_ss(&sFR[n]), _mm_load_ss(&sSR[n]));
+			avg = _mm_add_ss(_mm_load_ss(&sFR[n]), _mm_load_ss(&sSR[n]));
 			_mm_store_ss(&dFR[n], _mm_mul_ss(avg, mix));
 			_mm_store_ss(&dFC[n], _mm_mul_ss(_mm_load_ss(&sFC[n]), vol));
 			_mm_store_ss(&dLFE[n], _mm_mul_ss(_mm_load_ss(&sLFE[n]), vol));
@@ -267,9 +287,9 @@ channelmix_f32_5p1_3p1_sse(void *data, int n_dst, void *dst[n_dst],
 /* FL+FR+FC+LFE+SL+SR -> FL+FR+RL+RR*/
 static void
 channelmix_f32_5p1_4_sse(void *data, int n_dst, void *dst[n_dst],
-		   int n_src, const void *src[n_src], void *matrix, float v, int n_bytes)
+		   int n_src, const void *src[n_src], void *matrix, float v, int n_samples)
 {
-	int i, n, n_samples = n_bytes / sizeof(float), unrolled, remain;
+	int i, n, unrolled;
 	float **d = (float **) dst;
 	float **s = (float **) src;
 	float *m = matrix;
@@ -277,28 +297,39 @@ channelmix_f32_5p1_4_sse(void *data, int n_dst, void *dst[n_dst],
         __m128 llev = _mm_set1_ps(m[3]);
         __m128 vol = _mm_set1_ps(v);
 	__m128 ctr;
-	float *dFL = d[0], *dFR = d[1], *dRL = d[2], *dRR = d[3];
 	float *sFL = s[0], *sFR = s[1], *sFC = s[2], *sLFE = s[3], *sSL = s[4], *sSR = s[5];
+	float *dFL = d[0], *dFR = d[1], *dRL = d[2], *dRR = d[3];
+
+	if (SPA_IS_ALIGNED(sFL, 16) &&
+	    SPA_IS_ALIGNED(sFR, 16) &&
+	    SPA_IS_ALIGNED(sFC, 16) &&
+	    SPA_IS_ALIGNED(sLFE, 16) &&
+	    SPA_IS_ALIGNED(sSL, 16) &&
+	    SPA_IS_ALIGNED(sSR, 16) &&
+	    SPA_IS_ALIGNED(dFL, 16) &&
+	    SPA_IS_ALIGNED(dFR, 16) &&
+	    SPA_IS_ALIGNED(dRL, 16) &&
+	    SPA_IS_ALIGNED(dRR, 16))
+		unrolled = n_samples / 4;
+	else
+		unrolled = 0;
 
 	if (v <= VOLUME_MIN) {
 		for (i = 0; i < n_dst; i++)
-			memset(d[i], 0, n_bytes);
+			memset(d[i], 0, n_samples * sizeof(float));
 	}
 	else if (v == VOLUME_NORM) {
-		unrolled = n_samples / 4;
-		remain = n_samples & 3;
-
 		for(n = 0; unrolled--; n += 4) {
-			ctr = _mm_mul_ps(_mm_loadu_ps(&sFC[n]), clev);
-			ctr = _mm_add_ps(ctr, _mm_mul_ps(_mm_loadu_ps(&sLFE[n]), llev));
-			_mm_storeu_ps(&dFL[n], _mm_add_ps(_mm_loadu_ps(&sFL[n]), ctr));
-			_mm_storeu_ps(&dFR[n], _mm_add_ps(_mm_loadu_ps(&sFR[n]), ctr));
-			_mm_storeu_ps(&dRL[n], _mm_loadu_ps(&sSL[n]));
-			_mm_storeu_ps(&dRR[n], _mm_loadu_ps(&sSR[n]));
+			ctr = _mm_mul_ps(_mm_load_ps(&sFC[n]), clev);
+			ctr = _mm_add_ps(ctr, _mm_mul_ps(_mm_load_ps(&sLFE[n]), llev));
+			_mm_store_ps(&dFL[n], _mm_add_ps(_mm_load_ps(&sFL[n]), ctr));
+			_mm_store_ps(&dFR[n], _mm_add_ps(_mm_load_ps(&sFR[n]), ctr));
+			_mm_store_ps(&dRL[n], _mm_load_ps(&sSL[n]));
+			_mm_store_ps(&dRR[n], _mm_load_ps(&sSR[n]));
 		}
-		for(; remain--; n++) {
+		for(; n < n_samples; n++) {
 			ctr = _mm_mul_ss(_mm_load_ss(&sFC[n]), clev);
-			ctr = _mm_add_ps(ctr, _mm_mul_ps(_mm_loadu_ps(&sLFE[n]), llev));
+			ctr = _mm_add_ss(ctr, _mm_mul_ss(_mm_load_ss(&sLFE[n]), llev));
 			_mm_store_ss(&dFL[n], _mm_add_ss(_mm_load_ss(&sFL[n]), ctr));
 			_mm_store_ss(&dFR[n], _mm_add_ss(_mm_load_ss(&sFR[n]), ctr));
 			_mm_store_ss(&dRL[n], _mm_load_ss(&sSL[n]));
@@ -306,20 +337,17 @@ channelmix_f32_5p1_4_sse(void *data, int n_dst, void *dst[n_dst],
 		}
 	}
 	else {
-		unrolled = n_samples / 4;
-		remain = n_samples & 3;
-
 		for(n = 0; unrolled--; n += 4) {
-			ctr = _mm_mul_ps(_mm_loadu_ps(&sFC[n]), clev);
-			ctr = _mm_add_ps(ctr, _mm_mul_ps(_mm_loadu_ps(&sLFE[n]), llev));
-			_mm_storeu_ps(&dFL[n], _mm_mul_ps(_mm_add_ps(_mm_loadu_ps(&sFL[n]), ctr), vol));
-			_mm_storeu_ps(&dFR[n], _mm_mul_ps(_mm_add_ps(_mm_loadu_ps(&sFR[n]), ctr), vol));
-			_mm_storeu_ps(&dRL[n], _mm_mul_ps(_mm_loadu_ps(&sSL[n]), vol));
-			_mm_storeu_ps(&dRR[n], _mm_mul_ps(_mm_loadu_ps(&sSR[n]), vol));
+			ctr = _mm_mul_ps(_mm_load_ps(&sFC[n]), clev);
+			ctr = _mm_add_ps(ctr, _mm_mul_ps(_mm_load_ps(&sLFE[n]), llev));
+			_mm_store_ps(&dFL[n], _mm_mul_ps(_mm_add_ps(_mm_load_ps(&sFL[n]), ctr), vol));
+			_mm_store_ps(&dFR[n], _mm_mul_ps(_mm_add_ps(_mm_load_ps(&sFR[n]), ctr), vol));
+			_mm_store_ps(&dRL[n], _mm_mul_ps(_mm_load_ps(&sSL[n]), vol));
+			_mm_store_ps(&dRR[n], _mm_mul_ps(_mm_load_ps(&sSR[n]), vol));
 		}
-		for(; remain--; n++) {
+		for(; n < n_samples; n++) {
 			ctr = _mm_mul_ss(_mm_load_ss(&sFC[n]), clev);
-			ctr = _mm_add_ps(ctr, _mm_mul_ps(_mm_loadu_ps(&sLFE[n]), llev));
+			ctr = _mm_add_ss(ctr, _mm_mul_ss(_mm_load_ss(&sLFE[n]), llev));
 			_mm_store_ss(&dFL[n], _mm_mul_ss(_mm_add_ss(_mm_load_ss(&sFL[n]), ctr), vol));
 			_mm_store_ss(&dFR[n], _mm_mul_ss(_mm_add_ss(_mm_load_ss(&sFR[n]), ctr), vol));
 			_mm_store_ss(&dRL[n], _mm_mul_ss(_mm_load_ss(&sSL[n]), vol));
diff --git a/spa/plugins/audioconvert/channelmix-ops.c b/spa/plugins/audioconvert/channelmix-ops.c
index 23d69c6d4..c326a4814 100644
--- a/spa/plugins/audioconvert/channelmix-ops.c
+++ b/spa/plugins/audioconvert/channelmix-ops.c
@@ -37,19 +37,19 @@
 
 static void
 channelmix_copy(void *data, int n_dst, void *dst[n_dst],
-	   int n_src, const void *src[n_src], void *matrix, float v, int n_bytes)
+	   int n_src, const void *src[n_src], void *matrix, float v, int n_samples)
 {
-	int i, n, n_samples = n_bytes / sizeof(float);
+	int i, n;
 	float **d = (float **)dst;
 	float **s = (float **)src;
 
 	if (v <= VOLUME_MIN) {
 		for (i = 0; i < n_dst; i++)
-			memset(d[i], 0, n_bytes);
+			memset(d[i], 0, n_samples * sizeof(float));
 	}
 	else if (v == VOLUME_NORM) {
 		for (i = 0; i < n_dst; i++)
-			memcpy(d[i], s[i], n_bytes);
+			memcpy(d[i], s[i], n_samples * sizeof(float));
 	}
 	else {
 		for (i = 0; i < n_dst; i++)
@@ -62,9 +62,9 @@ channelmix_copy(void *data, int n_dst, void *dst[n_dst],
 
 static void
 channelmix_f32_n_m(void *data, int n_dst, void *dst[n_dst],
-		   int n_src, const void *src[n_src], void *matrix, float v, int n_bytes)
+		   int n_src, const void *src[n_src], void *matrix, float v, int n_samples)
 {
-	int i, j, n, n_samples = n_bytes / sizeof(float);
+	int i, j, n;
 	float **d = (float **) dst;
 	float **s = (float **) src;
 	float *m = matrix;
@@ -84,15 +84,15 @@ channelmix_f32_n_m(void *data, int n_dst, void *dst[n_dst],
 
 static void
 channelmix_f32_1_2(void *data, int n_dst, void *dst[n_dst],
-		   int n_src, const void *src[n_src], void *matrix, float v, int n_bytes)
+		   int n_src, const void *src[n_src], void *matrix, float v, int n_samples)
 {
-	int n, n_samples = n_bytes / sizeof(float);
+	int n;
 	float **d = (float **)dst;
 	float **s = (float **)src;
 
 	if (v <= VOLUME_MIN) {
-		memset(d[0], 0, n_bytes);
-		memset(d[1], 0, n_bytes);
+		memset(d[0], 0, n_samples * sizeof(float));
+		memset(d[1], 0, n_samples * sizeof(float));
 	}
 	else if (v == VOLUME_NORM) {
 		for (n = 0; n < n_samples; n++)
@@ -106,14 +106,14 @@ channelmix_f32_1_2(void *data, int n_dst, void *dst[n_dst],
 
 static void
 channelmix_f32_2_1(void *data, int n_dst, void *dst[n_dst],
-		   int n_src, const void *src[n_src], void *matrix, float v, int n_bytes)
+		   int n_src, const void *src[n_src], void *matrix, float v, int n_samples)
 {
-	int n, n_samples = n_bytes / sizeof(float);
+	int n;
 	float **d = (float **)dst;
 	float **s = (float **)src;
 
 	if (v <= VOLUME_MIN) {
-		memset(d[0], 0, n_bytes);
+		memset(d[0], 0, n_samples * sizeof(float));
 	}
 	else {
 		const float f = v * 0.5f;
@@ -124,14 +124,14 @@ channelmix_f32_2_1(void *data, int n_dst, void *dst[n_dst],
 
 static void
 channelmix_f32_4_1(void *data, int n_dst, void *dst[n_dst],
-		   int n_src, const void *src[n_src], void *matrix, float v, int n_bytes)
+		   int n_src, const void *src[n_src], void *matrix, float v, int n_samples)
 {
-	int n, n_samples = n_bytes / sizeof(float);
+	int n;
 	float **d = (float **)dst;
 	float **s = (float **)src;
 
 	if (v <= VOLUME_MIN) {
-		memset(d[0], 0, n_bytes);
+		memset(d[0], 0, n_samples * sizeof(float));
 	}
 	else {
 		const float f = v * 0.25f;
@@ -142,14 +142,14 @@ channelmix_f32_4_1(void *data, int n_dst, void *dst[n_dst],
 
 static void
 channelmix_f32_3p1_1(void *data, int n_dst, void *dst[n_dst],
-		   int n_src, const void *src[n_src], void *matrix, float v, int n_bytes)
+		   int n_src, const void *src[n_src], void *matrix, float v, int n_samples)
 {
-	int n, n_samples = n_bytes / sizeof(float);
+	int n;
 	float **d = (float **)dst;
 	float **s = (float **)src;
 
 	if (v <= VOLUME_MIN) {
-		memset(d[0], 0, n_bytes);
+		memset(d[0], 0, n_samples * sizeof(float));
 	}
 	else {
 		const float f = v * 0.5f;
@@ -163,15 +163,15 @@ channelmix_f32_3p1_1(void *data, int n_dst, void *dst[n_dst],
 
 static void
 channelmix_f32_2_4(void *data, int n_dst, void *dst[n_dst],
-		   int n_src, const void *src[n_src], void *matrix, float v, int n_bytes)
+		   int n_src, const void *src[n_src], void *matrix, float v, int n_samples)
 {
-	int i, n, n_samples = n_bytes / sizeof(float);
+	int i, n;
 	float **d = (float **)dst;
 	float **s = (float **)src;
 
 	if (v <= VOLUME_MIN) {
 		for (i = 0; i < n_dst; i++)
-			memset(d[i], 0, n_bytes);
+			memset(d[i], 0, n_samples * sizeof(float));
 	}
 	else if (v == VOLUME_NORM) {
 		for (n = 0; n < n_samples; n++) {
@@ -190,15 +190,15 @@ channelmix_f32_2_4(void *data, int n_dst, void *dst[n_dst],
 #define MASK_3_1	_M(FL)|_M(FR)|_M(FC)|_M(LFE)
 static void
 channelmix_f32_2_3p1(void *data, int n_dst, void *dst[n_dst],
-		   int n_src, const void *src[n_src], void *matrix, float v, int n_bytes)
+		   int n_src, const void *src[n_src], void *matrix, float v, int n_samples)
 {
-	int i, n, n_samples = n_bytes / sizeof(float);
+	int i, n;
 	float **d = (float **)dst;
 	float **s = (float **)src;
 
 	if (v <= VOLUME_MIN) {
 		for (i = 0; i < n_dst; i++)
-			memset(d[i], 0, n_bytes);
+			memset(d[i], 0, n_samples * sizeof(float));
 	}
 	else if (v == VOLUME_NORM) {
 		for (n = 0; n < n_samples; n++) {
@@ -222,15 +222,15 @@ channelmix_f32_2_3p1(void *data, int n_dst, void *dst[n_dst],
 #define MASK_5_1	_M(FL)|_M(FR)|_M(FC)|_M(LFE)|_M(SL)|_M(SR)|_M(RL)|_M(RR)
 static void
 channelmix_f32_2_5p1(void *data, int n_dst, void *dst[n_dst],
-		   int n_src, const void *src[n_src], void *matrix, float v, int n_bytes)
+		   int n_src, const void *src[n_src], void *matrix, float v, int n_samples)
 {
-	int i, n, n_samples = n_bytes / sizeof(float);
+	int i, n;
 	float **d = (float **)dst;
 	float **s = (float **)src;
 
 	if (v <= VOLUME_MIN) {
 		for (i = 0; i < n_dst; i++)
-			memset(d[i], 0, n_bytes);
+			memset(d[i], 0, n_samples * sizeof(float));
 	}
 	else if (v == VOLUME_NORM) {
 		for (n = 0; n < n_samples; n++) {
@@ -254,9 +254,9 @@ channelmix_f32_2_5p1(void *data, int n_dst, void *dst[n_dst],
 /* FL+FR+FC+LFE+SL+SR -> FL+FR */
 static void
 channelmix_f32_5p1_2(void *data, int n_dst, void *dst[n_dst],
-		   int n_src, const void *src[n_src], void *matrix, float v, int n_bytes)
+		   int n_src, const void *src[n_src], void *matrix, float v, int n_samples)
 {
-	int n, n_samples = n_bytes / sizeof(float);
+	int n;
 	float **d = (float **) dst;
 	float **s = (float **) src;
 	float *m = matrix;
@@ -265,8 +265,8 @@ channelmix_f32_5p1_2(void *data, int n_dst, void *dst[n_dst],
 	const float slev = m[4];
 
 	if (v <= VOLUME_MIN) {
-		memset(d[0], 0, n_bytes);
-		memset(d[1], 0, n_bytes);
+		memset(d[0], 0, n_samples * sizeof(float));
+		memset(d[1], 0, n_samples * sizeof(float));
 	}
 	else if (v == VOLUME_NORM) {
 		for (n = 0; n < n_samples; n++) {
@@ -287,16 +287,15 @@ channelmix_f32_5p1_2(void *data, int n_dst, void *dst[n_dst],
 /* FL+FR+FC+LFE+SL+SR -> FL+FR+FC+LFE*/
 static void
 channelmix_f32_5p1_3p1(void *data, int n_dst, void *dst[n_dst],
-		   int n_src, const void *src[n_src], void *matrix, float v, int n_bytes)
+		   int n_src, const void *src[n_src], void *matrix, float v, int n_samples)
 {
-	int i, n, n_samples;
+	int i, n;
 	float **d = (float **) dst;
 	float **s = (float **) src;
 
-	n_samples = n_bytes / sizeof(float);
 	if (v <= VOLUME_MIN) {
 		for (i = 0; i < n_dst; i++)
-			memset(d[i], 0, n_bytes);
+			memset(d[i], 0, n_samples * sizeof(float));
 	}
 	else {
 		const float f1 = 0.5f * v;
@@ -312,19 +311,18 @@ channelmix_f32_5p1_3p1(void *data, int n_dst, void *dst[n_dst],
 /* FL+FR+FC+LFE+SL+SR -> FL+FR+RL+RR*/
 static void
 channelmix_f32_5p1_4(void *data, int n_dst, void *dst[n_dst],
-		   int n_src, const void *src[n_src], void *matrix, float v, int n_bytes)
+		   int n_src, const void *src[n_src], void *matrix, float v, int n_samples)
 {
-	int i, n, n_samples;
+	int i, n;
 	float **d = (float **) dst;
 	float **s = (float **) src;
 	float *m = matrix;
 	const float clev = m[2];
 	const float llev = m[3];
 
-	n_samples = n_bytes / sizeof(float);
 	if (v <= VOLUME_MIN) {
 		for (i = 0; i < n_dst; i++)
-			memset(d[i], 0, n_bytes);
+			memset(d[i], 0, n_samples * sizeof(float));
 	}
 	else if (v == VOLUME_NORM) {
 		for (n = 0; n < n_samples; n++) {
@@ -351,9 +349,9 @@ channelmix_f32_5p1_4(void *data, int n_dst, void *dst[n_dst],
 /* FL+FR+FC+LFE+SL+SR+RL+RR -> FL+FR */
 static void
 channelmix_f32_7p1_2(void *data, int n_dst, void *dst[n_dst],
-		   int n_src, const void *src[n_src], void *matrix, float v, int n_bytes)
+		   int n_src, const void *src[n_src], void *matrix, float v, int n_samples)
 {
-	int n, n_samples = n_bytes / sizeof(float);
+	int n;
 	float **d = (float **) dst;
 	float **s = (float **) src;
 	float *m = matrix;
@@ -362,8 +360,8 @@ channelmix_f32_7p1_2(void *data, int n_dst, void *dst[n_dst],
 	const float slev = m[4];
 
 	if (v <= VOLUME_MIN) {
-		memset(d[0], 0, n_bytes);
-		memset(d[1], 0, n_bytes);
+		memset(d[0], 0, n_samples * sizeof(float));
+		memset(d[1], 0, n_samples * sizeof(float));
 	}
 	else if (v == VOLUME_NORM) {
 		for (n = 0; n < n_samples; n++) {
@@ -384,16 +382,15 @@ channelmix_f32_7p1_2(void *data, int n_dst, void *dst[n_dst],
 /* FL+FR+FC+LFE+SL+SR+RL+RR -> FL+FR+FC+LFE*/
 static void
 channelmix_f32_7p1_3p1(void *data, int n_dst, void *dst[n_dst],
-		   int n_src, const void *src[n_src], void *matrix, float v, int n_bytes)
+		   int n_src, const void *src[n_src], void *matrix, float v, int n_samples)
 {
-	int i, n, n_samples;
+	int i, n;
 	float **d = (float **) dst;
 	float **s = (float **) src;
 
-	n_samples = n_bytes / sizeof(float);
 	if (v <= VOLUME_MIN) {
 		for (i = 0; i < n_dst; i++)
-			memset(d[i], 0, n_bytes);
+			memset(d[i], 0, n_samples * sizeof(float));
 	}
 	else {
 		const float f1 = 0.5 * v;
@@ -409,9 +406,9 @@ channelmix_f32_7p1_3p1(void *data, int n_dst, void *dst[n_dst],
 /* FL+FR+FC+LFE+SL+SR+RL+RR -> FL+FR+RL+RR*/
 static void
 channelmix_f32_7p1_4(void *data, int n_dst, void *dst[n_dst],
-		   int n_src, const void *src[n_src], void *matrix, float v, int n_bytes)
+		   int n_src, const void *src[n_src], void *matrix, float v, int n_samples)
 {
-	int i, n, n_samples;
+	int i, n;
 	float **d = (float **) dst;
 	float **s = (float **) src;
 	float *m = matrix;
@@ -419,10 +416,9 @@ channelmix_f32_7p1_4(void *data, int n_dst, void *dst[n_dst],
 	const float llev = m[3];
 	const float slev = m[4];
 
-	n_samples = n_bytes / sizeof(float);
 	if (v <= VOLUME_MIN) {
 		for (i = 0; i < n_dst; i++)
-			memset(d[i], 0, n_bytes);
+			memset(d[i], 0, n_samples * sizeof(float));
 	}
 	else if (v == VOLUME_NORM) {
 		for (n = 0; n < n_samples; n++) {
@@ -450,7 +446,7 @@ channelmix_f32_7p1_4(void *data, int n_dst, void *dst[n_dst],
 
 typedef void (*channelmix_func_t) (void *data, int n_dst, void *dst[n_dst],
 				   int n_src, const void *src[n_src],
-				   void *matrix, float v, int n_bytes);
+				   void *matrix, float v, int n_samples);
 
 
 #define ANY	((uint32_t)-1)
diff --git a/spa/plugins/audioconvert/channelmix.c b/spa/plugins/audioconvert/channelmix.c
index 6b885c27f..c01db3c8a 100644
--- a/spa/plugins/audioconvert/channelmix.c
+++ b/spa/plugins/audioconvert/channelmix.c
@@ -1132,27 +1132,26 @@ static int impl_node_process(struct spa_node *node)
 	sbuf = &inport->buffers[inio->buffer_id];
 
 	{
-		uint32_t i, n_bytes;
+		uint32_t i, n_samples;
 		struct spa_buffer *sb = sbuf->outbuf, *db = dbuf->outbuf;
 		uint32_t n_src_datas = sb->n_datas;
 		uint32_t n_dst_datas = db->n_datas;
 		const void *src_datas[n_src_datas];
 		void *dst_datas[n_dst_datas];
 
-		n_bytes = sb->datas[0].chunk->size;
+		n_samples = sb->datas[0].chunk->size / inport->stride;
 
 		for (i = 0; i < n_src_datas; i++)
 			src_datas[i] = sb->datas[i].data;
 		for (i = 0; i < n_dst_datas; i++) {
 			dst_datas[i] = db->datas[i].data;
-			db->datas[i].chunk->size =
-				(n_bytes / inport->stride) * outport->stride;
+			db->datas[i].chunk->size = n_samples * outport->stride;
 		}
 
 		this->convert(this, n_dst_datas, dst_datas,
 				    n_src_datas, src_datas,
 				    this->matrix, this->props.mute ? 0.0f : this->props.volume,
-				    n_bytes);
+				    n_samples);
 	}
 
 	outio->status = SPA_STATUS_HAVE_BUFFER;
diff --git a/spa/plugins/audioconvert/fmt-ops-sse2.c b/spa/plugins/audioconvert/fmt-ops-sse2.c
index 6428757b7..ebca10467 100644
--- a/spa/plugins/audioconvert/fmt-ops-sse2.c
+++ b/spa/plugins/audioconvert/fmt-ops-sse2.c
@@ -30,142 +30,148 @@
 #include <emmintrin.h>
 
 static void
-conv_s16_to_f32d_1_sse2(void *data, int n_dst, void *dst[n_dst], const void *src, int n_samples)
+conv_s16_to_f32d_1_sse2(void *data, void *dst[], const void *src, int n_channels, int n_samples)
 {
 	const int16_t *s = src;
 	float **d = (float **) dst;
 	float *d0 = d[0];
-	int n = 0, unrolled;
+	int n, unrolled;
 	__m128i in;
 	__m128 out, factor = _mm_set1_ps(1.0f / S16_SCALE);
 
-	unrolled = n_samples / 4;
-	n_samples = n_samples & 3;
+	if (SPA_IS_ALIGNED(d0, 16))
+		unrolled = n_samples / 4;
+	else
+		unrolled = 0;
 
-	for(; unrolled--; n += 4) {
-		in = _mm_insert_epi16(in, s[0*n_dst], 1);
-		in = _mm_insert_epi16(in, s[1*n_dst], 3);
-		in = _mm_insert_epi16(in, s[2*n_dst], 5);
-		in = _mm_insert_epi16(in, s[3*n_dst], 7);
+	for(n = 0; unrolled--; n += 4) {
+		in = _mm_insert_epi16(in, s[0*n_channels], 1);
+		in = _mm_insert_epi16(in, s[1*n_channels], 3);
+		in = _mm_insert_epi16(in, s[2*n_channels], 5);
+		in = _mm_insert_epi16(in, s[3*n_channels], 7);
 		in = _mm_srai_epi32(in, 16);
 		out = _mm_cvtepi32_ps(in);
 		out = _mm_mul_ps(out, factor);
-		_mm_storeu_ps(&d0[n], out);
-		s += 4*n_dst;
+		_mm_store_ps(&d0[n], out);
+		s += 4*n_channels;
 	}
-	for(; n_samples--; n++) {
+	for(; n < n_samples; n++) {
 		out = _mm_cvtsi32_ss(out, s[0]);
 		out = _mm_mul_ss(out, factor);
 		_mm_store_ss(&d0[n], out);
-		s += n_dst;
+		s += n_channels;
 	}
 }
 
 static void
-conv_s16_to_f32d_2_sse2(void *data, int n_dst, void *dst[n_dst], const void *src, int n_samples)
+conv_s16_to_f32d_2_sse2(void *data, void *dst[], const void *src, int n_channels, int n_samples)
 {
 	const int16_t *s = src;
 	float **d = (float **) dst;
 	float *d0 = d[0], *d1 = d[1];
-	int n = 0, unrolled;
+	int n, unrolled;
 	__m128i in, t[2];
 	__m128 out[2], factor = _mm_set1_ps(1.0f / S16_SCALE);
 
-	if (n_dst == 2) {
+	if (n_channels == 2 &&
+	    SPA_IS_ALIGNED(s, 16) &&
+	    SPA_IS_ALIGNED(d0, 16) &&
+	    SPA_IS_ALIGNED(d1, 16))
 		unrolled = n_samples / 4;
-		n_samples = n_samples & 3;
+	else
+		unrolled = 0;
 
-		for(; unrolled--; n += 4) {
-			in = _mm_loadu_si128((__m128i*)s);
+	for(n = 0; unrolled--; n += 4) {
+		in = _mm_load_si128((__m128i*)s);
 
-			t[0] = _mm_slli_epi32(in, 16);
-			t[0] = _mm_srai_epi32(t[0], 16);
-			t[1] = _mm_srai_epi32(in, 16);
+		t[0] = _mm_slli_epi32(in, 16);
+		t[0] = _mm_srai_epi32(t[0], 16);
+		t[1] = _mm_srai_epi32(in, 16);
 
-			out[0] = _mm_cvtepi32_ps(t[0]);
-			out[0] = _mm_mul_ps(out[0], factor);
-			out[1] = _mm_cvtepi32_ps(t[1]);
-			out[1] = _mm_mul_ps(out[1], factor);
+		out[0] = _mm_cvtepi32_ps(t[0]);
+		out[0] = _mm_mul_ps(out[0], factor);
+		out[1] = _mm_cvtepi32_ps(t[1]);
+		out[1] = _mm_mul_ps(out[1], factor);
 
-			_mm_storeu_ps(&d0[n], out[0]);
-			_mm_storeu_ps(&d1[n], out[1]);
+		_mm_store_ps(&d0[n], out[0]);
+		_mm_store_ps(&d1[n], out[1]);
 
-			s += 4*n_dst;
-		}
+		s += 4*n_channels;
 	}
-	for(; n_samples--; n++) {
+	for(; n < n_samples; n++) {
 		out[0] = _mm_cvtsi32_ss(out[0], s[0]);
 		out[0] = _mm_mul_ss(out[0], factor);
 		out[1] = _mm_cvtsi32_ss(out[1], s[1]);
 		out[1] = _mm_mul_ss(out[1], factor);
 		_mm_store_ss(&d0[n], out[0]);
 		_mm_store_ss(&d1[n], out[1]);
-		s += n_dst;
+		s += n_channels;
 	}
 }
 
 static void
-conv_s16_to_f32d_sse2(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_s16_to_f32d_sse2(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	const int16_t *s = src[0];
 	int i = 0;
 
-	for(; i + 1 < n_dst; i += 2)
-		conv_s16_to_f32d_2_sse2(data, n_dst, &dst[i], &s[i], n_samples);
-	for(; i < n_dst; i++)
-		conv_s16_to_f32d_1_sse2(data, n_dst, &dst[i], &s[i], n_samples);
+	for(; i + 1 < n_channels; i += 2)
+		conv_s16_to_f32d_2_sse2(data, &dst[i], &s[i], n_channels, n_samples);
+	for(; i < n_channels; i++)
+		conv_s16_to_f32d_1_sse2(data, &dst[i], &s[i], n_channels, n_samples);
 }
 
 static void
-conv_s24_to_f32d_1_sse2(void *data, int n_dst, void *dst[n_dst], const void *src, int n_samples)
+conv_s24_to_f32d_1_sse2(void *data, void *dst[], const void *src, int n_channels, int n_samples)
 {
 	const uint8_t *s = src;
 	float **d = (float **) dst;
 	float *d0 = d[0];
-	int n = 0, unrolled;
+	int n, unrolled;
 	__m128i in;
 	__m128 out, factor = _mm_set1_ps(1.0f / S24_SCALE);
 
-	unrolled = n_samples / 4;
-	n_samples = n_samples & 3;
-	if (n_samples == 0) {
-		n_samples += 4;
-		unrolled--;
+	if (SPA_IS_ALIGNED(d0, 16) && n_samples > 4) {
+		unrolled = n_samples / 4;
+		if ((n_samples & 3) == 0)
+			unrolled--;
 	}
+	else
+		unrolled = 0;
 
-	for(; unrolled--; n += 4) {
+	for(n = 0; unrolled--; n += 4) {
 		in = _mm_setr_epi32(
-			*((uint32_t*)&s[0 * n_dst]),
-			*((uint32_t*)&s[3 * n_dst]),
-			*((uint32_t*)&s[6 * n_dst]),
-			*((uint32_t*)&s[9 * n_dst]));
+			*((uint32_t*)&s[0 * n_channels]),
+			*((uint32_t*)&s[3 * n_channels]),
+			*((uint32_t*)&s[6 * n_channels]),
+			*((uint32_t*)&s[9 * n_channels]));
 		in = _mm_slli_epi32(in, 8);
 		in = _mm_srai_epi32(in, 8);
 		out = _mm_cvtepi32_ps(in);
 		out = _mm_mul_ps(out, factor);
-		_mm_storeu_ps(&d0[n], out);
-		s += 12 * n_dst;
+		_mm_store_ps(&d0[n], out);
+		s += 12 * n_channels;
 	}
-	for(; n_samples--; n++) {
+	for(; n < n_samples; n++) {
 		out = _mm_cvtsi32_ss(out, read_s24(s));
 		out = _mm_mul_ss(out, factor);
 		_mm_store_ss(&d0[n], out);
-		s += 3 * n_dst;
+		s += 3 * n_channels;
 	}
 }
 
 static void
-conv_s24_to_f32d_sse2(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_s24_to_f32d_sse2(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	const int8_t *s = src[0];
 	int i = 0;
 
-	for(; i < n_dst; i++)
-		conv_s24_to_f32d_1_sse2(data, n_dst, &dst[i], &s[3*i], n_samples);
+	for(; i < n_channels; i++)
+		conv_s24_to_f32d_1_sse2(data, &dst[i], &s[3*i], n_channels, n_samples);
 }
 
 static void
-conv_f32d_to_s32_1_sse2(void *data, void *dst, int n_src, const void *src[n_src], int n_samples)
+conv_f32d_to_s32_1_sse2(void *data, void *dst, const void *src[], int n_channels, int n_samples)
 {
 	const float **s = (const float **) src;
 	const float *s0 = s[0];
@@ -176,11 +182,13 @@ conv_f32d_to_s32_1_sse2(void *data, void *dst, int n_src, const void *src[n_src]
 	__m128 int_max = _mm_set1_ps(S24_MAX_F);
         __m128 int_min = _mm_sub_ps(_mm_setzero_ps(), int_max);
 
-	unrolled = n_samples / 4;
-	n_samples = n_samples & 3;
+	if (SPA_IS_ALIGNED(s0, 16))
+		unrolled = n_samples / 4;
+	else
+		unrolled = 0;
 
 	for(n = 0; unrolled--; n += 4) {
-		in[0] = _mm_mul_ps(_mm_loadu_ps(&s0[n]), int_max);
+		in[0] = _mm_mul_ps(_mm_load_ps(&s0[n]), int_max);
 		in[0] = _mm_min_ps(int_max, _mm_max_ps(in[0], int_min));
 
 		out[0] = _mm_slli_epi32(_mm_cvtps_epi32(in[0]), 8);
@@ -188,23 +196,23 @@ conv_f32d_to_s32_1_sse2(void *data, void *dst, int n_src, const void *src[n_src]
 		out[2] = _mm_shuffle_epi32(out[0], _MM_SHUFFLE(1, 0, 3, 2));
 		out[3] = _mm_shuffle_epi32(out[0], _MM_SHUFFLE(2, 1, 0, 3));
 
-		d[0*n_src] = _mm_cvtsi128_si32(out[0]);
-		d[1*n_src] = _mm_cvtsi128_si32(out[1]);
-		d[2*n_src] = _mm_cvtsi128_si32(out[2]);
-		d[3*n_src] = _mm_cvtsi128_si32(out[3]);
-		d += 4*n_src;
+		d[0*n_channels] = _mm_cvtsi128_si32(out[0]);
+		d[1*n_channels] = _mm_cvtsi128_si32(out[1]);
+		d[2*n_channels] = _mm_cvtsi128_si32(out[2]);
+		d[3*n_channels] = _mm_cvtsi128_si32(out[3]);
+		d += 4*n_channels;
 	}
-	for(; n_samples--; n++) {
+	for(; n < n_samples; n++) {
 		in[0] = _mm_load_ss(&s0[n]);
 		in[0] = _mm_mul_ss(in[0], int_max);
 		in[0] = _mm_min_ss(int_max, _mm_max_ss(in[0], int_min));
 		*d = _mm_cvtss_si32(in[0]) << 8;
-		d += n_src;
+		d += n_channels;
 	}
 }
 
 static void
-conv_f32d_to_s32_2_sse2(void *data, void *dst, int n_src, const void *src[n_src], int n_samples)
+conv_f32d_to_s32_2_sse2(void *data, void *dst, const void *src[], int n_channels, int n_samples)
 {
 	const float **s = (const float **) src;
 	const float *s0 = s[0], *s1 = s[1];
@@ -215,12 +223,15 @@ conv_f32d_to_s32_2_sse2(void *data, void *dst, int n_src, const void *src[n_src]
 	__m128 int_max = _mm_set1_ps(S24_MAX_F);
         __m128 int_min = _mm_sub_ps(_mm_setzero_ps(), int_max);
 
-	unrolled = n_samples / 4;
-	n_samples = n_samples & 3;
+	if (SPA_IS_ALIGNED(s0, 16) &&
+	    SPA_IS_ALIGNED(s1, 16))
+		unrolled = n_samples / 4;
+	else
+		unrolled = 0;
 
 	for(n = 0; unrolled--; n += 4) {
-		in[0] = _mm_mul_ps(_mm_loadu_ps(&s0[n]), int_max);
-		in[1] = _mm_mul_ps(_mm_loadu_ps(&s1[n]), int_max);
+		in[0] = _mm_mul_ps(_mm_load_ps(&s0[n]), int_max);
+		in[1] = _mm_mul_ps(_mm_load_ps(&s1[n]), int_max);
 
 		in[0] = _mm_min_ps(int_max, _mm_max_ps(in[0], int_min));
 		in[1] = _mm_min_ps(int_max, _mm_max_ps(in[1], int_min));
@@ -233,13 +244,13 @@ conv_f32d_to_s32_2_sse2(void *data, void *dst, int n_src, const void *src[n_src]
 		t[2] = _mm_unpackhi_epi32(out[0], out[1]);
 		t[3] = _mm_shuffle_epi32(t[2], _MM_SHUFFLE(0, 0, 2, 2));
 
-		_mm_storel_epi64((__m128i*)(d + 0*n_src), t[0]);
-		_mm_storel_epi64((__m128i*)(d + 1*n_src), t[1]);
-		_mm_storel_epi64((__m128i*)(d + 2*n_src), t[2]);
-		_mm_storel_epi64((__m128i*)(d + 3*n_src), t[3]);
-		d += 4*n_src;
+		_mm_storel_epi64((__m128i*)(d + 0*n_channels), t[0]);
+		_mm_storel_epi64((__m128i*)(d + 1*n_channels), t[1]);
+		_mm_storel_epi64((__m128i*)(d + 2*n_channels), t[2]);
+		_mm_storel_epi64((__m128i*)(d + 3*n_channels), t[3]);
+		d += 4*n_channels;
 	}
-	for(; n_samples--; n++) {
+	for(; n < n_samples; n++) {
 		in[0] = _mm_load_ss(&s0[n]);
 		in[1] = _mm_load_ss(&s1[n]);
 
@@ -249,12 +260,12 @@ conv_f32d_to_s32_2_sse2(void *data, void *dst, int n_src, const void *src[n_src]
 		in[0] = _mm_min_ps(int_max, _mm_max_ps(in[0], int_min));
 		out[0] = _mm_slli_epi32(_mm_cvtps_epi32(in[0]), 8);
 		_mm_storel_epi64((__m128i*)d, out[0]);
-		d += n_src;
+		d += n_channels;
 	}
 }
 
 static void
-conv_f32d_to_s32_4_sse2(void *data, void *dst, int n_src, const void *src[n_src], int n_samples)
+conv_f32d_to_s32_4_sse2(void *data, void *dst, const void *src[], int n_channels, int n_samples)
 {
 	const float **s = (const float **) src;
 	const float *s0 = s[0], *s1 = s[1], *s2 = s[2], *s3 = s[3];
@@ -265,14 +276,19 @@ conv_f32d_to_s32_4_sse2(void *data, void *dst, int n_src, const void *src[n_src]
 	__m128 int_max = _mm_set1_ps(S24_MAX_F);
         __m128 int_min = _mm_sub_ps(_mm_setzero_ps(), int_max);
 
-	unrolled = n_samples / 4;
-	n_samples = n_samples & 3;
+	if (SPA_IS_ALIGNED(s0, 16) &&
+	    SPA_IS_ALIGNED(s1, 16) &&
+	    SPA_IS_ALIGNED(s2, 16) &&
+	    SPA_IS_ALIGNED(s3, 16))
+		unrolled = n_samples / 4;
+	else
+		unrolled = 0;
 
 	for(n = 0; unrolled--; n += 4) {
-		in[0] = _mm_mul_ps(_mm_loadu_ps(&s0[n]), int_max);
-		in[1] = _mm_mul_ps(_mm_loadu_ps(&s1[n]), int_max);
-		in[2] = _mm_mul_ps(_mm_loadu_ps(&s2[n]), int_max);
-		in[3] = _mm_mul_ps(_mm_loadu_ps(&s3[n]), int_max);
+		in[0] = _mm_mul_ps(_mm_load_ps(&s0[n]), int_max);
+		in[1] = _mm_mul_ps(_mm_load_ps(&s1[n]), int_max);
+		in[2] = _mm_mul_ps(_mm_load_ps(&s2[n]), int_max);
+		in[3] = _mm_mul_ps(_mm_load_ps(&s3[n]), int_max);
 
 		in[0] = _mm_min_ps(int_max, _mm_max_ps(in[0], int_min));
 		in[1] = _mm_min_ps(int_max, _mm_max_ps(in[1], int_min));
@@ -294,13 +310,13 @@ conv_f32d_to_s32_4_sse2(void *data, void *dst, int n_src, const void *src[n_src]
 		out[2] = _mm_unpacklo_epi64(t[2], t[3]);
 		out[3] = _mm_unpackhi_epi64(t[2], t[3]);
 
-		_mm_storeu_si128((__m128i*)(d + 0*n_src), out[0]);
-		_mm_storeu_si128((__m128i*)(d + 1*n_src), out[1]);
-		_mm_storeu_si128((__m128i*)(d + 2*n_src), out[2]);
-		_mm_storeu_si128((__m128i*)(d + 3*n_src), out[3]);
-		d += 4*n_src;
+		_mm_storeu_si128((__m128i*)(d + 0*n_channels), out[0]);
+		_mm_storeu_si128((__m128i*)(d + 1*n_channels), out[1]);
+		_mm_storeu_si128((__m128i*)(d + 2*n_channels), out[2]);
+		_mm_storeu_si128((__m128i*)(d + 3*n_channels), out[3]);
+		d += 4*n_channels;
 	}
-	for(; n_samples--; n++) {
+	for(; n < n_samples; n++) {
 		in[0] = _mm_load_ss(&s0[n]);
 		in[1] = _mm_load_ss(&s1[n]);
 		in[2] = _mm_load_ss(&s2[n]);
@@ -314,26 +330,26 @@ conv_f32d_to_s32_4_sse2(void *data, void *dst, int n_src, const void *src[n_src]
 		in[0] = _mm_min_ps(int_max, _mm_max_ps(in[0], int_min));
 		out[0] = _mm_slli_epi32(_mm_cvtps_epi32(in[0]), 8);
 		_mm_storeu_si128((__m128i*)d, out[0]);
-		d += n_src;
+		d += n_channels;
 	}
 }
 
 static void
-conv_f32d_to_s32_sse2(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_f32d_to_s32_sse2(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	int32_t *d = dst[0];
 	int i = 0;
 
-	for(; i + 3 < n_src; i += 4)
-		conv_f32d_to_s32_4_sse2(data, &d[i], n_src, &src[i], n_samples);
-	for(; i + 1 < n_src; i += 2)
-		conv_f32d_to_s32_2_sse2(data, &d[i], n_src, &src[i], n_samples);
-	for(; i < n_src; i++)
-		conv_f32d_to_s32_1_sse2(data, &d[i], n_src, &src[i], n_samples);
+	for(; i + 3 < n_channels; i += 4)
+		conv_f32d_to_s32_4_sse2(data, &d[i], &src[i], n_channels, n_samples);
+	for(; i + 1 < n_channels; i += 2)
+		conv_f32d_to_s32_2_sse2(data, &d[i], &src[i], n_channels, n_samples);
+	for(; i < n_channels; i++)
+		conv_f32d_to_s32_1_sse2(data, &d[i], &src[i], n_channels, n_samples);
 }
 
 static void
-conv_f32d_to_s16_1_sse2(void *data, void *dst, int n_src, const void *src[n_src], int n_samples)
+conv_f32d_to_s16_1_sse2(void *data, void *dst, const void *src[], int n_channels, int n_samples)
 {
 	const float **s = (const float **) src;
 	const float *s0 = s[0];
@@ -344,52 +360,59 @@ conv_f32d_to_s16_1_sse2(void *data, void *dst, int n_src, const void *src[n_src]
 	__m128 int_max = _mm_set1_ps(S16_MAX_F);
         __m128 int_min = _mm_sub_ps(_mm_setzero_ps(), int_max);
 
-	unrolled = n_samples / 8;
-	n_samples = n_samples & 7;
+	if (SPA_IS_ALIGNED(s0, 16))
+		unrolled = n_samples / 8;
+	else
+		unrolled = 0;
 
 	for(n = 0; unrolled--; n += 8) {
-		in[0] = _mm_mul_ps(_mm_loadu_ps(&s0[n]), int_max);
-		in[1] = _mm_mul_ps(_mm_loadu_ps(&s0[n+4]), int_max);
+		in[0] = _mm_mul_ps(_mm_load_ps(&s0[n]), int_max);
+		in[1] = _mm_mul_ps(_mm_load_ps(&s0[n+4]), int_max);
 		out[0] = _mm_cvtps_epi32(in[0]);
 		out[1] = _mm_cvtps_epi32(in[1]);
 		out[0] = _mm_packs_epi32(out[0], out[1]);
 
-		d[0*n_src] = _mm_extract_epi16(out[0], 0);
-		d[1*n_src] = _mm_extract_epi16(out[0], 1);
-		d[2*n_src] = _mm_extract_epi16(out[0], 2);
-		d[3*n_src] = _mm_extract_epi16(out[0], 3);
-		d[4*n_src] = _mm_extract_epi16(out[0], 4);
-		d[5*n_src] = _mm_extract_epi16(out[0], 5);
-		d[6*n_src] = _mm_extract_epi16(out[0], 6);
-		d[7*n_src] = _mm_extract_epi16(out[0], 7);
-		d += 8*n_src;
+		d[0*n_channels] = _mm_extract_epi16(out[0], 0);
+		d[1*n_channels] = _mm_extract_epi16(out[0], 1);
+		d[2*n_channels] = _mm_extract_epi16(out[0], 2);
+		d[3*n_channels] = _mm_extract_epi16(out[0], 3);
+		d[4*n_channels] = _mm_extract_epi16(out[0], 4);
+		d[5*n_channels] = _mm_extract_epi16(out[0], 5);
+		d[6*n_channels] = _mm_extract_epi16(out[0], 6);
+		d[7*n_channels] = _mm_extract_epi16(out[0], 7);
+		d += 8*n_channels;
 	}
-	for(; n_samples--; n++) {
+	for(; n < n_samples; n++) {
+		fprintf(stderr, "%p %d %d %d\n", s0, n_samples, n, n_channels);
+		spa_assert_not_reached();
 		in[0] = _mm_mul_ss(_mm_load_ss(&s0[n]), int_max);
 		in[0] = _mm_min_ss(int_max, _mm_max_ss(in[0], int_min));
 		*d = _mm_cvtss_si32(in[0]);
-		d += n_src;
+		d += n_channels;
 	}
 }
 
 static void
-conv_f32d_to_s16_2_sse2(void *data, void *dst, int n_src, const void *src[n_src], int n_samples)
+conv_f32d_to_s16_2_sse2(void *data, void *dst, const void *src[], int n_channels, int n_samples)
 {
 	const float **s = (const float **) src;
 	const float *s0 = s[0], *s1 = s[1];
 	int16_t *d = dst;
-	int n = 0, unrolled;
+	int n, unrolled;
 	__m128 in[2];
 	__m128i out[4], t[2];
 	__m128 int_max = _mm_set1_ps(S16_MAX_F);
         __m128 int_min = _mm_sub_ps(_mm_setzero_ps(), int_max);
 
-	unrolled = n_samples / 4;
-	n_samples = n_samples & 3;
+	if (SPA_IS_ALIGNED(s0, 16) &&
+	    SPA_IS_ALIGNED(s1, 16))
+		unrolled = n_samples / 4;
+	else
+		unrolled = 0;
 
-	for(; unrolled--; n += 4) {
-		in[0] = _mm_mul_ps(_mm_loadu_ps(&s0[n]), int_max);
-		in[1] = _mm_mul_ps(_mm_loadu_ps(&s1[n]), int_max);
+	for(n = 0; unrolled--; n += 4) {
+		in[0] = _mm_mul_ps(_mm_load_ps(&s0[n]), int_max);
+		in[1] = _mm_mul_ps(_mm_load_ps(&s1[n]), int_max);
 
 		t[0] = _mm_cvtps_epi32(in[0]);
 		t[1] = _mm_cvtps_epi32(in[1]);
@@ -402,31 +425,33 @@ conv_f32d_to_s16_2_sse2(void *data, void *dst, int n_src, const void *src[n_src]
 		out[2] = _mm_shuffle_epi32(out[0], _MM_SHUFFLE(1, 0, 3, 2));
 		out[3] = _mm_shuffle_epi32(out[0], _MM_SHUFFLE(2, 1, 0, 3));
 
-		*((uint32_t*)(d + 0*n_src)) = _mm_cvtsi128_si32(out[0]);
-		*((uint32_t*)(d + 1*n_src)) = _mm_cvtsi128_si32(out[1]);
-		*((uint32_t*)(d + 2*n_src)) = _mm_cvtsi128_si32(out[2]);
-		*((uint32_t*)(d + 3*n_src)) = _mm_cvtsi128_si32(out[3]);
-		d += 4*n_src;
+		*((int32_t*)(d + 0*n_channels)) = _mm_cvtsi128_si32(out[0]);
+		*((int32_t*)(d + 1*n_channels)) = _mm_cvtsi128_si32(out[1]);
+		*((int32_t*)(d + 2*n_channels)) = _mm_cvtsi128_si32(out[2]);
+		*((int32_t*)(d + 3*n_channels)) = _mm_cvtsi128_si32(out[3]);
+		d += 4*n_channels;
 	}
-	for(; n_samples--; n++) {
+	for(; n < n_samples; n++) {
+		fprintf(stderr, "%p %p %d %d %d\n", s0, s1, n_samples, n, n_channels);
+		spa_assert_not_reached();
 		in[0] = _mm_mul_ss(_mm_load_ss(&s0[n]), int_max);
 		in[1] = _mm_mul_ss(_mm_load_ss(&s1[n]), int_max);
 		in[0] = _mm_min_ss(int_max, _mm_max_ss(in[0], int_min));
 		in[1] = _mm_min_ss(int_max, _mm_max_ss(in[1], int_min));
 		d[0] = _mm_cvtss_si32(in[0]);
 		d[1] = _mm_cvtss_si32(in[1]);
-		d += n_src;
+		d += n_channels;
 	}
 }
 
 static void
-conv_f32d_to_s16_sse2(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_f32d_to_s16_sse2(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	int16_t *d = dst[0];
 	int i = 0;
 
-	for(; i + 1 < n_src; i += 2)
-		conv_f32d_to_s16_2_sse2(data, &d[i], n_src, &src[i], n_samples);
-	for(; i < n_src; i++)
-		conv_f32d_to_s16_1_sse2(data, &d[i], n_src, &src[i], n_samples);
+	for(; i + 1 < n_channels; i += 2)
+		conv_f32d_to_s16_2_sse2(data, &d[i], &src[i], n_channels, n_samples);
+	for(; i < n_channels; i++)
+		conv_f32d_to_s16_1_sse2(data, &d[i], &src[i], n_channels, n_samples);
 }
diff --git a/spa/plugins/audioconvert/fmt-ops.c b/spa/plugins/audioconvert/fmt-ops.c
index 00256a096..08be6b832 100644
--- a/spa/plugins/audioconvert/fmt-ops.c
+++ b/spa/plugins/audioconvert/fmt-ops.c
@@ -30,6 +30,8 @@
 #include <spa/utils/defs.h>
 #include <spa/param/audio/format-utils.h>
 
+#include <xmmintrin.h>
+
 #define U8_MIN		0
 #define U8_MAX		255
 #define U8_SCALE	127.5f
@@ -85,43 +87,68 @@ static inline void write_s24(void *dst, int32_t val)
 #endif
 
 static void
-conv_copy8(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_copy8d(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	int i;
-	for (i = 0; i < n_src; i++)
+	for (i = 0; i < n_channels; i++)
 		memcpy(dst[i], src[i], n_samples);
 }
 
 static void
-conv_copy16(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_copy8(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
+{
+	memcpy(dst[0], src[0], n_samples * n_channels);
+}
+
+
+static void
+conv_copy16d(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	int i;
-	for (i = 0; i < n_src; i++)
+	for (i = 0; i < n_channels; i++)
 		memcpy(dst[i], src[i], n_samples * sizeof(int16_t));
 }
 
 static void
-conv_copy24(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_copy16(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
+{
+	memcpy(dst[0], src[0], n_samples * sizeof(int16_t) * n_channels);
+}
+
+static void
+conv_copy24d(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	int i;
-	for (i = 0; i < n_src; i++)
+	for (i = 0; i < n_channels; i++)
 		memcpy(dst[i], src[i], n_samples * 3);
 }
 
 static void
-conv_copy32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_copy24(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
+{
+	memcpy(dst[0], src[0], n_samples * 3 * n_channels);
+}
+
+static void
+conv_copy32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	int i;
-	for (i = 0; i < n_src; i++)
+	for (i = 0; i < n_channels; i++)
 		memcpy(dst[i], src[i], n_samples * sizeof(int32_t));
 }
 
 static void
-conv_u8_to_f32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_copy32(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
+{
+	memcpy(dst[0], src[0], n_samples * sizeof(int32_t) * n_channels);
+}
+
+static void
+conv_u8d_to_f32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	int i, j;
 
-	for (i = 0; i < n_src; i++) {
+	for (i = 0; i < n_channels; i++) {
 		const uint8_t *s = src[i];
 		float *d = dst[i];
 
@@ -131,37 +158,43 @@ conv_u8_to_f32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *s
 }
 
 static void
-conv_u8_to_f32d(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_u8_to_f32(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
+{
+	conv_u8d_to_f32d(data, dst, src, 1, n_samples * n_channels);
+}
+
+static void
+conv_u8_to_f32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	const uint8_t *s = src[0];
 	float **d = (float **) dst;
 	int i, j;
 
 	for (j = 0; j < n_samples; j++) {
-		for (i = 0; i < n_dst; i++)
+		for (i = 0; i < n_channels; i++)
 			d[i][j] = U8_TO_F32(*s++);
 	}
 }
 
 static void
-conv_u8d_to_f32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_u8d_to_f32(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	const uint8_t **s = (const uint8_t **) src;
 	float *d = dst[0];
 	int i, j;
 
 	for (j = 0; j < n_samples; j++) {
-		for (i = 0; i < n_src; i++)
+		for (i = 0; i < n_channels; i++)
 			*d++ = U8_TO_F32(s[i][j]);
 	}
 }
 
 static void
-conv_s16_to_f32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_s16d_to_f32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	int i, j;
 
-	for (i = 0; i < n_src; i++) {
+	for (i = 0; i < n_channels; i++) {
 		const int16_t *s = src[i];
 		float *d = dst[i];
 		for (j = 0; j < n_samples; j++)
@@ -170,37 +203,43 @@ conv_s16_to_f32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *
 }
 
 static void
-conv_s16_to_f32d(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_s16_to_f32(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
+{
+	conv_s16d_to_f32d(data, dst, src, 1, n_samples * n_channels);
+}
+
+static void
+conv_s16_to_f32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	const int16_t *s = src[0];
 	float **d = (float **) dst;
 	int i, j;
 
 	for (j = 0; j < n_samples; j++) {
-		for (i = 0; i < n_dst; i++)
+		for (i = 0; i < n_channels; i++)
 			d[i][j] = S16_TO_F32(*s++);
 	}
 }
 
 static void
-conv_s16d_to_f32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_s16d_to_f32(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	const int16_t **s = (const int16_t **) src;
 	float *d = dst[0];
 	int i, j;
 
 	for (j = 0; j < n_samples; j++) {
-		for (i = 0; i < n_src; i++)
+		for (i = 0; i < n_channels; i++)
 			*d++ = S16_TO_F32(s[i][j]);
 	}
 }
 
 static void
-conv_s32_to_f32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_s32d_to_f32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	int i, j;
 
-	for (i = 0; i < n_src; i++) {
+	for (i = 0; i < n_channels; i++) {
 		const int32_t *s = src[i];
 		float *d = dst[i];
 
@@ -210,38 +249,43 @@ conv_s32_to_f32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *
 }
 
 static void
-conv_s32_to_f32d(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_s32_to_f32(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
+{
+	conv_s32d_to_f32d(data, dst, src, 1, n_samples * n_channels);
+}
+
+static void
+conv_s32_to_f32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	const int32_t *s = src[0];
 	float **d = (float **) dst;
 	int i, j;
 
 	for (j = 0; j < n_samples; j++) {
-		for (i = 0; i < n_dst; i++)
+		for (i = 0; i < n_channels; i++)
 			d[i][j] = S32_TO_F32(*s++);
 	}
 }
 
 static void
-conv_s32d_to_f32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_s32d_to_f32(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	const int32_t **s = (const int32_t **) src;
 	float *d = dst[0];
 	int i, j;
 
 	for (j = 0; j < n_samples; j++) {
-		for (i = 0; i < n_src; i++)
+		for (i = 0; i < n_channels; i++)
 			*d++ = S32_TO_F32(s[i][j]);
 	}
 }
 
-
 static void
-conv_s24_to_f32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_s24d_to_f32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	int i, j;
 
-	for (i = 0; i < n_src; i++) {
+	for (i = 0; i < n_channels; i++) {
 		const int8_t *s = src[i];
 		float *d = dst[i];
 
@@ -253,14 +297,20 @@ conv_s24_to_f32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *
 }
 
 static void
-conv_s24_to_f32d(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_s24_to_f32(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
+{
+	conv_s24d_to_f32d(data, dst, src, 1, n_samples * n_channels);
+}
+
+static void
+conv_s24_to_f32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	const uint8_t *s = src[0];
 	float **d = (float **) dst;
 	int i, j;
 
 	for (j = 0; j < n_samples; j++) {
-		for (i = 0; i < n_dst; i++) {
+		for (i = 0; i < n_channels; i++) {
 			d[i][j] = S24_TO_F32(read_s24(s));
 			s += 3;
 		}
@@ -268,25 +318,25 @@ conv_s24_to_f32d(void *data, int n_dst, void *dst[n_dst], int n_src, const void
 }
 
 static void
-conv_s24d_to_f32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_s24d_to_f32(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	const uint8_t **s = (const uint8_t **) src;
 	float *d = dst[0];
 	int i, j;
 
 	for (j = 0; j < n_samples; j++) {
-		for (i = 0; i < n_src; i++) {
+		for (i = 0; i < n_channels; i++) {
 			*d++ = S24_TO_F32(read_s24(&s[i][j*3]));
 		}
 	}
 }
 
 static void
-conv_s24_32_to_f32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_s24_32d_to_f32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	int i, j;
 
-	for (i = 0; i < n_src; i++) {
+	for (i = 0; i < n_channels; i++) {
 		const int32_t *s = src[i];
 		float *d = dst[i];
 
@@ -296,37 +346,43 @@ conv_s24_32_to_f32(void *data, int n_dst, void *dst[n_dst], int n_src, const voi
 }
 
 static void
-conv_s24_32_to_f32d(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_s24_32_to_f32(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
+{
+	conv_s24_32d_to_f32d(data, dst, src, 1, n_samples * n_channels);
+}
+
+static void
+conv_s24_32_to_f32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	const int32_t *s = src[0];
 	float **d = (float **) dst;
 	int i, j;
 
 	for (j = 0; j < n_samples; j++) {
-		for (i = 0; i < n_dst; i++)
+		for (i = 0; i < n_channels; i++)
 			d[i][j] = S24_TO_F32(*s++);
 	}
 }
 
 static void
-conv_s24_32d_to_f32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_s24_32d_to_f32(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	const int32_t **s = (const int32_t **) src;
 	float *d = dst[0];
 	int i, j;
 
 	for (j = 0; j < n_samples; j++) {
-		for (i = 0; i < n_src; i++)
+		for (i = 0; i < n_channels; i++)
 			*d++ = S24_TO_F32(s[i][j]);
 	}
 }
 
 static void
-conv_f32_to_u8(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_f32d_to_u8d(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	int i, j;
 
-	for (i = 0; i < n_src; i++) {
+	for (i = 0; i < n_channels; i++) {
 		const float *s = src[i];
 		uint8_t *d = dst[i];
 
@@ -336,37 +392,43 @@ conv_f32_to_u8(void *data, int n_dst, void *dst[n_dst], int n_src, const void *s
 }
 
 static void
-conv_f32_to_u8d(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_f32_to_u8(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
+{
+	conv_f32d_to_u8d(data, dst, src, 1, n_samples * n_channels);
+}
+
+static void
+conv_f32_to_u8d(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	const float *s = src[0];
 	uint8_t **d = (uint8_t **) dst;
 	int i, j;
 
 	for (j = 0; j < n_samples; j++) {
-		for (i = 0; i < n_dst; i++)
+		for (i = 0; i < n_channels; i++)
 			d[i][j] = F32_TO_U8(*s++);
 	}
 }
 
 static void
-conv_f32d_to_u8(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_f32d_to_u8(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	const float **s = (const float **) src;
 	uint8_t *d = dst[0];
 	int i, j;
 
 	for (j = 0; j < n_samples; j++) {
-		for (i = 0; i < n_src; i++)
+		for (i = 0; i < n_channels; i++)
 			*d++ = F32_TO_U8(s[i][j]);
 	}
 }
 
 static void
-conv_f32_to_s16(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_f32d_to_s16d(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	int i, j;
 
-	for (i = 0; i < n_src; i++) {
+	for (i = 0; i < n_channels; i++) {
 		const float *s = src[i];
 		int16_t *d = dst[i];
 
@@ -376,37 +438,43 @@ conv_f32_to_s16(void *data, int n_dst, void *dst[n_dst], int n_src, const void *
 }
 
 static void
-conv_f32_to_s16d(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_f32_to_s16(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
+{
+	conv_f32d_to_s16d(data, dst, src, 1, n_samples * n_channels);
+}
+
+static void
+conv_f32_to_s16d(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	const float *s = src[0];
 	int16_t **d = (int16_t **) dst;
 	int i, j;
 
 	for (j = 0; j < n_samples; j++) {
-		for (i = 0; i < n_dst; i++)
+		for (i = 0; i < n_channels; i++)
 			d[i][j] = F32_TO_S16(*s++);
 	}
 }
 
 static void
-conv_f32d_to_s16(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_f32d_to_s16(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	const float **s = (const float **) src;
 	int16_t *d = dst[0];
 	int i, j;
 
 	for (j = 0; j < n_samples; j++) {
-		for (i = 0; i < n_src; i++)
+		for (i = 0; i < n_channels; i++)
 			*d++ = F32_TO_S16(s[i][j]);
 	}
 }
 
 static void
-conv_f32_to_s32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_f32d_to_s32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	int i, j;
 
-	for (i = 0; i < n_src; i++) {
+	for (i = 0; i < n_channels; i++) {
 		const float *s = src[i];
 		int32_t *d = dst[i];
 
@@ -416,27 +484,33 @@ conv_f32_to_s32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *
 }
 
 static void
-conv_f32_to_s32d(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_f32_to_s32(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
+{
+	conv_f32d_to_s32d(data, dst, src, 1, n_samples * n_channels);
+}
+
+static void
+conv_f32_to_s32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	const float *s = src[0];
 	int32_t **d = (int32_t **) dst;
 	int i, j;
 
 	for (j = 0; j < n_samples; j++) {
-		for (i = 0; i < n_dst; i++)
+		for (i = 0; i < n_channels; i++)
 			d[i][j] = F32_TO_S32(*s++);
 	}
 }
 
 static void
-conv_f32d_to_s32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_f32d_to_s32(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	const float **s = (const float **) src;
 	int32_t *d = dst[0];
 	int i, j;
 
 	for (j = 0; j < n_samples; j++) {
-		for (i = 0; i < n_src; i++)
+		for (i = 0; i < n_channels; i++)
 			*d++ = F32_TO_S32(s[i][j]);
 	}
 }
@@ -444,11 +518,11 @@ conv_f32d_to_s32(void *data, int n_dst, void *dst[n_dst], int n_src, const void
 
 
 static void
-conv_f32_to_s24(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_f32d_to_s24d(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	int i, j;
 
-	for (i = 0; i < n_src; i++) {
+	for (i = 0; i < n_channels; i++) {
 		const float *s = src[i];
 		uint8_t *d = dst[i];
 
@@ -460,28 +534,34 @@ conv_f32_to_s24(void *data, int n_dst, void *dst[n_dst], int n_src, const void *
 }
 
 static void
-conv_f32_to_s24d(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_f32_to_s24(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
+{
+	conv_f32d_to_s24d(data, dst, src, 1, n_samples * n_channels);
+}
+
+static void
+conv_f32_to_s24d(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	const float *s = src[0];
 	uint8_t **d = (uint8_t **) dst;
 	int i, j;
 
 	for (j = 0; j < n_samples; j++) {
-		for (i = 0; i < n_dst; i++) {
+		for (i = 0; i < n_channels; i++) {
 			write_s24(&d[i][j*3], F32_TO_S24(*s++));
 		}
 	}
 }
 
 static void
-conv_f32d_to_s24(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_f32d_to_s24(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	const float **s = (const float **) src;
 	uint8_t *d = dst[0];
 	int i, j;
 
 	for (j = 0; j < n_samples; j++) {
-		for (i = 0; i < n_src; i++) {
+		for (i = 0; i < n_channels; i++) {
 			write_s24(d, F32_TO_S24(s[i][j]));
 			d += 3;
 		}
@@ -490,11 +570,11 @@ conv_f32d_to_s24(void *data, int n_dst, void *dst[n_dst], int n_src, const void
 
 
 static void
-conv_f32_to_s24_32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_f32d_to_s24_32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	int i, j;
 
-	for (i = 0; i < n_src; i++) {
+	for (i = 0; i < n_channels; i++) {
 		const float *s = src[i];
 		int32_t *d = dst[i];
 
@@ -504,66 +584,72 @@ conv_f32_to_s24_32(void *data, int n_dst, void *dst[n_dst], int n_src, const voi
 }
 
 static void
-conv_f32_to_s24_32d(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_f32_to_s24_32(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
+{
+	conv_f32d_to_s24_32d(data, dst, src, 1, n_samples * n_channels);
+}
+
+static void
+conv_f32_to_s24_32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	const float *s = src[0];
 	int32_t **d = (int32_t **) dst;
 	int i, j;
 
 	for (j = 0; j < n_samples; j++) {
-		for (i = 0; i < n_dst; i++)
+		for (i = 0; i < n_channels; i++)
 			d[i][j] = F32_TO_S24(*s++);
 	}
 }
 
 static void
-conv_f32d_to_s24_32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+conv_f32d_to_s24_32(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	const float **s = (const float **) src;
 	int32_t *d = dst[0];
 	int i, j;
 
 	for (j = 0; j < n_samples; j++) {
-		for (i = 0; i < n_src; i++)
+		for (i = 0; i < n_channels; i++)
 			*d++ = F32_TO_S24(s[i][j]);
 	}
 }
 
 static void
-deinterleave_8(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+deinterleave_8(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	const uint8_t *s = src[0];
 	uint8_t **d = (uint8_t **) dst;
 	int i, j;
 
 	for (j = 0; j < n_samples; j++) {
-		for (i = 0; i < n_dst; i++)
+		for (i = 0; i < n_channels; i++)
 			d[i][j] = *s++;
 	}
 }
 
 static void
-deinterleave_16(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+deinterleave_16(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	const uint16_t *s = src[0];
 	uint16_t **d = (uint16_t **) dst;
 	int i, j;
 
 	for (j = 0; j < n_samples; j++) {
-		for (i = 0; i < n_dst; i++)
+		for (i = 0; i < n_channels; i++)
 			d[i][j] = *s++;
 	}
 }
 
 static void
-deinterleave_24(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+deinterleave_24(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	const uint8_t *s = src[0];
 	uint8_t **d = (uint8_t **) dst;
 	int i, j;
 
 	for (j = 0; j < n_samples; j++) {
-		for (i = 0; i < n_dst; i++) {
+		for (i = 0; i < n_channels; i++) {
 			write_s24(&d[i][j*3], read_s24(s));
 			s += 3;
 		}
@@ -571,53 +657,53 @@ deinterleave_24(void *data, int n_dst, void *dst[n_dst], int n_src, const void *
 }
 
 static void
-deinterleave_32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+deinterleave_32(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	const uint32_t *s = src[0];
 	uint32_t **d = (uint32_t **) dst;
 	int i, j;
 
 	for (j = 0; j < n_samples; j++) {
-		for (i = 0; i < n_dst; i++)
+		for (i = 0; i < n_channels; i++)
 			d[i][j] = *s++;
 	}
 }
 
 static void
-interleave_8(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+interleave_8(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	const int8_t **s = (const int8_t **) src;
 	uint8_t *d = dst[0];
 	int i, j;
 
 	for (j = 0; j < n_samples; j++) {
-		for (i = 0; i < n_src; i++)
+		for (i = 0; i < n_channels; i++)
 			*d++ = s[i][j];
 	}
 }
 
 static void
-interleave_16(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+interleave_16(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	const int16_t **s = (const int16_t **) src;
 	uint16_t *d = dst[0];
 	int i, j;
 
 	for (j = 0; j < n_samples; j++) {
-		for (i = 0; i < n_src; i++)
+		for (i = 0; i < n_channels; i++)
 			*d++ = s[i][j];
 	}
 }
 
 static void
-interleave_24(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+interleave_24(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	const int8_t **s = (const int8_t **) src;
 	uint8_t *d = dst[0];
 	int i, j;
 
 	for (j = 0; j < n_samples; j++) {
-		for (i = 0; i < n_src; i++) {
+		for (i = 0; i < n_channels; i++) {
 			write_s24(d, read_s24(&s[i][j*3]));
 			d += 3;
 		}
@@ -625,21 +711,21 @@ interleave_24(void *data, int n_dst, void *dst[n_dst], int n_src, const void *sr
 }
 
 static void
-interleave_32(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
+interleave_32(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
 {
 	const int32_t **s = (const int32_t **) src;
 	uint32_t *d = dst[0];
 	int i, j;
 
 	for (j = 0; j < n_samples; j++) {
-		for (i = 0; i < n_src; i++)
+		for (i = 0; i < n_channels; i++)
 			*d++ = s[i][j];
 	}
 }
 
 
-typedef void (*convert_func_t) (void *data, int n_dst, void *dst[n_dst],
-				int n_src, const void *src[n_src], int n_samples);
+typedef void (*convert_func_t) (void *data, void *dst[], const void *src[],
+		int n_channels, int n_samples);
 
 static const struct conv_info {
 	uint32_t src_fmt;
@@ -652,13 +738,13 @@ static const struct conv_info {
 {
 	/* to f32 */
 	{ SPA_AUDIO_FORMAT_U8, SPA_AUDIO_FORMAT_F32, 0, conv_u8_to_f32 },
-	{ SPA_AUDIO_FORMAT_U8P, SPA_AUDIO_FORMAT_F32P, 0, conv_u8_to_f32 },
+	{ SPA_AUDIO_FORMAT_U8P, SPA_AUDIO_FORMAT_F32P, 0, conv_u8d_to_f32d },
 	{ SPA_AUDIO_FORMAT_U8, SPA_AUDIO_FORMAT_F32P, 0, conv_u8_to_f32d },
 	{ SPA_AUDIO_FORMAT_U8P, SPA_AUDIO_FORMAT_F32, 0, conv_u8d_to_f32 },
 
 
 	{ SPA_AUDIO_FORMAT_S16, SPA_AUDIO_FORMAT_F32, 0, conv_s16_to_f32 },
-	{ SPA_AUDIO_FORMAT_S16P, SPA_AUDIO_FORMAT_F32P, 0, conv_s16_to_f32 },
+	{ SPA_AUDIO_FORMAT_S16P, SPA_AUDIO_FORMAT_F32P, 0, conv_s16d_to_f32d },
 #if defined (__SSE2__)
 	{ SPA_AUDIO_FORMAT_S16, SPA_AUDIO_FORMAT_F32P, FEATURE_SSE2, conv_s16_to_f32d_sse2 },
 #endif
@@ -666,17 +752,17 @@ static const struct conv_info {
 	{ SPA_AUDIO_FORMAT_S16P, SPA_AUDIO_FORMAT_F32, 0, conv_s16d_to_f32 },
 
 	{ SPA_AUDIO_FORMAT_F32, SPA_AUDIO_FORMAT_F32, 0, conv_copy32 },
-	{ SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_F32P, 0, conv_copy32 },
+	{ SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_F32P, 0, conv_copy32d },
 	{ SPA_AUDIO_FORMAT_F32, SPA_AUDIO_FORMAT_F32P, 0, deinterleave_32 },
 	{ SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_F32, 0, interleave_32 },
 
 	{ SPA_AUDIO_FORMAT_S32, SPA_AUDIO_FORMAT_F32, 0, conv_s32_to_f32 },
-	{ SPA_AUDIO_FORMAT_S32P, SPA_AUDIO_FORMAT_F32P, 0, conv_s32_to_f32 },
+	{ SPA_AUDIO_FORMAT_S32P, SPA_AUDIO_FORMAT_F32P, 0, conv_s32d_to_f32d },
 	{ SPA_AUDIO_FORMAT_S32, SPA_AUDIO_FORMAT_F32P, 0, conv_s32_to_f32d },
 	{ SPA_AUDIO_FORMAT_S32P, SPA_AUDIO_FORMAT_F32, 0, conv_s32d_to_f32 },
 
 	{ SPA_AUDIO_FORMAT_S24, SPA_AUDIO_FORMAT_F32, 0, conv_s24_to_f32 },
-	{ SPA_AUDIO_FORMAT_S24P, SPA_AUDIO_FORMAT_F32P, 0, conv_s24_to_f32 },
+	{ SPA_AUDIO_FORMAT_S24P, SPA_AUDIO_FORMAT_F32P, 0, conv_s24d_to_f32d },
 #if defined (__SSE2__)
 	{ SPA_AUDIO_FORMAT_S24, SPA_AUDIO_FORMAT_F32P, FEATURE_SSE2, conv_s24_to_f32d_sse2 },
 #endif
@@ -684,18 +770,18 @@ static const struct conv_info {
 	{ SPA_AUDIO_FORMAT_S24P, SPA_AUDIO_FORMAT_F32, 0, conv_s24d_to_f32 },
 
 	{ SPA_AUDIO_FORMAT_S24_32, SPA_AUDIO_FORMAT_F32, 0, conv_s24_32_to_f32 },
-	{ SPA_AUDIO_FORMAT_S24_32P, SPA_AUDIO_FORMAT_F32P, 0, conv_s24_32_to_f32 },
+	{ SPA_AUDIO_FORMAT_S24_32P, SPA_AUDIO_FORMAT_F32P, 0, conv_s24_32d_to_f32d },
 	{ SPA_AUDIO_FORMAT_S24_32, SPA_AUDIO_FORMAT_F32P, 0, conv_s24_32_to_f32d },
 	{ SPA_AUDIO_FORMAT_S24_32P, SPA_AUDIO_FORMAT_F32, 0, conv_s24_32d_to_f32 },
 
 	/* from f32 */
 	{ SPA_AUDIO_FORMAT_F32, SPA_AUDIO_FORMAT_U8, 0, conv_f32_to_u8 },
-	{ SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_U8P, 0, conv_f32_to_u8 },
+	{ SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_U8P, 0, conv_f32d_to_u8d },
 	{ SPA_AUDIO_FORMAT_F32, SPA_AUDIO_FORMAT_U8P, 0, conv_f32_to_u8d },
 	{ SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_U8, 0, conv_f32d_to_u8 },
 
 	{ SPA_AUDIO_FORMAT_F32, SPA_AUDIO_FORMAT_S16, 0, conv_f32_to_s16 },
-	{ SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_S16P, 0, conv_f32_to_s16 },
+	{ SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_S16P, 0, conv_f32d_to_s16d },
 	{ SPA_AUDIO_FORMAT_F32, SPA_AUDIO_FORMAT_S16P, 0, conv_f32_to_s16d },
 #if defined (__SSE2__)
 	{ SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_S16, FEATURE_SSE2, conv_f32d_to_s16_sse2 },
@@ -703,7 +789,7 @@ static const struct conv_info {
 	{ SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_S16, 0, conv_f32d_to_s16 },
 
 	{ SPA_AUDIO_FORMAT_F32, SPA_AUDIO_FORMAT_S32, 0, conv_f32_to_s32 },
-	{ SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_S32P, 0, conv_f32_to_s32 },
+	{ SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_S32P, 0, conv_f32d_to_s32d },
 	{ SPA_AUDIO_FORMAT_F32, SPA_AUDIO_FORMAT_S32P, 0, conv_f32_to_s32d },
 #if defined (__SSE2__)
 	{ SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_S32, FEATURE_SSE2, conv_f32d_to_s32_sse2 },
@@ -711,42 +797,42 @@ static const struct conv_info {
 	{ SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_S32, 0, conv_f32d_to_s32 },
 
 	{ SPA_AUDIO_FORMAT_F32, SPA_AUDIO_FORMAT_S24, 0, conv_f32_to_s24 },
-	{ SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_S24P, 0, conv_f32_to_s24 },
+	{ SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_S24P, 0, conv_f32d_to_s24d },
 	{ SPA_AUDIO_FORMAT_F32, SPA_AUDIO_FORMAT_S24P, 0, conv_f32_to_s24d },
 	{ SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_S24, 0, conv_f32d_to_s24 },
 
 	{ SPA_AUDIO_FORMAT_F32, SPA_AUDIO_FORMAT_S24_32, 0, conv_f32_to_s24_32 },
-	{ SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_S24_32P, 0, conv_f32_to_s24_32 },
+	{ SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_S24_32P, 0, conv_f32d_to_s24_32d },
 	{ SPA_AUDIO_FORMAT_F32, SPA_AUDIO_FORMAT_S24_32P, 0, conv_f32_to_s24_32d },
 	{ SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_S24_32, 0, conv_f32d_to_s24_32 },
 
 	/* u8 */
 	{ SPA_AUDIO_FORMAT_U8, SPA_AUDIO_FORMAT_U8, 0, conv_copy8 },
-	{ SPA_AUDIO_FORMAT_U8P, SPA_AUDIO_FORMAT_U8P, 0, conv_copy8 },
+	{ SPA_AUDIO_FORMAT_U8P, SPA_AUDIO_FORMAT_U8P, 0, conv_copy8d },
 	{ SPA_AUDIO_FORMAT_U8, SPA_AUDIO_FORMAT_U8P, 0, deinterleave_8 },
 	{ SPA_AUDIO_FORMAT_U8P, SPA_AUDIO_FORMAT_U8, 0, interleave_8 },
 
 	/* s16 */
 	{ SPA_AUDIO_FORMAT_S16, SPA_AUDIO_FORMAT_S16, 0, conv_copy16 },
-	{ SPA_AUDIO_FORMAT_S16P, SPA_AUDIO_FORMAT_S16P, 0, conv_copy16 },
+	{ SPA_AUDIO_FORMAT_S16P, SPA_AUDIO_FORMAT_S16P, 0, conv_copy16d },
 	{ SPA_AUDIO_FORMAT_S16, SPA_AUDIO_FORMAT_S16P, 0, deinterleave_16 },
 	{ SPA_AUDIO_FORMAT_S16P, SPA_AUDIO_FORMAT_S16, 0, interleave_16 },
 
 	/* s32 */
 	{ SPA_AUDIO_FORMAT_S32, SPA_AUDIO_FORMAT_S32, 0, conv_copy32 },
-	{ SPA_AUDIO_FORMAT_S32P, SPA_AUDIO_FORMAT_S32P, 0, conv_copy32 },
+	{ SPA_AUDIO_FORMAT_S32P, SPA_AUDIO_FORMAT_S32P, 0, conv_copy32d },
 	{ SPA_AUDIO_FORMAT_S32, SPA_AUDIO_FORMAT_S32P, 0, deinterleave_32 },
 	{ SPA_AUDIO_FORMAT_S32P, SPA_AUDIO_FORMAT_S32, 0, interleave_32 },
 
 	/* s24 */
 	{ SPA_AUDIO_FORMAT_S24, SPA_AUDIO_FORMAT_S24, 0, conv_copy24 },
-	{ SPA_AUDIO_FORMAT_S24P, SPA_AUDIO_FORMAT_S24P, 0, conv_copy24 },
+	{ SPA_AUDIO_FORMAT_S24P, SPA_AUDIO_FORMAT_S24P, 0, conv_copy24d },
 	{ SPA_AUDIO_FORMAT_S24, SPA_AUDIO_FORMAT_S24P, 0, deinterleave_24 },
 	{ SPA_AUDIO_FORMAT_S24P, SPA_AUDIO_FORMAT_S24, 0, interleave_24 },
 
 	/* s24_32 */
 	{ SPA_AUDIO_FORMAT_S24_32, SPA_AUDIO_FORMAT_S24_32, 0, conv_copy32 },
-	{ SPA_AUDIO_FORMAT_S24_32P, SPA_AUDIO_FORMAT_S24_32P, 0, conv_copy32 },
+	{ SPA_AUDIO_FORMAT_S24_32P, SPA_AUDIO_FORMAT_S24_32P, 0, conv_copy32d },
 	{ SPA_AUDIO_FORMAT_S24_32, SPA_AUDIO_FORMAT_S24_32P, 0, deinterleave_32 },
 	{ SPA_AUDIO_FORMAT_S24_32P, SPA_AUDIO_FORMAT_S24_32, 0, interleave_32 },
 };
diff --git a/spa/plugins/audioconvert/fmtconvert.c b/spa/plugins/audioconvert/fmtconvert.c
index a99216f5a..c49f6c1d1 100644
--- a/spa/plugins/audioconvert/fmtconvert.c
+++ b/spa/plugins/audioconvert/fmtconvert.c
@@ -115,8 +115,6 @@ struct impl {
 
 	uint32_t cpu_flags;
 	convert_func_t convert;
-
-	float empty[4096];
 };
 
 #define CHECK_PORT(this,d,id)		(id == 0)
@@ -656,7 +654,7 @@ impl_node_port_use_buffers(struct spa_node *node,
 {
 	struct impl *this;
 	struct port *port;
-	uint32_t i, size = SPA_ID_INVALID;
+	uint32_t i, size = SPA_ID_INVALID, j;
 
 	spa_return_val_if_fail(node != NULL, -EINVAL);
 
@@ -674,6 +672,7 @@ impl_node_port_use_buffers(struct spa_node *node,
 
 	for (i = 0; i < n_buffers; i++) {
 		struct buffer *b;
+		uint32_t n_datas = buffers[i]->n_datas;
 		struct spa_data *d = buffers[i]->datas;
 
 		b = &port->buffers[i];
@@ -682,19 +681,35 @@ impl_node_port_use_buffers(struct spa_node *node,
 		b->outbuf = buffers[i];
 		b->h = spa_buffer_find_meta_data(buffers[i], SPA_META_Header, sizeof(*b->h));
 
+		if (n_datas != port->blocks) {
+			spa_log_error(this->log, NAME " %p: expected %d blocks on buffer %d", this,
+				      port->blocks, i);
+			return -EINVAL;
+		}
+
 		if (size == SPA_ID_INVALID)
 			size = d[0].maxsize;
 		else
-			if (size != d[0].maxsize)
+			if (size != d[0].maxsize) {
+				spa_log_error(this->log, NAME " %p: expected size %d on buffer %d", this,
+				      size, i);
 				return -EINVAL;
+			}
 
-		if (!((d[0].type == SPA_DATA_MemPtr ||
-		       d[0].type == SPA_DATA_MemFd ||
-		       d[0].type == SPA_DATA_DmaBuf) && d[0].data != NULL)) {
-			spa_log_error(this->log, NAME " %p: invalid memory on buffer %p", this,
-				      buffers[i]);
-			return -EINVAL;
+		for (j = 0; j < n_datas; j++) {
+			if (!((d[j].type == SPA_DATA_MemPtr ||
+			       d[j].type == SPA_DATA_MemFd ||
+			       d[j].type == SPA_DATA_DmaBuf) && d[j].data != NULL)) {
+				spa_log_error(this->log, NAME " %p: invalid memory %d on buffer %d",
+						this, j, i);
+				return -EINVAL;
+			}
+			if (!SPA_IS_ALIGNED(d[j].data, 16)) {
+				spa_log_warn(this->log, NAME " %p: memory %d on buffer %d not aligned",
+						this, j, i);
+			}
 		}
+
 		if (direction == SPA_DIRECTION_OUTPUT)
 			spa_list_append(&port->queue, &b->link);
 		else
@@ -878,7 +893,7 @@ static int impl_node_process(struct spa_node *node)
 	spa_log_trace(this->log, NAME " %p: n_src:%d n_dst:%d size:%d maxsize:%d n_samples:%d",
 			this, n_src_datas, n_dst_datas, size, maxsize, n_samples);
 
-	this->convert(this, n_dst_datas, dst_datas, n_src_datas, src_datas, n_samples);
+	this->convert(this, dst_datas, src_datas, SPA_MAX(n_src_datas, n_dst_datas), n_samples);
 
 	inio->status = SPA_STATUS_NEED_BUFFER;
 	res |= SPA_STATUS_NEED_BUFFER;
diff --git a/spa/plugins/audioconvert/merger.c b/spa/plugins/audioconvert/merger.c
index 4dd17a663..cbcd8f357 100644
--- a/spa/plugins/audioconvert/merger.c
+++ b/spa/plugins/audioconvert/merger.c
@@ -42,7 +42,7 @@
 #define DEFAULT_RATE		48000
 #define DEFAULT_CHANNELS	2
 
-#define MAX_SAMPLES	1024
+#define MAX_SAMPLES	2048
 #define MAX_BUFFERS	64
 #define MAX_PORTS	128
 
@@ -100,7 +100,7 @@ struct impl {
 	bool monitor;
 	bool have_profile;
 
-	float empty[MAX_SAMPLES];
+	float empty[MAX_SAMPLES + 15];
 };
 
 #define CHECK_IN_PORT(this,d,p)		((d) == SPA_DIRECTION_INPUT && (p) < this->port_count)
@@ -750,7 +750,7 @@ impl_node_port_use_buffers(struct spa_node *node,
 {
 	struct impl *this;
 	struct port *port;
-	uint32_t i;
+	uint32_t i, j;
 
 	spa_return_val_if_fail(node != NULL, -EINVAL);
 
@@ -769,6 +769,7 @@ impl_node_port_use_buffers(struct spa_node *node,
 
 	for (i = 0; i < n_buffers; i++) {
 		struct buffer *b;
+		uint32_t n_datas = buffers[i]->n_datas;
 		struct spa_data *d = buffers[i]->datas;
 
 		b = &port->buffers[i];
@@ -776,13 +777,25 @@ impl_node_port_use_buffers(struct spa_node *node,
 		b->flags = 0;
 		b->buf = buffers[i];
 
-		if (!((d[0].type == SPA_DATA_MemPtr ||
-		       d[0].type == SPA_DATA_MemFd ||
-		       d[0].type == SPA_DATA_DmaBuf) && d[0].data != NULL)) {
-			spa_log_error(this->log, NAME " %p: invalid memory on buffer %p %d %p", this,
-				      buffers[i], d[0].type, d[0].data);
+		if (n_datas != port->blocks) {
+			spa_log_error(this->log, NAME " %p: invalid blocks %d on buffer %d",
+					this, n_datas, i);
 			return -EINVAL;
 		}
+
+		for (j = 0; j < n_datas; j++) {
+			if (!((d[j].type == SPA_DATA_MemPtr ||
+			       d[j].type == SPA_DATA_MemFd ||
+			       d[j].type == SPA_DATA_DmaBuf) && d[j].data != NULL)) {
+				spa_log_error(this->log, NAME " %p: invalid memory %d on buffer %d %d %p",
+						this, j, i, d[j].type, d[j].data);
+				return -EINVAL;
+			}
+			if (!SPA_IS_ALIGNED(d[j].data, 16))
+				spa_log_warn(this->log, NAME " %p: memory %d on buffer %d not aligned",
+						this, j, i);
+		}
+
 		if (direction == SPA_DIRECTION_OUTPUT)
 			queue_buffer(this, port, i);
 	}
@@ -960,7 +973,7 @@ static int impl_node_process(struct spa_node *node)
 		struct port *inport = GET_IN_PORT(this, i);
 
 		if (get_in_buffer(this, inport, &sbuf) < 0) {
-			src_datas[n_src_datas++] = this->empty;
+			src_datas[n_src_datas++] = SPA_PTR_ALIGN(this->empty, 16, void);
 			continue;
 		}
 
@@ -987,7 +1000,7 @@ static int impl_node_process(struct spa_node *node)
 				n_samples * outport->stride);
 	}
 
-	this->convert(this, n_dst_datas, dst_datas, n_src_datas, src_datas, n_samples);
+	this->convert(this, dst_datas, src_datas, SPA_MAX(n_dst_datas, n_src_datas), n_samples);
 
 	return res | SPA_STATUS_HAVE_BUFFER;
 }
diff --git a/spa/plugins/audioconvert/splitter.c b/spa/plugins/audioconvert/splitter.c
index 06e82da16..2d2035f82 100644
--- a/spa/plugins/audioconvert/splitter.c
+++ b/spa/plugins/audioconvert/splitter.c
@@ -44,7 +44,7 @@
 #define DEFAULT_CHANNELS	2
 #define DEFAULT_MASK		(1LL << SPA_AUDIO_CHANNEL_FL) | (1LL << SPA_AUDIO_CHANNEL_FR)
 
-#define MAX_SAMPLES	1024
+#define MAX_SAMPLES	2048
 #define MAX_BUFFERS	64
 #define MAX_PORTS	128
 
@@ -100,7 +100,7 @@ struct impl {
 
 	bool have_profile;
 
-	float empty[MAX_SAMPLES];
+	float empty[MAX_SAMPLES + 15];
 };
 
 #define CHECK_OUT_PORT(this,d,p)	((d) == SPA_DIRECTION_OUTPUT && (p) < this->port_count)
@@ -754,10 +754,13 @@ impl_node_port_use_buffers(struct spa_node *node,
 		if (!((d[0].type == SPA_DATA_MemPtr ||
 		       d[0].type == SPA_DATA_MemFd ||
 		       d[0].type == SPA_DATA_DmaBuf) && d[0].data != NULL)) {
-			spa_log_error(this->log, NAME " %p: invalid memory on buffer %p %d %p", this,
-				      buffers[i], d[0].type, d[0].data);
+			spa_log_error(this->log, NAME " %p: invalid memory on buffer %d %d %p", this,
+				      i, d[0].type, d[0].data);
 			return -EINVAL;
 		}
+		if (!SPA_IS_ALIGNED(d[0].data, 16))
+			spa_log_warn(this->log, NAME " %p: memory on buffer %d not aligned", this, i);
+
 		if (direction == SPA_DIRECTION_OUTPUT)
 			queue_buffer(this, port, i);
 	}
@@ -903,7 +906,7 @@ static int impl_node_process(struct spa_node *node)
 		if ((dbuf = dequeue_buffer(this, outport)) == NULL) {
 			outio->status = -EPIPE;
           empty:
-			dst_datas[n_dst_datas++] = this->empty;
+			dst_datas[n_dst_datas++] = SPA_PTR_ALIGN(this->empty, 16, void);
 			continue;
 		}
 
@@ -927,7 +930,7 @@ static int impl_node_process(struct spa_node *node)
 	spa_log_trace(this->log, NAME " %p: %d %d %d %d %d", this,
 			n_src_datas, n_dst_datas, n_samples, maxsize, inport->stride);
 
-	this->convert(this, n_dst_datas, dst_datas, n_src_datas, src_datas, n_samples);
+	this->convert(this, dst_datas, src_datas, SPA_MAX(n_dst_datas, n_src_datas), n_samples);
 
 	inio->status = SPA_STATUS_NEED_BUFFER;
 	res |= SPA_STATUS_NEED_BUFFER;
diff --git a/spa/plugins/audioconvert/test-fmt-ops.c b/spa/plugins/audioconvert/test-fmt-ops.c
index f52dca11f..797c074df 100644
--- a/spa/plugins/audioconvert/test-fmt-ops.c
+++ b/spa/plugins/audioconvert/test-fmt-ops.c
@@ -33,7 +33,7 @@
 
 #include "fmt-ops.c"
 
-#define N_SAMPLES	29
+#define N_SAMPLES	253
 #define N_CHANNELS	11
 
 static uint8_t samp_in[N_SAMPLES * 4];
@@ -47,7 +47,7 @@ static void run_test(const char *name,
 {
 	const void *ip[N_CHANNELS];
 	void *tp[N_CHANNELS];
-	int i, j, ic, oc, ns;
+	int i, j;
 	const uint8_t *in8 = in, *out8 = out;
 
 	for (j = 0; j < N_SAMPLES; j++) {
@@ -62,16 +62,16 @@ static void run_test(const char *name,
 		tp[0] = temp_in;
 		switch(in_size) {
 		case 1:
-			interleave_8(NULL, 1, tp, N_CHANNELS, ip, N_SAMPLES);
+			interleave_8(NULL, tp, ip, N_CHANNELS, N_SAMPLES);
 			break;
 		case 2:
-			interleave_16(NULL, 1, tp, N_CHANNELS, ip, N_SAMPLES);
+			interleave_16(NULL, tp, ip, N_CHANNELS, N_SAMPLES);
 			break;
 		case 3:
-			interleave_24(NULL, 1, tp, N_CHANNELS, ip, N_SAMPLES);
+			interleave_24(NULL, tp, ip, N_CHANNELS, N_SAMPLES);
 			break;
 		case 4:
-			interleave_32(NULL, 1, tp, N_CHANNELS, ip, N_SAMPLES);
+			interleave_32(NULL, tp, ip, N_CHANNELS, N_SAMPLES);
 			break;
 		default:
 			fprintf(stderr, "unknown size %zd\n", in_size);
@@ -84,16 +84,11 @@ static void run_test(const char *name,
 	for (j = 0; j < N_CHANNELS; j++)
 		tp[j] = &temp_out[j * N_SAMPLES * out_size];
 
-	ic = in_packed ? 1 : N_CHANNELS;
-	oc = out_packed ? 1 : N_CHANNELS;
-	ns = (in_packed && out_packed) ? N_SAMPLES * N_CHANNELS : N_SAMPLES;
-
-	func(NULL, oc, tp, ic, ip, ns);
+	func(NULL, tp, ip, N_CHANNELS, N_SAMPLES);
 
 	fprintf(stderr, "test %s:\n", name);
 	if (out_packed) {
 		const uint8_t *d = tp[0], *s = samp_out;
-		spa_debug_mem(0, d, N_SAMPLES * N_CHANNELS * out_size);
 		for (i = 0; i < N_SAMPLES; i++) {
 			for (j = 0; j < N_CHANNELS; j++) {
 				spa_assert(memcmp(d, s, out_size) == 0);
@@ -119,6 +114,8 @@ static void test_f32_u8(void)
 			false, true, conv_f32d_to_u8);
 	run_test("test_f32_u8d", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out),
 			true, false, conv_f32_to_u8d);
+	run_test("test_f32d_u8d", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out),
+			false, false, conv_f32d_to_u8d);
 }
 
 static void test_u8_f32(void)
@@ -132,6 +129,8 @@ static void test_u8_f32(void)
 			false, true, conv_u8d_to_f32);
 	run_test("test_u8_f32d", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out),
 			true, false, conv_u8_to_f32d);
+	run_test("test_u8d_f32d", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out),
+			false, false, conv_u8d_to_f32d);
 }
 
 static void test_f32_s16(void)
@@ -145,6 +144,8 @@ static void test_f32_s16(void)
 			false, true, conv_f32d_to_s16);
 	run_test("test_f32_s16d", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out),
 			true, false, conv_f32_to_s16d);
+	run_test("test_f32d_s16d", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out),
+			false, false, conv_f32d_to_s16d);
 }
 
 static void test_s16_f32(void)
@@ -158,6 +159,8 @@ static void test_s16_f32(void)
 			false, true, conv_s16d_to_f32);
 	run_test("test_s16_f32", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out),
 			true, true, conv_s16_to_f32);
+	run_test("test_s16d_f32d", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out),
+			false, false, conv_s16d_to_f32d);
 }
 
 static void test_f32_s32(void)
@@ -172,6 +175,8 @@ static void test_f32_s32(void)
 			false, true, conv_f32d_to_s32);
 	run_test("test_f32_s32d", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out),
 			true, false, conv_f32_to_s32d);
+	run_test("test_f32d_s32d", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out),
+			false, false, conv_f32d_to_s32d);
 }
 
 static void test_s32_f32(void)
@@ -185,6 +190,8 @@ static void test_s32_f32(void)
 			false, true, conv_s32d_to_f32);
 	run_test("test_s32_f32", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out),
 			true, true, conv_s32_to_f32);
+	run_test("test_s32d_f32d", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out),
+			false, false, conv_s32d_to_f32d);
 }
 
 static void test_f32_s24(void)
@@ -193,9 +200,14 @@ static void test_f32_s24(void)
 	const uint8_t out[] = { 0x00, 0x00, 0x00, 0xff, 0xff, 0x7f, 0x01, 0x00, 0x80,
 		0xff, 0xff, 0x3f, 0x01, 0x00, 0xc0, 0xff, 0xff, 0x7f, 0x01, 0x00, 0x80 };
 
-	run_test("test_f32_s24", in, sizeof(in[0]), out, 3, SPA_N_ELEMENTS(in), true, true, conv_f32_to_s24);
-	run_test("test_f32d_s24", in, sizeof(in[0]), out, 3, SPA_N_ELEMENTS(in), false, true, conv_f32d_to_s24);
-	run_test("test_f32_s24d", in, sizeof(in[0]), out, 3, SPA_N_ELEMENTS(in), true, false, conv_f32_to_s24d);
+	run_test("test_f32_s24", in, sizeof(in[0]), out, 3, SPA_N_ELEMENTS(in),
+			true, true, conv_f32_to_s24);
+	run_test("test_f32d_s24", in, sizeof(in[0]), out, 3, SPA_N_ELEMENTS(in),
+			false, true, conv_f32d_to_s24);
+	run_test("test_f32_s24d", in, sizeof(in[0]), out, 3, SPA_N_ELEMENTS(in),
+			true, false, conv_f32_to_s24d);
+	run_test("test_f32d_s24d", in, sizeof(in[0]), out, 3, SPA_N_ELEMENTS(in),
+			false, false, conv_f32d_to_s24d);
 }
 
 static void test_s24_f32(void)
@@ -204,9 +216,14 @@ static void test_s24_f32(void)
 		0xff, 0xff, 0x3f, 0x01, 0x00, 0xc0,  };
 	const float out[] = { 0.0f, 1.0f, -1.0f, 0.4999999404f, -0.4999999404f, };
 
-	run_test("test_s24_f32d", in, 3, out, sizeof(out[0]), SPA_N_ELEMENTS(out), true, false, conv_s24_to_f32d);
-	run_test("test_s24d_f32", in, 3, out, sizeof(out[0]), SPA_N_ELEMENTS(out), false, true, conv_s24d_to_f32);
-	run_test("test_s24_f32", in, 3, out, sizeof(out[0]), SPA_N_ELEMENTS(out), true, true, conv_s24_to_f32);
+	run_test("test_s24_f32d", in, 3, out, sizeof(out[0]), SPA_N_ELEMENTS(out),
+			true, false, conv_s24_to_f32d);
+	run_test("test_s24d_f32", in, 3, out, sizeof(out[0]), SPA_N_ELEMENTS(out),
+			false, true, conv_s24d_to_f32);
+	run_test("test_s24_f32", in, 3, out, sizeof(out[0]), SPA_N_ELEMENTS(out),
+			true, true, conv_s24_to_f32);
+	run_test("test_s24d_f32d", in, 3, out, sizeof(out[0]), SPA_N_ELEMENTS(out),
+			false, false, conv_s24d_to_f32d);
 }
 
 static void test_f32_s24_32(void)
@@ -221,6 +238,8 @@ static void test_f32_s24_32(void)
 			false, true, conv_f32d_to_s24_32);
 	run_test("test_f32_s24_32d", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out),
 			true, false, conv_f32_to_s24_32d);
+	run_test("test_f32d_s24_32d", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out),
+			false, false, conv_f32d_to_s24_32d);
 }
 
 static void test_s24_32_f32(void)
@@ -234,6 +253,8 @@ static void test_s24_32_f32(void)
 			false, true, conv_s24_32d_to_f32);
 	run_test("test_s24_32_f32", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out),
 			true, true, conv_s24_32_to_f32);
+	run_test("test_s24_32d_f32d", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out),
+			false, false, conv_s24_32d_to_f32d);
 }
 
 int main(int argc, char *argv[])
diff --git a/src/modules/module-audio-dsp/audio-dsp.c b/src/modules/module-audio-dsp/audio-dsp.c
index 074d5c2eb..350cecd9c 100644
--- a/src/modules/module-audio-dsp/audio-dsp.c
+++ b/src/modules/module-audio-dsp/audio-dsp.c
@@ -69,7 +69,7 @@ struct port {
 	struct spa_handle *spa_handle;
 	struct spa_node *spa_node;
 
-	float empty[MAX_BUFFER_SIZE];
+	float empty[MAX_BUFFER_SIZE + 15];
 };
 
 struct node {
@@ -101,14 +101,15 @@ static void init_buffer(struct port *port, uint32_t id)
 	b->datas[0].flags = 0;
 	b->datas[0].fd = -1;
 	b->datas[0].mapoffset = 0;
-	b->datas[0].maxsize = sizeof(port->empty);
-	b->datas[0].data = port->empty;
+	b->datas[0].maxsize = SPA_ROUND_DOWN_N(sizeof(port->empty), 16);
+	b->datas[0].data = SPA_PTR_ALIGN(port->empty, 16, void);
 	b->datas[0].chunk = b->chunk;
 	b->datas[0].chunk->offset = 0;
 	b->datas[0].chunk->size = 0;
 	b->datas[0].chunk->stride = 0;
 	port->bufs[id] = &b->buf;
 	memset(port->empty, 0, sizeof(port->empty));
+	pw_log_debug("%p %d", b->datas[0].data, b->datas[0].maxsize);
 }
 
 static void init_port(struct port *p, enum spa_direction direction)
diff --git a/src/modules/module-audio-dsp/floatmix.c b/src/modules/module-audio-dsp/floatmix.c
index d87091f8f..bf9c6508f 100644
--- a/src/modules/module-audio-dsp/floatmix.c
+++ b/src/modules/module-audio-dsp/floatmix.c
@@ -109,7 +109,7 @@ struct impl {
 	uint32_t stride;
 
 	bool started;
-        float empty[MAX_SAMPLES];
+        float empty[MAX_SAMPLES + 15];
 };
 
 #define CHECK_FREE_IN_PORT(this,d,p) ((d) == SPA_DIRECTION_INPUT && (p) < MAX_PORTS && !this->in_ports[(p)].valid)
@@ -632,10 +632,12 @@ impl_node_port_use_buffers(struct spa_node *node,
 		if (!((d[0].type == SPA_DATA_MemPtr ||
 		       d[0].type == SPA_DATA_MemFd ||
 		       d[0].type == SPA_DATA_DmaBuf) && d[0].data != NULL)) {
-			spa_log_error(this->log, NAME " %p: invalid memory on buffer %p", this,
-				      buffers[i]);
+			spa_log_error(this->log, NAME " %p: invalid memory on buffer %d", this, i);
 			return -EINVAL;
 		}
+		if (!SPA_IS_ALIGNED(d[0].data, 16)) {
+			spa_log_warn(this->log, NAME " %p: memory on buffer %d not aligned", this, i);
+		}
 		if (direction == SPA_DIRECTION_OUTPUT)
 			queue_buffer(this, port, b);
 	}
@@ -717,23 +719,27 @@ impl_node_port_send_command(struct spa_node *node,
 #include <xmmintrin.h>
 static void mix_2(float *dst, float *src1, float *src2, int n_samples)
 {
-	int i, unrolled;
+	int n, unrolled;
 	__m128 in[2];
 
-	unrolled = n_samples / 4;
-	n_samples &= 3;
+	if (SPA_IS_ALIGNED(src1, 16) &&
+	    SPA_IS_ALIGNED(src2, 16) &&
+	    SPA_IS_ALIGNED(dst, 16))
+		unrolled = n_samples / 4;
+	else
+		unrolled = 0;
 
-	for (i = 0; unrolled--; i += 4) {
-		in[0] = _mm_loadu_ps(&src1[i]),
-		in[1] = _mm_loadu_ps(&src2[i]),
+	for (n = 0; unrolled--; n += 4) {
+		in[0] = _mm_load_ps(&src1[n]),
+		in[1] = _mm_load_ps(&src2[n]),
 		in[0] = _mm_add_ps(in[0], in[1]);
-		_mm_storeu_ps(&dst[i], in[0]);
+		_mm_store_ps(&dst[n], in[0]);
 	}
-	for (; n_samples--; i++) {
-		in[0] = _mm_load_ss(&src1[i]),
-		in[1] = _mm_load_ss(&src2[i]),
+	for (; n < n_samples; n++) {
+		in[0] = _mm_load_ss(&src1[n]),
+		in[1] = _mm_load_ss(&src2[n]),
 		in[0] = _mm_add_ss(in[0], in[1]);
-		_mm_store_ss(&dst[i], in[0]);
+		_mm_store_ss(&dst[n], in[0]);
 	}
 }
 #else
@@ -825,13 +831,13 @@ static int impl_node_process(struct spa_node *node)
 
 		outb->buffer->n_datas = 1;
 		outb->buffer->datas = outb->datas;
-		outb->datas[0].data = this->empty;
+		outb->datas[0].data = SPA_PTR_ALIGN(this->empty, 16, void);
 		outb->datas[0].chunk = outb->chunk;
 		outb->datas[0].chunk->offset = 0;
 		outb->datas[0].chunk->size = n_samples * sizeof(float);
 		outb->datas[0].chunk->stride = sizeof(float);
 
-		dst = this->empty;
+		dst = outb->datas[0].data;
 		if (n_buffers == 0) {
 			memset(dst, 0, n_samples * sizeof(float));
 		}
diff --git a/src/modules/module-client-node/client-node.c b/src/modules/module-client-node/client-node.c
index 02b796485..594dac6b6 100644
--- a/src/modules/module-client-node/client-node.c
+++ b/src/modules/module-client-node/client-node.c
@@ -837,7 +837,7 @@ do_port_use_buffers(struct impl *impl,
 
 		data_size = 0;
 		for (j = 0; j < buffers[i]->n_metas; j++) {
-			data_size += buffers[i]->metas[j].size;
+			data_size += SPA_ROUND_UP_N(buffers[i]->metas[j].size, 8);
 		}
 		for (j = 0; j < buffers[i]->n_datas; j++) {
 			struct spa_data *d = buffers[i]->datas;
diff --git a/src/pipewire/link.c b/src/pipewire/link.c
index 2d7abdabd..4aa369455 100644
--- a/src/pipewire/link.c
+++ b/src/pipewire/link.c
@@ -419,6 +419,7 @@ static int alloc_buffers(struct pw_link *this,
 			 uint32_t n_datas,
 			 size_t *data_sizes,
 			 ssize_t *data_strides,
+			 size_t *data_aligns,
 			 struct allocation *allocation)
 {
 	int res;
@@ -452,12 +453,13 @@ static int alloc_buffers(struct pw_link *this,
 
 			metas[n_metas].type = type;
 			metas[n_metas].size = size;
-			meta_size += metas[n_metas].size;
+			meta_size += SPA_ROUND_UP_N(metas[n_metas].size, 8);
 			n_metas++;
 			skel_size += sizeof(struct spa_meta);
 		}
 	}
 	data_size += meta_size;
+	data_size = SPA_ROUND_UP_N(data_size, data_aligns[0]);
 
 	/* data */
 	for (i = 0; i < n_datas; i++) {
@@ -492,7 +494,7 @@ static int alloc_buffers(struct pw_link *this,
 			m->type = metas[j].type;
 			m->size = metas[j].size;
 			m->data = p;
-			p = SPA_MEMBER(p, m->size, void);
+			p = SPA_MEMBER(p, SPA_ROUND_UP_N(m->size, 8), void);
 		}
 		/* pointer to data structure */
 		b->n_datas = n_datas;
@@ -509,7 +511,7 @@ static int alloc_buffers(struct pw_link *this,
 				d->type = SPA_DATA_MemFd;
 				d->flags = 0;
 				d->fd = m->fd;
-				d->mapoffset = SPA_PTRDIFF(ddp, m->ptr);
+				d->mapoffset = SPA_ROUND_UP_N(SPA_PTRDIFF(ddp, m->ptr), data_aligns[i]);
 				d->maxsize = data_sizes[j];
 				d->data = SPA_MEMBER(m->ptr, d->mapoffset, void);
 				d->chunk->offset = 0;
@@ -701,9 +703,10 @@ static int do_allocation(struct pw_link *this, uint32_t in_state, uint32_t out_s
 		struct spa_pod_builder b = SPA_POD_BUILDER_INIT(buffer, sizeof(buffer));
 		uint32_t i, offset, n_params;
 		uint32_t max_buffers;
-		size_t minsize = 8192, stride = 0;
+		size_t minsize = 8192, stride = 0, align;
 		size_t data_sizes[1];
 		ssize_t data_strides[1];
+		size_t data_aligns[1];
 
 		n_params = param_filter(this, input, output, SPA_PARAM_Buffers, &b);
 		n_params += param_filter(this, input, output, SPA_PARAM_Meta, &b);
@@ -720,25 +723,29 @@ static int do_allocation(struct pw_link *this, uint32_t in_state, uint32_t out_s
 
 		max_buffers = MAX_BUFFERS;
 		minsize = stride = 0;
+		align = 8;
 		param = find_param(params, n_params, SPA_TYPE_OBJECT_ParamBuffers);
 		if (param) {
 			uint32_t qmax_buffers = max_buffers,
-			    qminsize = minsize, qstride = stride;
+			    qminsize = minsize, qstride = stride, qalign = align;
 
 			spa_pod_parse_object(param,
 				SPA_TYPE_OBJECT_ParamBuffers, NULL,
 				SPA_PARAM_BUFFERS_buffers, SPA_POD_Int(&qmax_buffers),
 				SPA_PARAM_BUFFERS_size,    SPA_POD_Int(&qminsize),
-				SPA_PARAM_BUFFERS_stride,  SPA_POD_Int(&qstride));
+				SPA_PARAM_BUFFERS_stride,  SPA_POD_Int(&qstride),
+				SPA_PARAM_BUFFERS_align,   SPA_POD_Int(&qalign));
 
 			max_buffers =
 			    qmax_buffers == 0 ? max_buffers : SPA_MIN(qmax_buffers,
 							      max_buffers);
 			minsize = SPA_MAX(minsize, qminsize);
 			stride = SPA_MAX(stride, qstride);
+			align = SPA_MAX(align, qalign);
 
-			pw_log_debug("%d %d %d -> %zd %zd %d", qminsize, qstride, qmax_buffers,
-				     minsize, stride, max_buffers);
+			pw_log_debug("%d %d %d %d -> %zd %zd %d %zd",
+					qminsize, qstride, qmax_buffers, qalign,
+					minsize, stride, max_buffers, align);
 		} else {
 			pw_log_warn("no buffers param");
 			minsize = 8192;
@@ -754,6 +761,7 @@ static int do_allocation(struct pw_link *this, uint32_t in_state, uint32_t out_s
 
 		data_sizes[0] = minsize;
 		data_strides[0] = stride;
+		data_aligns[0] = align;
 
 		if ((res = alloc_buffers(this,
 					 max_buffers,
@@ -761,6 +769,7 @@ static int do_allocation(struct pw_link *this, uint32_t in_state, uint32_t out_s
 					 params,
 					 1,
 					 data_sizes, data_strides,
+					 data_aligns,
 					 &allocation)) < 0) {
 			asprintf(&error, "error alloc buffers: %d", res);
 			goto error;
diff --git a/src/pipewire/remote.c b/src/pipewire/remote.c
index b04c9a5c0..5ed844984 100644
--- a/src/pipewire/remote.c
+++ b/src/pipewire/remote.c
@@ -1075,7 +1075,7 @@ client_node_port_use_buffers(void *object,
 			struct spa_meta *m = &b->metas[j];
 			memcpy(m, &buffers[i].buffer->metas[j], sizeof(struct spa_meta));
 			m->data = SPA_MEMBER(bmem.map.ptr, offset, void);
-			offset += m->size;
+			offset += SPA_ROUND_UP_N(m->size, 8);
 		}
 
 		for (j = 0; j < b->n_datas; j++) {