From 938f2b123e31cd459c54b834b76eb15bfc7a35ba Mon Sep 17 00:00:00 2001
From: Wim Taymans <wtaymans@redhat.com>
Date: Wed, 29 Jun 2022 14:10:15 +0200
Subject: [PATCH] audioconvert: improve format conversion

Make dither noise as a value between -0.5 and 0.5 and add this
to the scaled samples.
For this, we first need to do the scaling and then the CLAMP to
the target depth. This optimizes to the same code but allows us
to avoid under and overflows when we add the dither noise.

Add more dithering methods.

Expose a dither.method property on audioconvert. Disable dither when
the target depth > 16.
---
 spa/plugins/audioconvert/audioconvert.c |  27 ++-
 spa/plugins/audioconvert/fmt-ops-avx2.c |  30 +--
 spa/plugins/audioconvert/fmt-ops-c.c    | 239 ++++++++++++++++--------
 spa/plugins/audioconvert/fmt-ops-sse2.c | 112 +++++++++--
 spa/plugins/audioconvert/fmt-ops.c      |  41 +++-
 spa/plugins/audioconvert/fmt-ops.h      | 130 +++++++------
 6 files changed, 408 insertions(+), 171 deletions(-)

diff --git a/spa/plugins/audioconvert/audioconvert.c b/spa/plugins/audioconvert/audioconvert.c
index 113b7a3b1..390f6b6d6 100644
--- a/spa/plugins/audioconvert/audioconvert.c
+++ b/spa/plugins/audioconvert/audioconvert.c
@@ -634,6 +634,24 @@ static int impl_node_enum_params(void *object, int seq,
 				SPA_PROP_INFO_type, SPA_POD_CHOICE_RANGE_Int(this->dir[1].conv.noise, 0, 16),
 				SPA_PROP_INFO_params, SPA_POD_Bool(true));
 			break;
+		case 23:
+			spa_pod_builder_push_object(&b, &f[0], SPA_TYPE_OBJECT_PropInfo, id);
+			spa_pod_builder_add(&b,
+				SPA_PROP_INFO_name, SPA_POD_String("dither.method"),
+				SPA_PROP_INFO_description, SPA_POD_String("The dithering method"),
+				SPA_PROP_INFO_type, SPA_POD_String(
+					dither_method_info[this->dir[1].conv.method].label),
+				SPA_PROP_INFO_params, SPA_POD_Bool(true),
+				0);
+			spa_pod_builder_prop(&b, SPA_PROP_INFO_labels, 0);
+			spa_pod_builder_push_struct(&b, &f[1]);
+			for (i = 0; i < SPA_N_ELEMENTS(channelmix_upmix_info); i++) {
+				spa_pod_builder_string(&b, dither_method_info[i].label);
+				spa_pod_builder_string(&b, dither_method_info[i].description);
+			}
+			spa_pod_builder_pop(&b, &f[1]);
+			param = spa_pod_builder_pop(&b, &f[0]);
+			break;
 		default:
 			return 0;
 		}
@@ -704,6 +722,8 @@ static int impl_node_enum_params(void *object, int seq,
 			spa_pod_builder_bool(&b, p->resample_disabled);
 			spa_pod_builder_string(&b, "dither.noise");
 			spa_pod_builder_int(&b, this->dir[1].conv.noise);
+			spa_pod_builder_string(&b, "dither.method");
+			spa_pod_builder_string(&b, dither_method_info[this->dir[1].conv.method].label);
 			spa_pod_builder_pop(&b, &f[1]);
 			param = spa_pod_builder_pop(&b, &f[0]);
 			break;
@@ -775,6 +795,8 @@ static int audioconvert_set_param(struct impl *this, const char *k, const char *
 		this->props.resample_disabled = spa_atob(s);
 	else if (spa_streq(k, "dither.noise"))
 		spa_atou32(s, &this->dir[1].conv.noise, 0);
+	else if (spa_streq(k, "dither.method"))
+		this->dir[1].conv.method = dither_method_from_label(s);
 	else
 		return 0;
 	return 1;
@@ -1410,14 +1432,15 @@ static int setup_out_convert(struct impl *this)
 	out->conv.quantize = calc_width(&dst_info) * 8;
 	out->conv.src_fmt = src_info.info.raw.format;
 	out->conv.dst_fmt = dst_info.info.raw.format;
+	out->conv.rate = dst_info.info.raw.rate;
 	out->conv.n_channels = dst_info.info.raw.channels;
 	out->conv.cpu_flags = this->cpu_flags;
 
 	if ((res = convert_init(&out->conv)) < 0)
 		return res;
 
-	spa_log_debug(this->log, "%p: got converter features %08x:%08x quant:%d:%d passthrough:%d %s", this,
-			this->cpu_flags, out->conv.cpu_flags,
+	spa_log_debug(this->log, "%p: got converter features %08x:%08x quant:%d:%d:%d passthrough:%d %s", this,
+			this->cpu_flags, out->conv.cpu_flags, out->conv.method,
 			out->conv.quantize, out->conv.noise,
 			out->conv.is_passthrough, out->conv.func_name);
 
diff --git a/spa/plugins/audioconvert/fmt-ops-avx2.c b/spa/plugins/audioconvert/fmt-ops-avx2.c
index 5c9ea6793..0ced69274 100644
--- a/spa/plugins/audioconvert/fmt-ops-avx2.c
+++ b/spa/plugins/audioconvert/fmt-ops-avx2.c
@@ -550,7 +550,7 @@ conv_f32d_to_s32_1s_avx2(void *data, void * SPA_RESTRICT dst, const void * SPA_R
 	__m128 in[1];
 	__m128i out[4];
 	__m128 scale = _mm_set1_ps(S32_SCALE);
-	__m128 int_min = _mm_set1_ps(S32_MIN);
+	__m128 int_max = _mm_set1_ps(S32_MAX);
 
 	if (SPA_IS_ALIGNED(s0, 16))
 		unrolled = n_samples & ~3;
@@ -559,7 +559,7 @@ conv_f32d_to_s32_1s_avx2(void *data, void * SPA_RESTRICT dst, const void * SPA_R
 
 	for(n = 0; n < unrolled; n += 4) {
 		in[0] = _mm_mul_ps(_mm_load_ps(&s0[n]), scale);
-		in[0] = _mm_min_ps(in[0], int_min);
+		in[0] = _mm_min_ps(in[0], int_max);
 		out[0] = _mm_cvtps_epi32(in[0]);
 		out[1] = _mm_shuffle_epi32(out[0], _MM_SHUFFLE(0, 3, 2, 1));
 		out[2] = _mm_shuffle_epi32(out[0], _MM_SHUFFLE(1, 0, 3, 2));
@@ -574,7 +574,7 @@ conv_f32d_to_s32_1s_avx2(void *data, void * SPA_RESTRICT dst, const void * SPA_R
 	for(; n < n_samples; n++) {
 		in[0] = _mm_load_ss(&s0[n]);
 		in[0] = _mm_mul_ss(in[0], scale);
-		in[0] = _mm_min_ss(in[0], int_min);
+		in[0] = _mm_min_ss(in[0], int_max);
 		*d = _mm_cvtss_si32(in[0]);
 		d += n_channels;
 	}
@@ -590,7 +590,7 @@ conv_f32d_to_s32_2s_avx2(void *data, void * SPA_RESTRICT dst, const void * SPA_R
 	__m256 in[2];
 	__m256i out[2], t[2];
 	__m256 scale = _mm256_set1_ps(S32_SCALE);
-	__m256 int_min = _mm256_set1_ps(S32_MIN);
+	__m256 int_max = _mm256_set1_ps(S32_MAX);
 
 	if (SPA_IS_ALIGNED(s0, 32) &&
 	    SPA_IS_ALIGNED(s1, 32))
@@ -602,8 +602,8 @@ conv_f32d_to_s32_2s_avx2(void *data, void * SPA_RESTRICT dst, const void * SPA_R
 		in[0] = _mm256_mul_ps(_mm256_load_ps(&s0[n]), scale);
 		in[1] = _mm256_mul_ps(_mm256_load_ps(&s1[n]), scale);
 
-		in[0] = _mm256_min_ps(in[0], int_min);
-		in[1] = _mm256_min_ps(in[1], int_min);
+		in[0] = _mm256_min_ps(in[0], int_max);
+		in[1] = _mm256_min_ps(in[1], int_max);
 
 		out[0] = _mm256_cvtps_epi32(in[0]);	/* a0 a1 a2 a3 a4 a5 a6 a7 */
 		out[1] = _mm256_cvtps_epi32(in[1]);	/* b0 b1 b2 b3 b4 b5 b6 b7 */
@@ -636,7 +636,7 @@ conv_f32d_to_s32_2s_avx2(void *data, void * SPA_RESTRICT dst, const void * SPA_R
 		__m128 in[2];
 		__m128i out[2];
 		__m128 scale = _mm_set1_ps(S32_SCALE);
-		__m128 int_min = _mm_set1_ps(S32_MIN);
+		__m128 int_max = _mm_set1_ps(S32_MAX);
 
 		in[0] = _mm_load_ss(&s0[n]);
 		in[1] = _mm_load_ss(&s1[n]);
@@ -644,7 +644,7 @@ conv_f32d_to_s32_2s_avx2(void *data, void * SPA_RESTRICT dst, const void * SPA_R
 		in[0] = _mm_unpacklo_ps(in[0], in[1]);
 
 		in[0] = _mm_mul_ps(in[0], scale);
-		in[0] = _mm_min_ps(in[0], int_min);
+		in[0] = _mm_min_ps(in[0], int_max);
 		out[0] = _mm_cvtps_epi32(in[0]);
 		_mm_storel_epi64((__m128i*)d, out[0]);
 		d += n_channels;
@@ -661,7 +661,7 @@ conv_f32d_to_s32_4s_avx2(void *data, void * SPA_RESTRICT dst, const void * SPA_R
 	__m256 in[4];
 	__m256i out[4], t[4];
 	__m256 scale = _mm256_set1_ps(S32_SCALE);
-	__m256 int_min = _mm256_set1_ps(S32_MIN);
+	__m256 int_max = _mm256_set1_ps(S32_MAX);
 
 	if (SPA_IS_ALIGNED(s0, 32) &&
 	    SPA_IS_ALIGNED(s1, 32) &&
@@ -677,10 +677,10 @@ conv_f32d_to_s32_4s_avx2(void *data, void * SPA_RESTRICT dst, const void * SPA_R
 		in[2] = _mm256_mul_ps(_mm256_load_ps(&s2[n]), scale);
 		in[3] = _mm256_mul_ps(_mm256_load_ps(&s3[n]), scale);
 
-		in[0] = _mm256_min_ps(in[0], int_min);
-		in[1] = _mm256_min_ps(in[1], int_min);
-		in[2] = _mm256_min_ps(in[2], int_min);
-		in[3] = _mm256_min_ps(in[3], int_min);
+		in[0] = _mm256_min_ps(in[0], int_max);
+		in[1] = _mm256_min_ps(in[1], int_max);
+		in[2] = _mm256_min_ps(in[2], int_max);
+		in[3] = _mm256_min_ps(in[3], int_max);
 
 		out[0] = _mm256_cvtps_epi32(in[0]); /* a0 a1 a2 a3 a4 a5 a6 a7 */
 		out[1] = _mm256_cvtps_epi32(in[1]); /* b0 b1 b2 b3 b4 b5 b6 b7 */
@@ -711,7 +711,7 @@ conv_f32d_to_s32_4s_avx2(void *data, void * SPA_RESTRICT dst, const void * SPA_R
 		__m128 in[4];
 		__m128i out[4];
 		__m128 scale = _mm_set1_ps(S32_SCALE);
-		__m128 int_min = _mm_set1_ps(S32_MIN);
+		__m128 int_max = _mm_set1_ps(S32_MAX);
 
 		in[0] = _mm_load_ss(&s0[n]);
 		in[1] = _mm_load_ss(&s1[n]);
@@ -723,7 +723,7 @@ conv_f32d_to_s32_4s_avx2(void *data, void * SPA_RESTRICT dst, const void * SPA_R
 		in[0] = _mm_unpacklo_ps(in[0], in[1]);
 
 		in[0] = _mm_mul_ps(in[0], scale);
-		in[0] = _mm_min_ps(in[0], int_min);
+		in[0] = _mm_min_ps(in[0], int_max);
 		out[0] = _mm_cvtps_epi32(in[0]);
 		_mm_storeu_si128((__m128i*)d, out[0]);
 		d += n_channels;
diff --git a/spa/plugins/audioconvert/fmt-ops-c.c b/spa/plugins/audioconvert/fmt-ops-c.c
index f4d762c57..44ffb30c4 100644
--- a/spa/plugins/audioconvert/fmt-ops-c.c
+++ b/spa/plugins/audioconvert/fmt-ops-c.c
@@ -737,23 +737,24 @@ conv_f64d_to_f32_c(struct convert *conv, void * SPA_RESTRICT dst[], const void *
 }
 
 /* 32 bit xorshift PRNG, see https://en.wikipedia.org/wiki/Xorshift */
-static inline uint32_t
+static inline int32_t
 xorshift(uint32_t *state)
 {
   uint32_t x = *state;
   x ^= x << 13;
   x ^= x >> 17;
   x ^= x << 5;
-  return (*state = x);
+  return (int32_t)(*state = x);
 }
 
 static inline void update_dither_c(struct convert *conv, uint32_t n_samples)
 {
-	uint32_t n, mask = conv->mask;
-	int32_t offset = conv->offset + conv->bias;
+	uint32_t n;
+	float *dither = conv->dither, scale = conv->scale;
+	uint32_t *state = &conv->random[0];
 
 	for (n = 0; n < n_samples; n++)
-		conv->dither[n] = offset + (int32_t)(xorshift(&conv->random[0]) & mask);
+		dither[n] = xorshift(state) * scale;
 }
 
 void
@@ -771,6 +772,27 @@ conv_f32d_to_u8d_c(struct convert *conv, void * SPA_RESTRICT dst[], const void *
 	}
 }
 
+void
+conv_f32d_to_u8d_dither_c(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],
+		uint32_t n_samples)
+{
+	uint32_t i, j, k, chunk, n_channels = conv->n_channels, dither_size = conv->dither_size;
+	float *dither = conv->dither;
+
+	update_dither_c(conv, SPA_MIN(n_samples, dither_size));
+
+	for (i = 0; i < n_channels; i++) {
+		const float *s = src[i];
+		uint8_t *d = dst[i];
+
+		for (j = 0; j < n_samples;) {
+			chunk = SPA_MIN(n_samples - j, dither_size);
+			for (k = 0; k < chunk; k++, j++)
+				d[j] = F32_TO_U8_D(s[j], dither[k]);
+		}
+	}
+}
+
 void
 conv_f32_to_u8_c(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],
 		uint32_t n_samples)
@@ -813,6 +835,26 @@ conv_f32d_to_u8_c(struct convert *conv, void * SPA_RESTRICT dst[], const void *
 	}
 }
 
+void
+conv_f32d_to_u8_dither_c(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],
+		uint32_t n_samples)
+{
+	const float **s = (const float **) src;
+	uint8_t *d = dst[0];
+	uint32_t i, j, k, chunk, n_channels = conv->n_channels, dither_size = conv->dither_size;
+	float *dither = conv->dither;
+
+	update_dither_c(conv, SPA_MIN(n_samples, dither_size));
+
+	for (j = 0; j < n_samples;) {
+		chunk = SPA_MIN(n_samples - j, dither_size);
+		for (k = 0; k < chunk; k++, j++) {
+			for (i = 0; i < n_channels; i++)
+				*d++ = F32_TO_U8_D(s[i][j], dither[k]);
+		}
+	}
+}
+
 void
 conv_f32d_to_s8d_c(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],
 		uint32_t n_samples)
@@ -828,6 +870,27 @@ conv_f32d_to_s8d_c(struct convert *conv, void * SPA_RESTRICT dst[], const void *
 	}
 }
 
+void
+conv_f32d_to_s8d_dither_c(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],
+		uint32_t n_samples)
+{
+	uint32_t i, j, k, chunk, n_channels = conv->n_channels, dither_size = conv->dither_size;
+	float *dither = conv->dither;
+
+	update_dither_c(conv, SPA_MIN(n_samples, dither_size));
+
+	for (i = 0; i < n_channels; i++) {
+		const float *s = src[i];
+		int8_t *d = dst[i];
+
+		for (j = 0; j < n_samples;) {
+			chunk = SPA_MIN(n_samples - j, dither_size);
+			for (k = 0; k < chunk; k++, j++)
+				d[j] = F32_TO_S8_D(s[j], dither[k]);
+		}
+	}
+}
+
 void
 conv_f32_to_s8_c(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],
 		uint32_t n_samples)
@@ -870,6 +933,26 @@ conv_f32d_to_s8_c(struct convert *conv, void * SPA_RESTRICT dst[], const void *
 	}
 }
 
+void
+conv_f32d_to_s8_dither_c(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],
+		uint32_t n_samples)
+{
+	const float **s = (const float **) src;
+	int8_t *d = dst[0];
+	uint32_t i, j, k, chunk, n_channels = conv->n_channels, dither_size = conv->dither_size;
+	float *dither = conv->dither;
+
+	update_dither_c(conv, SPA_MIN(n_samples, dither_size));
+
+	for (j = 0; j < n_samples;) {
+		chunk = SPA_MIN(n_samples - j, dither_size);
+		for (k = 0; k < chunk; k++, j++) {
+			for (i = 0; i < n_channels; i++)
+				*d++ = F32_TO_S8_D(s[i][j], dither[k]);
+		}
+	}
+}
+
 void
 conv_f32d_to_alaw_c(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],
 		uint32_t n_samples)
@@ -944,21 +1027,19 @@ void
 conv_f32d_to_s16d_dither_c(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],
 		uint32_t n_samples)
 {
-	uint32_t i, j, k, chunk, n_channels = conv->n_channels;
+	uint32_t i, j, k, chunk, n_channels = conv->n_channels, dither_size = conv->dither_size;
+	float *dither = conv->dither;
 
-	update_dither_c(conv, SPA_MIN(n_samples, conv->dither_size));
+	update_dither_c(conv, SPA_MIN(n_samples, dither_size));
 
 	for (i = 0; i < n_channels; i++) {
 		const float *s = src[i];
 		int16_t *d = dst[i];
-		int32_t v;
 
 		for (j = 0; j < n_samples;) {
-			chunk = SPA_MIN(n_samples - j, conv->dither_size);
-			for (k = 0; k < chunk; k++, j++) {
-				v = F32_TO_S24(s[j]) + conv->dither[k];
-				d[j] = v >> 8;
-			}
+			chunk = SPA_MIN(n_samples - j, dither_size);
+			for (k = 0; k < chunk; k++, j++)
+				d[j] = F32_TO_S16_D(s[j], dither[k]);
 		}
 	}
 }
@@ -1011,18 +1092,16 @@ conv_f32d_to_s16_dither_c(struct convert *conv, void * SPA_RESTRICT dst[], const
 {
 	const float **s = (const float **) src;
 	int16_t *d = dst[0];
-	uint32_t i, j, k, chunk, n_channels = conv->n_channels;
-	int32_t v;
+	uint32_t i, j, k, chunk, n_channels = conv->n_channels, dither_size = conv->dither_size;
+	float *dither = conv->dither;
 
-	update_dither_c(conv, SPA_MIN(n_samples, conv->dither_size));
+	update_dither_c(conv, SPA_MIN(n_samples, dither_size));
 
 	for (j = 0; j < n_samples;) {
-		chunk = SPA_MIN(n_samples - j, conv->dither_size);
+		chunk = SPA_MIN(n_samples - j, dither_size);
 		for (k = 0; k < chunk; k++, j++) {
-			for (i = 0; i < n_channels; i++) {
-				v = F32_TO_S24(s[i][j]) + conv->dither[k];
-				*d++ = v >> 8;
-			}
+			for (i = 0; i < n_channels; i++)
+				*d++ = F32_TO_S16_D(s[i][j], dither[k]);
 		}
 	}
 }
@@ -1046,19 +1125,17 @@ conv_f32d_to_s16s_dither_c(struct convert *conv, void * SPA_RESTRICT dst[], cons
 		uint32_t n_samples)
 {
 	const float **s = (const float **) src;
-	int16_t *d = dst[0];
-	uint32_t i, j, k, chunk, n_channels = conv->n_channels;
-	int32_t v;
+	uint16_t *d = dst[0];
+	uint32_t i, j, k, chunk, n_channels = conv->n_channels, dither_size = conv->dither_size;
+	float *dither = conv->dither;
 
-	update_dither_c(conv, SPA_MIN(n_samples, conv->dither_size));
+	update_dither_c(conv, SPA_MIN(n_samples, dither_size));
 
 	for (j = 0; j < n_samples;) {
-		chunk = SPA_MIN(n_samples - j, conv->dither_size);
+		chunk = SPA_MIN(n_samples - j, dither_size);
 		for (k = 0; k < chunk; k++, j++) {
-			for (i = 0; i < n_channels; i++) {
-				v = F32_TO_S24(s[i][j]) + conv->dither[k];
-				*d++ = bswap_16(v >> 8);
-			}
+			for (i = 0; i < n_channels; i++)
+				*d++ = F32_TO_S16S_D(s[i][j], dither[k]);
 		}
 	}
 }
@@ -1110,18 +1187,19 @@ void
 conv_f32d_to_s32d_dither_c(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],
 		uint32_t n_samples)
 {
-	uint32_t i, j, k, chunk, n_channels = conv->n_channels;
+	uint32_t i, j, k, chunk, n_channels = conv->n_channels, dither_size = conv->dither_size;
+	float *dither = conv->dither;
 
-	update_dither_c(conv, SPA_MIN(n_samples, conv->dither_size));
+	update_dither_c(conv, SPA_MIN(n_samples, dither_size));
 
 	for (i = 0; i < n_channels; i++) {
 		const float *s = src[i];
 		int32_t *d = dst[i];
 
 		for (j = 0; j < n_samples;) {
-			chunk = SPA_MIN(n_samples - j, conv->dither_size);
+			chunk = SPA_MIN(n_samples - j, dither_size);
 			for (k = 0; k < chunk; k++, j++)
-				d[j] = F32_TO_S32(s[j]) + conv->dither[k];
+				d[j] = F32_TO_S32_D(s[j], dither[k]);
 		}
 	}
 }
@@ -1174,15 +1252,16 @@ conv_f32d_to_s32_dither_c(struct convert *conv, void * SPA_RESTRICT dst[], const
 {
 	const float **s = (const float **) src;
 	int32_t *d = dst[0];
-	uint32_t i, j, k, chunk, n_channels = conv->n_channels;
+	uint32_t i, j, k, chunk, n_channels = conv->n_channels, dither_size = conv->dither_size;
+	float *dither = conv->dither;
 
-	update_dither_c(conv, SPA_MIN(n_samples, conv->dither_size));
+	update_dither_c(conv, SPA_MIN(n_samples, dither_size));
 
 	for (j = 0; j < n_samples;) {
-		chunk = SPA_MIN(n_samples - j, conv->dither_size);
+		chunk = SPA_MIN(n_samples - j, dither_size);
 		for (k = 0; k < chunk; k++, j++) {
 			for (i = 0; i < n_channels; i++)
-				*d++ = F32_TO_S32(s[i][j]) + conv->dither[k];
+				*d++ = F32_TO_S32_D(s[i][j], dither[k]);
 		}
 	}
 }
@@ -1192,7 +1271,7 @@ conv_f32d_to_s32s_c(struct convert *conv, void * SPA_RESTRICT dst[], const void
 		uint32_t n_samples)
 {
 	const float **s = (const float **) src;
-	int32_t *d = dst[0];
+	uint32_t *d = dst[0];
 	uint32_t i, j, n_channels = conv->n_channels;
 
 	for (j = 0; j < n_samples; j++) {
@@ -1206,18 +1285,17 @@ conv_f32d_to_s32s_dither_c(struct convert *conv, void * SPA_RESTRICT dst[], cons
 		uint32_t n_samples)
 {
 	const float **s = (const float **) src;
-	int32_t *d = dst[0], v;
-	uint32_t i, j, k, chunk, n_channels = conv->n_channels;
+	uint32_t *d = dst[0];
+	uint32_t i, j, k, chunk, n_channels = conv->n_channels, dither_size = conv->dither_size;
+	float *dither = conv->dither;
 
-	update_dither_c(conv, SPA_MIN(n_samples, conv->dither_size));
+	update_dither_c(conv, SPA_MIN(n_samples, dither_size));
 
 	for (j = 0; j < n_samples;) {
-		chunk = SPA_MIN(n_samples - j, conv->dither_size);
+		chunk = SPA_MIN(n_samples - j, dither_size);
 		for (k = 0; k < chunk; k++, j++) {
-			for (i = 0; i < n_channels; i++) {
-				v = F32_TO_S32(s[i][j]) + conv->dither[k];
-				*d++ = bswap_32(v);
-			}
+			for (i = 0; i < n_channels; i++)
+				*d++ = F32_TO_S32S_D(s[i][j], dither[k]);
 		}
 	}
 }
@@ -1346,20 +1424,19 @@ void
 conv_f32d_to_s24d_dither_c(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],
 		uint32_t n_samples)
 {
-	uint32_t i, j, k, chunk, n_channels = conv->n_channels;
-	int32_t v;
+	uint32_t i, j, k, chunk, n_channels = conv->n_channels, dither_size = conv->dither_size;
+	float *dither = conv->dither;
 
-	update_dither_c(conv, SPA_MIN(n_samples, conv->dither_size));
+	update_dither_c(conv, SPA_MIN(n_samples, dither_size));
 
 	for (i = 0; i < n_channels; i++) {
 		const float *s = src[i];
 		uint8_t *d = dst[i];
 
 		for (j = 0; j < n_samples;) {
-			chunk = SPA_MIN(n_samples - j, conv->dither_size);
+			chunk = SPA_MIN(n_samples - j, dither_size);
 			for (k = 0; k < chunk; k++, j++) {
-				v = F32_TO_S24(s[j]) + conv->dither[k];
-				write_s24(d, v);
+				write_s24(d, F32_TO_S24_D(s[j], dither[k]));
 				d += 3;
 			}
 		}
@@ -1419,24 +1496,22 @@ conv_f32d_to_s24_dither_c(struct convert *conv, void * SPA_RESTRICT dst[], const
 {
 	const float **s = (const float **) src;
 	uint8_t *d = dst[0];
-	uint32_t i, j, k, chunk, n_channels = conv->n_channels;
-	int32_t v;
+	uint32_t i, j, k, chunk, n_channels = conv->n_channels, dither_size = conv->dither_size;
+	float *dither = conv->dither;
 
-	update_dither_c(conv, SPA_MIN(n_samples, conv->dither_size));
+	update_dither_c(conv, SPA_MIN(n_samples, dither_size));
 
 	for (j = 0; j < n_samples;) {
-		chunk = SPA_MIN(n_samples - j, conv->dither_size);
+		chunk = SPA_MIN(n_samples - j, dither_size);
 		for (k = 0; k < chunk; k++, j++) {
 			for (i = 0; i < n_channels; i++) {
-				v = F32_TO_S24(s[i][j]) + conv->dither[k];
-				write_s24(d, v);
+				write_s24(d, F32_TO_S24_D(s[i][j], dither[k]));
 				d += 3;
 			}
 		}
 	}
 }
 
-
 void
 conv_f32d_to_s24s_c(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],
 		uint32_t n_samples)
@@ -1459,17 +1534,16 @@ conv_f32d_to_s24s_dither_c(struct convert *conv, void * SPA_RESTRICT dst[], cons
 {
 	const float **s = (const float **) src;
 	uint8_t *d = dst[0];
-	uint32_t i, j, k, chunk, n_channels = conv->n_channels;
-	int32_t v;
+	uint32_t i, j, k, chunk, n_channels = conv->n_channels, dither_size = conv->dither_size;
+	float *dither = conv->dither;
 
-	update_dither_c(conv, SPA_MIN(n_samples, conv->dither_size));
+	update_dither_c(conv, SPA_MIN(n_samples, dither_size));
 
 	for (j = 0; j < n_samples;) {
-		chunk = SPA_MIN(n_samples - j, conv->dither_size);
+		chunk = SPA_MIN(n_samples - j, dither_size);
 		for (k = 0; k < chunk; k++, j++) {
 			for (i = 0; i < n_channels; i++) {
-				v = F32_TO_S24(s[i][j]) + conv->dither[k];
-				write_s24s(d, v);
+				write_s24s(d, F32_TO_S24_D(s[i][j], dither[k]));
 				d += 3;
 			}
 		}
@@ -1495,18 +1569,19 @@ void
 conv_f32d_to_s24_32d_dither_c(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],
 		uint32_t n_samples)
 {
-	uint32_t i, j, k, chunk, n_channels = conv->n_channels;
+	uint32_t i, j, k, chunk, n_channels = conv->n_channels, dither_size = conv->dither_size;
+	float *dither = conv->dither;
 
-	update_dither_c(conv, SPA_MIN(n_samples, conv->dither_size));
+	update_dither_c(conv, SPA_MIN(n_samples, dither_size));
 
 	for (i = 0; i < n_channels; i++) {
 		const float *s = src[i];
 		int32_t *d = dst[i];
 
 		for (j = 0; j < n_samples;) {
-			chunk = SPA_MIN(n_samples - j, conv->dither_size);
+			chunk = SPA_MIN(n_samples - j, dither_size);
 			for (k = 0; k < chunk; k++, j++)
-				d[j] = F32_TO_S24_32(s[j]) + conv->dither[k];
+				d[j] = F32_TO_S24_32_D(s[j], dither[k]);
 		}
 	}
 }
@@ -1587,15 +1662,16 @@ conv_f32d_to_s24_32_dither_c(struct convert *conv, void * SPA_RESTRICT dst[], co
 {
 	const float **s = (const float **) src;
 	int32_t *d = dst[0];
-	uint32_t i, j, k, chunk, n_channels = conv->n_channels;
+	uint32_t i, j, k, chunk, n_channels = conv->n_channels, dither_size = conv->dither_size;
+	float *dither = conv->dither;
 
-	update_dither_c(conv, SPA_MIN(n_samples, conv->dither_size));
+	update_dither_c(conv, SPA_MIN(n_samples, dither_size));
 
 	for (j = 0; j < n_samples;) {
-		chunk = SPA_MIN(n_samples - j, conv->dither_size);
+		chunk = SPA_MIN(n_samples - j, dither_size);
 		for (k = 0; k < chunk; k++, j++) {
 			for (i = 0; i < n_channels; i++)
-				*d++ = F32_TO_S24_32(s[i][j]) + conv->dither[k];
+				*d++ = F32_TO_S24_32_D(s[i][j], dither[k]);
 		}
 	}
 }
@@ -1619,18 +1695,17 @@ conv_f32d_to_s24_32s_dither_c(struct convert *conv, void * SPA_RESTRICT dst[], c
 		uint32_t n_samples)
 {
 	const float **s = (const float **) src;
-	int32_t *d = dst[0], v;
-	uint32_t i, j, k, chunk, n_channels = conv->n_channels;
+	int32_t *d = dst[0];
+	uint32_t i, j, k, chunk, n_channels = conv->n_channels, dither_size = conv->dither_size;
+	float *dither = conv->dither;
 
-	update_dither_c(conv, SPA_MIN(n_samples, conv->dither_size));
+	update_dither_c(conv, SPA_MIN(n_samples, dither_size));
 
 	for (j = 0; j < n_samples;) {
-		chunk = SPA_MIN(n_samples - j, conv->dither_size);
+		chunk = SPA_MIN(n_samples - j, dither_size);
 		for (k = 0; k < chunk; k++, j++) {
-			for (i = 0; i < n_channels; i++) {
-				v = F32_TO_S24_32(s[i][j]) + conv->dither[k];
-				*d++ = bswap_32(v);
-			}
+			for (i = 0; i < n_channels; i++)
+				*d++ = F32_TO_S24_32S_D(s[i][j], dither[k]);
 		}
 	}
 }
diff --git a/spa/plugins/audioconvert/fmt-ops-sse2.c b/spa/plugins/audioconvert/fmt-ops-sse2.c
index 4fd13a1d4..6d811914c 100644
--- a/spa/plugins/audioconvert/fmt-ops-sse2.c
+++ b/spa/plugins/audioconvert/fmt-ops-sse2.c
@@ -385,7 +385,7 @@ conv_f32d_to_s32_1s_sse2(void *data, void * SPA_RESTRICT dst, const void * SPA_R
 	__m128 in[1];
 	__m128i out[4];
 	__m128 scale = _mm_set1_ps(S32_SCALE);
-	__m128 int_min = _mm_set1_ps(S32_MIN);
+	__m128 int_max = _mm_set1_ps(S32_MAX);
 
 	if (SPA_IS_ALIGNED(s0, 16))
 		unrolled = n_samples & ~3;
@@ -394,7 +394,7 @@ conv_f32d_to_s32_1s_sse2(void *data, void * SPA_RESTRICT dst, const void * SPA_R
 
 	for(n = 0; n < unrolled; n += 4) {
 		in[0] = _mm_mul_ps(_mm_load_ps(&s0[n]), scale);
-		in[0] = _mm_min_ps(in[0], int_min);
+		in[0] = _mm_min_ps(in[0], int_max);
 		out[0] = _mm_cvtps_epi32(in[0]);
 		out[1] = _mm_shuffle_epi32(out[0], _MM_SHUFFLE(0, 3, 2, 1));
 		out[2] = _mm_shuffle_epi32(out[0], _MM_SHUFFLE(1, 0, 3, 2));
@@ -409,7 +409,7 @@ conv_f32d_to_s32_1s_sse2(void *data, void * SPA_RESTRICT dst, const void * SPA_R
 	for(; n < n_samples; n++) {
 		in[0] = _mm_load_ss(&s0[n]);
 		in[0] = _mm_mul_ss(in[0], scale);
-		in[0] = _mm_min_ss(in[0], int_min);
+		in[0] = _mm_min_ss(in[0], int_max);
 		*d = _mm_cvtss_si32(in[0]);
 		d += n_channels;
 	}
@@ -425,7 +425,7 @@ conv_f32d_to_s32_2s_sse2(void *data, void * SPA_RESTRICT dst, const void * SPA_R
 	__m128 in[2];
 	__m128i out[2], t[2];
 	__m128 scale = _mm_set1_ps(S32_SCALE);
-	__m128 int_min = _mm_set1_ps(S32_MIN);
+	__m128 int_max = _mm_set1_ps(S32_MAX);
 
 	if (SPA_IS_ALIGNED(s0, 16) &&
 	    SPA_IS_ALIGNED(s1, 16))
@@ -437,8 +437,8 @@ conv_f32d_to_s32_2s_sse2(void *data, void * SPA_RESTRICT dst, const void * SPA_R
 		in[0] = _mm_mul_ps(_mm_load_ps(&s0[n]), scale);
 		in[1] = _mm_mul_ps(_mm_load_ps(&s1[n]), scale);
 
-		in[0] = _mm_min_ps(in[0], int_min);
-		in[1] = _mm_min_ps(in[1], int_min);
+		in[0] = _mm_min_ps(in[0], int_max);
+		in[1] = _mm_min_ps(in[1], int_max);
 
 		out[0] = _mm_cvtps_epi32(in[0]);
 		out[1] = _mm_cvtps_epi32(in[1]);
@@ -459,7 +459,7 @@ conv_f32d_to_s32_2s_sse2(void *data, void * SPA_RESTRICT dst, const void * SPA_R
 		in[0] = _mm_unpacklo_ps(in[0], in[1]);
 
 		in[0] = _mm_mul_ps(in[0], scale);
-		in[0] = _mm_min_ps(in[0], int_min);
+		in[0] = _mm_min_ps(in[0], int_max);
 		out[0] = _mm_cvtps_epi32(in[0]);
 		_mm_storel_epi64((__m128i*)d, out[0]);
 		d += n_channels;
@@ -476,7 +476,7 @@ conv_f32d_to_s32_4s_sse2(void *data, void * SPA_RESTRICT dst, const void * SPA_R
 	__m128 in[4];
 	__m128i out[4];
 	__m128 scale = _mm_set1_ps(S32_SCALE);
-	__m128 int_min = _mm_set1_ps(S32_MIN);
+	__m128 int_max = _mm_set1_ps(S32_MAX);
 
 	if (SPA_IS_ALIGNED(s0, 16) &&
 	    SPA_IS_ALIGNED(s1, 16) &&
@@ -492,10 +492,10 @@ conv_f32d_to_s32_4s_sse2(void *data, void * SPA_RESTRICT dst, const void * SPA_R
 		in[2] = _mm_mul_ps(_mm_load_ps(&s2[n]), scale);
 		in[3] = _mm_mul_ps(_mm_load_ps(&s3[n]), scale);
 
-		in[0] = _mm_min_ps(in[0], int_min);
-		in[1] = _mm_min_ps(in[1], int_min);
-		in[2] = _mm_min_ps(in[2], int_min);
-		in[3] = _mm_min_ps(in[3], int_min);
+		in[0] = _mm_min_ps(in[0], int_max);
+		in[1] = _mm_min_ps(in[1], int_max);
+		in[2] = _mm_min_ps(in[2], int_max);
+		in[3] = _mm_min_ps(in[3], int_max);
 
 		_MM_TRANSPOSE4_PS(in[0], in[1], in[2], in[3]);
 
@@ -521,7 +521,7 @@ conv_f32d_to_s32_4s_sse2(void *data, void * SPA_RESTRICT dst, const void * SPA_R
 		in[0] = _mm_unpacklo_ps(in[0], in[1]);
 
 		in[0] = _mm_mul_ps(in[0], scale);
-		in[0] = _mm_min_ps(in[0], int_min);
+		in[0] = _mm_min_ps(in[0], int_max);
 		out[0] = _mm_cvtps_epi32(in[0]);
 		_mm_storeu_si128((__m128i*)d, out[0]);
 		d += n_channels;
@@ -543,6 +543,92 @@ conv_f32d_to_s32_sse2(struct convert *conv, void * SPA_RESTRICT dst[], const voi
 		conv_f32d_to_s32_1s_sse2(conv, &d[i], &src[i], n_channels, n_samples);
 }
 
+static inline void update_dither_sse2(struct convert *conv, uint32_t n_samples)
+{
+	uint32_t n;
+	const uint32_t *r = SPA_PTR_ALIGN(conv->random, 16, uint32_t);
+	float *dither = SPA_PTR_ALIGN(conv->dither, 16, float);
+	__m128 scale = _mm_set1_ps(conv->scale), out[1];
+	__m128i in[1], t[1];
+
+	for (n = 0; n < n_samples; n += 4) {
+		/* 32 bit xorshift PRNG, see https://en.wikipedia.org/wiki/Xorshift */
+		in[0] = _mm_load_si128((__m128i*)r);
+		t[0] = _mm_slli_epi32(in[0], 13);
+		in[0] = _mm_xor_si128(in[0], t[0]);
+		t[0] = _mm_srli_epi32(in[0], 17);
+		in[0] = _mm_xor_si128(in[0], t[0]);
+		t[0] = _mm_slli_epi32(in[0], 5);
+		in[0] = _mm_xor_si128(in[0], t[0]);
+		_mm_store_si128((__m128i*)r, in[0]);
+
+		out[0] = _mm_cvtepi32_ps(in[0]);
+		out[0] = _mm_mul_ps(out[0], scale);
+		_mm_store_ps(&dither[n], out[0]);
+	}
+}
+
+static void
+conv_f32d_to_s32_1s_dither_sse2(struct convert *conv, void * SPA_RESTRICT dst, const void * SPA_RESTRICT src,
+		uint32_t n_channels, uint32_t n_samples)
+{
+	const float *s = src;
+	float *dither = SPA_PTR_ALIGN(conv->dither, 16, float);
+	int32_t *d = dst;
+	uint32_t n, unrolled;
+	__m128 in[1];
+	__m128i out[4];
+	__m128 scale = _mm_set1_ps(S32_SCALE);
+	__m128 int_max = _mm_set1_ps(S32_MAX);
+
+	if (SPA_IS_ALIGNED(s, 16))
+		unrolled = n_samples & ~3;
+	else
+		unrolled = 0;
+
+	for(n = 0; n < unrolled; n += 4) {
+		in[0] = _mm_mul_ps(_mm_load_ps(&s[n]), scale);
+		in[0] = _mm_add_ps(in[0], _mm_load_ps(&dither[n]));
+		in[0] = _mm_min_ps(in[0], int_max);
+		out[0] = _mm_cvtps_epi32(in[0]);
+		out[1] = _mm_shuffle_epi32(out[0], _MM_SHUFFLE(0, 3, 2, 1));
+		out[2] = _mm_shuffle_epi32(out[0], _MM_SHUFFLE(1, 0, 3, 2));
+		out[3] = _mm_shuffle_epi32(out[0], _MM_SHUFFLE(2, 1, 0, 3));
+
+		d[0*n_channels] = _mm_cvtsi128_si32(out[0]);
+		d[1*n_channels] = _mm_cvtsi128_si32(out[1]);
+		d[2*n_channels] = _mm_cvtsi128_si32(out[2]);
+		d[3*n_channels] = _mm_cvtsi128_si32(out[3]);
+		d += 4*n_channels;
+	}
+	for(; n < n_samples; n++) {
+		in[0] = _mm_load_ss(&s[n]);
+		in[0] = _mm_mul_ss(in[0], scale);
+		in[0] = _mm_add_ss(in[0], _mm_load_ss(&dither[n]));
+		in[0] = _mm_min_ss(in[0], int_max);
+		*d = _mm_cvtss_si32(in[0]);
+		d += n_channels;
+	}
+}
+
+void
+conv_f32d_to_s32_dither_sse2(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],
+		uint32_t n_samples)
+{
+	int32_t *d = dst[0];
+	uint32_t i, k, chunk, n_channels = conv->n_channels;
+
+	update_dither_sse2(conv, SPA_MIN(n_samples, conv->dither_size));
+
+	for(i = 0; i < n_channels; i++) {
+		const float *s = src[i];
+		for(k = 0; k < n_samples; k += chunk) {
+			chunk = SPA_MIN(n_samples - k, conv->dither_size);
+			conv_f32d_to_s32_1s_dither_sse2(conv, &d[i + k*n_channels], &s[k], n_channels, chunk);
+		}
+	}
+}
+
 static void
 conv_interleave_32_1s_sse2(void *data, void * SPA_RESTRICT dst, const void * SPA_RESTRICT src[],
 		uint32_t n_channels, uint32_t n_samples)
diff --git a/spa/plugins/audioconvert/fmt-ops.c b/spa/plugins/audioconvert/fmt-ops.c
index ccd91f2d3..7ad743f57 100644
--- a/spa/plugins/audioconvert/fmt-ops.c
+++ b/spa/plugins/audioconvert/fmt-ops.c
@@ -170,13 +170,17 @@ static struct conv_info conv_table[] =
 
 	/* from f32 */
 	MAKE(F32, U8, 0, conv_f32_to_u8_c),
+	MAKE(F32P, U8P, 0, conv_f32d_to_u8d_dither_c, 0, CONV_DITHER),
 	MAKE(F32P, U8P, 0, conv_f32d_to_u8d_c),
 	MAKE(F32, U8P, 0, conv_f32_to_u8d_c),
+	MAKE(F32P, U8, 0, conv_f32d_to_u8_dither_c, 0, CONV_DITHER),
 	MAKE(F32P, U8, 0, conv_f32d_to_u8_c),
 
 	MAKE(F32, S8, 0, conv_f32_to_s8_c),
+	MAKE(F32P, S8P, 0, conv_f32d_to_s8d_dither_c, 0, CONV_DITHER),
 	MAKE(F32P, S8P, 0, conv_f32d_to_s8d_c),
 	MAKE(F32, S8P, 0, conv_f32_to_s8d_c),
+	MAKE(F32P, S8, 0, conv_f32d_to_s8_dither_c, 0, CONV_DITHER),
 	MAKE(F32P, S8, 0, conv_f32d_to_s8_c),
 
 	MAKE(F32P, ALAW, 0, conv_f32d_to_alaw_c),
@@ -224,7 +228,11 @@ static struct conv_info conv_table[] =
 	MAKE(F32P, S32P, 0, conv_f32d_to_s32d_c),
 	MAKE(F32, S32P, 0, conv_f32_to_s32d_c),
 
+#if defined (HAVE_SSE2)
+	MAKE(F32P, S32, 0, conv_f32d_to_s32_dither_sse2, SPA_CPU_FLAG_SSE2, CONV_DITHER),
+#endif
 	MAKE(F32P, S32, 0, conv_f32d_to_s32_dither_c, 0, CONV_DITHER),
+
 #if defined (HAVE_AVX2)
 	MAKE(F32P, S32, 0, conv_f32d_to_s32_avx2, SPA_CPU_FLAG_AVX2),
 #endif
@@ -357,17 +365,38 @@ static void impl_convert_free(struct convert *conv)
 	conv->dither = NULL;
 }
 
+static bool need_dither(uint32_t format)
+{
+	switch (format) {
+	case SPA_AUDIO_FORMAT_U8:
+	case SPA_AUDIO_FORMAT_U8P:
+	case SPA_AUDIO_FORMAT_S8:
+	case SPA_AUDIO_FORMAT_S8P:
+	case SPA_AUDIO_FORMAT_ULAW:
+	case SPA_AUDIO_FORMAT_ALAW:
+	case SPA_AUDIO_FORMAT_S16P:
+	case SPA_AUDIO_FORMAT_S16:
+	case SPA_AUDIO_FORMAT_S16_OE:
+		return true;
+	}
+	return false;
+}
+
 int convert_init(struct convert *conv)
 {
 	const struct conv_info *info;
-	uint32_t i, shift, dither_flags;
+	uint32_t i, dither_flags;
 
-	shift = 24u - SPA_MIN(conv->quantize, 24u);
-	shift += conv->noise;
+	conv->scale = 1.0f / (float)(INT32_MAX >> conv->noise);
 
-	conv->mask = (1ULL << (shift + 1)) - 1;
-	conv->offset = shift < 32 ? -(1ULL << shift) : 0;
-	conv->bias = shift > 0 ? 1 << (shift - 1) : 0;
+	/* disable dither if not needed */
+	if (!need_dither(conv->dst_fmt))
+		conv->method = DITHER_METHOD_NONE;
+
+	/* don't use shaped for too low rates, it moves the noise to
+	 * audible ranges */
+	if (conv->method == DITHER_METHOD_SHAPED_5 && conv->rate < 32000)
+		conv->method = DITHER_METHOD_TRIANGULAR;
 
 	dither_flags = 0;
 	if (conv->method != DITHER_METHOD_NONE || conv->noise)
diff --git a/spa/plugins/audioconvert/fmt-ops.h b/spa/plugins/audioconvert/fmt-ops.h
index f4373a122..0cb990ded 100644
--- a/spa/plugins/audioconvert/fmt-ops.h
+++ b/spa/plugins/audioconvert/fmt-ops.h
@@ -37,72 +37,92 @@
 
 #define FMT_OPS_MAX_ALIGN	32
 
-#define U8_MIN		0
-#define U8_MAX		255
-#define U8_SCALE	127.5f
-#define U8_OFFS		128
-#define U8_TO_F32(v)	((((uint8_t)(v)) * (1.0f / U8_OFFS)) - 1.0)
-#define F32_TO_U8(v)	(uint8_t)((SPA_CLAMP(v, -1.0f, 1.0f) * U8_SCALE) + U8_OFFS)
+#define U8_MIN			0u
+#define U8_MAX			255u
+#define U8_SCALE		127.5f
+#define U8_OFFS			128.f
+#define U8_TO_F32(v)		((((uint8_t)(v)) * (1.0f / U8_OFFS)) - 1.0)
+#define F32_TO_U8(v)		(uint8_t)SPA_CLAMP((v) * U8_SCALE + U8_OFFS, U8_MIN, U8_MAX)
+#define F32_TO_U8_D(v,d)	(uint8_t)SPA_CLAMP((v) * U8_SCALE + U8_OFFS + (d), U8_MIN, U8_MAX)
 
-#define S8_MIN		-127
-#define S8_MAX		127
-#define S8_MAX_F	127.0f
-#define S8_SCALE	127.0f
-#define S8_TO_F32(v)	(((int8_t)(v)) * (1.0f / S8_SCALE))
-#define F32_TO_S8(v)	(int8_t)(SPA_CLAMP(v, -1.0f, 1.0f) * S8_SCALE)
+#define S8_MIN			-127
+#define S8_MAX			127
+#define S8_MAX_F		127.0f
+#define S8_SCALE		127.0f
+#define S8_TO_F32(v)		(((int8_t)(v)) * (1.0f / S8_SCALE))
+#define F32_TO_S8(v)		(int8_t)SPA_CLAMP((v) * S8_SCALE, S8_MIN, S8_MAX)
+#define F32_TO_S8_D(v,d)	(int8_t)SPA_CLAMP((v) * S8_SCALE + (d), S8_MIN, S8_MAX)
 
-#define U16_MIN		0
-#define U16_MAX		65535
-#define U16_SCALE	32767.5f
-#define U16_OFFS	32768
-#define U16_TO_F32(v)	((((uint16_t)(v)) * (1.0f / U16_OFFS)) - 1.0)
-#define U16S_TO_F32(v)	(((uint16_t)bswap_16((uint16_t)(v)) * (1.0f / U16_OFFS)) - 1.0)
-#define F32_TO_U16(v)	(uint16_t)((SPA_CLAMP(v, -1.0f, 1.0f) * U16_SCALE) + U16_OFFS)
-#define F32_TO_U16S(v)	((uint16_t)bswap_16((uint16_t)((SPA_CLAMP(v, -1.0f, 1.0f) * U16_SCALE) + U16_OFFS)))
+#define U16_MIN			0u
+#define U16_MAX			65535u
+#define U16_SCALE		32767.5f
+#define U16_OFFS		32768.f
+#define U16_TO_F32(v)		((((uint16_t)(v)) * (1.0f / U16_OFFS)) - 1.0)
+#define U16S_TO_F32(v)		(((uint16_t)bswap_16((uint16_t)(v)) * (1.0f / U16_OFFS)) - 1.0)
+#define F32_TO_U16(v)		(uint16_t)SPA_CLAMP((v) * U16_SCALE + U16_OFFS, U16_MIN, U16_MAX)
+#define F32_TO_U16_D(v,d)	(uint16_t)SPA_CLAMP((v) * U16_SCALE + U16_OFFS + (d), U16_MIN, U16_MAX)
+#define F32_TO_U16S(v)		bswap_16(F32_TO_U16(v))
+#define F32_TO_U16S_D(v,d)	bswap_16(F32_TO_U16_D(v,d))
 
-#define S16_MIN		-32767
-#define S16_MAX		32767
-#define S16_MAX_F	32767.0f
-#define S16_SCALE	32767.0f
-#define S16_TO_F32(v)	(((int16_t)(v)) * (1.0f / S16_SCALE))
-#define S16S_TO_F32(v)	(((int16_t)bswap_16((uint16_t)v)) * (1.0f / S16_SCALE))
-#define F32_TO_S16(v)	(int16_t)(SPA_CLAMP(v, -1.0f, 1.0f) * S16_SCALE)
-#define F32_TO_S16S(v)	((int16_t)bswap_16((uint16_t)(SPA_CLAMP(v, -1.0f, 1.0f) * S16_SCALE)))
+#define S16_MIN			-32767
+#define S16_MAX			32767
+#define S16_MAX_F		32767.0f
+#define S16_SCALE		32767.0f
+#define S16_TO_F32(v)		(((int16_t)(v)) * (1.0f / S16_SCALE))
+#define S16S_TO_F32(v)		(((int16_t)bswap_16(v)) * (1.0f / S16_SCALE))
+#define F32_TO_S16(v)		(int16_t)SPA_CLAMP((v) * S16_SCALE, S16_MIN, S16_MAX)
+#define F32_TO_S16_D(v,d)	(int16_t)SPA_CLAMP((v) * S16_SCALE + (d), S16_MIN, S16_MAX)
+#define F32_TO_S16S(v)		bswap_16(F32_TO_S16(v))
+#define F32_TO_S16S_D(v,d)	bswap_16(F32_TO_S16_D(v,d))
 
-#define U24_MIN		0
-#define U24_MAX		16777215
-#define U24_SCALE	8388607.5f
-#define U24_OFFS	8388608
-#define U24_TO_F32(v)	((((uint32_t)(v)) * (1.0f / U24_OFFS)) - 1.0)
-#define F32_TO_U24(v)	(uint32_t)((SPA_CLAMP(v, -1.0f, 1.0f) * U24_SCALE) + U24_OFFS)
+#define U24_MIN			0u
+#define U24_MAX			16777215u
+#define U24_SCALE		8388607.5f
+#define U24_OFFS		8388608.f
+#define U24_TO_F32(v)		((((uint32_t)(v)) * (1.0f / U24_OFFS)) - 1.0)
+#define F32_TO_U24(v)		(uint32_t)SPA_CLAMP((v) * U24_SCALE + U24_OFFS, U24_MIN, U24_MAX)
+#define F32_TO_U24_D(v,d)	(uint32_t)SPA_CLAMP((v) * U24_SCALE + U24_OFFS + (d), U24_MIN, U24_MAX)
 
-#define S24_MIN		-8388607
-#define S24_MAX		8388607
-#define S24_MAX_F	8388607.0f
-#define S24_SCALE	8388607.0f
-#define S24_TO_F32(v)	(((int32_t)(v)) * (1.0f / S24_SCALE))
-#define F32_TO_S24(v)	(int32_t)(SPA_CLAMP(v, -1.0f, 1.0f) * S24_SCALE)
+#define S24_MIN			-8388607
+#define S24_MAX			8388607
+#define S24_MAX_F		8388607.0f
+#define S24_SCALE		8388607.0f
+#define S24_TO_F32(v)		(((int32_t)(v)) * (1.0f / S24_SCALE))
+#define F32_TO_S24(v)		(int32_t)SPA_CLAMP((v) * S24_SCALE, S24_MIN, S24_MAX)
+#define F32_TO_S24_D(v,d)	(int32_t)SPA_CLAMP((v) * S24_SCALE + (d), S24_MIN, S24_MAX)
 
-#define U32_TO_F32(v)	U24_TO_F32(((uint32_t)(v)) >> 8)
-#define F32_TO_U32(v)	(F32_TO_U24(v) << 8)
+#define U32_MIN			0u
+#define U32_MAX			4294967040u
+#define U32_SCALE		2147483520.f
+#define U32_OFFS		2147483520.f
+#define U32_TO_F32(v)		((((uint32_t)(v)) * (1.0f / U32_OFFS)) - 1.0)
+#define F32_TO_U32(v)		(uint32_t)SPA_CLAMP((v) * U32_SCALE + U32_OFFS, U32_MIN, U32_MAX)
+#define F32_TO_U32_D(v,d)	(uint32_t)SPA_CLAMP((v) * U32_SCALE + U32_OFFS + (d), U32_MIN, U32_MAX)
 
-#define S32_SCALE	2147483648.0f
-#define S32_MIN		2147483520.0f
-
-#define S32_TO_F32(v)	S24_TO_F32(((int32_t)(v)) >> 8)
-#define S32S_TO_F32(v)	S24_TO_F32(((int32_t)bswap_32(v)) >> 8)
-#define F32_TO_S32(v)	(F32_TO_S24(v) << 8)
-#define F32_TO_S32S(v)	bswap_32((F32_TO_S24(v) << 8))
+#define S32_MIN			-2147483520
+#define S32_MAX			2147483520
+#define S32_MAX_F		2147483520.f
+#define S32_SCALE		2147483648.f
+#define S32_TO_F32(v)		(((int32_t)(v)) * (1.0f / S32_SCALE))
+#define S32S_TO_F32(v)		(((int32_t)bswap_32(v)) * (1.0f / S32_SCALE))
+#define F32_TO_S32(v)		(int32_t)SPA_CLAMP((v) * S32_SCALE, S32_MIN, S32_MAX)
+#define F32_TO_S32_D(v,d)	(int32_t)SPA_CLAMP((v) * S32_SCALE + (d), S32_MIN, S32_MAX)
+#define F32_TO_S32S(v)		bswap_32(F32_TO_S32(v))
+#define F32_TO_S32S_D(v,d)	bswap_32(F32_TO_S32_D(v,d))
 
 #define U24_32_TO_F32(v)	U32_TO_F32((v)<<8)
 #define U24_32S_TO_F32(v)	U32_TO_F32(((int32_t)bswap_32(v))<<8)
 #define F32_TO_U24_32(v)	F32_TO_U24(v)
 #define F32_TO_U24_32S(v)	bswap_32(F32_TO_U24(v))
+#define F32_TO_U24_32_D(v,d)	F32_TO_U24_D(v,d)
+#define F32_TO_U24_32S_D(v,d)	bswap_32(F32_TO_U24_D(v,d))
 
 #define S24_32_TO_F32(v)	S32_TO_F32((v)<<8)
 #define S24_32S_TO_F32(v)	S32_TO_F32(((int32_t)bswap_32(v))<<8)
 #define F32_TO_S24_32(v)	F32_TO_S24(v)
 #define F32_TO_S24_32S(v)	bswap_32(F32_TO_S24(v))
+#define F32_TO_S24_32_D(v,d)	F32_TO_S24_D(v,d)
+#define F32_TO_S24_32S_D(v,d)	bswap_32(F32_TO_S24_D(v,d))
 
 static inline uint32_t read_u24(const void *src)
 {
@@ -190,16 +210,15 @@ struct convert {
 	uint32_t src_fmt;
 	uint32_t dst_fmt;
 	uint32_t n_channels;
+	uint32_t rate;
 	uint32_t cpu_flags;
 	const char *func_name;
 
 	unsigned int is_passthrough:1;
 
-	int32_t bias;
-	int32_t offset;
-	uint32_t mask;
+	float scale;
 	uint32_t random[16 + FMT_OPS_MAX_ALIGN/4];
-	int32_t *dither;
+	float *dither;
 	uint32_t dither_size;
 
 	float ns_data[MAX_NS];
@@ -295,13 +314,17 @@ DEFINE_FUNCTION(f64_to_f32d, c);
 DEFINE_FUNCTION(f64s_to_f32d, c);
 DEFINE_FUNCTION(f64d_to_f32, c);
 DEFINE_FUNCTION(f32d_to_u8d, c);
+DEFINE_FUNCTION(f32d_to_u8d_dither, c);
 DEFINE_FUNCTION(f32_to_u8, c);
 DEFINE_FUNCTION(f32_to_u8d, c);
 DEFINE_FUNCTION(f32d_to_u8, c);
+DEFINE_FUNCTION(f32d_to_u8_dither, c);
 DEFINE_FUNCTION(f32d_to_s8d, c);
+DEFINE_FUNCTION(f32d_to_s8d_dither, c);
 DEFINE_FUNCTION(f32_to_s8, c);
 DEFINE_FUNCTION(f32_to_s8d, c);
 DEFINE_FUNCTION(f32d_to_s8, c);
+DEFINE_FUNCTION(f32d_to_s8_dither, c);
 DEFINE_FUNCTION(f32d_to_alaw, c);
 DEFINE_FUNCTION(f32d_to_ulaw, c);
 DEFINE_FUNCTION(f32_to_u16, c);
@@ -375,6 +398,7 @@ DEFINE_FUNCTION(s16_to_f32d, sse2);
 DEFINE_FUNCTION(s24_to_f32d, sse2);
 DEFINE_FUNCTION(s32_to_f32d, sse2);
 DEFINE_FUNCTION(f32d_to_s32, sse2);
+DEFINE_FUNCTION(f32d_to_s32_dither, sse2);
 DEFINE_FUNCTION(f32_to_s16, sse2);
 DEFINE_FUNCTION(f32d_to_s16_2, sse2);
 DEFINE_FUNCTION(f32d_to_s16, sse2);