color-lcms: optimize build_3d_lut()

Previously, inverse_evaluate_lut1d was called 3 * len^3 times. Now it is called only 3 * len times with the help of pre-computed arrays for red and green channels. Blue channel does not need an array, because there were no redundant computations. This is a significant win. Also, allocate a temporary array rgb_in, so cmsDoTransform() can be called in batches of len triplets. This seemed to be not that big win. I tried running cmsDoTransform() over len^2 triplets, but that did not seem to improve performance. The test I used creates two 3D LUTs, hence two times for a single run. My representative timing test results per one 3D LUT: - before: 16 ms and 19 ms - after: 7 ms and 10 ms The measurements were done with this patch: static bool xform_to_shaper_plus_3dlut(struct weston_color_transform *xform_base, uint32_t len_shaper, float *shaper, uint32_t len_lut3d, float *lut3d) { struct cmlcms_color_transform *xform = to_cmlcms_xform(xform_base); struct weston_compositor *compositor = xform_base->cm->compositor; bool ret; + struct timespec begin, end; + unsigned i; - ret = build_shaper(xform->lcms_ctx, xform->cmap_3dlut, + clock_gettime(CLOCK_MONOTONIC, &begin); + for (i = 0; i < 100; i++) + ret = build_shaper(xform->lcms_ctx, xform->cmap_3dlut, len_shaper, shaper); if (!ret) return false; - ret = build_3d_lut(compositor, xform->cmap_3dlut, + for (i = 0; i < 100; i++) + ret = build_3d_lut(compositor, xform->cmap_3dlut, len_shaper, shaper, len_lut3d, lut3d); if (!ret) return false; + clock_gettime(CLOCK_MONOTONIC, &end); + fprintf(stderr, "%s: %" PRId64 " ms\n", __func__, timespec_sub_to_msec(&end, &begin)); return true; } Using this command: $ ./tests/test-color-icc-output -f 8 opaque_pixel_conversion Signed-off-by: Pekka Paalanen <pekka.paalanen@collabora.com>
2026-02-06 01:10:28 +01:00 · 2025-10-06 15:31:11 +03:00 · 2025-10-06 15:31:11 +03:00 · d4e39210a3
commit d4e39210a3
parent f69bb08738
1 changed files with 75 additions and 30 deletions
--- a/libweston/color-lcms/color-transform.c
+++ b/libweston/color-lcms/color-transform.c
@ -1705,45 +1705,90 @@ build_3d_lut(struct weston_compositor *compositor, cmsHTRANSFORM cmap_3dlut,
 	     unsigned int len_shaper, const float *shaper,
 	     unsigned int len_lut3d, float *lut3d)
 {
-	float divider = len_lut3d - 1;
-	float rgb_in[3], rgb_out[3];
-	uint32_t index, index_r, index_g, index_b;
-	const float *curves[3];
+	const float *const red_curve = &shaper[0];
+	const float *const green_curve = &shaper[len_shaper];
+	const float *const blue_curve = &shaper[2 * len_shaper];
+	uint32_t index_r, index_g, index_b;
+	uint32_t i;
+	float *tmp;
+	float *inverse_r;
+	float *inverse_g;
+	struct weston_vec3f *rgb_in;

-	curves[0] = &shaper[0];
-	curves[1] = &shaper[len_shaper];
-	curves[2] = &shaper[2 * len_shaper];
+	/*
+	 * Ensure the indices and byte counts cannot overflow,
+	 * and memory usage does not get ridiculous. Arbitrary limit.
+	 */
+	weston_assert_u32_lt(compositor, len_lut3d, 100);

+	/*
+	 * A temporary allocation that holds two 1D LUTs of length len_lut3d
+	 * and one scratch array of vec3f of length len_lut3d.
+	 */
+	const uint32_t bytes_per_elem = 2 * sizeof (float) + sizeof *rgb_in;
+	tmp = malloc(len_lut3d * bytes_per_elem);
+
+	inverse_r = &tmp[0];
+	inverse_g = &tmp[len_lut3d];
+	rgb_in = (struct weston_vec3f *)&tmp[2 * len_lut3d];
+
+	/*
+	 * For each channel, use the shaper to compute the value x such that
+	 * y(x) = index / (len - 1). As the shaper is a LUT, we find the closest
+	 * neighbors of such point (x, y) and then use linear interpolation to
+	 * estimate x.
+	 */
+	for (i = 0; i < len_lut3d; i++) {
+		float y = (float)i / (len_lut3d - 1);
+		inverse_r[i] = weston_inverse_evaluate_lut1d(compositor,
+							     len_shaper,
+							     red_curve,
+							     y);
+		inverse_g[i] = weston_inverse_evaluate_lut1d(compositor,
+							     len_shaper,
+							     green_curve,
+							     y);
+	}
+
+	/*
+	 * Fill in the 3D LUT: LUT(Rin, Gin, Bin) = { Rout, Gout, Bout }
+	 * Each of Rin, Gin and Bin varies from 0.0 to 1.0. The range [0.0, 1.0]
+	 * is evenly divided into len_lut3d number of sampling points. The
+	 * indices of the sampling points are index_r, index_g, index_b.
+	 *
+	 * To compute { Rout, Gout, Bout }, first Rin, Gin, Bin must go through
+	 * the shaper 1D LUTs in reverse. This was pre-computed into
+	 * inverse_r and inverse_g above, and inverse_b is computed below.
+	 * This was done one dimension (channel) at a time, because they are
+	 * separable.
+	 *
+	 * The next step is not separable, so we iterate through all points in
+	 * the 3D volume. The points are transformed len_lut3d points at a time
+	 * (rgb_in array) to strike a balance between the number of function
+	 * calls and the memory requirements.
+	 */
 	for (index_b = 0; index_b < len_lut3d; index_b++) {
+		float inverse_b = weston_inverse_evaluate_lut1d(compositor,
+								len_shaper,
+								blue_curve,
+								(float)index_b / (len_lut3d - 1));
+		for (i = 0; i < len_lut3d; i++)
+			rgb_in[i].b = inverse_b;
+
 		for (index_g = 0; index_g < len_lut3d; index_g++) {
 			for (index_r = 0; index_r < len_lut3d; index_r++) {
-				/**
-				 * For each channel, use the shaper to compute
-				 * the value x such that y(x) = index / divider.
-				 * As the shapper is a LUT, we find the closest
-				 * neighbors of such point (x, y) and then use
-				 * linear interpolation to estimate x.
-				 */
-				rgb_in[0] = weston_inverse_evaluate_lut1d(compositor,len_shaper,
-									  curves[0],
-									  (float)index_r / divider);
-				rgb_in[1] = weston_inverse_evaluate_lut1d(compositor, len_shaper,
-									  curves[1],
-									  (float)index_g / divider);
-				rgb_in[2] = weston_inverse_evaluate_lut1d(compositor, len_shaper,
-									  curves[2],
-									  (float)index_b / divider);
-
-				cmsDoTransform(cmap_3dlut, rgb_in, rgb_out, 1);
-
-				index = 3 * (index_r + len_lut3d * (index_g + len_lut3d * index_b));
-				lut3d[index    ] = rgb_out[0];
-				lut3d[index + 1] = rgb_out[1];
-				lut3d[index + 2] = rgb_out[2];
+				rgb_in[index_r].g = inverse_g[index_g];
+				rgb_in[index_r].r = inverse_r[index_r];
 			}
+
+			index_r = 0;
+			i = 3 * (index_r + len_lut3d * (index_g + len_lut3d * index_b));
+			cmsDoTransform(cmap_3dlut, rgb_in, &lut3d[i], len_lut3d);
 		}
 	}

+	free(tmp);
+
 	return true;
 }