From d4e39210a331b4422e33c0d692aa281087f4febd Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 6 Oct 2025 15:31:11 +0300 Subject: [PATCH] color-lcms: optimize build_3d_lut() Previously, inverse_evaluate_lut1d was called 3 * len^3 times. Now it is called only 3 * len times with the help of pre-computed arrays for red and green channels. Blue channel does not need an array, because there were no redundant computations. This is a significant win. Also, allocate a temporary array rgb_in, so cmsDoTransform() can be called in batches of len triplets. This seemed to be not that big win. I tried running cmsDoTransform() over len^2 triplets, but that did not seem to improve performance. The test I used creates two 3D LUTs, hence two times for a single run. My representative timing test results per one 3D LUT: - before: 16 ms and 19 ms - after: 7 ms and 10 ms The measurements were done with this patch: static bool xform_to_shaper_plus_3dlut(struct weston_color_transform *xform_base, uint32_t len_shaper, float *shaper, uint32_t len_lut3d, float *lut3d) { struct cmlcms_color_transform *xform = to_cmlcms_xform(xform_base); struct weston_compositor *compositor = xform_base->cm->compositor; bool ret; + struct timespec begin, end; + unsigned i; - ret = build_shaper(xform->lcms_ctx, xform->cmap_3dlut, + clock_gettime(CLOCK_MONOTONIC, &begin); + for (i = 0; i < 100; i++) + ret = build_shaper(xform->lcms_ctx, xform->cmap_3dlut, len_shaper, shaper); if (!ret) return false; - ret = build_3d_lut(compositor, xform->cmap_3dlut, + for (i = 0; i < 100; i++) + ret = build_3d_lut(compositor, xform->cmap_3dlut, len_shaper, shaper, len_lut3d, lut3d); if (!ret) return false; + clock_gettime(CLOCK_MONOTONIC, &end); + fprintf(stderr, "%s: %" PRId64 " ms\n", __func__, timespec_sub_to_msec(&end, &begin)); return true; } Using this command: $ ./tests/test-color-icc-output -f 8 opaque_pixel_conversion Signed-off-by: Pekka Paalanen --- libweston/color-lcms/color-transform.c | 105 ++++++++++++++++++------- 1 file changed, 75 insertions(+), 30 deletions(-) diff --git a/libweston/color-lcms/color-transform.c b/libweston/color-lcms/color-transform.c index 15cf3402d..9c7bec59d 100644 --- a/libweston/color-lcms/color-transform.c +++ b/libweston/color-lcms/color-transform.c @@ -1705,45 +1705,90 @@ build_3d_lut(struct weston_compositor *compositor, cmsHTRANSFORM cmap_3dlut, unsigned int len_shaper, const float *shaper, unsigned int len_lut3d, float *lut3d) { - float divider = len_lut3d - 1; - float rgb_in[3], rgb_out[3]; - uint32_t index, index_r, index_g, index_b; - const float *curves[3]; + const float *const red_curve = &shaper[0]; + const float *const green_curve = &shaper[len_shaper]; + const float *const blue_curve = &shaper[2 * len_shaper]; + uint32_t index_r, index_g, index_b; + uint32_t i; + float *tmp; + float *inverse_r; + float *inverse_g; + struct weston_vec3f *rgb_in; - curves[0] = &shaper[0]; - curves[1] = &shaper[len_shaper]; - curves[2] = &shaper[2 * len_shaper]; + /* + * Ensure the indices and byte counts cannot overflow, + * and memory usage does not get ridiculous. Arbitrary limit. + */ + weston_assert_u32_lt(compositor, len_lut3d, 100); + /* + * A temporary allocation that holds two 1D LUTs of length len_lut3d + * and one scratch array of vec3f of length len_lut3d. + */ + const uint32_t bytes_per_elem = 2 * sizeof (float) + sizeof *rgb_in; + tmp = malloc(len_lut3d * bytes_per_elem); + + inverse_r = &tmp[0]; + inverse_g = &tmp[len_lut3d]; + rgb_in = (struct weston_vec3f *)&tmp[2 * len_lut3d]; + + /* + * For each channel, use the shaper to compute the value x such that + * y(x) = index / (len - 1). As the shaper is a LUT, we find the closest + * neighbors of such point (x, y) and then use linear interpolation to + * estimate x. + */ + for (i = 0; i < len_lut3d; i++) { + float y = (float)i / (len_lut3d - 1); + inverse_r[i] = weston_inverse_evaluate_lut1d(compositor, + len_shaper, + red_curve, + y); + inverse_g[i] = weston_inverse_evaluate_lut1d(compositor, + len_shaper, + green_curve, + y); + } + + /* + * Fill in the 3D LUT: LUT(Rin, Gin, Bin) = { Rout, Gout, Bout } + * Each of Rin, Gin and Bin varies from 0.0 to 1.0. The range [0.0, 1.0] + * is evenly divided into len_lut3d number of sampling points. The + * indices of the sampling points are index_r, index_g, index_b. + * + * To compute { Rout, Gout, Bout }, first Rin, Gin, Bin must go through + * the shaper 1D LUTs in reverse. This was pre-computed into + * inverse_r and inverse_g above, and inverse_b is computed below. + * This was done one dimension (channel) at a time, because they are + * separable. + * + * The next step is not separable, so we iterate through all points in + * the 3D volume. The points are transformed len_lut3d points at a time + * (rgb_in array) to strike a balance between the number of function + * calls and the memory requirements. + */ for (index_b = 0; index_b < len_lut3d; index_b++) { + float inverse_b = weston_inverse_evaluate_lut1d(compositor, + len_shaper, + blue_curve, + (float)index_b / (len_lut3d - 1)); + for (i = 0; i < len_lut3d; i++) + rgb_in[i].b = inverse_b; + for (index_g = 0; index_g < len_lut3d; index_g++) { for (index_r = 0; index_r < len_lut3d; index_r++) { - /** - * For each channel, use the shaper to compute - * the value x such that y(x) = index / divider. - * As the shapper is a LUT, we find the closest - * neighbors of such point (x, y) and then use - * linear interpolation to estimate x. - */ - rgb_in[0] = weston_inverse_evaluate_lut1d(compositor,len_shaper, - curves[0], - (float)index_r / divider); - rgb_in[1] = weston_inverse_evaluate_lut1d(compositor, len_shaper, - curves[1], - (float)index_g / divider); - rgb_in[2] = weston_inverse_evaluate_lut1d(compositor, len_shaper, - curves[2], - (float)index_b / divider); - - cmsDoTransform(cmap_3dlut, rgb_in, rgb_out, 1); - - index = 3 * (index_r + len_lut3d * (index_g + len_lut3d * index_b)); - lut3d[index ] = rgb_out[0]; - lut3d[index + 1] = rgb_out[1]; - lut3d[index + 2] = rgb_out[2]; + rgb_in[index_r].g = inverse_g[index_g]; + rgb_in[index_r].r = inverse_r[index_r]; } + + index_r = 0; + i = 3 * (index_r + len_lut3d * (index_g + len_lut3d * index_b)); + cmsDoTransform(cmap_3dlut, rgb_in, &lut3d[i], len_lut3d); } } + free(tmp); + return true; }