amd/vpelib: Multiple instance support in caching framework

Generalize the caching to work with multiple instances of objects. Change some static functions to public functions to maximize function re-use possibilities. Reviewed-by: Roy Chan <Roy.Chan@amd.com> Acked-by: Chih-Wei Chien <Chih-Wei.Chien@amd.com> Signed-off-by: Brendan <brendanSteve.leder@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31605>
2026-05-07 07:08:04 +02:00 · 2024-06-27 14:56:54 -04:00 · 2024-06-27 14:56:54 -04:00 · ded1a2b3f0
commit ded1a2b3f0
parent 7a293a812a
15 changed files with 222 additions and 158 deletions
--- a/src/amd/vpelib/src/chip/vpe10/inc/vpe10_dpp.h
+++ b/src/amd/vpelib/src/chip/vpe10/inc/vpe10_dpp.h
@ -887,6 +887,69 @@ void vpe10_dpp_set_segment_scaler(struct dpp *dpp, const struct scaler_data *scl

 void vpe10_dpp_set_frame_scaler(struct dpp *dpp, const struct scaler_data *scl_data);

+/*Scalar helper functions*/
+enum vpe10_coef_filter_type_sel {
+    SCL_COEF_LUMA_VERT_FILTER   = 0,
+    SCL_COEF_LUMA_HORZ_FILTER   = 1,
+    SCL_COEF_CHROMA_VERT_FILTER = 2,
+    SCL_COEF_CHROMA_HORZ_FILTER = 3,
+    SCL_COEF_ALPHA_VERT_FILTER  = 4,
+    SCL_COEF_ALPHA_HORZ_FILTER  = 5,
+};
+
+enum vpe10_dscl_autocal_mode {
+    AUTOCAL_MODE_OFF = 0,
+
+    /* Autocal calculate the scaling ratio and initial phase and the
+     * DSCL_MODE_SEL must be set to 1
+     */
+    AUTOCAL_MODE_AUTOSCALE = 1,
+    /* Autocal perform auto centering without replication and the
+     * DSCL_MODE_SEL must be set to 0
+     */
+    AUTOCAL_MODE_AUTOCENTER = 2,
+    /* Autocal perform auto centering and auto replication and the
+     * DSCL_MODE_SEL must be set to 0
+     */
+    AUTOCAL_MODE_AUTOREPLICATE = 3
+};
+
+enum vpe10_dscl_mode_sel {
+    DSCL_MODE_SCALING_444_BYPASS        = 0,
+    DSCL_MODE_SCALING_444_RGB_ENABLE    = 1,
+    DSCL_MODE_SCALING_444_YCBCR_ENABLE  = 2,
+    DSCL_MODE_SCALING_420_YCBCR_ENABLE  = 3,
+    DSCL_MODE_SCALING_420_LUMA_BYPASS   = 4,
+    DSCL_MODE_SCALING_420_CHROMA_BYPASS = 5,
+    DSCL_MODE_DSCL_BYPASS               = 6
+};
+void vpe10_dpp_dscl_set_h_blank(struct dpp *dpp, uint16_t start, uint16_t end);
+
+void vpe10_dpp_dscl_set_v_blank(struct dpp *dpp, uint16_t start, uint16_t end);
+
+void vpe10_dpp_power_on_dscl(struct dpp *dpp, bool power_on);
+
+void vpe10_dpp_dscl_set_lb(struct dpp *dpp, const struct line_buffer_params *lb_params,
+    enum lb_memory_config mem_size_config);
+
+void vpe10_dpp_dscl_set_scale_ratio(struct dpp *dpp, const struct scaler_data *data);
+
+void vpe10_dpp_dscl_set_taps(struct dpp *dpp, const struct scaler_data *scl_data);
+
+void vpe10_dpp_dscl_set_scl_filter(struct dpp *dpp, const struct scaler_data *scl_data,
+    enum vpe10_dscl_mode_sel scl_mode, bool chroma_coef_mode);
+
+void vpe10_dpp_dscl_set_dscl_mode(struct dpp *dpp, enum vpe10_dscl_mode_sel dscl_mode);
+
+enum vpe10_dscl_mode_sel vpe10_dpp_dscl_get_dscl_mode(const struct scaler_data *data);
+
+void vpe10_dpp_dscl_set_scaler_filter(struct dpp *dpp, uint32_t taps,
+    enum vpe10_coef_filter_type_sel filter_type, const uint16_t *filter);
+
+bool vpe10_dpp_dscl_is_ycbcr(const enum vpe_surface_pixel_format format);
+
+void vpe10_dpp_program_gamcor_lut(struct dpp *dpp, const struct pwl_params *params);
+
 uint32_t vpe10_get_line_buffer_size(void);

 bool vpe10_dpp_validate_number_of_taps(struct dpp *dpp, struct scaler_data *scl_data);
--- a/src/amd/vpelib/src/chip/vpe10/vpe10_cm_common.c
+++ b/src/amd/vpelib/src/chip/vpe10/vpe10_cm_common.c
@ -233,7 +233,7 @@ bool vpe10_cm_helper_translate_curve_to_hw_format(
    uint32_t j, k, seg_distr[MAX_REGIONS_NUMBER], increment, start_index, hw_points;

    if (output_tf == NULL || lut_params == NULL || output_tf->type == TF_TYPE_BYPASS ||
-        !output_tf->dirty)
+        (!output_tf->dirty && (lut_params->hw_points_num != 0)))
        return false;

    corner_points = lut_params->corner_points;
@ -419,7 +419,7 @@ bool vpe10_cm_helper_translate_curve_to_degamma_hw_format(
    uint32_t k, seg_distr[MAX_REGIONS_NUMBER_DEGAMMA], num_segments, hw_points;

    if (output_tf == NULL || lut_params == NULL || output_tf->type == TF_TYPE_BYPASS ||
-        !output_tf->dirty)
+        (!output_tf->dirty && (lut_params->hw_points_num != 0)))
        return false;

    corner_points = lut_params->corner_points;
--- a/src/amd/vpelib/src/chip/vpe10/vpe10_dpp_cm.c
+++ b/src/amd/vpelib/src/chip/vpe10/vpe10_dpp_cm.c
@ -132,7 +132,7 @@ static void vpe10_dpp_program_gammcor_lut(
    }
 }

-static void vpe10_dpp_program_gamcor_lut(struct dpp *dpp, const struct pwl_params *params)
+void vpe10_dpp_program_gamcor_lut(struct dpp *dpp, const struct pwl_params *params)
 {
    struct vpe10_xfer_func_reg gam_regs = {0};

@ -204,7 +204,7 @@ void vpe10_dpp_program_input_transfer_func(struct dpp *dpp, struct transfer_func
    bypass = ((input_tf->type == TF_TYPE_BYPASS) || dpp->vpe_priv->init.debug.bypass_gamcor);

    CONFIG_CACHE(input_tf, stream_ctx, vpe_priv->init.debug.disable_lut_caching, bypass,
-        vpe10_dpp_program_gamcor_lut(dpp, params));
+        vpe10_dpp_program_gamcor_lut(dpp, params), dpp->inst);
 }

 void vpe10_dpp_program_gamut_remap(struct dpp *dpp, struct colorspace_transform *gamut_remap)
--- a/src/amd/vpelib/src/chip/vpe10/vpe10_dpp_dscl.c
+++ b/src/amd/vpelib/src/chip/vpe10/vpe10_dpp_dscl.c
@ -34,43 +34,7 @@

 #define LB_MAX_PARTITION 12

-enum vpe10_coef_filter_type_sel {
-    SCL_COEF_LUMA_VERT_FILTER   = 0,
-    SCL_COEF_LUMA_HORZ_FILTER   = 1,
-    SCL_COEF_CHROMA_VERT_FILTER = 2,
-    SCL_COEF_CHROMA_HORZ_FILTER = 3,
-    SCL_COEF_ALPHA_VERT_FILTER  = 4,
-    SCL_COEF_ALPHA_HORZ_FILTER  = 5
-};
-
-enum dscl_autocal_mode {
-    AUTOCAL_MODE_OFF = 0,
-
-    /* Autocal calculate the scaling ratio and initial phase and the
-     * DSCL_MODE_SEL must be set to 1
-     */
-    AUTOCAL_MODE_AUTOSCALE = 1,
-    /* Autocal perform auto centering without replication and the
-     * DSCL_MODE_SEL must be set to 0
-     */
-    AUTOCAL_MODE_AUTOCENTER = 2,
-    /* Autocal perform auto centering and auto replication and the
-     * DSCL_MODE_SEL must be set to 0
-     */
-    AUTOCAL_MODE_AUTOREPLICATE = 3
-};
-
-enum dscl_mode_sel {
-    DSCL_MODE_SCALING_444_BYPASS        = 0,
-    DSCL_MODE_SCALING_444_RGB_ENABLE    = 1,
-    DSCL_MODE_SCALING_444_YCBCR_ENABLE  = 2,
-    DSCL_MODE_SCALING_420_YCBCR_ENABLE  = 3,
-    DSCL_MODE_SCALING_420_LUMA_BYPASS   = 4,
-    DSCL_MODE_SCALING_420_CHROMA_BYPASS = 5,
-    DSCL_MODE_DSCL_BYPASS               = 6
-};
-
-static bool dpp1_dscl_is_ycbcr(const enum vpe_surface_pixel_format format)
+bool vpe10_dpp_dscl_is_ycbcr(const enum vpe_surface_pixel_format format)
 {
    return format >= VPE_SURFACE_PIXEL_FORMAT_VIDEO_BEGIN &&
           format <= VPE_SURFACE_PIXEL_FORMAT_VIDEO_END;
@ -82,7 +46,7 @@ static bool dpp1_dscl_is_video_subsampled(const enum vpe_surface_pixel_format fo
            format <= VPE_SURFACE_PIXEL_FORMAT_SUBSAMPLE_END);
 }

-static enum dscl_mode_sel dpp1_dscl_get_dscl_mode(const struct scaler_data *data)
+enum vpe10_dscl_mode_sel vpe10_dpp_dscl_get_dscl_mode(const struct scaler_data *data)
 {

    // TODO Check if bypass bit enabled
@ -92,7 +56,7 @@ static enum dscl_mode_sel dpp1_dscl_get_dscl_mode(const struct scaler_data *data
        data->ratios.horz_c.value == one && data->ratios.vert_c.value == one)
        return DSCL_MODE_DSCL_BYPASS;

-    if (!dpp1_dscl_is_ycbcr(data->format))
+    if (!vpe10_dpp_dscl_is_ycbcr(data->format))
        return DSCL_MODE_SCALING_444_RGB_ENABLE;

    if (!dpp1_dscl_is_video_subsampled(data->format))
@ -104,7 +68,7 @@ static enum dscl_mode_sel dpp1_dscl_get_dscl_mode(const struct scaler_data *data
    return DSCL_MODE_SCALING_420_YCBCR_ENABLE;
 }

-static void dpp1_dscl_set_dscl_mode(struct dpp *dpp, enum dscl_mode_sel dscl_mode)
+void vpe10_dpp_dscl_set_dscl_mode(struct dpp *dpp, enum vpe10_dscl_mode_sel dscl_mode)
 {

    PROGRAM_ENTRY();
@ -130,21 +94,21 @@ static void dpp1_dscl_set_mpc_size(struct dpp *dpp, const struct scaler_data *sc
    REG_SET_2(VPMPC_SIZE, 0, VPMPC_WIDTH, scl_data->h_active, VPMPC_HEIGHT, scl_data->v_active);
 }

-static void dpp1_dscl_set_h_blank(struct dpp *dpp, uint16_t start, uint16_t end)
+void vpe10_dpp_dscl_set_h_blank(struct dpp *dpp, uint16_t start, uint16_t end)
 {

    PROGRAM_ENTRY();
    REG_SET_2(VPOTG_H_BLANK, 0, OTG_H_BLANK_END, end, OTG_H_BLANK_START, start);
 }

-static void dpp1_dscl_set_v_blank(struct dpp *dpp, uint16_t start, uint16_t end)
+void vpe10_dpp_dscl_set_v_blank(struct dpp *dpp, uint16_t start, uint16_t end)
 {

    PROGRAM_ENTRY();
    REG_SET_2(VPOTG_V_BLANK, 0, OTG_V_BLANK_END, end, OTG_V_BLANK_START, start);
 }

-static void dpp1_dscl_set_taps(struct dpp *dpp, const struct scaler_data *scl_data)
+void vpe10_dpp_dscl_set_taps(struct dpp *dpp, const struct scaler_data *scl_data)
 {

    PROGRAM_ENTRY();
@ -172,7 +136,7 @@ static const uint16_t *dpp1_dscl_get_filter_coeffs_64p(int taps, struct fixed31_
    }
 }

-static void dpp1_dscl_set_scaler_filter(struct dpp *dpp, uint32_t taps,
+void vpe10_dpp_dscl_set_scaler_filter(struct dpp *dpp, uint32_t taps,
    enum vpe10_coef_filter_type_sel filter_type, const uint16_t *filter)
 {
    const int tap_pairs = (taps + 1) / 2;
@ -206,8 +170,8 @@ static void dpp1_dscl_set_scaler_filter(struct dpp *dpp, uint32_t taps,
    }
 }

-static void dpp1_dscl_set_scl_filter(struct dpp *dpp, const struct scaler_data *scl_data,
-    enum dscl_mode_sel scl_mode, bool chroma_coef_mode)
+void vpe10_dpp_dscl_set_scl_filter(struct dpp *dpp, const struct scaler_data *scl_data,
+    enum vpe10_dscl_mode_sel scl_mode, bool chroma_coef_mode)
 {

    const uint16_t *filter_h   = NULL;
@ -228,11 +192,11 @@ static void dpp1_dscl_set_scl_filter(struct dpp *dpp, const struct scaler_data *
        filter_v = (const uint16_t *)&scl_data->polyphase_filter_coeffs->vert_polyphase_coeffs;
    }
    if (filter_h != NULL)
-        dpp1_dscl_set_scaler_filter(
+        vpe10_dpp_dscl_set_scaler_filter(
            dpp, scl_data->taps.h_taps, SCL_COEF_LUMA_HORZ_FILTER, filter_h);

    if (filter_v != NULL)
-        dpp1_dscl_set_scaler_filter(
+        vpe10_dpp_dscl_set_scaler_filter(
            dpp, scl_data->taps.v_taps, SCL_COEF_LUMA_VERT_FILTER, filter_v);

    if (chroma_coef_mode) {
@ -243,18 +207,18 @@ static void dpp1_dscl_set_scl_filter(struct dpp *dpp, const struct scaler_data *
            dpp1_dscl_get_filter_coeffs_64p((int)scl_data->taps.v_taps_c, scl_data->ratios.vert_c);

        if (filter_h_c != NULL)
-            dpp1_dscl_set_scaler_filter(
+            vpe10_dpp_dscl_set_scaler_filter(
                dpp, scl_data->taps.h_taps_c, SCL_COEF_CHROMA_HORZ_FILTER, filter_h_c);

        if (filter_v_c != NULL)
-            dpp1_dscl_set_scaler_filter(
+            vpe10_dpp_dscl_set_scaler_filter(
                dpp, scl_data->taps.v_taps_c, SCL_COEF_CHROMA_VERT_FILTER, filter_v_c);
    }

    REG_UPDATE(VPDSCL_MODE, SCL_CHROMA_COEF_MODE, chroma_coef_mode);
 }

-static void dpp1_dscl_set_lb(struct dpp *dpp, const struct line_buffer_params *lb_params,
+void vpe10_dpp_dscl_set_lb(struct dpp *dpp, const struct line_buffer_params *lb_params,
    enum lb_memory_config mem_size_config)
 {

@ -266,7 +230,7 @@ static void dpp1_dscl_set_lb(struct dpp *dpp, const struct line_buffer_params *l
        VPLB_MEMORY_CTRL, 0, MEMORY_CONFIG, mem_size_config, LB_MAX_PARTITIONS, LB_MAX_PARTITION);
 }

-static void dpp1_dscl_set_scale_ratio(struct dpp *dpp, const struct scaler_data *data)
+void vpe10_dpp_dscl_set_scale_ratio(struct dpp *dpp, const struct scaler_data *data)
 {

    PROGRAM_ENTRY();
@ -313,7 +277,7 @@ static void dpp1_dscl_set_scaler_position(struct dpp *dpp, const struct scaler_d
        VPDSCL_VERT_FILTER_INIT_C, 0, SCL_V_INIT_FRAC_C, init_frac, SCL_V_INIT_INT_C, init_int);
 }

-static void dpp1_power_on_dscl(struct dpp *dpp, bool power_on)
+void vpe10_dpp_power_on_dscl(struct dpp *dpp, bool power_on)
 {
    PROGRAM_ENTRY();

@ -346,7 +310,7 @@ static void dpp1_power_on_dscl(struct dpp *dpp, bool power_on)
 void vpe10_dpp_set_segment_scaler(struct dpp *dpp, const struct scaler_data *scl_data)
 {

-    enum dscl_mode_sel dscl_mode = dpp1_dscl_get_dscl_mode(scl_data);
+    enum vpe10_dscl_mode_sel dscl_mode = vpe10_dpp_dscl_get_dscl_mode(scl_data);

    dpp1_dscl_set_recout(dpp, &scl_data->recout);
    dpp1_dscl_set_mpc_size(dpp, scl_data);
@ -360,24 +324,24 @@ void vpe10_dpp_set_segment_scaler(struct dpp *dpp, const struct scaler_data *scl
 void vpe10_dpp_set_frame_scaler(struct dpp *dpp, const struct scaler_data *scl_data)
 {

-    enum dscl_mode_sel dscl_mode = dpp1_dscl_get_dscl_mode(scl_data);
-    bool               ycbcr     = dpp1_dscl_is_ycbcr(scl_data->format);
+    enum vpe10_dscl_mode_sel dscl_mode = vpe10_dpp_dscl_get_dscl_mode(scl_data);
+    bool                     ycbcr     = vpe10_dpp_dscl_is_ycbcr(scl_data->format);

-    dpp1_dscl_set_h_blank(dpp, 1, 0);
-    dpp1_dscl_set_v_blank(dpp, 1, 0);
+    vpe10_dpp_dscl_set_h_blank(dpp, 1, 0);
+    vpe10_dpp_dscl_set_v_blank(dpp, 1, 0);

    if (dscl_mode != DSCL_MODE_DSCL_BYPASS)
-        dpp1_power_on_dscl(dpp, true);
+        vpe10_dpp_power_on_dscl(dpp, true);

-    dpp1_dscl_set_dscl_mode(dpp, dscl_mode);
+    vpe10_dpp_dscl_set_dscl_mode(dpp, dscl_mode);

    if (dscl_mode == DSCL_MODE_DSCL_BYPASS) {
-        dpp1_power_on_dscl(dpp, false);
+        vpe10_dpp_power_on_dscl(dpp, false);
        return;
    }

-    dpp1_dscl_set_lb(dpp, &scl_data->lb_params, LB_MEMORY_CONFIG_0);
-    dpp1_dscl_set_scale_ratio(dpp, scl_data);
-    dpp1_dscl_set_taps(dpp, scl_data);
-    dpp1_dscl_set_scl_filter(dpp, scl_data, dscl_mode, ycbcr);
+    vpe10_dpp_dscl_set_lb(dpp, &scl_data->lb_params, LB_MEMORY_CONFIG_0);
+    vpe10_dpp_dscl_set_scale_ratio(dpp, scl_data);
+    vpe10_dpp_dscl_set_taps(dpp, scl_data);
+    vpe10_dpp_dscl_set_scl_filter(dpp, scl_data, dscl_mode, ycbcr);
 }
--- a/src/amd/vpelib/src/chip/vpe10/vpe10_mpc.c
+++ b/src/amd/vpelib/src/chip/vpe10/vpe10_mpc.c
@ -1263,12 +1263,12 @@ void vpe10_mpc_set_mpc_shaper_3dlut(

    bypass = (!shaper_lut || (func_shaper && func_shaper->type == TF_TYPE_BYPASS));
    CONFIG_CACHE(func_shaper, stream_ctx, vpe_priv->init.debug.disable_lut_caching, bypass,
-        mpc->funcs->program_shaper(mpc, shaper_lut));
+        mpc->funcs->program_shaper(mpc, shaper_lut), mpc->inst);

    bypass       = (!lut3d_func || !lut3d_func->state.bits.initialized);
    lut3d_params = (bypass) ? (NULL) : (&lut3d_func->lut_3d);
    CONFIG_CACHE(lut3d_func, stream_ctx, vpe_priv->init.debug.disable_lut_caching, bypass,
-        mpc->funcs->program_3dlut(mpc, lut3d_params));
+        mpc->funcs->program_3dlut(mpc, lut3d_params), mpc->inst);

    return;
 }
@ -1298,7 +1298,7 @@ void vpe10_mpc_set_output_transfer_func(struct mpc *mpc, struct output_ctx *outp
              vpe_priv->init.debug.cm_in_bypass || vpe_priv->init.debug.bypass_ogam);

    CONFIG_CACHE(output_ctx->output_tf, output_ctx, vpe_priv->init.debug.disable_lut_caching,
-        bypass, mpc->funcs->set_output_gamma(mpc, params));
+        bypass, mpc->funcs->set_output_gamma(mpc, params), mpc->inst);
 }

 void vpe10_mpc_set_blend_lut(struct mpc *mpc, struct transfer_func *blend_tf)
@ -1328,7 +1328,7 @@ void vpe10_mpc_set_blend_lut(struct mpc *mpc, struct transfer_func *blend_tf)
        ((!blend_tf) || (blend_tf->type == TF_TYPE_BYPASS) || vpe_priv->init.debug.bypass_blndgam);

    CONFIG_CACHE(blend_tf, stream_ctx, vpe_priv->init.debug.disable_lut_caching, bypass,
-        mpc->funcs->program_1dlut(mpc, blend_lut, gamma_type));
+        mpc->funcs->program_1dlut(mpc, blend_lut, gamma_type), mpc->inst);
 }

 bool vpe10_mpc_program_movable_cm(struct mpc *mpc, struct transfer_func *func_shaper,
--- a/src/amd/vpelib/src/core/color.c
+++ b/src/amd/vpelib/src/core/color.c
@ -181,29 +181,33 @@ static bool color_update_regamma_tf(struct vpe_priv *vpe_priv,
        break;
    }

-    if (vpe_priv->init.debug.disable_lut_caching ||
-        (output_tf->cache_info.cm_gamma_type != output_tf->cm_gamma_type) ||
-        (output_tf->cache_info.tf != output_tf->tf) ||
-        (output_tf->cache_info.x_scale.value != x_scale.value) ||
-        (output_tf->cache_info.y_scale.value != y_scale.value) ||
-        (output_tf->cache_info.y_bias.value != y_bias.value)) {
-        // if gamma points have been previously generated,
-        // skip the re-gen no matter it was config cached or not
-        update = true;
+    for (int i = 0; i < vpe_priv->pub.caps->resource_caps.num_dpp; i++) {
+        if (vpe_priv->init.debug.disable_lut_caching ||
+            (output_tf->cache_info[i].cm_gamma_type != output_tf->cm_gamma_type) ||
+            (output_tf->cache_info[i].tf != output_tf->tf) ||
+            (output_tf->cache_info[i].x_scale.value != x_scale.value) ||
+            (output_tf->cache_info[i].y_scale.value != y_scale.value) ||
+            (output_tf->cache_info[i].y_bias.value != y_bias.value)) {
+            // if gamma points have been previously generated,
+            // skip the re-gen no matter it was config cached or not
+            update = true;
+        }
    }

    if (update) {
        ret = vpe_color_calculate_regamma_params(
            vpe_priv, x_scale, y_scale, &vpe_priv->cal_buffer, output_tf);
        if (ret) {
-            // reset the cache status and mark as dirty to let hw layer to re-cache
-            output_tf->dirty                    = true;
-            output_tf->config_cache.cached      = false;
-            output_tf->cache_info.cm_gamma_type = output_tf->cm_gamma_type;
-            output_tf->cache_info.tf            = output_tf->tf;
-            output_tf->cache_info.x_scale       = x_scale;
-            output_tf->cache_info.y_scale       = y_scale;
-            output_tf->cache_info.y_bias        = y_bias;
+            for (int i = 0; i < vpe_priv->pub.caps->resource_caps.num_dpp; i++) {
+                // reset the cache status and mark as dirty to let hw layer to re-cache
+                output_tf->dirty[i]                    = true;
+                output_tf->config_cache[i].cached      = false;
+                output_tf->cache_info[i].cm_gamma_type = output_tf->cm_gamma_type;
+                output_tf->cache_info[i].tf            = output_tf->tf;
+                output_tf->cache_info[i].x_scale       = x_scale;
+                output_tf->cache_info[i].y_scale       = y_scale;
+                output_tf->cache_info[i].y_bias        = y_bias;
+            }
        }
    }
    return ret;
@ -240,28 +244,32 @@ static bool color_update_degamma_tf(struct vpe_priv *vpe_priv,
        break;
    }

-    if (vpe_priv->init.debug.disable_lut_caching ||
-        (input_tf->cache_info.cm_gamma_type != input_tf->cm_gamma_type) ||
-        (input_tf->cache_info.tf != input_tf->tf) ||
-        (input_tf->cache_info.x_scale.value != x_scale.value) ||
-        (input_tf->cache_info.y_scale.value != y_scale.value) ||
-        (input_tf->cache_info.y_bias.value != y_bias.value)) {
-        // if gamma points have been previously generated,
-        // skip the re-gen no matter it was config cached or not
-        update = true;
+    for (int i = 0; i < vpe_priv->pub.caps->resource_caps.num_dpp; i++) {
+        if (vpe_priv->init.debug.disable_lut_caching ||
+            (input_tf->cache_info[i].cm_gamma_type != input_tf->cm_gamma_type) ||
+            (input_tf->cache_info[i].tf != input_tf->tf) ||
+            (input_tf->cache_info[i].x_scale.value != x_scale.value) ||
+            (input_tf->cache_info[i].y_scale.value != y_scale.value) ||
+            (input_tf->cache_info[i].y_bias.value != y_bias.value)) {
+            // if gamma points have been previously generated,
+            // skip the re-gen no matter it was config cached or not
+            update = true;
+        }
    }

    if (update) {
        ret = vpe_color_calculate_degamma_params(vpe_priv, x_scale, y_scale, input_tf);
        if (ret) {
-            // reset the cache status and mark as dirty to let hw layer to re-cache
-            input_tf->dirty                    = true;
-            input_tf->config_cache.cached      = false;
-            input_tf->cache_info.cm_gamma_type = input_tf->cm_gamma_type;
-            input_tf->cache_info.tf            = color_input_tf;
-            input_tf->cache_info.x_scale       = x_scale;
-            input_tf->cache_info.y_scale       = y_scale;
-            input_tf->cache_info.y_bias        = y_bias;
+            for (int i = 0; i < vpe_priv->pub.caps->resource_caps.num_dpp; i++) {
+                // reset the cache status and mark as dirty to let hw layer to re-cache
+                input_tf->dirty[i]                    = true;
+                input_tf->config_cache[i].cached      = false;
+                input_tf->cache_info[i].cm_gamma_type = input_tf->cm_gamma_type;
+                input_tf->cache_info[i].tf            = color_input_tf;
+                input_tf->cache_info[i].x_scale       = x_scale;
+                input_tf->cache_info[i].y_scale       = y_scale;
+                input_tf->cache_info[i].y_bias        = y_bias;
+            }
        }
    }
    return ret;
@ -673,13 +681,22 @@ enum vpe_status vpe_color_update_3dlut(
    if (!enable_3dlut) {
        stream_ctx->lut3d_func->state.bits.initialized = 0;
    } else {
-        if (vpe_priv->init.debug.disable_lut_caching ||
-            (stream_ctx->lut3d_func->cache_info.uid_3dlut != stream_ctx->stream.tm_params.UID)) {
+        bool update = false;
+
+        for (int i = 0; i < vpe_priv->pub.caps->resource_caps.num_mpc_3dlut; i++)
+            if (vpe_priv->init.debug.disable_lut_caching ||
+                (stream_ctx->lut3d_func->cache_info[i].uid_3dlut !=
+                    stream_ctx->stream.tm_params.UID))
+                update = true;
+
+        if (update) {
            vpe_convert_to_tetrahedral(
                vpe_priv, stream_ctx->stream.tm_params.lut_data, stream_ctx->lut3d_func);
-            stream_ctx->lut3d_func->dirty                = true;
-            stream_ctx->lut3d_func->config_cache.cached  = false;
-            stream_ctx->lut3d_func->cache_info.uid_3dlut = stream_ctx->stream.tm_params.UID;
+            for (int i = 0; i < vpe_priv->pub.caps->resource_caps.num_mpc_3dlut; i++) {
+                stream_ctx->lut3d_func->dirty[i]                = true;
+                stream_ctx->lut3d_func->config_cache[i].cached  = false;
+                stream_ctx->lut3d_func->cache_info[i].uid_3dlut = stream_ctx->stream.tm_params.UID;
+            }
        }
        stream_ctx->lut3d_func->state.bits.initialized = 1;
    }
@ -812,10 +829,12 @@ enum vpe_status vpe_color_update_shaper(const struct vpe_priv *vpe_priv, uint16_
    }

    // right now shaper is always programmed with linear, once cached, it is always reused.
-    if (vpe_priv->init.debug.disable_lut_caching ||
-        (shaper_func && shaper_func->cache_info.tf != tf)) {
-        // if the caching has the required data cached, skip the update
-        update = true;
+    for (int i = 0; i < vpe_priv->pub.caps->resource_caps.num_mpc_3dlut; i++) {
+        if (vpe_priv->init.debug.disable_lut_caching ||
+            (shaper_func && shaper_func->cache_info[i].tf != tf)) {
+            // if the caching has the required data cached, skip the update
+            update = true;
+        }
    }

    shaper_func->type = TF_TYPE_HWPWL;
@ -829,9 +848,11 @@ enum vpe_status vpe_color_update_shaper(const struct vpe_priv *vpe_priv, uint16_

        ret = vpe_build_shaper(&shaper_in, &shaper_func->pwl);
        if (ret == VPE_STATUS_OK) {
-            shaper_func->dirty               = true;
-            shaper_func->config_cache.cached = false;
-            shaper_func->cache_info.tf       = tf;
+            for (int i = 0; i < vpe_priv->pub.caps->resource_caps.num_mpc_3dlut; i++) {
+                shaper_func->dirty[i]               = true;
+                shaper_func->config_cache[i].cached = false;
+                shaper_func->cache_info[i].tf       = tf;
+            }
        }
    }
    return ret;
--- a/src/amd/vpelib/src/core/inc/cdc.h
+++ b/src/amd/vpelib/src/core/inc/cdc.h
@ -61,6 +61,7 @@ struct cdc_funcs {
 struct cdc {
    struct vpe_priv  *vpe_priv;
    struct cdc_funcs *funcs;
+    unsigned int      inst;
 };

 #ifdef __cplusplus
--- a/src/amd/vpelib/src/core/inc/color.h
+++ b/src/amd/vpelib/src/core/inc/color.h
@ -147,6 +147,14 @@ struct transfer_func_distributed_points {
    uint16_t x_point_at_y1_blue;
 };

+struct cache_info {
+    enum color_transfer_func tf;
+    enum cm_type             cm_gamma_type;
+    struct fixed31_32        x_scale;
+    struct fixed31_32        y_scale;
+    struct fixed31_32        y_bias;
+};
+
 struct transfer_func {
    enum transfer_func_type  type;
    enum color_transfer_func tf;
@ -161,16 +169,10 @@ struct transfer_func {
    };

    // the followings are for optimization: skip if no change
-    bool                dirty;        /*< indicate this object is updated or not */
-    struct config_cache config_cache; /*< used by the hw hook layer to do the caching */
+    bool                dirty[MAX_PIPE];        /*< indicate this object is updated or not */
+    struct config_cache config_cache[MAX_PIPE]; /*< used by the hw hook layer to do the caching */

-    struct {
-        enum color_transfer_func tf;
-        enum cm_type             cm_gamma_type;
-        struct fixed31_32        x_scale;
-        struct fixed31_32        y_scale;
-        struct fixed31_32        y_bias;
-    } cache_info;
+    struct cache_info cache_info[MAX_PIPE];
 };

 enum color_white_point_type {
@ -237,12 +239,12 @@ struct vpe_3dlut {
    union vpe_3dlut_state     state;

    // the followings are for optimization: skip if no change
-    bool                dirty;        /*< indicate this object is updated or not */
-    struct config_cache config_cache; /*< used by the hw hook layer to do the caching */
+    bool                dirty[MAX_3DLUT];        /*< indicate this object is updated or not */
+    struct config_cache config_cache[MAX_3DLUT]; /*< used by the hw hook layer to do the caching */

    struct {
        uint64_t uid_3dlut; /*< UID for current 3D LUT params */
-    } cache_info;
+    } cache_info[MAX_3DLUT];
 };

 enum vpe_status vpe_color_update_color_space_and_tf(
--- a/src/amd/vpelib/src/core/inc/config_cache.h
+++ b/src/amd/vpelib/src/core/inc/config_cache.h
@ -73,7 +73,6 @@ struct config_cache {
    bool     cached;
 };

-
 /* A macro that helps cache the config packet, it won't cache if it is in bypass mode
 * as bypass mode is not heavy lifting programming.
 *
@ -82,26 +81,28 @@ struct config_cache {
 * /param   disable_cache       a flag that controls a caching is needed
 * /param   is_bypass           if it is in bypass, it doesn't cache the bypass config
 * /param   program_func_call   the program call that generate config packet content
+ * /param   inst                index to address the config_cache array
 */
-#define CONFIG_CACHE(obj_cache, obj_cfg_array, disable_cache, is_bypass, program_func_call)        \
+#define CONFIG_CACHE(obj_cache, obj_cfg_array, disable_cache, is_bypass, program_func_call, inst)  \
    {                                                                                              \
        bool use_cache = false;                                                                    \
                                                                                                   \
        /* make sure it opens a new config packet */                                               \
        config_writer_force_new_with_type(config_writer, CONFIG_TYPE_DIRECT);                      \
                                                                                                   \
-        if ((obj_cache) && !disable_cache && (obj_cache)->config_cache.p_buffer &&                 \
-            (obj_cache)->config_cache.cached && !((obj_cache)->dirty) && !is_bypass) {             \
+        if ((obj_cache) && !disable_cache && (obj_cache)->config_cache[inst].p_buffer &&           \
+            (obj_cache)->config_cache[inst].cached && !((obj_cache)->dirty[inst]) && !is_bypass) { \
            /* reuse the cache */                                                                  \
-            if (config_writer->buf->size >= (obj_cache)->config_cache.size) {                      \
+            if (config_writer->buf->size >= (obj_cache)->config_cache[inst].size) {                \
                memcpy((void *)(uintptr_t)config_writer->base_cpu_va,                              \
-                    (obj_cache)->config_cache.p_buffer,                                            \
-                    (size_t)(obj_cache)->config_cache.size);                                       \
+                    (obj_cache)->config_cache[inst].p_buffer,                                      \
+                    (size_t)(obj_cache)->config_cache[inst].size);                                 \
                config_writer->buf->cpu_va =                                                       \
-                    config_writer->base_cpu_va + (obj_cache)->config_cache.size;                   \
+                    config_writer->base_cpu_va + (obj_cache)->config_cache[inst].size;             \
                config_writer->buf->gpu_va =                                                       \
-                    config_writer->base_gpu_va + (obj_cache)->config_cache.size;                   \
-                config_writer->buf->size -= ((obj_cache)->config_cache.size - sizeof(uint32_t));   \
+                    config_writer->base_gpu_va + (obj_cache)->config_cache[inst].size;             \
+                config_writer->buf->size -=                                                        \
+                    ((obj_cache)->config_cache[inst].size - sizeof(uint32_t));                     \
                use_cache = true;                                                                  \
            }                                                                                      \
        }                                                                                          \
@ -117,21 +118,21 @@ struct config_cache {
            if (!disable_cache && !is_bypass) {                                                    \
                /* only cache when it is not crossing config packets */                            \
                if (config_num == (obj_cfg_array)->num_configs) {                                  \
-                    if ((obj_cache)->dirty) {                                                      \
+                    if ((obj_cache)->dirty[inst]) {                                                \
                        uint64_t size = end - start;                                               \
                                                                                                   \
-                        if ((obj_cache)->config_cache.size < size) {                               \
-                            if ((obj_cache)->config_cache.p_buffer)                                \
-                                vpe_free((obj_cache)->config_cache.p_buffer);                      \
+                        if ((obj_cache)->config_cache[inst].size < size) {                         \
+                            if ((obj_cache)->config_cache[inst].p_buffer)                          \
+                                vpe_free((obj_cache)->config_cache[inst].p_buffer);                \
                                                                                                   \
-                            (obj_cache)->config_cache.p_buffer = vpe_zalloc((size_t)size);         \
-                            if ((obj_cache)->config_cache.p_buffer) {                              \
-                                memcpy((obj_cache)->config_cache.p_buffer,                         \
+                            (obj_cache)->config_cache[inst].p_buffer = vpe_zalloc((size_t)size);   \
+                            if ((obj_cache)->config_cache[inst].p_buffer) {                        \
+                                memcpy((obj_cache)->config_cache[inst].p_buffer,                   \
                                    (void *)(uintptr_t)start, (size_t)size);                       \
-                                (obj_cache)->config_cache.size   = size;                           \
-                                (obj_cache)->config_cache.cached = true;                           \
+                                (obj_cache)->config_cache[inst].size   = size;                     \
+                                (obj_cache)->config_cache[inst].cached = true;                     \
                            } else {                                                               \
-                                (obj_cache)->config_cache.size = 0;                                \
+                                (obj_cache)->config_cache[inst].size = 0;                          \
                            }                                                                      \
                        }                                                                          \
                    }                                                                              \
@ -139,7 +140,7 @@ struct config_cache {
            }                                                                                      \
        }                                                                                          \
        if ((obj_cache))                                                                           \
-            (obj_cache)->dirty = false;                                                            \
+            (obj_cache)->dirty[inst] = false;                                                      \
    }

 /* the following macro requires a local variable vpr_priv to be present */
--- a/src/amd/vpelib/src/core/inc/dpp.h
+++ b/src/amd/vpelib/src/core/inc/dpp.h
@ -99,6 +99,7 @@ struct dpp_funcs {
 struct dpp {
    struct vpe_priv  *vpe_priv;
    struct dpp_funcs *funcs;
+    unsigned int      inst;

    struct pwl_params degamma_params;
 };
--- a/src/amd/vpelib/src/core/inc/hw_shared.h
+++ b/src/amd/vpelib/src/core/inc/hw_shared.h
@ -26,6 +26,11 @@

 #include "fixed31_32.h"

+#define MAX_3DLUT 1
+
+#define MAX_PIPE        2
+#define MAX_OUTPUT_PIPE 1
+
 #ifdef __cplusplus
 extern "C" {
 #endif
--- a/src/amd/vpelib/src/core/inc/mpc.h
+++ b/src/amd/vpelib/src/core/inc/mpc.h
@ -171,6 +171,7 @@ struct mpc_funcs {
 struct mpc {
    struct vpe_priv  *vpe_priv;
    struct mpc_funcs *funcs;
+    unsigned int      inst;
    struct pwl_params regamma_params;
    struct pwl_params blender_params;
    struct pwl_params shaper_params;
--- a/src/amd/vpelib/src/core/inc/opp.h
+++ b/src/amd/vpelib/src/core/inc/opp.h
@ -121,6 +121,7 @@ struct opp_funcs {
 struct opp {
    struct vpe_priv  *vpe_priv;
    struct opp_funcs *funcs;
+    unsigned int      inst;
 };

 #ifdef __cplusplus
--- a/src/amd/vpelib/src/core/inc/resource.h
+++ b/src/amd/vpelib/src/core/inc/resource.h
@ -32,6 +32,7 @@
 #include "mpc.h"
 #include "opp.h"
 #include "vector.h"
+#include "hw_shared.h"

 #ifdef __cplusplus
 extern "C" {
@ -41,8 +42,6 @@ struct vpe_priv;
 struct vpe_cmd_info;
 struct segment_ctx;

-#define MAX_PIPE 2
-#define MAX_OUTPUT_PIPE 2
 #define MIN_VPE_CMD     1024

 enum vpe_cmd_ops;
--- a/src/amd/vpelib/src/core/resource.c
+++ b/src/amd/vpelib/src/core/resource.c
@ -199,7 +199,8 @@ void vpe_free_stream_ctx(struct vpe_priv *vpe_priv)
    for (i = 0; i < vpe_priv->num_streams; i++) {
        ctx = &vpe_priv->stream_ctx[i];
        if (ctx->input_tf) {
-            CONFIG_CACHE_FREE(ctx->input_tf->config_cache);
+            for (int j = 0; j < MAX_PIPE; j++)
+                CONFIG_CACHE_FREE(ctx->input_tf->config_cache[j]);
            vpe_free(ctx->input_tf);
            ctx->input_tf = NULL;
        }
@ -220,19 +221,22 @@ void vpe_free_stream_ctx(struct vpe_priv *vpe_priv)
        }

        if (ctx->in_shaper_func) {
-            CONFIG_CACHE_FREE(ctx->in_shaper_func->config_cache);
+            for (int j = 0; j < MAX_PIPE; j++)
+                CONFIG_CACHE_FREE(ctx->in_shaper_func->config_cache[j]);
            vpe_free(ctx->in_shaper_func);
            ctx->in_shaper_func = NULL;
        }

        if (ctx->blend_tf) {
-            CONFIG_CACHE_FREE(ctx->blend_tf->config_cache);
+            for (int j = 0; j < MAX_PIPE; j++)
+                CONFIG_CACHE_FREE(ctx->blend_tf->config_cache[j]);
            vpe_free(ctx->blend_tf);
            ctx->blend_tf = NULL;
        }

        if (ctx->lut3d_func) {
-            CONFIG_CACHE_FREE(ctx->lut3d_func->config_cache);
+            for (int j = 0; j < MAX_3DLUT; j++)
+                CONFIG_CACHE_FREE(ctx->lut3d_func->config_cache[j]);
            vpe_free(ctx->lut3d_func);
            ctx->lut3d_func = NULL;
        }
@ -264,6 +268,7 @@ void vpe_pipe_reset(struct vpe_priv *vpe_priv)

    for (i = 0; i < vpe_priv->num_pipe; i++) {
        pipe_ctx               = &vpe_priv->pipe_ctx[i];
+        pipe_ctx->pipe_idx     = i;
        pipe_ctx->is_top_pipe  = true;
        pipe_ctx->owner        = PIPE_CTX_NO_OWNER;
        pipe_ctx->top_pipe_idx = 0xff;