tu: Implement VK_QCOM_image_processing.

This includes the block matching, box filtering, and weighted sample features. Passes all of the dEQP-VK.image_processing.* CTS tests that were recently landed. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38559>
2026-05-03 07:48:07 +02:00 · 2025-10-09 16:02:56 -07:00 · 2025-10-09 16:02:56 -07:00 · 72c12f62ff
commit 72c12f62ff
parent 431c7a6e36
17 changed files with 329 additions and 23 deletions
--- a/docs/features.txt
+++ b/docs/features.txt
@ -722,6 +722,7 @@ Khronos extensions that are not part of any Vulkan version:
  VK_MESA_image_alignment_control                       DONE (anv, nvk, radv)
  VK_EXT_legacy_dithering                               DONE (anv, tu, vn)
  VK_QCOM_fragment_density_map_offset                   DONE (tu)
+  VK_QCOM_image_processing                              DONE (tu)
  VK_VALVE_video_encode_rgb_conversion                  DONE (radv)

 Rusticl OpenCL 1.0 -- all DONE:
--- a/docs/relnotes/new_features.txt
+++ b/docs/relnotes/new_features.txt
@ -0,0 +1 @@
+VK_QCOM_image_processing on Turnip
--- a/src/freedreno/common/freedreno_dev_info.h
+++ b/src/freedreno/common/freedreno_dev_info.h
@ -467,6 +467,9 @@ struct fd_dev_info {
       * expected:
       */
      bool has_salu_int_narrowing_quirk;
+
+      /* Whether the device supports the image processing opcode */
+      bool has_image_processing;
   } props;
 };

--- a/src/freedreno/common/freedreno_devices.py
+++ b/src/freedreno/common/freedreno_devices.py
@ -1016,6 +1016,7 @@ a7xx_gen2 = GPUProps(
        reading_shading_rate_requires_smask_quirk = True,
        has_ray_intersection = True,
        has_hw_bin_scaling = True,
+        has_image_processing = True,
    )

 a7xx_gen3 = GPUProps(
@ -1043,6 +1044,7 @@ a7xx_gen3 = GPUProps(
        has_abs_bin_mask = True,
        new_control_regs = True,
        has_hw_bin_scaling = True,
+        has_image_processing = True,
    )

 a730_magic_regs = dict(
--- a/src/freedreno/fdl/fd6_view.cc
+++ b/src/freedreno/fdl/fd6_view.cc
@ -133,6 +133,10 @@ fdl6_texswiz(const struct fdl_view_args *args, bool has_z24uint_s8uint)
   unsigned char swiz[4];
   util_format_compose_swizzles(format_swiz, args->swiz, swiz);

+   /* Unused for box filter, match the blob behavior. */
+   if (args->filter_width)
+      return 0;
+
   if (CHIP <= A7XX) {
      return A6XX_TEX_CONST_0_SWIZ_X(fdl6_swiz(swiz[0])) |
             A6XX_TEX_CONST_0_SWIZ_Y(fdl6_swiz(swiz[1])) |
@ -258,7 +262,13 @@ fdl6_view_init(struct fdl6_view *view, const struct fdl_layout **layouts,
      view->descriptor[3] = A6XX_TEX_CONST_3_ARRAY_PITCH(layer_size);
      view->descriptor[4] = base_addr;
      view->descriptor[5] = (base_addr >> 32) | A6XX_TEX_CONST_5_DEPTH(depth);
-      view->descriptor[6] = A6XX_TEX_CONST_6_MIN_LOD_CLAMP(args->min_lod_clamp - args->base_miplevel);
+      if (args->filter_width) {
+         view->descriptor[6] = A6XX_TEX_CONST_6_LOG2_PHASES(
+                                  util_logbase2_ceil(args->filter_num_phases) / 2) |
+                               A6XX_TEX_CONST_6_DILATION(1);
+      } else {
+         view->descriptor[6] = A6XX_TEX_CONST_6_MIN_LOD_CLAMP(args->min_lod_clamp - args->base_miplevel);
+      }

      if (layout->tile_all)
         view->descriptor[3] |= A6XX_TEX_CONST_3_TILE_ALL;
@ -300,6 +310,13 @@ fdl6_view_init(struct fdl6_view *view, const struct fdl_layout **layouts,

         assert(args->type != FDL_VIEW_TYPE_3D);
         return;
+      } else if (args->filter_width) {
+         view->descriptor[8] =
+            (A6XX_TEX_CONST_8_FILTER_SIZE_X(args->filter_width) |
+             A6XX_TEX_CONST_8_FILTER_SIZE_Y(args->filter_height));
+         view->descriptor[10] =
+            (A6XX_TEX_CONST_10_FILTER_OFFSET_X(args->filter_center_x) |
+             A6XX_TEX_CONST_10_FILTER_OFFSET_Y(args->filter_center_y));
      }

      if (ubwc_enabled) {
@ -323,6 +340,8 @@ fdl6_view_init(struct fdl6_view *view, const struct fdl_layout **layouts,
   } else if (CHIP >= A8XX) {
      uint32_t *descriptor = view->descriptor;

+      assert(!args->filter_width); /* Need descriptor fields defined. */
+
      descriptor[0] = A8XX_TEX_MEMOBJ_0_BASE_LO(base_addr);
      descriptor[1] = A8XX_TEX_MEMOBJ_1_BASE_HI(base_addr >> 32) |
                      A8XX_TEX_MEMOBJ_1_TYPE(fdl6_tex_type(args->type, false)) |
@ -374,13 +393,23 @@ fdl6_view_init(struct fdl6_view *view, const struct fdl_layout **layouts,
                          A8XX_TEX_MEMOBJ_9_UV_PITCH(fdl_pitch(layouts[1], args->base_miplevel));

         return;
+      } else if (args->filter_width) {
+         descriptor[5] |= A8XX_TEX_MEMOBJ_5_FILTER_SIZE_X(args->filter_width) |
+                          A8XX_TEX_MEMOBJ_5_FILTER_SIZE_Y(args->filter_height) |
+                          A8XX_TEX_MEMOBJ_5_FILTER_OFFSET_X(args->filter_center_x) |
+                          A8XX_TEX_MEMOBJ_5_FILTER_OFFSET_Y(args->filter_center_y);
      }

      descriptor[7] = A8XX_TEX_MEMOBJ_7_ARRAY_SLICE_OFFSET(layer_size);
      descriptor[9] = A8XX_TEX_MEMOBJ_9_MIN_LOD_CLAMP(args->min_lod_clamp - args->base_miplevel);

-      if (args->type == FDL_VIEW_TYPE_3D)
+      if (args->filter_width) {
+         descriptor[7] |= A8XX_TEX_MEMOBJ_7_LOG2_PHASES(
+                             util_logbase2_ceil(args->filter_num_phases) / 2) |
+                          A8XX_TEX_MEMOBJ_7_DILATION(1);
+      } else if (args->type == FDL_VIEW_TYPE_3D) {
         descriptor[7] |= A8XX_TEX_MEMOBJ_7_MIN_ARRAY_SLIZE_OFFSET(layout->slices[layout->mip_levels - 1].size0);
+      }

      if (ubwc_enabled) {
         uint32_t block_width, block_height;
--- a/src/freedreno/fdl/freedreno_layout.h
+++ b/src/freedreno/fdl/freedreno_layout.h
@ -374,6 +374,12 @@ struct fdl_view_args {
   enum pipe_format format;
   enum fdl_view_type type;
   enum fdl_chroma_location chroma_offsets[2];
+
+   uint32_t filter_width;
+   uint32_t filter_height;
+   uint32_t filter_center_x;
+   uint32_t filter_center_y;
+   uint32_t filter_num_phases;
 };

 #define FDL6_TEX_CONST_DWORDS 16
--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@ -1900,7 +1900,7 @@ get_bindless_ref(struct ir3_context *ctx, nir_src *src, bool is_sampler)

 static struct tex_src_info
 get_bindless_samp_src(struct ir3_context *ctx, nir_src *tex,
-                      nir_src *samp)
+                      nir_src *samp, nir_src *tex2, nir_src *samp2)
 {
   struct ir3_builder *b = &ctx->build;
   struct tex_src_info info = {0};
@ -1912,6 +1912,46 @@ get_bindless_samp_src(struct ir3_context *ctx, nir_src *tex,
    */
   struct bindless_ref_info tex_info = get_bindless_ref(ctx, tex, false);
   struct bindless_ref_info samp_info = get_bindless_ref(ctx, samp, true);
+   struct bindless_ref_info tex2_info = get_bindless_ref(ctx, tex2, false);
+   /* NOTE: The QC implementation completely ignores samp2 (reference
+    * sampler), in both the A1 and S2EN cases.
+    */
+
+   if (tex2 || samp2) {
+      struct tex_src_info info = {0};
+      info.flags = IR3_INSTR_B;
+
+      /* NOTE: QC implementation doesn't encode the BASE_HI bits in the right
+       * place (ORing them into src2 instead), but our normal base encoding
+       * appears to work.
+       */
+      info.base = tex_info.desc_set;
+
+      info.a1_val = 0;
+      info.a1_val |= samp_info.desc_set;
+      info.a1_val |= tex2_info.desc_set << 13;
+
+      /* NOTE: QC implementation lets samp index overflow into tex2 index */
+      if (tex_info.is_const && tex_info.const_index < 16 &&
+          samp_info.is_const && samp_info.const_index < 16 &&
+          tex2_info.is_const && tex2_info.const_index < 64) {
+         info.tex_idx = tex_info.const_index;
+         info.a1_val |= (samp_info.const_index << 3);
+         info.a1_val |= (tex2_info.const_index << 7);
+      } else {
+         /* Non-constant case: Collect the combined texture/sampler, and the
+          * secondary texture.
+          */
+         info.samp_tex = ir3_collect(b, tex_info.index, samp_info.index, tex2_info.index);
+
+         info.flags |= IR3_INSTR_S2EN;
+      }
+
+      if (info.a1_val)
+         info.flags |= IR3_INSTR_A1EN;
+
+      return info;
+   }

   info.tex_base = tex_info.desc_set;
   info.tex_idx = tex_info.const_index;
@ -3411,7 +3451,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
   }
   case nir_intrinsic_prefetch_sam_ir3: {
      struct tex_src_info info =
-         get_bindless_samp_src(ctx, &intr->src[0], &intr->src[1]);
+         get_bindless_samp_src(ctx, &intr->src[0], &intr->src[1], NULL, NULL);
      struct ir3_instruction *sam =
         emit_sam(ctx, OPC_SAM, info, TYPE_F32, 0b1111, NULL, NULL);

@ -3581,13 +3621,17 @@ get_tex_samp_tex_src(struct ir3_context *ctx, nir_tex_instr *tex)
   struct tex_src_info info = {0};
   int texture_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_handle);
   int sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_handle);
+   int texture2_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_2_handle);
+   int sampler2_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_2_handle);
   struct ir3_instruction *texture, *sampler;

   if (texture_idx >= 0 || sampler_idx >= 0) {
      /* Bindless case */
      info = get_bindless_samp_src(ctx,
                                   texture_idx >= 0 ? &tex->src[texture_idx].src : NULL,
-                                   sampler_idx >= 0 ? &tex->src[sampler_idx].src : NULL);
+                                   sampler_idx >= 0 ? &tex->src[sampler_idx].src : NULL,
+                                   texture2_idx >= 0 ? &tex->src[texture2_idx].src : NULL,
+                                   sampler2_idx >= 0 ? &tex->src[sampler2_idx].src : NULL);

      if (tex->texture_non_uniform || tex->sampler_non_uniform)
         info.flags |= IR3_INSTR_NONUNIF;
@ -3629,8 +3673,8 @@ emit_tex(struct ir3_context *ctx, nir_tex_instr *tex)
 {
   struct ir3_builder *b = &ctx->build;
   struct ir3_instruction **dst, *sam, *src0[12], *src1[5];
-   struct ir3_instruction *const *coord, *const *off, *const *ddx, *const *ddy;
-   struct ir3_instruction *lod, *compare, *proj, *sample_index, *min_lod;
+   struct ir3_instruction *const *coord, *const *off, *const *ddx, *const *ddy, *const *box_size;
+   struct ir3_instruction *lod, *compare, *proj, *sample_index, *min_lod, *ref_coord, *block_size;
   struct tex_src_info info = {0};
   bool has_bias = false, has_lod = false, has_proj = false, has_off = false;
   bool lod_zero = false, has_min_lod = false;
@ -3641,8 +3685,8 @@ emit_tex(struct ir3_context *ctx, nir_tex_instr *tex)

   ncomp = tex->def.num_components;

-   coord = off = ddx = ddy = NULL;
-   lod = proj = compare = sample_index = min_lod = NULL;
+   coord = off = ddx = ddy = box_size = NULL;
+   lod = proj = compare = sample_index = min_lod = ref_coord = block_size = NULL;

   dst = ir3_get_def(ctx, &tex->def, ncomp);

@ -3691,11 +3735,22 @@ emit_tex(struct ir3_context *ctx, nir_tex_instr *tex)
         min_lod = ir3_get_src(ctx, &tex->src[i].src)[0];
         has_min_lod = true;
         break;
+      case nir_tex_src_box_size:
+         box_size = ir3_get_src(ctx, &tex->src[i].src);
+         break;
+      case nir_tex_src_block_size:
+         block_size = ir3_get_src(ctx, &tex->src[i].src)[0];
+         break;
+      case nir_tex_src_ref_coord:
+         ref_coord = ir3_get_src(ctx, &tex->src[i].src)[0];
+         break;
      case nir_tex_src_texture_offset:
      case nir_tex_src_sampler_offset:
      case nir_tex_src_texture_handle:
      case nir_tex_src_sampler_handle:
-         /* handled in get_tex_samp_src() */
+      case nir_tex_src_texture_2_handle:
+      case nir_tex_src_sampler_2_handle:
+         /* handled in get_tex_samp_tex_src() */
         break;
      default:
         ir3_context_error(ctx, "Unhandled NIR tex src type: %d\n",
@ -3767,6 +3822,16 @@ emit_tex(struct ir3_context *ctx, nir_tex_instr *tex)
   case nir_texop_txf_ms:
      opc = OPC_ISAMM;
      break;
+   case nir_texop_sample_weighted_qcom:
+      opc = OPC_IMG_BINDLESS_HOF;
+      break;
+   case nir_texop_box_filter_qcom:
+      opc = OPC_IMG_BINDLESS_PCMN;
+      break;
+   case nir_texop_block_match_sad_qcom:
+   case nir_texop_block_match_ssd_qcom:
+      opc = OPC_IMG_BINDLESS;
+      break;
   default:
      ir3_context_error(ctx, "Unhandled NIR tex type: %d\n", tex->op);
      return;
@ -3864,7 +3929,7 @@ emit_tex(struct ir3_context *ctx, nir_tex_instr *tex)
    *  - lod
    *  - bias
    */
-   if (has_off | has_lod | has_bias | has_min_lod) {
+   if (has_off | has_lod | has_bias | has_min_lod | (box_size != NULL)) {
      if (has_off) {
         unsigned off_coords = coords;
         if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
@ -3883,6 +3948,16 @@ emit_tex(struct ir3_context *ctx, nir_tex_instr *tex)
         src1[nsrc1++] = min_lod;
         flags |= IR3_INSTR_CLP;
      }
+
+      if (box_size) {
+         src1[nsrc1++] = box_size[0];
+         src1[nsrc1++] = box_size[1];
+      }
+   }
+
+   if (opc == OPC_IMG_BINDLESS) {
+      src1[nsrc1++] = ref_coord;
+      src1[nsrc1++] = block_size;
   }

   type = get_tex_dest_type(tex);
@ -3978,6 +4053,9 @@ emit_tex(struct ir3_context *ctx, nir_tex_instr *tex)
      sam = emit_sam(ctx, opc, info, type, MASK(ncomp), col0, col1);
   }

+   if (tex->op == nir_texop_block_match_ssd_qcom)
+      sam->cat5.match_mode = IR3_MATCH_MODE_SSD;
+
   if (tex->is_sparse) {
      info.flags |= flags;
      struct ir3_instruction *rck =
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@ -523,6 +523,73 @@ ir3_nir_lower_array_sampler(nir_shader *shader)
      nir_metadata_control_flow, NULL);
 }

+/* pack_uvec2_to_uint does clamping that we don't need to do. */
+static nir_def *
+pack_16_16(nir_builder *b, nir_def *x)
+{
+   return nir_ior(b, nir_channel(b, x, 0), nir_ishl_imm(b, nir_channel(b, x, 1), 16));
+}
+
+static bool
+ir3_nir_lower_image_processing_instr(struct nir_builder *b, nir_instr *instr,
+                                     void *_data)
+{
+   if (instr->type != nir_instr_type_tex)
+      return false;
+
+   nir_tex_instr *tex = nir_instr_as_tex(instr);
+   b->cursor = nir_before_instr(&tex->instr);
+
+   if (tex->op == nir_texop_box_filter_qcom) {
+      /* The hardware's box filter arg is preprocessed, but still a vec2.  We do
+       * the preprocessing in NIR so it's more legible, and can be constant
+       * folded.
+       */
+      int box_size_src = nir_tex_instr_src_index(tex, nir_tex_src_box_size);
+      assert(box_size_src >= 0);
+
+      nir_def *box_size = tex->src[box_size_src].src.ssa;
+      nir_def *area =
+         nir_fmul(b, nir_channel(b, box_size, 0), nir_channel(b, box_size, 1));
+      box_size =
+         nir_f2u32(b, nir_fround_even(b, nir_fmul_imm(b, box_size, 64.0)));
+      nir_def *inv_area = nir_u2u32(b, nir_f2f16(b, nir_frcp(b, area)));
+
+      nir_src_rewrite(&tex->src[box_size_src].src, nir_vec2(b, pack_16_16(b, box_size), inv_area));
+
+      return true;
+   } else if (tex->op == nir_texop_block_match_sad_qcom ||
+              tex->op == nir_texop_block_match_ssd_qcom) {
+      /* Convert the src coords to integer, and pack the ref coord and block
+       * into u32s each.
+       */
+      int coord_src = nir_tex_instr_src_index(tex, nir_tex_src_coord);
+      assert(coord_src >= 0);
+      nir_src_rewrite(&tex->src[coord_src].src, nir_i2f32(b, tex->src[coord_src].src.ssa));
+
+      int ref_coord_src = nir_tex_instr_src_index(tex, nir_tex_src_ref_coord);
+      assert(ref_coord_src >= 0);
+      nir_src_rewrite(&tex->src[ref_coord_src].src,
+                      pack_16_16(b, tex->src[ref_coord_src].src.ssa));
+
+      int block_size_src = nir_tex_instr_src_index(tex, nir_tex_src_block_size);
+      assert(block_size_src >= 0);
+      nir_src_rewrite(&tex->src[block_size_src].src,
+                      pack_16_16(b, tex->src[block_size_src].src.ssa));
+
+      return true;
+   } else {
+      return false;
+   }
+}
+
+static bool
+ir3_nir_lower_image_processing(nir_shader *shader)
+{
+   return nir_shader_instructions_pass(shader, ir3_nir_lower_image_processing_instr,
+                                       nir_metadata_control_flow, NULL);
+}
+
 static bool
 lower_shader_clock(struct nir_builder *b, nir_intrinsic_instr *instr, void *data)
 {
@ -701,6 +768,8 @@ ir3_finalize_nir(struct ir3_compiler *compiler,
   if (compiler->array_index_add_half)
      OPT(s, ir3_nir_lower_array_sampler);

+   OPT(s, ir3_nir_lower_image_processing);
+
   if (compiler->gen >= 6) {
      OPT(s, ir3_nir_lower_shader_clock, compiler->options.uche_trap_base);
   }
--- a/src/freedreno/registers/adreno/a6xx_descriptors.xml
+++ b/src/freedreno/registers/adreno/a6xx_descriptors.xml
@ -120,6 +120,10 @@ xsi:schemaLocation="https://gitlab.freedesktop.org/freedreno/ rules-fd.xsd">
 		<bitfield name="MIN_LOD_CLAMP" low="0" high="11" type="ufixed" radix="8"/>
 		<!-- pitch for plane 2 / plane 3 -->
 		<bitfield name="PLANE_PITCH" low="8" high="31" type="uint"/>
+
+		<!-- QCOM_image_filtering sample weights descriptor fields, overlapping the others. -->
+		<bitfield name="LOG2_PHASES" low="0" high="2" type="uint"/>
+		<bitfield name="DILATION" low="8" high="11" type="uint"/>
 	</reg32>
 	<!-- 7/8 is plane 2 address for planar formats -->
 	<reg32 offset="7" name="7">
@ -127,6 +131,8 @@ xsi:schemaLocation="https://gitlab.freedesktop.org/freedreno/ rules-fd.xsd">
 	</reg32>
 	<reg32 offset="8" name="8">
 		<bitfield name="FLAG_HI" low="0" high="16"/>
+		<bitfield name="FILTER_SIZE_X" low="17" high="23"/>
+		<bitfield name="FILTER_SIZE_Y" low="24" high="30"/>
 	</reg32>
 	<!-- 9/10 is plane 3 address for planar formats -->
 	<reg32 offset="9" name="9">
@ -137,6 +143,8 @@ xsi:schemaLocation="https://gitlab.freedesktop.org/freedreno/ rules-fd.xsd">
 		<!-- log2 size of the first level, required for mipmapping -->
 		<bitfield name="FLAG_BUFFER_LOGW" low="8" high="11" type="uint"/>
 		<bitfield name="FLAG_BUFFER_LOGH" low="12" high="15" type="uint"/>
+		<bitfield name="FILTER_OFFSET_X" low="17" high="22"/>
+		<bitfield name="FILTER_OFFSET_Y" low="23" high="28"/>
 	</reg32>
 	<reg32 offset="11" name="11"/>
 	<reg32 offset="12" name="12"/>
--- a/src/freedreno/registers/adreno/a8xx_descriptors.xml
+++ b/src/freedreno/registers/adreno/a8xx_descriptors.xml
@ -83,6 +83,12 @@ xsi:schemaLocation="https://gitlab.freedesktop.org/freedreno/ rules-fd.xsd">
 		<bitfield name="FLAG_BUFFER_PITCH" low="17" high="24" shr="6" type="uint"/>
 		<bitfield name="ALL_SAMPLES_CENTER" pos="29" type="boolean"/>
 		<bitfield name="MUTABLEEN" pos="31" type="boolean"/>
+
+		<!-- QCOM_image_filtering sample weights descriptor fields, overlapping the others. -->
+		<bitfield name="FILTER_SIZE_X" low="0" high="6"/>
+		<bitfield name="FILTER_SIZE_Y" low="7" high="13"/>
+		<bitfield name="FILTER_OFFSET_X" low="19" high="24"/>
+		<bitfield name="FILTER_OFFSET_Y" low="25" high="30"/>
 	</reg32>
 	<reg32 offset="6" name="6">
 		<bitfield name="TEX_LINE_OFFSET" low="0" high="23" type="uint"/> <!-- PITCH -->
@ -99,6 +105,10 @@ xsi:schemaLocation="https://gitlab.freedesktop.org/freedreno/ rules-fd.xsd">
 		<!-- For multiplanar.  This overlaps other single-planar fields: -->
 		<bitfield name="UV_OFFSET_H" low="24" high="25" type="ufixed" radix="2"/> <!-- CHROMA_MIDPOINT_X -->
 		<bitfield name="UV_OFFSET_V" low="26" high="27" type="ufixed" radix="2"/> <!-- CHROMA_MIDPOINT_Y -->
+
+		<!-- QCOM_image_filtering sample weights descriptor fields, overlapping the others. -->
+		<bitfield name="DILATION" low="24" high="27" type="uint"/>
+		<bitfield name="LOG2_PHASES" low="28" high="30" type="uint"/>
 	</reg32>
 	<reg32 offset="8" name="8">
 		<bitfield name="FLAG_ARRAY_PITCH" low="0" high="14" shr="12" type="uint"/> <!-- FLAG_BUFFER_ARRAY_PITCH -->
--- a/src/freedreno/tests/reference/prefetch-test.log
+++ b/src/freedreno/tests/reference/prefetch-test.log
@ -8744,11 +8744,11 @@ got cmdszdw=416
 								{ ARRAY_PITCH = 4096 | MIN_LAYERSZ = 0 }
 								{ BASE_LO = 0x373a000 }
 								{ BASE_HI = 0x1 | DEPTH = 1 }
-								{ MIN_LOD_CLAMP = 0.000000 | PLANE_PITCH = 0 }
+								{ MIN_LOD_CLAMP = 0.000000 | PLANE_PITCH = 0 | LOG2_PHASES = 0 | DILATION = 0 }
 								{ FLAG_LO = 0 }
-								{ FLAG_HI = 0 }
+								{ FLAG_HI = 0 | FILTER_SIZE_X = 0 | FILTER_SIZE_Y = 0 }
 								{ FLAG_BUFFER_ARRAY_PITCH = 0 }
-								{ FLAG_BUFFER_PITCH = 0 | FLAG_BUFFER_LOGW = 0 | FLAG_BUFFER_LOGH = 0 }
+								{ FLAG_BUFFER_PITCH = 0 | FLAG_BUFFER_LOGW = 0 | FLAG_BUFFER_LOGH = 0 | FILTER_OFFSET_X = 0 | FILTER_OFFSET_Y = 0 }
 								{ 11 = 0 }
 								{ 12 = 0 }
 								{ 13 = 0 }
--- a/src/freedreno/vulkan/tu_descriptor_set.cc
+++ b/src/freedreno/vulkan/tu_descriptor_set.cc
@ -1276,6 +1276,8 @@ tu_update_descriptor_sets(const struct tu_device *device,
            break;
         case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
         case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+         case VK_DESCRIPTOR_TYPE_SAMPLE_WEIGHT_IMAGE_QCOM:
+         case VK_DESCRIPTOR_TYPE_BLOCK_MATCH_IMAGE_QCOM:
         case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
            write_image_descriptor(ptr, writeset->descriptorType, writeset->pImageInfo + j);
            break;
@ -1621,6 +1623,8 @@ tu_update_descriptor_set_with_template(
            break;
         case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
         case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+         case VK_DESCRIPTOR_TYPE_SAMPLE_WEIGHT_IMAGE_QCOM:
+         case VK_DESCRIPTOR_TYPE_BLOCK_MATCH_IMAGE_QCOM:
         case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: {
            write_image_descriptor(ptr, templ->entry[i].descriptor_type,
                                   (const VkDescriptorImageInfo *) src);
--- a/src/freedreno/vulkan/tu_device.cc
+++ b/src/freedreno/vulkan/tu_device.cc
@ -355,6 +355,7 @@ get_device_extensions(const struct tu_physical_device *device,
      .IMG_filter_cubic = device->info->props.has_tex_filter_cubic,
      .NV_compute_shader_derivatives = device->info->chip >= 7,
      .QCOM_fragment_density_map_offset = true,
+      .QCOM_image_processing = device->info->props.has_image_processing,
      .QCOM_multiview_per_view_render_areas = true,
      .QCOM_multiview_per_view_viewports =
         device->info->props.has_per_view_viewport,
@ -824,6 +825,11 @@ tu_get_features(struct tu_physical_device *pdevice,
   /* VK_EXT_zero_initialize_device_memory */
   features->zeroInitializeDeviceMemory = true;

+   /* VK_QCOM_image_processing */
+   features->textureSampleWeighted = pdevice->vk.supported_extensions.QCOM_image_processing;
+   features->textureBoxFilter = pdevice->vk.supported_extensions.QCOM_image_processing;
+   features->textureBlockMatch = pdevice->vk.supported_extensions.QCOM_image_processing;
+
   /* VK_VALVE_fragment_density_map_layered */
   features->fragmentDensityMapLayered = true;

@ -1520,6 +1526,21 @@ tu_get_properties(struct tu_physical_device *pdevice,

   /* VK_VALVE_fragment_density_map_layered */
   props->maxFragmentDensityMapLayers = MAX_VIEWS;
+
+   /* VK_QCOM_image_processing */
+   props->maxWeightFilterPhases = 1024;
+   props->maxWeightFilterDimension =
+      pdevice->vk.supported_extensions.QCOM_image_processing
+         ? (VkExtent2D) { 64, 64 }
+         : (VkExtent2D) { 0, 0 };
+   props->maxBlockMatchRegion =
+      pdevice->vk.supported_extensions.QCOM_image_processing
+         ? (VkExtent2D) { 64, 64 }
+         : (VkExtent2D) { 0, 0 };
+   props->maxBoxFilterBlockSize =
+      pdevice->vk.supported_extensions.QCOM_image_processing
+         ? (VkExtent2D) { 64, 64 }
+         : (VkExtent2D) { 0, 0 };
 }

 static const struct vk_pipeline_cache_object_ops *const cache_import_ops[] = {
--- a/src/freedreno/vulkan/tu_formats.cc
+++ b/src/freedreno/vulkan/tu_formats.cc
@ -251,6 +251,50 @@ tu_physical_device_get_format_properties(
                   VK_FORMAT_FEATURE_2_STORAGE_IMAGE_ATOMIC_BIT);
   }

+   /* Set up QCOM_imgae_processing flags. This matches blob behavior, except
+    * that it advertises box/weighted on NPOT sampleable formats and ASTC_FLOAT
+    * (which we don't advertise yet), and blockmatch/box/weighted on
+    * VK_FORMAT_G8B8G8R8_422_UNORM.
+    */
+   if ((optimal & VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT) &&
+       (!ycbcr_info || ycbcr_info->n_planes == 1) &&
+       !vk_format_is_depth_or_stencil(vk_format)) {
+      int c = util_format_get_first_non_void_channel(desc->format);
+      bool is_8bpc = c != -1 && desc->is_array && desc->channel[c].size == 8;
+
+      if ((is_8bpc && vk_format != VK_FORMAT_B8G8R8A8_UNORM &&
+           vk_format != VK_FORMAT_B8G8R8A8_SNORM &&
+           vk_format != VK_FORMAT_B8G8R8A8_SRGB) ||
+          vk_format == VK_FORMAT_A2B10G10R10_UNORM_PACK32) {
+         if (desc->is_unorm &&
+             desc->colorspace != UTIL_FORMAT_COLORSPACE_SRGB)
+            optimal |= VK_FORMAT_FEATURE_2_BLOCK_MATCHING_BIT_QCOM;
+         if ((desc->is_unorm || desc->is_snorm) &&
+             vk_format != VK_FORMAT_R8G8_SNORM) {
+            optimal |= VK_FORMAT_FEATURE_2_BOX_FILTER_SAMPLED_BIT_QCOM;
+            optimal |= VK_FORMAT_FEATURE_2_WEIGHT_SAMPLED_IMAGE_BIT_QCOM;
+         }
+      }
+
+      if (vk_format == VK_FORMAT_B5G6R5_UNORM_PACK16 ||
+          vk_format == VK_FORMAT_B10G11R11_UFLOAT_PACK32 ||
+          vk_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 ||
+          util_format_is_float16(format) ||
+          (util_format_is_compressed(format) &&
+           desc->layout != UTIL_FORMAT_LAYOUT_RGTC &&
+           vk_format != VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK &&
+           vk_format != VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK &&
+           vk_format != VK_FORMAT_EAC_R11G11_UNORM_BLOCK &&
+           vk_format != VK_FORMAT_EAC_R11G11_SNORM_BLOCK)) {
+         optimal |= VK_FORMAT_FEATURE_2_BOX_FILTER_SAMPLED_BIT_QCOM;
+         optimal |= VK_FORMAT_FEATURE_2_WEIGHT_SAMPLED_IMAGE_BIT_QCOM;
+      }
+
+      if (vk_format == VK_FORMAT_R8_UNORM ||
+          vk_format == VK_FORMAT_R16_SFLOAT)
+         optimal |= VK_FORMAT_FEATURE_2_WEIGHT_IMAGE_BIT_QCOM;
+   }
+
   /* For the most part, we can do anything with a linear image that we could
    * do with a tiled image. However, we can't support sysmem rendering with a
    * linear depth texture, because we don't know if there's a bit to control
--- a/src/freedreno/vulkan/tu_image.cc
+++ b/src/freedreno/vulkan/tu_image.cc
@ -180,6 +180,8 @@ tu_image_view_init(struct tu_device *device,
      vk_find_struct_const(pCreateInfo->pNext, SAMPLER_YCBCR_CONVERSION_INFO);
   const struct vk_ycbcr_conversion *conversion = ycbcr_conversion ?
      vk_ycbcr_conversion_from_handle(ycbcr_conversion->conversion) : NULL;
+   const VkImageViewSampleWeightCreateInfoQCOM *sample_weights =
+      vk_find_struct_const(pCreateInfo->pNext, IMAGE_VIEW_SAMPLE_WEIGHT_CREATE_INFO_QCOM);

   vk_image_view_init(&device->vk, &iview->vk, pCreateInfo);
   assert(iview->vk.format != VK_FORMAT_UNDEFINED);
@ -268,6 +270,14 @@ tu_image_view_init(struct tu_device *device,
      args.chroma_offsets[1] = (enum fdl_chroma_location) conversion->state.chroma_offsets[1];
   }

+   if (sample_weights) {
+      args.filter_width = sample_weights->filterSize.width;
+      args.filter_height = sample_weights->filterSize.height;
+      args.filter_center_x = sample_weights->filterCenter.x;
+      args.filter_center_y = sample_weights->filterCenter.y;
+      args.filter_num_phases = sample_weights->numPhases;
+   }
+
   TU_CALLX(device, fdl6_view_init)(&iview->view, layouts, &args, device->use_z24uint_s8uint);

   if (image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
--- a/src/freedreno/vulkan/tu_pipeline.cc
+++ b/src/freedreno/vulkan/tu_pipeline.cc
@ -90,6 +90,8 @@ tu6_load_state_size(struct tu_pipeline *pipeline,
         case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
         case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
         case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+         case VK_DESCRIPTOR_TYPE_SAMPLE_WEIGHT_IMAGE_QCOM:
+         case VK_DESCRIPTOR_TYPE_BLOCK_MATCH_IMAGE_QCOM:
            /* Textures and UBO's needs a packet for each stage */
            count = stage_count;
            break;
@ -219,7 +221,8 @@ tu6_emit_load_state(struct tu_device *device,
            }
            break;
         }
-         case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: {
+         case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+         case VK_DESCRIPTOR_TYPE_BLOCK_MATCH_IMAGE_QCOM: {
            tu_foreach_stage(stage, stages) {
               /* TODO: We could emit less CP_LOAD_STATE6 if we used
                * struct-of-arrays instead of array-of-structs.
--- a/src/freedreno/vulkan/tu_shader.cc
+++ b/src/freedreno/vulkan/tu_shader.cc
@ -757,30 +757,29 @@ lower_tex_ycbcr(const struct tu_pipeline_layout *layout,
 }

 static bool
-lower_tex(nir_builder *b, nir_tex_instr *tex, struct tu_device *dev,
+lower_tex_impl(nir_builder *b, nir_tex_instr *tex, struct tu_device *dev,
          struct tu_shader *shader, const struct tu_pipeline_layout *layout,
-          uint32_t read_only_input_attachments, bool dynamic_renderpass)
+          uint32_t read_only_input_attachments, bool dynamic_renderpass,
+          bool ref)
 {
-   lower_tex_ycbcr(layout, b, tex);
-
-   int sampler_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_deref);
+   int sampler_src_idx = nir_tex_instr_src_index(tex, ref ? nir_tex_src_sampler_2_deref : nir_tex_src_sampler_deref);
   if (sampler_src_idx >= 0) {
      nir_deref_instr *deref = nir_src_as_deref(tex->src[sampler_src_idx].src);
      nir_def *bindless = build_bindless(dev, b, deref, true, shader, layout,
                                         read_only_input_attachments,
                                         dynamic_renderpass);
      nir_src_rewrite(&tex->src[sampler_src_idx].src, bindless);
-      tex->src[sampler_src_idx].src_type = nir_tex_src_sampler_handle;
+      tex->src[sampler_src_idx].src_type = ref ? nir_tex_src_sampler_2_handle : nir_tex_src_sampler_handle;
   }

-   int tex_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref);
+   int tex_src_idx = nir_tex_instr_src_index(tex, ref ? nir_tex_src_texture_2_deref : nir_tex_src_texture_deref);
   if (tex_src_idx >= 0) {
      nir_deref_instr *deref = nir_src_as_deref(tex->src[tex_src_idx].src);
      nir_def *bindless = build_bindless(dev, b, deref, false, shader, layout,
                                         read_only_input_attachments,
                                         dynamic_renderpass);
      nir_src_rewrite(&tex->src[tex_src_idx].src, bindless);
-      tex->src[tex_src_idx].src_type = nir_tex_src_texture_handle;
+      tex->src[tex_src_idx].src_type = ref ? nir_tex_src_texture_2_handle : nir_tex_src_texture_handle;

      /* for the input attachment case: */
      if (!nir_def_is_intrinsic(bindless))
@ -790,6 +789,24 @@ lower_tex(nir_builder *b, nir_tex_instr *tex, struct tu_device *dev,
   return true;
 }

+static bool
+lower_tex(nir_builder *b, nir_tex_instr *tex, struct tu_device *dev,
+          struct tu_shader *shader, const struct tu_pipeline_layout *layout,
+          uint32_t read_only_input_attachments, bool dynamic_renderpass)
+{
+   if (tex->op == nir_texop_block_match_sad_qcom ||
+       tex->op == nir_texop_block_match_ssd_qcom ||
+       tex->op == nir_texop_sample_weighted_qcom) {
+      lower_tex_impl(b, tex, dev, shader, layout, read_only_input_attachments, dynamic_renderpass, false);
+      lower_tex_impl(b, tex, dev, shader, layout, read_only_input_attachments, dynamic_renderpass, true);
+   } else {
+      lower_tex_ycbcr(layout, b, tex);
+      lower_tex_impl(b, tex, dev, shader, layout, read_only_input_attachments, dynamic_renderpass, false);
+   }
+
+   return true;
+}
+
 struct lower_instr_params {
   struct tu_device *dev;
   struct tu_shader *shader;