panvk: Use LD_VAR[_IMM] + ADs for varyings

The current implementation uses LD_VAR_BUF[_IMM] to look up varyings, which limits the number of varying components to 64 due to an 8-bit offset value. As this does not align to maxVertexOutputComponents (128), this change replaces the use of LD_VAR_BUF[_IMM] with LD_VAR[_IMM] + Attribute Descriptors, which do not have this limitation. As allocating Attribute Descriptors is potentially expensive, this can be further optimized by falling back to LD_VAR_BUF[_IMM] in cases where we can ensure we do not use more than 64 varying components. This change currently does not change behavior for gallium/panfrost, though that should be done as well. Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32969>
2026-01-09 06:10:12 +01:00 · 2025-01-07 16:52:53 +01:00 · 2025-01-07 16:52:53 +01:00 · 6d5ae5b3af
commit 6d5ae5b3af
parent 7881d19d01
7 changed files with 146 additions and 57 deletions
--- a/src/gallium/drivers/panfrost/pan_shader.c
+++ b/src/gallium/drivers/panfrost/pan_shader.c
@ -132,6 +132,10 @@ panfrost_shader_compile(struct panfrost_screen *screen, const nir_shader *ir,
      .gpu_id = panfrost_device_gpu_id(dev),
   };

+   if (dev->arch >= 9)
+      /* Use LD_VAR_BUF for varying lookups. */
+      inputs.valhall.use_ld_var_buf = true;
+
   /* Lower this early so the backends don't have to worry about it */
   if (s->info.stage == MESA_SHADER_FRAGMENT) {
      inputs.fixed_varying_mask = key->fs.fixed_varying_mask;
--- a/src/panfrost/ci/panfrost-g610-fails.txt
+++ b/src/panfrost/ci/panfrost-g610-fails.txt
@ -273,19 +273,10 @@ dEQP-VK.api.device_init.create_device_global_priority_query_khr.basic,Fail
 dEQP-VK.renderpass.dedicated_allocation.attachment_allocation.input_output.63,Fail
 dEQP-VK.renderpass2.dedicated_allocation.attachment_allocation.input_output.63,Fail

-dEQP-VK.glsl.limits.near_max.fragment_input.components_123,Fail
-dEQP-VK.glsl.limits.near_max.fragment_input.components_124,Fail
-
-dEQP-VK.pipeline.monolithic.max_varyings.test_vertex_io_between_vertex_fragment,Fail
-
-dEQP-VK.pipeline.pipeline_library.max_varyings.test_vertex_io_between_vertex_fragment,Fail
-
 dEQP-VK.renderpass.suballocation.attachment_allocation.input_output.63,Fail
 dEQP-VK.renderpass.multiple_subpasses_multiple_command_buffers.test,Fail
 dEQP-VK.renderpass2.suballocation.attachment_allocation.input_output.63,Fail

-dEQP-VK.pipeline.fast_linked_library.max_varyings.test_vertex_io_between_vertex_fragment,Fail
-
 dEQP-VK.glsl.loops.special.do_while_dynamic_iterations.dowhile_trap_vertex,Crash

 dEQP-VK.rasterization.rasterization_order_attachment_access.depth.samples_1.multi_draw_barriers,Crash
--- a/src/panfrost/compiler/bifrost_compile.c
+++ b/src/panfrost/compiler/bifrost_compile.c
@ -589,45 +589,30 @@ bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
      b->shader->info.bifrost->uses_flat_shading = true;
   }

-   enum bi_source_format source_format =
-      smooth ? BI_SOURCE_FORMAT_F32 : BI_SOURCE_FORMAT_FLAT32;
-
   nir_src *offset = nir_get_io_offset_src(instr);
   unsigned imm_index = 0;
   bool immediate = bi_is_imm_var_desc_handle(b, instr, &imm_index);
   unsigned base = nir_intrinsic_base(instr);

-   /* On Valhall, ensure the table and index are valid for usage with immediate
-    * form when IDVS isn't used */
-   if (b->shader->arch >= 9 && !b->shader->malloc_idvs)
-      immediate &= va_is_valid_const_table(pan_res_handle_get_table(base)) &&
-                   pan_res_handle_get_index(base) < 256;
+   /* LD_VAR_BUF[_IMM] takes an 8-bit offset, limiting its use to 64 or less
+    * varying components, assuming F32.
+    * Therefore, only use LD_VAR_BUF[_IMM] if explicitly told by the driver
+    * through a compiler input value, falling back to LD_VAR[_IMM] +
+    * Attribute Descriptors otherwise. */
+   bool use_ld_var_buf =
+      b->shader->malloc_idvs && b->shader->inputs->valhall.use_ld_var_buf;

-   if (b->shader->malloc_idvs && immediate) {
-      /* Immediate index given in bytes. */
-      bi_ld_var_buf_imm_to(b, sz, dest, src0, regfmt, sample, source_format,
-                           update, vecsize,
-                           bi_varying_offset(b->shader, instr));
-   } else if (immediate) {
-      bi_instr *I;
+   if (use_ld_var_buf) {
+      enum bi_source_format source_format =
+         smooth ? BI_SOURCE_FORMAT_F32 : BI_SOURCE_FORMAT_FLAT32;

-      if (smooth) {
-         I = bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update, vecsize,
-                              pan_res_handle_get_index(imm_index));
+      if (immediate) {
+         /* Immediate index given in bytes. */
+         bi_ld_var_buf_imm_to(b, sz, dest, src0, regfmt, sample, source_format,
+                              update, vecsize,
+                              bi_varying_offset(b->shader, instr));
      } else {
-         I = bi_ld_var_flat_imm_to(b, dest, BI_FUNCTION_NONE, regfmt, vecsize,
-                                   pan_res_handle_get_index(imm_index));
-      }
-
-      /* Valhall usually uses machine-allocated IDVS. If this is disabled,
-       * use a simple Midgard-style ABI.
-       */
-      if (b->shader->arch >= 9)
-         I->table = va_res_fold_table_idx(pan_res_handle_get_table(base));
-   } else {
-      bi_index idx = bi_src_index(offset);
-
-      if (b->shader->malloc_idvs) {
+         bi_index idx = bi_src_index(offset);
         /* Index needs to be in bytes, but NIR gives the index
          * in slots. For now assume 16 bytes per element.
          */
@ -639,7 +624,33 @@ bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)

         bi_ld_var_buf_to(b, sz, dest, src0, idx_bytes, regfmt, sample,
                          source_format, update, vecsize);
+      }
+   } else {
+      /* On Valhall, ensure the table and index are valid for usage with
+       * immediate form when IDVS isn't used */
+      if (b->shader->arch >= 9)
+         immediate &= va_is_valid_const_table(pan_res_handle_get_table(base)) &&
+                      pan_res_handle_get_index(base) < 256;
+
+      if (immediate) {
+         bi_instr *I;
+
+         if (smooth) {
+            I = bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update, vecsize,
+                                 pan_res_handle_get_index(imm_index));
+         } else {
+            I =
+               bi_ld_var_flat_imm_to(b, dest, BI_FUNCTION_NONE, regfmt, vecsize,
+                                     pan_res_handle_get_index(imm_index));
+         }
+
+         /* Valhall usually uses LD_VAR_BUF. If this is disabled, use a simple
+          * Midgard-style ABI. */
+         if (b->shader->arch >= 9)
+            I->table = va_res_fold_table_idx(pan_res_handle_get_table(base));
      } else {
+         bi_index idx = bi_src_index(offset);
+
         if (base != 0)
            idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);

--- a/src/panfrost/util/pan_ir.h
+++ b/src/panfrost/util/pan_ir.h
@ -121,6 +121,10 @@ struct panfrost_compile_inputs {
      struct {
         uint32_t rt_conv[8];
      } bifrost;
+      struct {
+         /* Use LD_VAR_BUF[_IMM] instead of LD_VAR[_IMM] to load varyings. */
+         bool use_ld_var_buf;
+      } valhall;
   };
 };

--- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c
+++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c
@ -165,6 +165,73 @@ prepare_vs_driver_set(struct panvk_cmd_buffer *cmdbuf)
   return VK_SUCCESS;
 }

+static uint32_t
+get_varying_slots(const struct panvk_cmd_buffer *cmdbuf)
+{
+   const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
+   const struct panvk_shader *fs = get_fs(cmdbuf);
+   uint32_t varying_slots = 0;
+
+   if (fs) {
+      unsigned vs_vars = vs->info.varyings.output_count;
+      unsigned fs_vars = fs->info.varyings.input_count;
+      varying_slots = MAX2(vs_vars, fs_vars);
+   }
+
+   return varying_slots;
+}
+
+static void
+emit_varying_descs(const struct panvk_cmd_buffer *cmdbuf,
+                   struct mali_attribute_packed *descs)
+{
+   uint32_t varying_slots = get_varying_slots(cmdbuf);
+   /* Assumes 16 byte slots. We could do better. */
+   uint32_t varying_size = varying_slots * 16;
+
+   const struct panvk_shader *fs = get_fs(cmdbuf);
+
+   for (uint32_t i = 0; i < varying_slots; i++) {
+      const struct pan_shader_varying *var = &fs->info.varyings.input[i];
+      /* Skip special varyings. */
+      if (var->location < VARYING_SLOT_VAR0)
+         continue;
+
+      /* We currently always write out F32 in the vertex shaders, so the format
+       * needs to reflect this. */
+      enum pipe_format f = var->format;
+      switch (f) {
+      case PIPE_FORMAT_R16_FLOAT:
+         f = PIPE_FORMAT_R32_FLOAT;
+         break;
+      case PIPE_FORMAT_R16G16_FLOAT:
+         f = PIPE_FORMAT_R32G32_FLOAT;
+         break;
+      case PIPE_FORMAT_R16G16B16_FLOAT:
+         f = PIPE_FORMAT_R32G32B32_FLOAT;
+         break;
+      case PIPE_FORMAT_R16G16B16A16_FLOAT:
+         f = PIPE_FORMAT_R32G32B32A32_FLOAT;
+         break;
+      default:
+         break;
+      }
+
+      uint32_t loc = var->location - VARYING_SLOT_VAR0;
+      pan_pack(&descs[i], ATTRIBUTE, cfg) {
+         cfg.attribute_type = MALI_ATTRIBUTE_TYPE_VERTEX_PACKET;
+         cfg.offset_enable = false;
+         cfg.format = GENX(panfrost_format_from_pipe_format)(f)->hw;
+         cfg.table = 61;
+         cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_VERTEX;
+         cfg.offset = 1024 + (loc * 16);
+         cfg.buffer_index = 0;
+         cfg.attribute_stride = varying_size;
+         cfg.packet_stride = varying_size + 16;
+      }
+   }
+}
+
 static VkResult
 prepare_fs_driver_set(struct panvk_cmd_buffer *cmdbuf)
 {
@ -172,7 +239,7 @@ prepare_fs_driver_set(struct panvk_cmd_buffer *cmdbuf)
   const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader;
   const struct panvk_descriptor_state *desc_state =
      &cmdbuf->state.gfx.desc_state;
-   uint32_t desc_count = fs->desc_info.dyn_bufs.count + 1;
+   uint32_t desc_count = fs->desc_info.dyn_bufs.count + MAX_VARYING + 1;
   struct panfrost_ptr driver_set = panvk_cmd_alloc_dev_mem(
      cmdbuf, desc, desc_count * PANVK_DESCRIPTOR_SIZE, PANVK_DESCRIPTOR_SIZE);
   struct panvk_opaque_desc *descs = driver_set.cpu;
@ -180,13 +247,15 @@ prepare_fs_driver_set(struct panvk_cmd_buffer *cmdbuf)
   if (desc_count && !driver_set.gpu)
      return VK_ERROR_OUT_OF_DEVICE_MEMORY;

-   /* Dummy sampler always comes first. */
-   pan_cast_and_pack(&descs[0], SAMPLER, cfg) {
+   emit_varying_descs(cmdbuf, (struct mali_attribute_packed *)(&descs[0]));
+
+   /* Dummy sampler always comes right after the varyings. */
+   pan_cast_and_pack(&descs[MAX_VARYING], SAMPLER, cfg) {
      cfg.clamp_integer_array_indices = false;
   }

-   panvk_per_arch(cmd_fill_dyn_bufs)(desc_state, fs,
-                                     (struct mali_buffer_packed *)(&descs[1]));
+   panvk_per_arch(cmd_fill_dyn_bufs)(
+      desc_state, fs, (struct mali_buffer_packed *)(&descs[1 + MAX_VARYING]));

   fs_desc_state->driver_set.dev_addr = driver_set.gpu;
   fs_desc_state->driver_set.size = desc_count * PANVK_DESCRIPTOR_SIZE;
@ -1650,16 +1719,8 @@ prepare_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
   if (result != VK_SUCCESS)
      return result;

-   uint32_t varying_size = 0;
-
-   if (fs) {
-      unsigned vs_vars = vs->info.varyings.output_count;
-      unsigned fs_vars = fs->info.varyings.input_count;
-      unsigned var_slots = MAX2(vs_vars, fs_vars);
-
-      /* Assumes 16 byte slots. We could do better. */
-      varying_size = var_slots * 16;
-   }
+   /* Assumes 16 byte slots. We could do better. */
+   uint32_t varying_size = get_varying_slots(cmdbuf) * 16;

   cs_update_vt_ctx(b) {
      /* We don't use the resource dep system yet. */
--- a/src/panfrost/vulkan/panvk_vX_nir_lower_descriptors.c
+++ b/src/panfrost/vulkan/panvk_vX_nir_lower_descriptors.c
@ -1025,8 +1025,22 @@ create_copy_table(nir_shader *nir, struct lower_desc_ctx *ctx)
   for (uint32_t i = 0; i < PANVK_BIFROST_DESC_TABLE_COUNT; i++)
      copy_count += desc_info->others[i].count;
 #else
-   /* Dummy sampler comes after the vertex attributes. */
-   uint32_t dummy_sampler_idx = nir->info.stage == MESA_SHADER_VERTEX ? 16 : 0;
+   uint32_t dummy_sampler_idx;
+   switch (nir->info.stage) {
+   case MESA_SHADER_VERTEX:
+      /* Dummy sampler comes after the vertex attributes. */
+      dummy_sampler_idx = 16;
+      break;
+   case MESA_SHADER_FRAGMENT:
+      /* Dummy sampler comes after the varyings. */
+      dummy_sampler_idx = MAX_VARYING;
+      break;
+   case MESA_SHADER_COMPUTE:
+      dummy_sampler_idx = 0;
+      break;
+   default:
+      unreachable("unexpected stage");
+   }
   desc_info->dummy_sampler_handle = pan_res_handle(0, dummy_sampler_idx);

   copy_count = desc_info->dyn_bufs.count + desc_info->dyn_bufs.count;
--- a/src/panfrost/vulkan/panvk_vX_shader.c
+++ b/src/panfrost/vulkan/panvk_vX_shader.c
@ -1041,6 +1041,10 @@ panvk_compile_shader(struct panvk_device *dev,
      .gpu_id = phys_dev->kmod.props.gpu_prod_id,
      .no_ubo_to_push = true,
      .view_mask = (state && state->rp) ? state->rp->view_mask : 0,
+#if PAN_ARCH >= 9
+      /* LD_VAR_BUF does not support maxVertexOutputComponents (128) */
+      .valhall.use_ld_var_buf = false,
+#endif
   };

   if (info->stage == MESA_SHADER_FRAGMENT && state != NULL &&