intel/elk: Remove multi-polygon support

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27629>
2026-02-26 20:00:32 +01:00 · 2024-02-13 13:26:59 -08:00 · 2024-02-13 13:26:59 -08:00 · be73fa1434
commit be73fa1434
parent fd3a815a5b
9 changed files with 30 additions and 144 deletions
--- a/src/gallium/drivers/iris/iris_program.c
+++ b/src/gallium/drivers/iris/iris_program.c
@ -297,8 +297,8 @@ iris_apply_elk_wm_prog_data(struct iris_compiled_shader *shader,
   iris->flat_inputs          = elk->flat_inputs;
   iris->inputs               = elk->inputs;
   iris->computed_depth_mode  = elk->computed_depth_mode;
-   iris->max_polygons         = elk->max_polygons;
-   iris->dispatch_multi       = elk->dispatch_multi;
+   iris->max_polygons         = 1;
+   iris->dispatch_multi       = 0;
   iris->computed_stencil     = elk->computed_stencil;
   iris->early_fragment_tests = elk->early_fragment_tests;
   iris->post_depth_coverage  = elk->post_depth_coverage;
--- a/src/intel/common/intel_genX_state_elk.h
+++ b/src/intel/common/intel_genX_state_elk.h
@ -89,10 +89,8 @@ intel_set_ps_dispatch_state(struct GENX(3DSTATE_PS) *ps,
   }

   assert(enable_8 || enable_16 || enable_32);
-   assert(!prog_data->dispatch_multi);

-   ps->_8PixelDispatchEnable = enable_8 ||
-      (GFX_VER == 12 && prog_data->dispatch_multi);
+   ps->_8PixelDispatchEnable = enable_8;
   ps->_16PixelDispatchEnable = enable_16;
   ps->_32PixelDispatchEnable = enable_32;
 }
--- a/src/intel/compiler/elk/elk_compiler.h
+++ b/src/intel/compiler/elk/elk_compiler.h
@ -833,18 +833,6 @@ struct elk_wm_prog_data {
   uint8_t color_outputs_written;
   uint8_t computed_depth_mode;

-   /**
-    * Number of polygons handled in parallel by the multi-polygon PS
-    * kernel.
-    */
-   uint8_t max_polygons;
-
-   /**
-    * Dispatch width of the multi-polygon PS kernel, or 0 if no
-    * multi-polygon kernel was built.
-    */
-   uint8_t dispatch_multi;
-
   bool computed_stencil;
   bool early_fragment_tests;
   bool post_depth_coverage;
@ -1791,7 +1779,7 @@ elk_cs_get_dispatch_info(const struct intel_device_info *devinfo,
 */
 static inline bool
 elk_stage_has_packed_dispatch(ASSERTED const struct intel_device_info *devinfo,
-                              gl_shader_stage stage, unsigned max_polygons,
+                              gl_shader_stage stage,
                              const struct elk_stage_prog_data *prog_data)
 {
   /* The code below makes assumptions about the hardware's thread dispatch
@ -1814,8 +1802,7 @@ elk_stage_has_packed_dispatch(ASSERTED const struct intel_device_info *devinfo,
      const struct elk_wm_prog_data *wm_prog_data =
         (const struct elk_wm_prog_data *)prog_data;
      return !wm_prog_data->persample_dispatch &&
-             wm_prog_data->uses_vmask &&
-             max_polygons < 2;
+             wm_prog_data->uses_vmask;
   }
   case MESA_SHADER_COMPUTE:
      /* Compute shaders will be spawned with either a fully enabled dispatch
--- a/src/intel/compiler/elk/elk_fs.cpp
+++ b/src/intel/compiler/elk/elk_fs.cpp
@ -1505,48 +1505,14 @@ elk_fs_visitor::assign_urb_setup()
             *      3       Attr0.w  a1-a0  a2-a0   N/A    a0
             *      4       Attr1.x  a1-a0  a2-a0   N/A    a0
             *     ...
-             *
-             * In multipolygon mode that no longer works since
-             * different channels may be processing polygons with
-             * different plane parameters, so each parameter above is
-             * represented as a dispatch_width-wide vector:
-             *
-             *  elk_fs_reg::nr     elk_fs_reg::offset    Input      Comp0     ...    CompN
-             *      0                 0          Attr0.x  a1[0]-a0[0] ... a1[N]-a0[N]
-             *      0        4 * dispatch_width  Attr0.x  a2[0]-a0[0] ... a2[N]-a0[N]
-             *      0        8 * dispatch_width  Attr0.x     N/A      ...     N/A
-             *      0       12 * dispatch_width  Attr0.x    a0[0]     ...    a0[N]
-             *      1                 0          Attr0.y  a1[0]-a0[0] ... a1[N]-a0[N]
-             *     ...
-             *
-             * Note that many of the components on a single row above
-             * are likely to be replicated multiple times (if, say, a
-             * single SIMD thread is only processing 2 different
-             * polygons), so plane parameters aren't actually stored
-             * in GRF memory with that layout to avoid wasting space.
-             * Instead we compose ATTR register regions with a 2D
-             * region that walks through the parameters of each
-             * polygon with the correct stride, reading the parameter
-             * corresponding to each channel directly from the PS
-             * thread payload.
-             *
-             * The latter layout corresponds to a param_width equal to
-             * dispatch_width, while the former (scalar parameter)
-             * layout has a param_width of 1.
-             *
-             * Gfx20+ represent plane parameters in a format similar
-             * to the above, except the parameters are packed in 12B
-             * and ordered like "a0, a1-a0, a2-a0" instead of the
-             * above vec4 representation with a missing component.
             */
-            const unsigned param_width = (max_polygons > 1 ? dispatch_width : 1);
+            const unsigned param_width = 1;

            /* Size of a single scalar component of a plane parameter
             * in bytes.
             */
            const unsigned chan_sz = 4;
            struct elk_reg reg;
-            assert(max_polygons > 0);

            /* Calculate the base register on the thread payload of
             * either the block of vertex setup data or the block of
@ -1558,7 +1524,7 @@ elk_fs_visitor::assign_urb_setup()
            const unsigned base = urb_start +
               (per_prim ? 0 :
                ALIGN(prog_data->num_per_primitive_inputs / 2,
-                      reg_unit(devinfo)) * max_polygons);
+                      reg_unit(devinfo)));
            const unsigned idx = per_prim ? inst->src[i].nr :
               inst->src[i].nr - prog_data->num_per_primitive_inputs;

@ -1570,7 +1536,7 @@ elk_fs_visitor::assign_urb_setup()
             * Earlier platforms and per-primitive block pack 2 logical
             * input components per 32B register.
             */
-            const unsigned grf = base + idx / 2 * max_polygons;
+            const unsigned grf = base + idx / 2;
            assert(inst->src[i].offset / param_width < REG_SIZE / 2);
            const unsigned delta = (idx % 2) * (REG_SIZE / 2) +
               inst->src[i].offset / (param_width * chan_sz) * chan_sz +
@ -1594,13 +1560,13 @@ elk_fs_visitor::assign_urb_setup()
    * but they may be replicated multiple times for multipolygon
    * dispatch.
    */
-   this->first_non_payload_grf += prog_data->num_varying_inputs * 2 * max_polygons;
+   this->first_non_payload_grf += prog_data->num_varying_inputs * 2;

   /* Unlike regular attributes, per-primitive attributes have all 4 channels
    * in the same slot, so each GRF can store two slots.
    */
   assert(prog_data->num_per_primitive_inputs % 2 == 0);
-   this->first_non_payload_grf += prog_data->num_per_primitive_inputs / 2 * max_polygons;
+   this->first_non_payload_grf += prog_data->num_per_primitive_inputs / 2;
 }

 void
@ -2931,8 +2897,7 @@ elk_fs_visitor::eliminate_find_live_channel()
   bool progress = false;
   unsigned depth = 0;

-   if (!elk_stage_has_packed_dispatch(devinfo, stage, max_polygons,
-                                      stage_prog_data)) {
+   if (!elk_stage_has_packed_dispatch(devinfo, stage, stage_prog_data)) {
      /* The optimization below assumes that channel zero is live on thread
       * dispatch, which may not be the case if the fixed function dispatches
       * threads sparsely.
@ -4226,19 +4191,6 @@ get_fpu_lowered_simd_width(const elk_fs_visitor *shader,
   /* Maximum execution size representable in the instruction controls. */
   unsigned max_width = MIN2(32, inst->exec_size);

-   /* Number of channels per polygon handled by a multipolygon PS shader. */
-   const unsigned poly_width = shader->dispatch_width /
-                               MAX2(1, shader->max_polygons);
-
-   /* Number of registers that will be read by an ATTR source if
-    * present for multipolygon PS shaders, since the PS vertex setup
-    * data for each polygon is stored in different contiguous GRFs.
-    */
-   const unsigned attr_reg_count = (shader->stage != MESA_SHADER_FRAGMENT ||
-                                    shader->max_polygons < 2 ? 0 :
-                                    DIV_ROUND_UP(inst->exec_size,
-                                                 poly_width) * reg_unit(devinfo));
-
   /* According to the PRMs:
    *  "A. In Direct Addressing mode, a source cannot span more than 2
    *      adjacent GRF registers.
@ -4251,8 +4203,7 @@ get_fpu_lowered_simd_width(const elk_fs_visitor *shader,
   unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);

   for (unsigned i = 0; i < inst->sources; i++)
-      reg_count = MAX3(reg_count, DIV_ROUND_UP(inst->size_read(i), REG_SIZE),
-                       (inst->src[i].file == ATTR ? attr_reg_count : 0));
+      reg_count = MAX2(reg_count, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));

   /* Calculate the maximum execution size of the instruction based on the
    * factor by which it goes over the hardware limit of 2 GRFs.
@ -5200,8 +5151,7 @@ elk_fs_visitor::lower_find_live_channel()
      return false;

   bool packed_dispatch =
-      elk_stage_has_packed_dispatch(devinfo, stage, max_polygons,
-                                    stage_prog_data);
+      elk_stage_has_packed_dispatch(devinfo, stage, stage_prog_data);
   bool vmask =
      stage == MESA_SHADER_FRAGMENT &&
      elk_wm_prog_data(stage_prog_data)->uses_vmask;
@ -6505,7 +6455,6 @@ elk_nir_populate_wm_prog_data(nir_shader *shader,
   prog_data->uses_omask = !key->ignore_sample_mask_out &&
      (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK));
   prog_data->color_outputs_written = key->color_outputs_valid;
-   prog_data->max_polygons = 1;
   prog_data->computed_depth_mode = computed_depth_mode(shader);
   prog_data->computed_stencil =
      shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
@ -6681,7 +6630,7 @@ elk_compile_fs(const struct elk_compiler *compiler,
   bool has_spilled = false;

   v8 = std::make_unique<elk_fs_visitor>(compiler, &params->base, key,
-                                     prog_data, nir, 8, 1,
+                                     prog_data, nir, 8,
                                     params->base.stats != NULL,
                                     debug_enabled);
   if (!v8->run_fs(allow_spilling, false /* do_rep_send */)) {
@ -6716,7 +6665,7 @@ elk_compile_fs(const struct elk_compiler *compiler,
       (INTEL_SIMD(FS, 16) || params->use_rep_send)) {
      /* Try a SIMD16 compile */
      v16 = std::make_unique<elk_fs_visitor>(compiler, &params->base, key,
-                                         prog_data, nir, 16, 1,
+                                         prog_data, nir, 16,
                                         params->base.stats != NULL,
                                         debug_enabled);
      if (v8)
@ -6749,7 +6698,7 @@ elk_compile_fs(const struct elk_compiler *compiler,
       INTEL_SIMD(FS, 32)) {
      /* Try a SIMD32 compile */
      v32 = std::make_unique<elk_fs_visitor>(compiler, &params->base, key,
-                                         prog_data, nir, 32, 1,
+                                         prog_data, nir, 32,
                                         params->base.stats != NULL,
                                         debug_enabled);
      if (v8)
@ -6830,7 +6779,7 @@ elk_compile_fs(const struct elk_compiler *compiler,
   if (simd8_cfg) {
      prog_data->dispatch_8 = true;
      g.generate_code(simd8_cfg, 8, v8->shader_stats,
-                      v8->performance_analysis.require(), stats, 1);
+                      v8->performance_analysis.require(), stats);
      stats = stats ? stats + 1 : NULL;
      max_dispatch_width = 8;
   }
@ -6839,7 +6788,7 @@ elk_compile_fs(const struct elk_compiler *compiler,
      prog_data->dispatch_16 = true;
      prog_data->prog_offset_16 = g.generate_code(
         simd16_cfg, 16, v16->shader_stats,
-         v16->performance_analysis.require(), stats, 1);
+         v16->performance_analysis.require(), stats);
      stats = stats ? stats + 1 : NULL;
      max_dispatch_width = 16;
   }
@ -6848,7 +6797,7 @@ elk_compile_fs(const struct elk_compiler *compiler,
      prog_data->dispatch_32 = true;
      prog_data->prog_offset_32 = g.generate_code(
         simd32_cfg, 32, v32->shader_stats,
-         v32->performance_analysis.require(), stats, 1);
+         v32->performance_analysis.require(), stats);
      stats = stats ? stats + 1 : NULL;
      max_dispatch_width = 32;
   }
@ -7146,7 +7095,6 @@ elk_fs_test_dispatch_packing(const fs_builder &bld)
      elk_wm_prog_data(shader->stage_prog_data)->uses_vmask;

   if (elk_stage_has_packed_dispatch(shader->devinfo, stage,
-                                     shader->max_polygons,
                                     shader->stage_prog_data)) {
      const fs_builder ubld = bld.exec_all().group(1, 0);
      const elk_fs_reg tmp = component(bld.vgrf(ELK_REGISTER_TYPE_UD), 0);
--- a/src/intel/compiler/elk/elk_fs.h
+++ b/src/intel/compiler/elk/elk_fs.h
@ -177,7 +177,6 @@ public:
              struct elk_wm_prog_data *prog_data,
              const nir_shader *shader,
              unsigned dispatch_width,
-              unsigned num_polygons,
              bool needs_register_pressure,
              bool debug_enabled);
   elk_fs_visitor(const struct elk_compiler *compiler,
@ -400,7 +399,6 @@ public:
   bool needs_register_pressure;

   const unsigned dispatch_width; /**< 8, 16 or 32 */
-   const unsigned max_polygons;
   unsigned max_dispatch_width;

   /* The API selected subgroup size */
@ -451,8 +449,7 @@ public:
   int generate_code(const elk_cfg_t *cfg, int dispatch_width,
                     struct shader_stats shader_stats,
                     const elk::performance &perf,
-                     struct elk_compile_stats *stats,
-                     unsigned max_polygons = 0);
+                     struct elk_compile_stats *stats);
   void add_const_data(void *data, unsigned size);
   const unsigned *get_assembly();

--- a/src/intel/compiler/elk/elk_fs_copy_propagation.cpp
+++ b/src/intel/compiler/elk/elk_fs_copy_propagation.cpp
@ -660,8 +660,7 @@ instruction_requires_packed_data(elk_fs_inst *inst)
 static bool
 try_copy_propagate(const elk_compiler *compiler, elk_fs_inst *inst,
                   acp_entry *entry, int arg,
-                   const elk::simple_allocator &alloc,
-                   uint8_t max_polygons)
+                   const elk::simple_allocator &alloc)
 {
   if (inst->src[arg].file != VGRF)
      return false;
@ -799,17 +798,6 @@ try_copy_propagate(const elk_compiler *compiler, elk_fs_inst *inst,
       (reg_offset(inst->dst) % REG_SIZE) != (reg_offset(entry->src) % REG_SIZE))
      return false;

-   /* The <8;8,0> regions used for FS attributes in multipolygon
-    * dispatch mode could violate regioning restrictions, don't copy
-    * propagate them in such cases.
-    */
-   if (entry->src.file == ATTR && max_polygons > 1 &&
-       (has_dst_aligned_region_restriction(devinfo, inst, dst_type) ||
-	instruction_requires_packed_data(inst) ||
-	(inst->elk_is_3src(compiler) && arg == 2) ||
-	entry->dst.type != inst->src[arg].type))
-      return false;
-
   /* Bail if the source FIXED_GRF region of the copy cannot be trivially
    * composed with the source region of the instruction -- E.g. because the
    * copy uses some extended stride greater than 4 not supported natively by
@ -1245,8 +1233,7 @@ can_propagate_from(elk_fs_inst *inst)
 static bool
 opt_copy_propagation_local(const elk_compiler *compiler, linear_ctx *lin_ctx,
                           elk_bblock_t *block, struct acp &acp,
-                           const elk::simple_allocator &alloc,
-                           uint8_t max_polygons)
+                           const elk::simple_allocator &alloc)
 {
   bool progress = false;

@ -1266,8 +1253,7 @@ opt_copy_propagation_local(const elk_compiler *compiler, linear_ctx *lin_ctx,
                  break;
               }
            } else {
-               if (try_copy_propagate(compiler, inst, *iter, i, alloc,
-                                      max_polygons)) {
+               if (try_copy_propagate(compiler, inst, *iter, i, alloc)) {
                  instruction_progress = true;
                  break;
               }
@ -1373,8 +1359,7 @@ elk_fs_visitor::opt_copy_propagation()
    */
   foreach_block (block, cfg) {
      progress = opt_copy_propagation_local(compiler, lin_ctx, block,
-                                            out_acp[block->num], alloc,
-                                            max_polygons) || progress;
+                                            out_acp[block->num], alloc) || progress;

      /* If the destination of an ACP entry exists only within this block,
       * then there's no need to keep it for dataflow analysis.  We can delete
@ -1414,7 +1399,7 @@ elk_fs_visitor::opt_copy_propagation()
      }

      progress = opt_copy_propagation_local(compiler, lin_ctx, block,
-                                            in_acp, alloc, max_polygons) ||
+                                            in_acp, alloc) ||
                 progress;
   }

--- a/src/intel/compiler/elk/elk_fs_generator.cpp
+++ b/src/intel/compiler/elk/elk_fs_generator.cpp
@ -1517,8 +1517,7 @@ int
 elk_fs_generator::generate_code(const elk_cfg_t *cfg, int dispatch_width,
                            struct shader_stats shader_stats,
                            const elk::performance &perf,
-                            struct elk_compile_stats *stats,
-                            unsigned max_polygons)
+                            struct elk_compile_stats *stats)
 {
   /* align to 64 byte boundary. */
   elk_realign(p, 64);
@ -2273,7 +2272,6 @@ elk_fs_generator::generate_code(const elk_cfg_t *cfg, int dispatch_width,
                        before_size, after_size);
   if (stats) {
      stats->dispatch_width = dispatch_width;
-      stats->max_polygons = max_polygons;
      stats->max_dispatch_width = dispatch_width;
      stats->instructions = before_size / 16 - nop_count - sync_nop_count;
      stats->sends = send_count;
--- a/src/intel/compiler/elk/elk_fs_visitor.cpp
+++ b/src/intel/compiler/elk/elk_fs_visitor.cpp
@ -64,19 +64,7 @@ elk_fs_visitor::interp_reg(const fs_builder &bld, unsigned location,
   const unsigned per_vertex_start = prog_data->num_per_primitive_inputs;
   const unsigned regnr = per_vertex_start + (nr * 4) + channel;

-   if (max_polygons > 1) {
-      /* In multipolygon dispatch each plane parameter is a
-       * dispatch_width-wide SIMD vector (see comment in
-       * assign_urb_setup()), so we need to use offset() instead of
-       * component() to select the specified parameter.
-       */
-      const elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD);
-      bld.MOV(tmp, offset(elk_fs_reg(ATTR, regnr, ELK_REGISTER_TYPE_UD),
-                          dispatch_width, comp));
-      return retype(tmp, ELK_REGISTER_TYPE_F);
-   } else {
-      return component(elk_fs_reg(ATTR, regnr, ELK_REGISTER_TYPE_F), comp);
-   }
+   return component(elk_fs_reg(ATTR, regnr, ELK_REGISTER_TYPE_F), comp);
 }

 /* The register location here is relative to the start of the URB
@ -99,19 +87,7 @@ elk_fs_visitor::per_primitive_reg(const fs_builder &bld, int location, unsigned

   assert(regnr < prog_data->num_per_primitive_inputs);

-   if (max_polygons > 1) {
-      /* In multipolygon dispatch each primitive constant is a
-       * dispatch_width-wide SIMD vector (see comment in
-       * assign_urb_setup()), so we need to use offset() instead of
-       * component() to select the specified parameter.
-       */
-      const elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD);
-      bld.MOV(tmp, offset(elk_fs_reg(ATTR, regnr, ELK_REGISTER_TYPE_UD),
-                          dispatch_width, comp % 4));
-      return retype(tmp, ELK_REGISTER_TYPE_F);
-   } else {
-      return component(elk_fs_reg(ATTR, regnr, ELK_REGISTER_TYPE_F), comp % 4);
-   }
+   return component(elk_fs_reg(ATTR, regnr, ELK_REGISTER_TYPE_F), comp % 4);
 }

 /** Emits the interpolation for the varying inputs. */
@ -878,7 +854,6 @@ elk_fs_visitor::elk_fs_visitor(const struct elk_compiler *compiler,
     performance_analysis(this),
     needs_register_pressure(needs_register_pressure),
     dispatch_width(dispatch_width),
-     max_polygons(0),
     api_subgroup_size(elk_nir_api_subgroup_size(shader, dispatch_width))
 {
   init();
@ -889,7 +864,7 @@ elk_fs_visitor::elk_fs_visitor(const struct elk_compiler *compiler,
                       const elk_wm_prog_key *key,
                       struct elk_wm_prog_data *prog_data,
                       const nir_shader *shader,
-                       unsigned dispatch_width, unsigned max_polygons,
+                       unsigned dispatch_width,
                       bool needs_register_pressure,
                       bool debug_enabled)
   : elk_backend_shader(compiler, params, shader, &prog_data->base,
@ -899,7 +874,6 @@ elk_fs_visitor::elk_fs_visitor(const struct elk_compiler *compiler,
     performance_analysis(this),
     needs_register_pressure(needs_register_pressure),
     dispatch_width(dispatch_width),
-     max_polygons(max_polygons),
     api_subgroup_size(elk_nir_api_subgroup_size(shader, dispatch_width))
 {
   init();
@ -924,7 +898,6 @@ elk_fs_visitor::elk_fs_visitor(const struct elk_compiler *compiler,
     performance_analysis(this),
     needs_register_pressure(needs_register_pressure),
     dispatch_width(8),
-     max_polygons(0),
     api_subgroup_size(elk_nir_api_subgroup_size(shader, dispatch_width))
 {
   init();
--- a/src/intel/compiler/elk/elk_vec4.cpp
+++ b/src/intel/compiler/elk/elk_vec4.cpp
@ -1208,7 +1208,7 @@ vec4_visitor::eliminate_find_live_channel()
   bool progress = false;
   unsigned depth = 0;

-   if (!elk_stage_has_packed_dispatch(devinfo, stage, 0, stage_prog_data)) {
+   if (!elk_stage_has_packed_dispatch(devinfo, stage, stage_prog_data)) {
      /* The optimization below assumes that channel zero is live on thread
       * dispatch, which may not be the case if the fixed function dispatches
       * threads sparsely.