lima: ppir: handle ffma in the backend

ppir doesn't do a good job in fusing ffma, so allow nir to do it and handle ffma in backend. shader-db: total instructions in shared programs: 29485 -> 29066 (-1.42%) instructions in affected programs: 10362 -> 9943 (-4.04%) helped: 114 HURT: 5 helped stats (abs) min: 1 max: 30 x̄: 3.72 x̃: 2 helped stats (rel) min: 0.78% max: 20.00% x̄: 5.66% x̃: 4.31% HURT stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 HURT stats (rel) min: 0.52% max: 1.09% x̄: 0.85% x̃: 0.98% 95% mean confidence interval for instructions value: -4.37 -2.67 95% mean confidence interval for instructions %-change: -6.10% -4.68% Instructions are helped. total loops in shared programs: 2 -> 2 (0.00%) loops in affected programs: 0 -> 0 helped: 0 HURT: 0 total spills in shared programs: 369 -> 367 (-0.54%) spills in affected programs: 199 -> 197 (-1.01%) helped: 8 HURT: 9 total fills in shared programs: 1265 -> 1208 (-4.51%) fills in affected programs: 758 -> 701 (-7.52%) helped: 11 HURT: 9 Reviewed-by: Erico Nunes <nunes.erico@gmail.com> Signed-off-by: Vasily Khoruzhick <anarsoul@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33313>
2026-01-07 15:10:12 +01:00 · 2025-01-30 18:47:59 -08:00 · 2025-01-30 18:47:59 -08:00 · 3983e88c27
commit 3983e88c27
parent a4b1924b22
5 changed files with 74 additions and 6 deletions
--- a/src/gallium/drivers/lima/ci/lima-fails.txt
+++ b/src/gallium/drivers/lima/ci/lima-fails.txt
@ -402,7 +402,6 @@ spec@glsl-1.20@execution@built-in-functions@fs-op-div-mat2x4-mat2x4,Fail
 spec@glsl-1.20@execution@built-in-functions@fs-op-div-mat3x2-mat3x2,Fail
 spec@glsl-1.20@execution@built-in-functions@fs-op-div-mat3x4-mat3x4,Fail
 spec@glsl-1.20@execution@built-in-functions@fs-op-div-mat4x3-mat4x3,Fail
-spec@glsl-1.20@execution@built-in-functions@fs-op-mult-mat4x3-mat3x4,Fail
 spec@glsl-1.20@execution@clipping@fixed-clip-enables,Fail
 spec@glsl-1.20@execution@clipping@vs-clip-vertex-const-reject,Fail
 spec@glsl-1.20@execution@clipping@vs-clip-vertex-different-from-position,Fail
--- a/src/gallium/drivers/lima/ir/lima_nir_duplicate_consts.c
+++ b/src/gallium/drivers/lima/ir/lima_nir_duplicate_consts.c
@ -33,10 +33,19 @@ lima_nir_duplicate_load_const(nir_builder *b, nir_load_const_instr *load)

   nir_foreach_use_safe(use_src, &load->def) {
      nir_load_const_instr *dupl;
+      nir_instr *instr = nir_src_parent_instr(use_src);
+      nir_alu_instr *alu = NULL;
+      if (instr->type == nir_instr_type_alu)
+         alu = nir_instr_as_alu(instr);

-      if (last_parent_instr != nir_src_parent_instr(use_src)) {
+      /* Always clone consts for FFMA sources as well, since it will translate
+       * into 2 PPIR ops and each may need its own const. Redundant consts
+       * will be dropped by PPIR later
+       */
+      if (last_parent_instr != instr ||
+          (alu && alu->op == nir_op_ffma)) {
         /* if ssa use, clone for the target block */
-         b->cursor = nir_before_instr(nir_src_parent_instr(use_src));
+         b->cursor = nir_before_instr(instr);

         dupl = nir_load_const_instr_create(b->shader, load->def.num_components,
                                            load->def.bit_size);
--- a/src/gallium/drivers/lima/ir/pp/nir.c
+++ b/src/gallium/drivers/lima/ir/pp/nir.c
@ -123,6 +123,57 @@ static void ppir_node_add_src(ppir_compiler *comp, ppir_node *node,
   ppir_node_target_assign(ps, child);
 }

+static bool ppir_emit_ffma(ppir_block *block, nir_instr *ni)
+{
+   nir_alu_instr *instr = nir_instr_as_alu(ni);
+   nir_def *def = &instr->def;
+   unsigned mask = nir_component_mask(def->num_components);
+   uint8_t identity[4] = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y,
+                           PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W };
+
+   ppir_alu_node *add = ppir_node_create_dest(block, ppir_op_add, def, mask);
+   if (!add)
+      return false;
+   ppir_alu_node *mul = ppir_node_create(block, ppir_op_mul, -1, mask);
+   if (!mul)
+      return false;
+
+   ppir_dest *mul_dest = &mul->dest;
+   ppir_dest *add_dest = &add->dest;
+
+   mul_dest->type = ppir_target_pipeline;
+   if (util_bitcount(add_dest->write_mask) == 1) {
+      mul_dest->write_mask = 1;
+      mul_dest->pipeline = ppir_pipeline_reg_fmul;
+   } else {
+      mul_dest->write_mask = u_bit_consecutive(0, 4);
+      mul_dest->pipeline = ppir_pipeline_reg_vmul;
+   }
+
+   add->num_src = 2;
+   mul->num_src = 2;
+
+   for (int i = 0; i < 2; i++) {
+      nir_alu_src *alu_src = instr->src + i;
+      ppir_src *ps = mul->src + i;
+      memcpy(ps->swizzle, alu_src->swizzle, sizeof(ps->swizzle));
+      ppir_node_add_src(block->comp, &mul->node, ps, &alu_src->src, mask);
+   }
+
+   nir_alu_src *alu_src = instr->src + 2;
+   ppir_src *ps = add->src;
+   memcpy(ps[1].swizzle, alu_src->swizzle, sizeof(ps[1].swizzle));
+   ppir_node_add_src(block->comp, &add->node, ps + 1, &alu_src->src, mask);
+
+   memcpy(ps[0].swizzle, identity, sizeof(ps[0].swizzle));
+   ppir_node_target_assign(&ps[0], &mul->node);
+   ppir_node_add_dep(&add->node, &mul->node, ppir_dep_src);
+
+   list_addtail(&add->node.list, &block->node_list);
+   list_addtail(&mul->node.list, &block->node_list);
+   return true;
+}
+
 static int nir_to_ppir_opcodes[nir_num_opcodes] = {
   [nir_op_mov] = ppir_op_mov,
   [nir_op_fmul] = ppir_op_mul,
@ -152,6 +203,7 @@ static int nir_to_ppir_opcodes[nir_num_opcodes] = {
   [nir_op_ftrunc] = ppir_op_trunc,
   [nir_op_fsat] = ppir_op_sat,
   [nir_op_fclamp_pos] = ppir_op_clamp_pos,
+   [nir_op_ffma] = ppir_op_ffma,
 };

 static bool ppir_emit_alu(ppir_block *block, nir_instr *ni)
@ -164,6 +216,11 @@ static bool ppir_emit_alu(ppir_block *block, nir_instr *ni)
      ppir_error("unsupported nir_op: %s\n", nir_op_infos[instr->op].name);
      return false;
   }
+
+   if (op == ppir_op_ffma) {
+      return ppir_emit_ffma(block, ni);
+   }
+
   unsigned mask = nir_component_mask(def->num_components);
   ppir_alu_node *node = ppir_node_create_dest(block, op, def, mask);
   if (!node)
--- a/src/gallium/drivers/lima/ir/pp/ppir.h
+++ b/src/gallium/drivers/lima/ir/pp/ppir.h
@ -118,6 +118,8 @@ typedef enum {
   ppir_op_undef,
   ppir_op_dummy,

+   ppir_op_ffma,
+
   ppir_op_num,
 } ppir_op;

--- a/src/gallium/drivers/lima/lima_program.c
+++ b/src/gallium/drivers/lima/lima_program.c
@ -69,9 +69,9 @@ static const nir_shader_compiler_options vs_nir_options = {
 };

 static const nir_shader_compiler_options fs_nir_options = {
-   .lower_ffma16 = true,
-   .lower_ffma32 = true,
-   .lower_ffma64 = true,
+   .fuse_ffma16 = true,
+   .fuse_ffma32 = true,
+   .fuse_ffma64 = true,
   .lower_fpow = true,
   .lower_fdiv = true,
   .lower_fmod = true,
@ -266,6 +266,7 @@ lima_program_optimize_fs_nir(struct nir_shader *s,

   /* Must be run after optimization loop */
   NIR_PASS_V(s, lima_nir_scale_trig);
+   NIR_PASS_V(s, nir_opt_algebraic_late);
   NIR_PASS_V(s, lima_nir_ppir_algebraic_late);

   NIR_PASS_V(s, nir_copy_prop);