lima: ppir: handle ffma in the backend

ppir doesn't do a good job in fusing ffma, so allow nir to do it and
handle ffma in backend.

shader-db:

total instructions in shared programs: 29485 -> 29066 (-1.42%)
instructions in affected programs: 10362 -> 9943 (-4.04%)
helped: 114
HURT: 5
helped stats (abs) min: 1 max: 30 x̄: 3.72 x̃: 2
helped stats (rel) min: 0.78% max: 20.00% x̄: 5.66% x̃: 4.31%
HURT stats (abs)   min: 1 max: 1 x̄: 1.00 x̃: 1
HURT stats (rel)   min: 0.52% max: 1.09% x̄: 0.85% x̃: 0.98%
95% mean confidence interval for instructions value: -4.37 -2.67
95% mean confidence interval for instructions %-change: -6.10% -4.68%
Instructions are helped.

total loops in shared programs: 2 -> 2 (0.00%)
loops in affected programs: 0 -> 0
helped: 0
HURT: 0

total spills in shared programs: 369 -> 367 (-0.54%)
spills in affected programs: 199 -> 197 (-1.01%)
helped: 8
HURT: 9

total fills in shared programs: 1265 -> 1208 (-4.51%)
fills in affected programs: 758 -> 701 (-7.52%)
helped: 11
HURT: 9

Reviewed-by: Erico Nunes <nunes.erico@gmail.com>
Signed-off-by: Vasily Khoruzhick <anarsoul@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33313>
This commit is contained in:
Vasily Khoruzhick 2025-01-30 18:47:59 -08:00 committed by Marge Bot
parent a4b1924b22
commit 3983e88c27
5 changed files with 74 additions and 6 deletions

View file

@ -402,7 +402,6 @@ spec@glsl-1.20@execution@built-in-functions@fs-op-div-mat2x4-mat2x4,Fail
spec@glsl-1.20@execution@built-in-functions@fs-op-div-mat3x2-mat3x2,Fail
spec@glsl-1.20@execution@built-in-functions@fs-op-div-mat3x4-mat3x4,Fail
spec@glsl-1.20@execution@built-in-functions@fs-op-div-mat4x3-mat4x3,Fail
spec@glsl-1.20@execution@built-in-functions@fs-op-mult-mat4x3-mat3x4,Fail
spec@glsl-1.20@execution@clipping@fixed-clip-enables,Fail
spec@glsl-1.20@execution@clipping@vs-clip-vertex-const-reject,Fail
spec@glsl-1.20@execution@clipping@vs-clip-vertex-different-from-position,Fail

View file

@ -33,10 +33,19 @@ lima_nir_duplicate_load_const(nir_builder *b, nir_load_const_instr *load)
nir_foreach_use_safe(use_src, &load->def) {
nir_load_const_instr *dupl;
nir_instr *instr = nir_src_parent_instr(use_src);
nir_alu_instr *alu = NULL;
if (instr->type == nir_instr_type_alu)
alu = nir_instr_as_alu(instr);
if (last_parent_instr != nir_src_parent_instr(use_src)) {
/* Always clone consts for FFMA sources as well, since it will translate
* into 2 PPIR ops and each may need its own const. Redundant consts
* will be dropped by PPIR later
*/
if (last_parent_instr != instr ||
(alu && alu->op == nir_op_ffma)) {
/* if ssa use, clone for the target block */
b->cursor = nir_before_instr(nir_src_parent_instr(use_src));
b->cursor = nir_before_instr(instr);
dupl = nir_load_const_instr_create(b->shader, load->def.num_components,
load->def.bit_size);

View file

@ -123,6 +123,57 @@ static void ppir_node_add_src(ppir_compiler *comp, ppir_node *node,
ppir_node_target_assign(ps, child);
}
static bool ppir_emit_ffma(ppir_block *block, nir_instr *ni)
{
nir_alu_instr *instr = nir_instr_as_alu(ni);
nir_def *def = &instr->def;
unsigned mask = nir_component_mask(def->num_components);
uint8_t identity[4] = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y,
PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W };
ppir_alu_node *add = ppir_node_create_dest(block, ppir_op_add, def, mask);
if (!add)
return false;
ppir_alu_node *mul = ppir_node_create(block, ppir_op_mul, -1, mask);
if (!mul)
return false;
ppir_dest *mul_dest = &mul->dest;
ppir_dest *add_dest = &add->dest;
mul_dest->type = ppir_target_pipeline;
if (util_bitcount(add_dest->write_mask) == 1) {
mul_dest->write_mask = 1;
mul_dest->pipeline = ppir_pipeline_reg_fmul;
} else {
mul_dest->write_mask = u_bit_consecutive(0, 4);
mul_dest->pipeline = ppir_pipeline_reg_vmul;
}
add->num_src = 2;
mul->num_src = 2;
for (int i = 0; i < 2; i++) {
nir_alu_src *alu_src = instr->src + i;
ppir_src *ps = mul->src + i;
memcpy(ps->swizzle, alu_src->swizzle, sizeof(ps->swizzle));
ppir_node_add_src(block->comp, &mul->node, ps, &alu_src->src, mask);
}
nir_alu_src *alu_src = instr->src + 2;
ppir_src *ps = add->src;
memcpy(ps[1].swizzle, alu_src->swizzle, sizeof(ps[1].swizzle));
ppir_node_add_src(block->comp, &add->node, ps + 1, &alu_src->src, mask);
memcpy(ps[0].swizzle, identity, sizeof(ps[0].swizzle));
ppir_node_target_assign(&ps[0], &mul->node);
ppir_node_add_dep(&add->node, &mul->node, ppir_dep_src);
list_addtail(&add->node.list, &block->node_list);
list_addtail(&mul->node.list, &block->node_list);
return true;
}
static int nir_to_ppir_opcodes[nir_num_opcodes] = {
[nir_op_mov] = ppir_op_mov,
[nir_op_fmul] = ppir_op_mul,
@ -152,6 +203,7 @@ static int nir_to_ppir_opcodes[nir_num_opcodes] = {
[nir_op_ftrunc] = ppir_op_trunc,
[nir_op_fsat] = ppir_op_sat,
[nir_op_fclamp_pos] = ppir_op_clamp_pos,
[nir_op_ffma] = ppir_op_ffma,
};
static bool ppir_emit_alu(ppir_block *block, nir_instr *ni)
@ -164,6 +216,11 @@ static bool ppir_emit_alu(ppir_block *block, nir_instr *ni)
ppir_error("unsupported nir_op: %s\n", nir_op_infos[instr->op].name);
return false;
}
if (op == ppir_op_ffma) {
return ppir_emit_ffma(block, ni);
}
unsigned mask = nir_component_mask(def->num_components);
ppir_alu_node *node = ppir_node_create_dest(block, op, def, mask);
if (!node)

View file

@ -118,6 +118,8 @@ typedef enum {
ppir_op_undef,
ppir_op_dummy,
ppir_op_ffma,
ppir_op_num,
} ppir_op;

View file

@ -69,9 +69,9 @@ static const nir_shader_compiler_options vs_nir_options = {
};
static const nir_shader_compiler_options fs_nir_options = {
.lower_ffma16 = true,
.lower_ffma32 = true,
.lower_ffma64 = true,
.fuse_ffma16 = true,
.fuse_ffma32 = true,
.fuse_ffma64 = true,
.lower_fpow = true,
.lower_fdiv = true,
.lower_fmod = true,
@ -266,6 +266,7 @@ lima_program_optimize_fs_nir(struct nir_shader *s,
/* Must be run after optimization loop */
NIR_PASS_V(s, lima_nir_scale_trig);
NIR_PASS_V(s, nir_opt_algebraic_late);
NIR_PASS_V(s, lima_nir_ppir_algebraic_late);
NIR_PASS_V(s, nir_copy_prop);