r300: use w channel for scalar opcodes if possible

The opcodes write to w by default so using anything else means we can't
schedule anything in the rbg slot anyway becasue we have to replicate the
result from w. We already attempt to do this during the scheduling, but
at that point it is more tricky, so doing it early leads to much better
code. Performance++

RV530 benchmarks:

Lightsmark, 1280x800, fullscreen
before:
    N           Min           Max        Median           Avg        Stddev
x   5         27.32         27.36         27.34         27.34   0.015811388
after:
    N           Min           Max        Median           Avg        Stddev
x   5         27.53         27.61         27.59        27.576   0.034351128

Unigine Sanctuary, 1280x800, fullscreen, medium shaders
before:
    N           Min           Max        Median           Avg        Stddev
x   5       10.1211       10.1238       10.1214      10.12192  0.0011211601
after:
    N           Min           Max        Median           Avg        Stddev
x   5       10.4607       10.4637       10.4619      10.46206  0.0012441865

RV530 shader-db:
total instructions in shared programs: 129643 -> 128038 (-1.24%)
instructions in affected programs: 45415 -> 43810 (-3.53%)
helped: 514
HURT: 43
total presub in shared programs: 4912 -> 5201 (5.88%)
presub in affected programs: 752 -> 1041 (38.43%)
helped: 40
HURT: 30
total omod in shared programs: 381 -> 383 (0.52%)
omod in affected programs: 6 -> 8 (33.33%)
helped: 1
HURT: 3
total temps in shared programs: 16904 -> 16841 (-0.37%)
temps in affected programs: 1377 -> 1314 (-4.58%)
helped: 81
HURT: 52
total lits in shared programs: 3555 -> 3550 (-0.14%)
lits in affected programs: 294 -> 289 (-1.70%)
helped: 13
HURT: 11
total cycles in shared programs: 194771 -> 193734 (-0.53%)
cycles in affected programs: 79079 -> 78042 (-1.31%)
helped: 452
HURT: 84
GAINED: shaders/glamor/82.shader_test FS

RV370 shader-db:
total instructions in shared programs: 82116 -> 81600 (-0.63%)
instructions in affected programs: 11888 -> 11372 (-4.34%)
helped: 273
HURT: 40
total temps in shared programs: 12438 -> 12441 (0.02%)
temps in affected programs: 692 -> 695 (0.43%)
helped: 36
HURT: 39
total cycles in shared programs: 128140 -> 127630 (-0.40%)
cycles in affected programs: 25838 -> 25328 (-1.97%)
helped: 266
HURT: 41
GAINED: shaders/0ad/12.shader_test FS
GAINED: shaders/CC3-tiberium-wars/314.shader_test FS
GAINED: shaders/lightsmark/16.shader_test FS
GAINED: shaders/sanctuary/159.shader_test FS
GAINED: shaders/sanctuary/162.shader_test FS
GAINED: shaders/sanctuary/51.shader_test FS
GAINED: shaders/sanctuary/54.shader_test FS
GAINED: shaders/trine/fp-422.shader_test FS

Partial fix for: https://gitlab.freedesktop.org/mesa/mesa/-/issues/6661

Reviewed-by: Filip Gawin <filip.gawin@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24889>
This commit is contained in:
Pavel Ondračka 2023-08-23 13:55:48 +02:00 committed by Marge Bot
parent 326080428e
commit 58f1931104

View file

@ -30,6 +30,8 @@
#include "radeon_program_tex.h"
#include "radeon_rename_regs.h"
#include "radeon_remove_constants.h"
#include "radeon_variable.h"
#include "radeon_list.h"
#include "r300_fragprog.h"
#include "r300_fragprog_swizzle.h"
#include "r500_fragprog.h"
@ -65,6 +67,48 @@ static void rc_rewrite_depth_out(struct radeon_compiler *cc, void *user)
}
}
/**
* This function will try to convert rgb instructions into alpha instructions
* and vice versa. While this is already attempted during the pair scheduling,
* it is much simpler to do it before pair conversion, so do it here at least for
* the simple cases.
*
* Currently only math opcodes writing to rgb (and with no friends) are
* converted to alpha.
*
* This function assumes all the instructions are still of type
* RC_INSTRUCTION_NORMAL, the conversion is much simpler.
*
* Beware that this needs to be also called before doing presubtract, because
* rc_get_variables can't get properly readers for normal instructions if presubtract
* is present (it works fine for pair instructions).
*/
static void rc_convert_rgb_alpha(struct radeon_compiler *c, void *user)
{
struct rc_list * variables;
struct rc_list * var_ptr;
variables = rc_get_variables(c);
for (var_ptr = variables; var_ptr; var_ptr = var_ptr->Next) {
struct rc_variable * var = var_ptr->Item;
if (var->Inst->U.I.DstReg.File != RC_FILE_TEMPORARY) {
continue;
}
/* Only rewrite scalar opcodes that are used separatelly for now. */
if (var->Friend)
continue;
const struct rc_opcode_info * opcode = rc_get_opcode_info(var->Inst->U.I.Opcode);
if (opcode->IsStandardScalar && var->Dst.WriteMask != RC_MASK_W) {
unsigned index = rc_find_free_temporary(c);
rc_variable_change_dst(var, index, RC_MASK_W);
}
}
}
void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
{
int is_r500 = c->Base.is_r500;
@ -104,6 +148,7 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
{"native rewrite", 1, is_r500, rc_local_transform, native_rewrite_r500},
{"native rewrite", 1, !is_r500, rc_local_transform, native_rewrite_r300},
{"deadcode", 1, opt, rc_dataflow_deadcode, NULL},
{"convert rgb<->alpha", 1, opt, rc_convert_rgb_alpha, NULL},
{"register rename", 1, !is_r500 || opt, rc_rename_regs, NULL},
{"dataflow optimize", 1, opt, rc_optimize, NULL},
{"inline literals", 1, is_r500 && opt, rc_inline_literals, NULL},