r300: add partial CMP support on R5xx

VE_COND_MUX_GTE4 is a nice match for the TGSI CMP opcode, however
there is a big limitation due to the general shortcoming of the
vertex shader engine that any instruction can read only two different
temporary registers. So we still have to lower in some cases.

Shader-db RV530:
total instructions in shared programs: 130872 -> 130333 (-0.41%)
instructions in affected programs: 29854 -> 29315 (-1.81%)
helped: 294
HURT: 83
total temps in shared programs: 16747 -> 16775 (0.17%)
temps in affected programs: 407 -> 435 (6.88%)
helped: 10
HURT: 38

Reviewed-by: Filip Gawin <filip.gawin@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23691>
This commit is contained in:
Pavel Ondračka 2023-06-15 15:10:01 +02:00 committed by Marge Bot
parent e15a4e6e1a
commit 9db5da0f38
4 changed files with 44 additions and 2 deletions

View file

@ -237,6 +237,36 @@ static void ei_math1(struct r300_vertex_program_code *vp,
inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
}
static void ei_cmp(struct r300_vertex_program_code *vp,
struct rc_sub_instruction *vpi,
unsigned int * inst)
{
inst[0] = PVS_OP_DST_OPERAND(VE_COND_MUX_GTE,
0,
0,
t_dst_index(vp, &vpi->DstReg),
t_dst_mask(vpi->DstReg.WriteMask),
t_dst_class(vpi->DstReg.File),
vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
/* Arguments with constant swizzles still count as a unique
* temporary, so we should make sure these arguments share a
* register index with one of the other arguments. */
for (unsigned i = 0; i < 3; i++) {
unsigned j = (i + 1) % 3;
if (vpi->SrcReg[i].File == RC_FILE_NONE &&
(vpi->SrcReg[j].File == RC_FILE_NONE ||
vpi->SrcReg[j].File == RC_FILE_TEMPORARY)) {
vpi->SrcReg[i].Index = vpi->SrcReg[j].Index;
break;
}
}
inst[1] = t_src(vp, &vpi->SrcReg[0]);
inst[2] = t_src(vp, &vpi->SrcReg[2]);
inst[3] = t_src(vp, &vpi->SrcReg[1]);
}
static void ei_lit(struct r300_vertex_program_code *vp,
struct rc_sub_instruction *vpi,
unsigned int * inst)
@ -414,6 +444,7 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user)
case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break;
case RC_OPCODE_ARR: ei_vector1(compiler->code, VE_FLT2FIX_DX_RND, vpi, inst); break;
case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break;
case RC_OPCODE_CMP: ei_cmp(compiler->code, vpi, inst); break;
case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;

View file

@ -395,7 +395,18 @@ int radeonTransformALU(
static void transform_r300_vertex_CMP(struct radeon_compiler* c,
struct rc_instruction* inst)
{
/* There is no decent CMP available, so let's rig one up.
/* R5xx has a CMP, but we can use it only if it reads from less than
* three different temps. */
if (c->is_r500 &&
(inst->U.I.SrcReg[0].File != RC_FILE_TEMPORARY ||
inst->U.I.SrcReg[1].File != RC_FILE_TEMPORARY ||
inst->U.I.SrcReg[2].File != RC_FILE_TEMPORARY ||
inst->U.I.SrcReg[0].Index == inst->U.I.SrcReg[1].Index ||
inst->U.I.SrcReg[1].Index == inst->U.I.SrcReg[2].Index ||
inst->U.I.SrcReg[0].Index == inst->U.I.SrcReg[2].Index))
return;
/* There is no decent CMP available on r300, so let's rig one up.
* CMP is defined as dst = src0 < 0.0 ? src1 : src2
* The following sequence consumes zero to two temps and two extra slots
* (the second temp and the second slot is consumed by transform_LRP),

View file

@ -512,6 +512,7 @@ static int r300_get_video_param(struct pipe_screen *screen,
static const nir_shader_compiler_options r500_vs_compiler_options = {
COMMON_NIR_OPTIONS,
.has_fused_comp_and_csel = true,
/* Have HW loops support and 1024 max instr count, but don't unroll *too*
* hard.

View file

@ -1952,7 +1952,6 @@ static void* r300_create_vs_state(struct pipe_context* pipe,
.ubo_vec4_max = 0x00ff,
};
static const struct nir_to_tgsi_options hwtcl_r500_options = {
.lower_cmp = true,
.ubo_vec4_max = 0x00ff,
};
const struct nir_to_tgsi_options *ntt_options;