aco/ra: convert bitwise instruction to gfx11+ 16bit on demand
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

The 32bit versions are smaller, allow more optimizations and VOPD,
so only use the 16bit opcodes if nessecary.

Foz-DB Navi31:
Totals from 84 (0.10% of 80237) affected shaders:
Instrs: 176673 -> 176347 (-0.18%); split: -0.20%, +0.01%
CodeSize: 970148 -> 969716 (-0.04%); split: -0.08%, +0.03%
VGPRs: 5876 -> 5864 (-0.20%)
Latency: 2805974 -> 2805674 (-0.01%); split: -0.02%, +0.01%
InvThroughput: 769007 -> 768738 (-0.03%); split: -0.04%, +0.01%
VClause: 2593 -> 2597 (+0.15%)
Copies: 23749 -> 23487 (-1.10%); split: -1.11%, +0.00%
VALU: 107124 -> 106862 (-0.24%); split: -0.25%, +0.00%

Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35919>
This commit is contained in:
Georg Lehmann 2025-07-03 16:34:05 +02:00 committed by Marge Bot
parent 404e1f13e8
commit a6a6c2f691
2 changed files with 60 additions and 0 deletions

View file

@ -271,6 +271,12 @@ withoutVOP3(Format format)
return (Format)((uint32_t)format & ~((uint32_t)Format::VOP3));
}
constexpr Format
withoutVOP2(Format format)
{
return (Format)((uint32_t)format & ~((uint32_t)Format::VOP2));
}
enum class RegType {
sgpr,
vgpr,

View file

@ -568,6 +568,36 @@ is_sgpr_writable_without_side_effects(amd_gfx_level gfx_level, PhysReg reg)
(!has_flat_scr_lo_gfx7_or_xnack_mask || (reg != 104 || reg != 105));
}
static bool
convert_bitwise_to_16bit(Instruction* instr)
{
if (instr->opcode == aco_opcode::v_cndmask_b32) {
instr->opcode = aco_opcode::v_cndmask_b16;
instr->format = withoutVOP2(asVOP3(instr->format));
instr->valu().abs = 0;
instr->valu().neg = 0;
} else if (instr->opcode == aco_opcode::v_mov_b32) {
instr->opcode = aco_opcode::v_mov_b16;
instr->valu().abs = 0;
instr->valu().neg = 0;
} else if (instr->opcode == aco_opcode::v_not_b32) {
instr->opcode = aco_opcode::v_not_b16;
} else if (instr->opcode == aco_opcode::v_and_b32) {
instr->opcode = aco_opcode::v_and_b16;
instr->format = withoutVOP2(asVOP3(instr->format));
} else if (instr->opcode == aco_opcode::v_or_b32) {
instr->opcode = aco_opcode::v_or_b16;
instr->format = withoutVOP2(asVOP3(instr->format));
} else if (instr->opcode == aco_opcode::v_xor_b32) {
instr->opcode = aco_opcode::v_xor_b16;
instr->format = withoutVOP2(asVOP3(instr->format));
} else {
return false;
}
return true;
}
unsigned
get_subdword_operand_stride(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr,
unsigned idx, RegClass rc)
@ -593,6 +623,13 @@ get_subdword_operand_stride(amd_gfx_level gfx_level, const aco_ptr<Instruction>&
}
switch (instr->opcode) {
case aco_opcode::v_mov_b32:
case aco_opcode::v_not_b32:
case aco_opcode::v_and_b32:
case aco_opcode::v_or_b32:
case aco_opcode::v_xor_b32:
case aco_opcode::v_cndmask_b32:
return gfx_level >= GFX11 && instr->definitions[0].bytes() <= 2 ? 2 : 4;
case aco_opcode::v_cvt_f32_ubyte0: return 1;
case aco_opcode::ds_write_b8:
case aco_opcode::ds_write_b16: return gfx_level >= GFX9 ? 2 : 4;
@ -643,6 +680,8 @@ add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, uns
return;
}
convert_bitwise_to_16bit(instr.get());
assert(can_use_opsel(gfx_level, instr->opcode, idx));
instr->valu().opsel[idx] = true;
return;
@ -706,6 +745,16 @@ DefInfo::get_subdword_definition_info(Program* program, const aco_ptr<Instructio
can_use_opsel(gfx_level, instr->opcode, -1)) {
data_stride = 2;
stride = rc == v2b ? 2 : stride;
} else if ((instr->opcode == aco_opcode::v_cndmask_b32 ||
instr->opcode == aco_opcode::v_mov_b32 ||
instr->opcode == aco_opcode::v_not_b32 ||
instr->opcode == aco_opcode::v_and_b32 || instr->opcode == aco_opcode::v_or_b32 ||
instr->opcode == aco_opcode::v_xor_b32) &&
program->gfx_level >= GFX11) {
/* Convert to 16bit opcode on demand. */
rc = v2b;
data_stride = 2;
stride = 2;
}
return;
}
@ -785,6 +834,11 @@ add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg r
return;
}
if (convert_bitwise_to_16bit(instr.get())) {
if (reg.byte() == 0)
return;
}
/* use opsel */
assert(reg.byte() == 2);
assert(can_use_opsel(gfx_level, instr->opcode, -1));