mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-15 14:18:04 +02:00
aco: add helpers to get instruction subdword capabilities
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41018>
This commit is contained in:
parent
43aeeb8b88
commit
4249daeedd
3 changed files with 183 additions and 160 deletions
|
|
@ -285,7 +285,7 @@ get_sync_info(const Instruction* instr)
|
|||
}
|
||||
|
||||
bool
|
||||
can_use_SDWA(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool pre_ra)
|
||||
can_use_SDWA(amd_gfx_level gfx_level, const Instruction* instr, bool pre_ra)
|
||||
{
|
||||
if (!instr->isVALU())
|
||||
return false;
|
||||
|
|
@ -297,7 +297,7 @@ can_use_SDWA(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool pr
|
|||
return true;
|
||||
|
||||
if (instr->isVOP3()) {
|
||||
VALU_instruction& vop3 = instr->valu();
|
||||
const VALU_instruction& vop3 = instr->valu();
|
||||
if (instr->format == Format::VOP3)
|
||||
return false;
|
||||
if (vop3.clamp && instr->isVOPC() && gfx_level != GFX8)
|
||||
|
|
@ -351,6 +351,12 @@ can_use_SDWA(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool pr
|
|||
instr->opcode != aco_opcode::v_clrexcp && instr->opcode != aco_opcode::v_swap_b32;
|
||||
}
|
||||
|
||||
bool
|
||||
can_use_SDWA(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool pre_ra)
|
||||
{
|
||||
return can_use_SDWA(gfx_level, instr.get(), pre_ra);
|
||||
}
|
||||
|
||||
/* updates "instr" and returns the old instruction (or NULL if no update was needed) */
|
||||
aco_ptr<Instruction>
|
||||
convert_to_SDWA(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr)
|
||||
|
|
@ -836,6 +842,159 @@ get_gfx11_true16_mask(aco_opcode op)
|
|||
}
|
||||
}
|
||||
|
||||
unsigned
|
||||
get_subdword_operand_stride(Program* program, const Instruction* instr, unsigned idx, RegClass rc)
|
||||
{
|
||||
assert(rc.is_subdword());
|
||||
assert(program->gfx_level >= GFX8);
|
||||
|
||||
/* Pseudo instructions are a bit special here, because instr might just be a dummy instruction,
|
||||
* so we shouldn't try accessing it's operands. */
|
||||
if (instr->isPseudo()) {
|
||||
/* v_readfirstlane_b32 cannot use SDWA */
|
||||
if (instr->opcode == aco_opcode::p_as_uniform ||
|
||||
instr->opcode == aco_opcode::p_permlane64_shared_vgpr)
|
||||
return 4;
|
||||
else
|
||||
return rc.bytes() % 2 == 0 ? 2 : 1;
|
||||
}
|
||||
|
||||
assert(instr->operands[idx].regClass() == rc);
|
||||
|
||||
if (rc.bytes() > 2)
|
||||
return 4;
|
||||
|
||||
if (instr->isVALU()) {
|
||||
if (can_use_SDWA(program->gfx_level, instr, false))
|
||||
return rc.bytes();
|
||||
if (can_use_opsel(program->gfx_level, instr->opcode, idx))
|
||||
return 2;
|
||||
if (instr->isVOP3P())
|
||||
return 2;
|
||||
}
|
||||
|
||||
switch (instr->opcode) {
|
||||
case aco_opcode::v_mov_b32:
|
||||
case aco_opcode::v_not_b32:
|
||||
case aco_opcode::v_and_b32:
|
||||
case aco_opcode::v_or_b32:
|
||||
case aco_opcode::v_xor_b32:
|
||||
case aco_opcode::v_cndmask_b32:
|
||||
return program->gfx_level >= GFX11 && instr->definitions[0].bytes() <= 2 ? 2 : 4;
|
||||
case aco_opcode::v_cvt_f32_ubyte0: return 1;
|
||||
case aco_opcode::ds_write_b8:
|
||||
case aco_opcode::ds_write_b16:
|
||||
case aco_opcode::buffer_store_byte:
|
||||
case aco_opcode::buffer_store_short:
|
||||
case aco_opcode::buffer_store_format_d16_x:
|
||||
case aco_opcode::flat_store_byte:
|
||||
case aco_opcode::flat_store_short:
|
||||
case aco_opcode::scratch_store_byte:
|
||||
case aco_opcode::scratch_store_short:
|
||||
case aco_opcode::global_store_byte:
|
||||
case aco_opcode::global_store_short: return program->gfx_level >= GFX9 ? 2 : 4;
|
||||
default: return 4;
|
||||
}
|
||||
}
|
||||
|
||||
SubdwordCaps
|
||||
get_subdword_definition_caps(Program* program, const Instruction* instr, unsigned idx, RegClass rc)
|
||||
{
|
||||
amd_gfx_level gfx_level = program->gfx_level;
|
||||
assert(rc.is_subdword());
|
||||
assert(gfx_level >= GFX8);
|
||||
|
||||
SubdwordCaps caps;
|
||||
caps.placement_stride = rc.bytes() % 2 == 0 ? 2 : 1;
|
||||
caps.overwrite_bytes = rc.bytes();
|
||||
|
||||
/* Pseudo instructions are a bit special here, because instr might just be a dummy instruction,
|
||||
* so we shouldn't try accessing it's definitions. Pseudo instructions are also the only ones
|
||||
* with multiple definitions, and "idx" isn't always correct. */
|
||||
if (instr->isPseudo()) {
|
||||
if (instr->opcode == aco_opcode::p_interp_gfx11 ||
|
||||
instr->opcode == aco_opcode::p_permlane64_shared_vgpr) {
|
||||
caps.overwrite_bytes = rc.size() * 4;
|
||||
caps.placement_stride = 4;
|
||||
}
|
||||
return caps;
|
||||
}
|
||||
|
||||
assert(instr->definitions[idx].regClass() == rc);
|
||||
|
||||
if (instr->isVALU()) {
|
||||
if (rc.bytes() == 3) {
|
||||
caps.overwrite_bytes = 4;
|
||||
caps.placement_stride = 4;
|
||||
return caps;
|
||||
}
|
||||
assert(rc.bytes() <= 2);
|
||||
|
||||
if (can_use_SDWA(gfx_level, instr, false))
|
||||
return caps;
|
||||
|
||||
if ((instr->opcode == aco_opcode::v_cndmask_b32 || instr->opcode == aco_opcode::v_mov_b32 ||
|
||||
instr->opcode == aco_opcode::v_not_b32 || instr->opcode == aco_opcode::v_and_b32 ||
|
||||
instr->opcode == aco_opcode::v_or_b32 || instr->opcode == aco_opcode::v_xor_b32) &&
|
||||
program->gfx_level >= GFX11) {
|
||||
/* Convert to 16bit opcode on demand. */
|
||||
caps.overwrite_bytes = 2;
|
||||
caps.placement_stride = 2;
|
||||
return caps;
|
||||
}
|
||||
|
||||
bool preserve = instr_is_16bit(gfx_level, instr->opcode);
|
||||
bool can_write_hi16 = instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
|
||||
instr->opcode == aco_opcode::p_v_fma_mixlo_f16_rtz ||
|
||||
can_use_opsel(gfx_level, instr->opcode, -1);
|
||||
caps.placement_stride = can_write_hi16 ? 2 : 4;
|
||||
caps.overwrite_bytes = preserve ? 2 : 4;
|
||||
return caps;
|
||||
}
|
||||
|
||||
switch (instr->opcode) {
|
||||
case aco_opcode::v_interp_p2_f16: assert(rc.bytes() == 2); return caps;
|
||||
/* D16 loads with _hi version */
|
||||
case aco_opcode::ds_read_u8_d16:
|
||||
case aco_opcode::ds_read_i8_d16:
|
||||
case aco_opcode::ds_read_u16_d16:
|
||||
case aco_opcode::flat_load_ubyte_d16:
|
||||
case aco_opcode::flat_load_sbyte_d16:
|
||||
case aco_opcode::flat_load_short_d16:
|
||||
case aco_opcode::global_load_ubyte_d16:
|
||||
case aco_opcode::global_load_sbyte_d16:
|
||||
case aco_opcode::global_load_short_d16:
|
||||
case aco_opcode::scratch_load_ubyte_d16:
|
||||
case aco_opcode::scratch_load_sbyte_d16:
|
||||
case aco_opcode::scratch_load_short_d16:
|
||||
case aco_opcode::buffer_load_ubyte_d16:
|
||||
case aco_opcode::buffer_load_sbyte_d16:
|
||||
case aco_opcode::buffer_load_short_d16:
|
||||
case aco_opcode::buffer_load_format_d16_x: {
|
||||
assert(gfx_level >= GFX9);
|
||||
caps.overwrite_bytes = program->dev.sram_ecc_enabled ? 4 : 2;
|
||||
caps.placement_stride = 2;
|
||||
return caps;
|
||||
}
|
||||
/* 3-component D16 loads */
|
||||
case aco_opcode::buffer_load_format_d16_xyz:
|
||||
case aco_opcode::tbuffer_load_format_d16_xyz: {
|
||||
assert(gfx_level >= GFX9);
|
||||
caps.overwrite_bytes = program->dev.sram_ecc_enabled ? 8 : 6;
|
||||
caps.placement_stride = 4;
|
||||
return caps;
|
||||
}
|
||||
default: break;
|
||||
}
|
||||
|
||||
caps.placement_stride = 4;
|
||||
if (instr->isMIMG() && instr->mimg().d16 && !program->dev.sram_ecc_enabled)
|
||||
assert(gfx_level >= GFX9);
|
||||
else
|
||||
caps.overwrite_bytes = rc.size() * 4;
|
||||
return caps;
|
||||
}
|
||||
|
||||
uint32_t
|
||||
get_reduction_identity(ReduceOp op, unsigned idx)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -2052,7 +2052,19 @@ bool can_use_input_modifiers(amd_gfx_level gfx_level, aco_opcode op, int idx);
|
|||
bool can_use_opsel(amd_gfx_level gfx_level, aco_opcode op, int idx);
|
||||
bool instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op);
|
||||
uint8_t get_gfx11_true16_mask(aco_opcode op);
|
||||
bool can_use_SDWA(amd_gfx_level gfx_level, const Instruction* instr, bool pre_ra);
|
||||
bool can_use_SDWA(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool pre_ra);
|
||||
|
||||
struct SubdwordCaps {
|
||||
unsigned placement_stride;
|
||||
unsigned overwrite_bytes;
|
||||
};
|
||||
|
||||
unsigned get_subdword_operand_stride(Program* program, const Instruction* instr, unsigned idx,
|
||||
RegClass rc);
|
||||
SubdwordCaps get_subdword_definition_caps(Program* program, const Instruction* instr, unsigned idx,
|
||||
RegClass rc);
|
||||
|
||||
bool opcode_supports_dpp(amd_gfx_level gfx_level, aco_opcode opcode, bool vop3p);
|
||||
bool can_use_DPP(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool dpp8);
|
||||
bool can_write_m0(const aco_ptr<Instruction>& instr);
|
||||
|
|
|
|||
|
|
@ -22,8 +22,6 @@ namespace {
|
|||
struct ra_ctx;
|
||||
struct DefInfo;
|
||||
|
||||
unsigned get_subdword_operand_stride(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr,
|
||||
unsigned idx, RegClass rc);
|
||||
void add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte,
|
||||
RegClass rc);
|
||||
void add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg,
|
||||
|
|
@ -289,9 +287,12 @@ struct DefInfo {
|
|||
|
||||
if (rc.is_subdword() && operand >= 0) {
|
||||
/* stride in bytes */
|
||||
stride = get_subdword_operand_stride(ctx.program->gfx_level, instr, operand, rc);
|
||||
stride = get_subdword_operand_stride(ctx.program, instr.get(), operand, rc);
|
||||
} else if (rc.is_subdword()) {
|
||||
get_subdword_definition_info(ctx.program, instr);
|
||||
SubdwordCaps caps = get_subdword_definition_caps(ctx.program, instr.get(), 0, rc);
|
||||
stride = rc.bytes() == caps.overwrite_bytes ? caps.placement_stride : 4;
|
||||
rc = rc.resize(caps.overwrite_bytes);
|
||||
data_stride = caps.placement_stride;
|
||||
} else if (instr->isMIMG() && instr->mimg().d16 && ctx.program->gfx_level <= GFX9) {
|
||||
/* Workaround GFX9 hardware bug for D16 image instructions: FeatureImageGather4D16Bug
|
||||
*
|
||||
|
|
@ -317,9 +318,6 @@ struct DefInfo {
|
|||
if (!data_stride)
|
||||
data_stride = stride;
|
||||
}
|
||||
|
||||
private:
|
||||
void get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr);
|
||||
};
|
||||
|
||||
class RegisterFile {
|
||||
|
|
@ -631,56 +629,6 @@ convert_bitwise_to_16bit(Instruction* instr)
|
|||
return true;
|
||||
}
|
||||
|
||||
unsigned
|
||||
get_subdword_operand_stride(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr,
|
||||
unsigned idx, RegClass rc)
|
||||
{
|
||||
assert(gfx_level >= GFX8);
|
||||
if (instr->isPseudo()) {
|
||||
/* v_readfirstlane_b32 cannot use SDWA */
|
||||
if (instr->opcode == aco_opcode::p_as_uniform ||
|
||||
instr->opcode == aco_opcode::p_permlane64_shared_vgpr)
|
||||
return 4;
|
||||
else
|
||||
return rc.bytes() % 2 == 0 ? 2 : 1;
|
||||
}
|
||||
|
||||
if (rc.bytes() > 2)
|
||||
return 4;
|
||||
|
||||
if (instr->isVALU()) {
|
||||
if (can_use_SDWA(gfx_level, instr, false))
|
||||
return rc.bytes();
|
||||
if (can_use_opsel(gfx_level, instr->opcode, idx))
|
||||
return 2;
|
||||
if (instr->isVOP3P())
|
||||
return 2;
|
||||
}
|
||||
|
||||
switch (instr->opcode) {
|
||||
case aco_opcode::v_mov_b32:
|
||||
case aco_opcode::v_not_b32:
|
||||
case aco_opcode::v_and_b32:
|
||||
case aco_opcode::v_or_b32:
|
||||
case aco_opcode::v_xor_b32:
|
||||
case aco_opcode::v_cndmask_b32:
|
||||
return gfx_level >= GFX11 && instr->definitions[0].bytes() <= 2 ? 2 : 4;
|
||||
case aco_opcode::v_cvt_f32_ubyte0: return 1;
|
||||
case aco_opcode::ds_write_b8:
|
||||
case aco_opcode::ds_write_b16: return gfx_level >= GFX9 ? 2 : 4;
|
||||
case aco_opcode::buffer_store_byte:
|
||||
case aco_opcode::buffer_store_short:
|
||||
case aco_opcode::buffer_store_format_d16_x:
|
||||
case aco_opcode::flat_store_byte:
|
||||
case aco_opcode::flat_store_short:
|
||||
case aco_opcode::scratch_store_byte:
|
||||
case aco_opcode::scratch_store_short:
|
||||
case aco_opcode::global_store_byte:
|
||||
case aco_opcode::global_store_short: return gfx_level >= GFX9 ? 2 : 4;
|
||||
default: return 4;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte,
|
||||
RegClass rc)
|
||||
|
|
@ -750,102 +698,6 @@ add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, uns
|
|||
return;
|
||||
}
|
||||
|
||||
void
|
||||
DefInfo::get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr)
|
||||
{
|
||||
amd_gfx_level gfx_level = program->gfx_level;
|
||||
assert(gfx_level >= GFX8);
|
||||
|
||||
stride = rc.bytes() % 2 == 0 ? 2 : 1;
|
||||
|
||||
if (instr->isPseudo()) {
|
||||
if (instr->opcode == aco_opcode::p_interp_gfx11 ||
|
||||
instr->opcode == aco_opcode::p_permlane64_shared_vgpr) {
|
||||
rc = RegClass(RegType::vgpr, rc.size());
|
||||
stride = 4;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (instr->isVALU()) {
|
||||
if (rc.bytes() == 3) {
|
||||
rc = v1;
|
||||
stride = 4;
|
||||
return;
|
||||
}
|
||||
|
||||
if (can_use_SDWA(gfx_level, instr, false))
|
||||
return;
|
||||
|
||||
rc = instr_is_16bit(gfx_level, instr->opcode) ? v2b : v1;
|
||||
stride = 4;
|
||||
if (instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
|
||||
instr->opcode == aco_opcode::p_v_fma_mixlo_f16_rtz ||
|
||||
can_use_opsel(gfx_level, instr->opcode, -1)) {
|
||||
data_stride = 2;
|
||||
stride = rc == v2b ? 2 : stride;
|
||||
} else if ((instr->opcode == aco_opcode::v_cndmask_b32 ||
|
||||
instr->opcode == aco_opcode::v_mov_b32 ||
|
||||
instr->opcode == aco_opcode::v_not_b32 ||
|
||||
instr->opcode == aco_opcode::v_and_b32 || instr->opcode == aco_opcode::v_or_b32 ||
|
||||
instr->opcode == aco_opcode::v_xor_b32) &&
|
||||
program->gfx_level >= GFX11) {
|
||||
/* Convert to 16bit opcode on demand. */
|
||||
rc = v2b;
|
||||
data_stride = 2;
|
||||
stride = 2;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
switch (instr->opcode) {
|
||||
case aco_opcode::v_interp_p2_f16: return;
|
||||
/* D16 loads with _hi version */
|
||||
case aco_opcode::ds_read_u8_d16:
|
||||
case aco_opcode::ds_read_i8_d16:
|
||||
case aco_opcode::ds_read_u16_d16:
|
||||
case aco_opcode::flat_load_ubyte_d16:
|
||||
case aco_opcode::flat_load_sbyte_d16:
|
||||
case aco_opcode::flat_load_short_d16:
|
||||
case aco_opcode::global_load_ubyte_d16:
|
||||
case aco_opcode::global_load_sbyte_d16:
|
||||
case aco_opcode::global_load_short_d16:
|
||||
case aco_opcode::scratch_load_ubyte_d16:
|
||||
case aco_opcode::scratch_load_sbyte_d16:
|
||||
case aco_opcode::scratch_load_short_d16:
|
||||
case aco_opcode::buffer_load_ubyte_d16:
|
||||
case aco_opcode::buffer_load_sbyte_d16:
|
||||
case aco_opcode::buffer_load_short_d16:
|
||||
case aco_opcode::buffer_load_format_d16_x: {
|
||||
assert(gfx_level >= GFX9);
|
||||
if (program->dev.sram_ecc_enabled) {
|
||||
rc = v1;
|
||||
stride = 4;
|
||||
data_stride = 2;
|
||||
} else {
|
||||
stride = 2;
|
||||
}
|
||||
return;
|
||||
}
|
||||
/* 3-component D16 loads */
|
||||
case aco_opcode::buffer_load_format_d16_xyz:
|
||||
case aco_opcode::tbuffer_load_format_d16_xyz: {
|
||||
assert(gfx_level >= GFX9);
|
||||
stride = 4;
|
||||
if (program->dev.sram_ecc_enabled)
|
||||
rc = v2;
|
||||
return;
|
||||
}
|
||||
default: break;
|
||||
}
|
||||
|
||||
stride = 4;
|
||||
if (instr->isMIMG() && instr->mimg().d16 && !program->dev.sram_ecc_enabled)
|
||||
assert(gfx_level >= GFX9);
|
||||
else
|
||||
rc = RegClass(RegType::vgpr, rc.size());
|
||||
}
|
||||
|
||||
void
|
||||
add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg,
|
||||
bool allow_16bit_write)
|
||||
|
|
@ -2387,11 +2239,12 @@ handle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr)
|
|||
}
|
||||
|
||||
bool
|
||||
operand_can_use_reg(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg,
|
||||
operand_can_use_reg(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg,
|
||||
RegClass rc)
|
||||
{
|
||||
amd_gfx_level gfx_level = ctx.program->gfx_level;
|
||||
if (reg.byte()) {
|
||||
unsigned stride = get_subdword_operand_stride(gfx_level, instr, idx, rc);
|
||||
unsigned stride = get_subdword_operand_stride(ctx.program, instr.get(), idx, rc);
|
||||
if (reg.byte() % stride)
|
||||
return false;
|
||||
}
|
||||
|
|
@ -3699,8 +3552,7 @@ undo_renames(ra_ctx& ctx, std::vector<parallelcopy>& parallelcopies,
|
|||
}
|
||||
|
||||
bool use_original = !op.isPrecolored() && !op.isLateKill();
|
||||
use_original &= operand_can_use_reg(ctx.program->gfx_level, instr, i, copy.op.physReg(),
|
||||
copy.op.regClass());
|
||||
use_original &= operand_can_use_reg(ctx, instr, i, copy.op.physReg(), copy.op.regClass());
|
||||
|
||||
if (use_original) {
|
||||
const PhysRegInterval copy_reg = {copy.op.physReg(), copy.op.size()};
|
||||
|
|
@ -4128,7 +3980,7 @@ register_allocation(Program* program, ra_test_policy policy)
|
|||
}
|
||||
|
||||
PhysReg reg = ctx.assignments[operand.tempId()].reg;
|
||||
if (operand_can_use_reg(program->gfx_level, instr, i, reg, operand.regClass()))
|
||||
if (operand_can_use_reg(ctx, instr, i, reg, operand.regClass()))
|
||||
operand.setFixed(reg);
|
||||
else
|
||||
get_reg_for_operand(ctx, register_file, parallelcopy, instr, operand, i);
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue