amd: add ac_cu_info::has_point_sample_accel flag and use in ACO

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38701>
This commit is contained in:
Daniel Schürmann 2025-11-27 13:27:00 +01:00 committed by Marge Bot
parent cfb745592d
commit 7b7bdb76ab
8 changed files with 17 additions and 11 deletions

View file

@ -304,6 +304,8 @@ ac_fill_cu_info(struct radeon_info *info, struct drm_amdgpu_info_device *device_
cu_info->has_lds_bank_count_16 = info->family == CHIP_KABINI || info->family == CHIP_STONEY;
cu_info->has_sram_ecc_enabled = info->family == CHIP_VEGA20 || info->family == CHIP_MI100 ||
info->family == CHIP_MI200 || info->family == CHIP_GFX940;
cu_info->has_point_sample_accel = info->family == CHIP_STRIX1 || info->family == CHIP_STRIX_HALO ||
info->family == CHIP_KRACKAN1;
cu_info->has_fast_fma32 = info->gfx_level >= GFX9 || info->family == CHIP_TAHITI ||
info->family == CHIP_HAWAII || info->family == CHIP_CARRIZO;
cu_info->has_fma_mix = info->gfx_level >= GFX10 ||

View file

@ -47,6 +47,8 @@ struct ac_cu_info {
/* Flags */
bool has_lds_bank_count_16 : 1;
bool has_sram_ecc_enabled : 1;
/* Whether image_sample* instructions can be either a sampler or no-sampler access.*/
bool has_point_sample_accel : 1;
bool has_fast_fma32 : 1;
/* Whether chips support fused v_fma_mix* instructions.
* Otherwise, unfused v_mad_mix* is available on GFX9.

View file

@ -51,7 +51,7 @@ get_type(Program* program, aco_ptr<Instruction>& instr)
if (program->gfx_level >= GFX11) {
if (instr->isMIMG()) {
uint8_t vmem_type = get_vmem_type(program->gfx_level, program->family, instr.get());
uint8_t vmem_type = get_vmem_type(instr.get(), program->dev.has_point_sample_accel);
switch (vmem_type) {
case vmem_bvh: return clause_bvh;
case vmem_sampler: return clause_mimg_sample;

View file

@ -1687,7 +1687,7 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
} else {
uint8_t vmem_type =
state.program->gfx_level >= GFX12
? get_vmem_type(state.program->gfx_level, state.program->family, instr.get())
? get_vmem_type(instr.get(), state.program->dev.has_point_sample_accel)
: vmem_nosampler;
std::bitset<256>* vgprs = &ctx.vgpr_used_by_vmem_load;
if (vmem_type == vmem_sampler)

View file

@ -408,7 +408,7 @@ check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr)
* We can do this for GFX12 and different types for GFX11 if we know that the two
* VMEM loads do not write the same register half or the same lanes.
*/
uint8_t vmem_type = get_vmem_type(ctx.gfx_level, ctx.program->family, instr);
uint8_t vmem_type = get_vmem_type(instr, ctx.program->dev.has_point_sample_accel);
if (vmem_type) {
wait_event event = get_vmem_event(ctx, instr, vmem_type);
wait_type type = (wait_type)(ffs(ctx.info->get_counters_for_event(event)) - 1);
@ -834,7 +834,7 @@ gen(Instruction* instr, wait_ctx& ctx)
case Format::MIMG:
case Format::GLOBAL:
case Format::SCRATCH: {
uint8_t type = get_vmem_type(ctx.gfx_level, ctx.program->family, instr);
uint8_t type = get_vmem_type(instr, ctx.program->dev.has_point_sample_accel);
wait_event ev = get_vmem_event(ctx, instr, type);
uint32_t mask = ev == event_vmem ? get_vmem_mask(ctx, instr) : 0;

View file

@ -107,6 +107,7 @@ init_program(Program* program, Stage stage, const struct aco_shader_info* info,
program->dev.xnack_enabled = false;
program->dev.sram_ecc_enabled = options->cu_info->has_sram_ecc_enabled;
program->dev.has_point_sample_accel = options->cu_info->has_point_sample_accel;
program->dev.has_fast_fma32 = options->cu_info->has_fast_fma32;
program->dev.has_mac_legacy32 = program->gfx_level <= GFX7 || program->gfx_level == GFX10;
@ -1470,7 +1471,7 @@ get_tied_defs(Instruction* instr)
}
uint8_t
get_vmem_type(amd_gfx_level gfx_level, radeon_family family, Instruction* instr)
get_vmem_type(Instruction* instr, bool has_point_sample_accel)
{
if (instr->opcode == aco_opcode::image_bvh_intersect_ray ||
instr->opcode == aco_opcode::image_bvh64_intersect_ray ||
@ -1481,10 +1482,10 @@ get_vmem_type(amd_gfx_level gfx_level, radeon_family family, Instruction* instr)
return vmem_sampler;
} else if (instr->isMIMG() && !instr->operands[1].isUndefined() &&
instr->operands[1].regClass() == s4) {
bool point_sample_accel = gfx_level == GFX11_5 && family != CHIP_GFX1153 &&
(instr->opcode == aco_opcode::image_sample ||
instr->opcode == aco_opcode::image_sample_l ||
instr->opcode == aco_opcode::image_sample_lz);
bool point_sample_accel =
has_point_sample_accel && (instr->opcode == aco_opcode::image_sample ||
instr->opcode == aco_opcode::image_sample_l ||
instr->opcode == aco_opcode::image_sample_lz);
return vmem_sampler | (point_sample_accel ? vmem_nosampler : 0);
} else if (instr->isVMEM() || instr->isScratch() || instr->isGlobal()) {
return vmem_nosampler;

View file

@ -2054,7 +2054,7 @@ enum vmem_type : uint8_t {
/* VMEM instructions of the same type return in-order. For GFX12+, this determines which counter
* is used.
*/
uint8_t get_vmem_type(amd_gfx_level gfx_level, radeon_family family, Instruction* instr);
uint8_t get_vmem_type(Instruction* instr, bool has_point_sample_accel);
/* For all of the counters, the maximum value means no wait.
* Some of the counters are larger than their bit field,
@ -2249,6 +2249,7 @@ struct DeviceInfo {
bool fused_mad_mix = false;
bool xnack_enabled = false;
bool sram_ecc_enabled = false;
bool has_point_sample_accel = false;
int32_t scratch_global_offset_min;
int32_t scratch_global_offset_max;

View file

@ -320,7 +320,7 @@ get_wait_counter_info(Program* program, aco_ptr<Instruction>& instr)
} else if (instr->isVMEM() && instr->definitions.empty() && program->gfx_level >= GFX10) {
info[wait_type_vs] = 320;
} else if (instr->isVMEM()) {
uint8_t vm_type = get_vmem_type(program->gfx_level, program->family, instr.get());
uint8_t vm_type = get_vmem_type(instr.get(), program->dev.has_point_sample_accel);
wait_type type = wait_type_vm;
if (program->gfx_level >= GFX12 && vm_type == vmem_bvh)
type = wait_type_bvh;