radv: select frag_coord_xy and pixel_coord conditionally based on dynamic state

the code explains it

Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com> (shader parts)
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41689>
This commit is contained in:
Marek Olšák 2026-05-18 17:33:08 -04:00 committed by Marge Bot
parent 22e40edfb9
commit 1b45a8aee2
9 changed files with 92 additions and 13 deletions

View file

@ -441,6 +441,9 @@ lower_abi_instr(nir_builder *b, nir_intrinsic_instr *intrin, void *state)
/* Load the primitive topology from an user SGPR when it's unknown at compile time (GPL). */
replacement = GET_SGPR_FIELD_NIR(s->args->ps_state, PS_STATE_RAST_PRIM);
break;
case nir_intrinsic_load_use_float_frag_coord_xy_amd:
replacement = nir_ine_imm(b, GET_SGPR_FIELD_NIR(s->args->ps_state, PS_STATE_USE_FLOAT_FRAG_COORD_XY), 0);
break;
default:
progress = false;
break;

View file

@ -9,6 +9,20 @@
* not both.
*
* sample_pos counts as a frag_coord_xy use and is lowered to frag_coord_xy here.
*
* If frag_coord_xy survives and both components are used and sample_pos isn't used, it becomes:
* load_use_float_frag_coord_xy_amd() ? load_frag_coord_xy() : u2f32(load_pixel_coord()) + 0.5;
*
* If load_use_float_frag_coord_xy_amd==false, load_frag_coord_xy() returns uninitialized values.
* If load_use_float_frag_coord_xy_amd==true, load_pixel_coord() returns uninitialized values.
*
* SPI_PS_INPUT_ENA is used to disable VGPR initialization for frag_coord_xy (POS_X_FLOAT,
* POS_Y_FLOAT) or pixel_coord (POS_FIXED_PT) while SPI_PS_INPUT_ADDR keeps them at the same
* VGPR locations. Reducing the number of initialized VGPRs increases the PS wave launch rate,
* which increases observed pixel throughput depending on other states.
*
* load_use_float_frag_coord_xy_amd() comes from a user SGPR, and determines which VGPRs are
* initialized at PS wave launch.
*/
#include "nir_builder.h"
@ -21,10 +35,12 @@ typedef struct {
bool has_frag_coord_xy_float_use;
bool has_pixel_coord;
bool has_sample_pos;
uint8_t comp_usage_mask;
/* lower_frag_coord_and_pixel_coord */
bool lower_to_pixel_coord;
bool lower_to_frag_coord_xy;
bool select_frag_coord_xy_dynamically;
} opt_fs_frag_coord_and_pixel_coord_state;
static bool
@ -45,10 +61,12 @@ gather_fs_frag_pos(nir_builder *b, nir_intrinsic_instr *intr, void *data)
state->has_frag_coord_xy |= intr->intrinsic == nir_intrinsic_load_frag_coord_xy;
state->has_sample_pos |= intr->intrinsic == nir_intrinsic_load_sample_pos;
state->comp_usage_mask |= nir_def_components_read(&intr->def);
return false;
case nir_intrinsic_load_pixel_coord:
state->has_pixel_coord = true;
state->comp_usage_mask |= nir_def_components_read(&intr->def);
return false;
default:
@ -68,6 +86,10 @@ lower_fs_frag_pos(nir_builder *b, nir_intrinsic_instr *intr, void *data)
if (state->lower_to_pixel_coord) {
nir_def_replace(&intr->def, nir_fadd_imm(b, nir_u2f32(b, nir_load_pixel_coord(b)), 0.5));
return true;
} else if (state->select_frag_coord_xy_dynamically) {
nir_def_replace(&intr->def, nir_bcsel(b, nir_load_use_float_frag_coord_xy_amd(b), nir_load_frag_coord_xy(b),
nir_fadd_imm(b, nir_u2f32(b, nir_load_pixel_coord(b)), 0.5)));
return true;
}
return false;
@ -82,7 +104,12 @@ lower_fs_frag_pos(nir_builder *b, nir_intrinsic_instr *intr, void *data)
case nir_intrinsic_load_pixel_coord:
if (state->lower_to_frag_coord_xy) {
nir_def_replace(&intr->def, nir_f2u16(b, nir_load_frag_coord_xy(b)));
if (state->select_frag_coord_xy_dynamically) {
nir_def_replace(&intr->def, nir_bcsel(b, nir_load_use_float_frag_coord_xy_amd(b),
nir_f2u16(b, nir_load_frag_coord_xy(b)), nir_load_pixel_coord(b)));
} else {
nir_def_replace(&intr->def, nir_f2u16(b, nir_load_frag_coord_xy(b)));
}
return true;
}
return false;
@ -109,8 +136,10 @@ radv_nir_lower_opt_fs_frag_pos(nir_shader *shader, bool force_pixel_coord)
(state.has_frag_coord_xy || state.has_sample_pos) && !state.has_frag_coord_xy_float_use;
state.lower_to_frag_coord_xy =
(state.has_pixel_coord || state.has_sample_pos) && state.has_frag_coord_xy_float_use;
state.select_frag_coord_xy_dynamically =
state.has_frag_coord_xy_float_use && !state.has_sample_pos && state.comp_usage_mask == 0x3;
if (!state.lower_to_pixel_coord && !state.lower_to_frag_coord_xy)
if (!state.lower_to_pixel_coord && !state.lower_to_frag_coord_xy && !state.select_frag_coord_xy_dynamically)
return false;
return nir_shader_intrinsics_pass(shader, lower_fs_frag_pos, nir_metadata_control_flow, &state);

View file

@ -8786,6 +8786,16 @@ radv_bind_pre_rast_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_
cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_FLUSH;
}
if (cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT] &&
cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT]->info.ps.selects_frag_coord_xy_dynamically &&
(!cmd_buffer->state.last_vgt_shader ||
/* We just want to know whether the VRS output changes between enabled and disabled. */
(cmd_buffer->state.last_vgt_shader->info.outinfo.writes_primitive_shading_rate ||
cmd_buffer->state.last_vgt_shader->info.outinfo.writes_primitive_shading_rate_per_primitive) !=
(shader->info.outinfo.writes_primitive_shading_rate ||
shader->info.outinfo.writes_primitive_shading_rate_per_primitive)))
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PS_STATE;
cmd_buffer->state.last_vgt_shader = (struct radv_shader *)shader;
}
}
@ -11691,7 +11701,29 @@ radv_emit_ps_state(struct radv_cmd_buffer *cmd_buffer)
if (!ps)
return;
const uint32_t spi_ps_input_ena = ps->config.spi_ps_input_ena;
uint32_t spi_ps_input_ena = ps->config.spi_ps_input_ena;
bool use_float_frag_coord_xy = false;
if (ps->info.ps.selects_frag_coord_xy_dynamically) {
/* The shader selects frag_coord_xy/pixel_coord dynamically depending on a flag in PS_STATE
* that depends on the following dynamic state while preferring pixel_coord (POS_FIXED_PT)
* if possible due to lower VGPR initialization cost.
*/
use_float_frag_coord_xy =
/* Whether VRS can be other than 1x1. */
cmd_buffer->state.dynamic.vk.fsr.fragment_size.width != 1 ||
cmd_buffer->state.dynamic.vk.fsr.fragment_size.height != 1 || cmd_buffer->state.render.vrs_att.iview ||
cmd_buffer->state.last_vgt_shader->info.outinfo.writes_primitive_shading_rate ||
cmd_buffer->state.last_vgt_shader->info.outinfo.writes_primitive_shading_rate_per_primitive ||
radv_is_sample_shading_enabled(cmd_buffer, NULL);
/* Disable the initialized PS VGPRs that the shader doesn't use. */
if (use_float_frag_coord_xy)
spi_ps_input_ena &= C_0286CC_POS_FIXED_PT_ENA;
else
spi_ps_input_ena &= C_0286CC_POS_X_FLOAT_ENA & C_0286CC_POS_Y_FLOAT_ENA;
}
struct radv_cmd_stream *cs = cmd_buffer->cs;
radeon_begin(cs);
@ -11711,7 +11743,8 @@ radv_emit_ps_state(struct radv_cmd_buffer *cmd_buffer)
const unsigned ps_state = SET_SGPR_FIELD(PS_STATE_NUM_SAMPLES, rasterization_samples) |
SET_SGPR_FIELD(PS_STATE_PS_ITER_MASK, ps_iter_mask) |
SET_SGPR_FIELD(PS_STATE_LINE_RAST_MODE, line_rast_mode) |
SET_SGPR_FIELD(PS_STATE_RAST_PRIM, vgt_outprim_type);
SET_SGPR_FIELD(PS_STATE_RAST_PRIM, vgt_outprim_type) |
SET_SGPR_FIELD(PS_STATE_USE_FLOAT_FRAG_COORD_XY, use_float_frag_coord_xy);
if (pdev->info.gfx_level >= GFX12) {
gfx12_push_sh_reg(ps_state_offset, ps_state);
@ -12830,7 +12863,7 @@ radv_validate_dynamic_states(struct radv_cmd_buffer *cmd_buffer, uint64_t dynami
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DB_SHADER_CONTROL;
if (dynamic_states & RADV_DYNAMIC_FRAGMENT_SHADING_RATE)
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FSR_STATE;
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FSR_STATE | RADV_CMD_DIRTY_PS_STATE;
if (dynamic_states & RADV_DYNAMIC_SAMPLE_LOCATIONS_ENABLE)
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_RAST_SAMPLES_STATE;

View file

@ -237,14 +237,16 @@ struct radv_llvm_compiler_options {
#define NGG_STATE_QUERY__SHIFT 6
#define NGG_STATE_QUERY__MASK 0x7
#define PS_STATE_NUM_SAMPLES__SHIFT 0
#define PS_STATE_NUM_SAMPLES__MASK 0xf
#define PS_STATE_LINE_RAST_MODE__SHIFT 4
#define PS_STATE_LINE_RAST_MODE__MASK 0x3
#define PS_STATE_PS_ITER_MASK__SHIFT 6
#define PS_STATE_PS_ITER_MASK__MASK 0xffff
#define PS_STATE_RAST_PRIM__SHIFT 22
#define PS_STATE_RAST_PRIM__MASK 0x3
#define PS_STATE_NUM_SAMPLES__SHIFT 0
#define PS_STATE_NUM_SAMPLES__MASK 0xf
#define PS_STATE_LINE_RAST_MODE__SHIFT 4
#define PS_STATE_LINE_RAST_MODE__MASK 0x3
#define PS_STATE_PS_ITER_MASK__SHIFT 6
#define PS_STATE_PS_ITER_MASK__MASK 0xffff
#define PS_STATE_RAST_PRIM__SHIFT 22
#define PS_STATE_RAST_PRIM__MASK 0x3
#define PS_STATE_USE_FLOAT_FRAG_COORD_XY__SHIFT 24
#define PS_STATE_USE_FLOAT_FRAG_COORD_XY__MASK 0x1
struct radv_shader_layout {
uint32_t num_sets;

View file

@ -420,6 +420,9 @@ radv_ps_needs_state_sgpr(const struct radv_shader_info *info, const struct radv_
if (info->ps.load_rasterization_prim && gfx_state->unknown_rast_prim)
return true;
if (info->ps.selects_frag_coord_xy_dynamically)
return true;
return false;
}

View file

@ -312,6 +312,9 @@ gather_intrinsic_info(const nir_shader *nir, const nir_intrinsic_instr *instr, s
case nir_intrinsic_begin_invocation_interlock:
info->ps.pops = true;
break;
case nir_intrinsic_load_use_float_frag_coord_xy_amd:
info->ps.selects_frag_coord_xy_dynamically = true;
break;
default:
break;
}

View file

@ -213,6 +213,7 @@ struct radv_shader_info {
bool load_rasterization_prim : 1;
bool force_sample_iter_shading_rate : 1;
bool allow_flat_shading : 1;
bool selects_frag_coord_xy_dynamically : 1;
bool has_epilog : 1;
} ps;

View file

@ -351,6 +351,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
case nir_intrinsic_load_fbfetch_image_fmask_desc_amd:
case nir_intrinsic_load_fbfetch_image_desc_amd:
case nir_intrinsic_load_polygon_stipple_buffer_amd:
case nir_intrinsic_load_use_float_frag_coord_xy_amd:
case nir_intrinsic_load_tcs_mem_attrib_stride:
case nir_intrinsic_load_printf_buffer_address:
case nir_intrinsic_load_printf_buffer_size:

View file

@ -1143,6 +1143,10 @@ system_value("blend_const_color_aaaa8888_unorm", 1)
# System value for internal compute shaders in radeonsi.
system_value("user_data_amd", 8)
# Whether to use load_frag_coord_x() or load_pixel_coord() based on dynamic states.
intrinsic("load_use_float_frag_coord_xy_amd", dest_comp=1, bit_sizes=[1],
flags=[CAN_ELIMINATE, CAN_REORDER])
# Loads for gl_Color, for radeonsi which interpolates these in the shader
# prolog to handle flatshading and front/back color selection without
# recompiles and therefore doesn't handle them like normal varyings.