diff --git a/src/amd/vulkan/nir/radv_nir_lower_abi.c b/src/amd/vulkan/nir/radv_nir_lower_abi.c index 82e91040c1f..3da113266fe 100644 --- a/src/amd/vulkan/nir/radv_nir_lower_abi.c +++ b/src/amd/vulkan/nir/radv_nir_lower_abi.c @@ -441,6 +441,9 @@ lower_abi_instr(nir_builder *b, nir_intrinsic_instr *intrin, void *state) /* Load the primitive topology from an user SGPR when it's unknown at compile time (GPL). */ replacement = GET_SGPR_FIELD_NIR(s->args->ps_state, PS_STATE_RAST_PRIM); break; + case nir_intrinsic_load_use_float_frag_coord_xy_amd: + replacement = nir_ine_imm(b, GET_SGPR_FIELD_NIR(s->args->ps_state, PS_STATE_USE_FLOAT_FRAG_COORD_XY), 0); + break; default: progress = false; break; diff --git a/src/amd/vulkan/nir/radv_nir_lower_opt_fs_frag_pos.c b/src/amd/vulkan/nir/radv_nir_lower_opt_fs_frag_pos.c index 62f05646802..c3c523b5d9c 100644 --- a/src/amd/vulkan/nir/radv_nir_lower_opt_fs_frag_pos.c +++ b/src/amd/vulkan/nir/radv_nir_lower_opt_fs_frag_pos.c @@ -9,6 +9,20 @@ * not both. * * sample_pos counts as a frag_coord_xy use and is lowered to frag_coord_xy here. + * + * If frag_coord_xy survives and both components are used and sample_pos isn't used, it becomes: + * load_use_float_frag_coord_xy_amd() ? load_frag_coord_xy() : u2f32(load_pixel_coord()) + 0.5; + * + * If load_use_float_frag_coord_xy_amd==false, load_frag_coord_xy() returns uninitialized values. + * If load_use_float_frag_coord_xy_amd==true, load_pixel_coord() returns uninitialized values. + * + * SPI_PS_INPUT_ENA is used to disable VGPR initialization for frag_coord_xy (POS_X_FLOAT, + * POS_Y_FLOAT) or pixel_coord (POS_FIXED_PT) while SPI_PS_INPUT_ADDR keeps them at the same + * VGPR locations. Reducing the number of initialized VGPRs increases the PS wave launch rate, + * which increases observed pixel throughput depending on other states. + * + * load_use_float_frag_coord_xy_amd() comes from a user SGPR, and determines which VGPRs are + * initialized at PS wave launch. */ #include "nir_builder.h" @@ -21,10 +35,12 @@ typedef struct { bool has_frag_coord_xy_float_use; bool has_pixel_coord; bool has_sample_pos; + uint8_t comp_usage_mask; /* lower_frag_coord_and_pixel_coord */ bool lower_to_pixel_coord; bool lower_to_frag_coord_xy; + bool select_frag_coord_xy_dynamically; } opt_fs_frag_coord_and_pixel_coord_state; static bool @@ -45,10 +61,12 @@ gather_fs_frag_pos(nir_builder *b, nir_intrinsic_instr *intr, void *data) state->has_frag_coord_xy |= intr->intrinsic == nir_intrinsic_load_frag_coord_xy; state->has_sample_pos |= intr->intrinsic == nir_intrinsic_load_sample_pos; + state->comp_usage_mask |= nir_def_components_read(&intr->def); return false; case nir_intrinsic_load_pixel_coord: state->has_pixel_coord = true; + state->comp_usage_mask |= nir_def_components_read(&intr->def); return false; default: @@ -68,6 +86,10 @@ lower_fs_frag_pos(nir_builder *b, nir_intrinsic_instr *intr, void *data) if (state->lower_to_pixel_coord) { nir_def_replace(&intr->def, nir_fadd_imm(b, nir_u2f32(b, nir_load_pixel_coord(b)), 0.5)); return true; + } else if (state->select_frag_coord_xy_dynamically) { + nir_def_replace(&intr->def, nir_bcsel(b, nir_load_use_float_frag_coord_xy_amd(b), nir_load_frag_coord_xy(b), + nir_fadd_imm(b, nir_u2f32(b, nir_load_pixel_coord(b)), 0.5))); + return true; } return false; @@ -82,7 +104,12 @@ lower_fs_frag_pos(nir_builder *b, nir_intrinsic_instr *intr, void *data) case nir_intrinsic_load_pixel_coord: if (state->lower_to_frag_coord_xy) { - nir_def_replace(&intr->def, nir_f2u16(b, nir_load_frag_coord_xy(b))); + if (state->select_frag_coord_xy_dynamically) { + nir_def_replace(&intr->def, nir_bcsel(b, nir_load_use_float_frag_coord_xy_amd(b), + nir_f2u16(b, nir_load_frag_coord_xy(b)), nir_load_pixel_coord(b))); + } else { + nir_def_replace(&intr->def, nir_f2u16(b, nir_load_frag_coord_xy(b))); + } return true; } return false; @@ -109,8 +136,10 @@ radv_nir_lower_opt_fs_frag_pos(nir_shader *shader, bool force_pixel_coord) (state.has_frag_coord_xy || state.has_sample_pos) && !state.has_frag_coord_xy_float_use; state.lower_to_frag_coord_xy = (state.has_pixel_coord || state.has_sample_pos) && state.has_frag_coord_xy_float_use; + state.select_frag_coord_xy_dynamically = + state.has_frag_coord_xy_float_use && !state.has_sample_pos && state.comp_usage_mask == 0x3; - if (!state.lower_to_pixel_coord && !state.lower_to_frag_coord_xy) + if (!state.lower_to_pixel_coord && !state.lower_to_frag_coord_xy && !state.select_frag_coord_xy_dynamically) return false; return nir_shader_intrinsics_pass(shader, lower_fs_frag_pos, nir_metadata_control_flow, &state); diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index a412e298102..5349c01ce3a 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -8786,6 +8786,16 @@ radv_bind_pre_rast_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_ cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_FLUSH; } + if (cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT] && + cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT]->info.ps.selects_frag_coord_xy_dynamically && + (!cmd_buffer->state.last_vgt_shader || + /* We just want to know whether the VRS output changes between enabled and disabled. */ + (cmd_buffer->state.last_vgt_shader->info.outinfo.writes_primitive_shading_rate || + cmd_buffer->state.last_vgt_shader->info.outinfo.writes_primitive_shading_rate_per_primitive) != + (shader->info.outinfo.writes_primitive_shading_rate || + shader->info.outinfo.writes_primitive_shading_rate_per_primitive))) + cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PS_STATE; + cmd_buffer->state.last_vgt_shader = (struct radv_shader *)shader; } } @@ -11691,7 +11701,29 @@ radv_emit_ps_state(struct radv_cmd_buffer *cmd_buffer) if (!ps) return; - const uint32_t spi_ps_input_ena = ps->config.spi_ps_input_ena; + uint32_t spi_ps_input_ena = ps->config.spi_ps_input_ena; + bool use_float_frag_coord_xy = false; + + if (ps->info.ps.selects_frag_coord_xy_dynamically) { + /* The shader selects frag_coord_xy/pixel_coord dynamically depending on a flag in PS_STATE + * that depends on the following dynamic state while preferring pixel_coord (POS_FIXED_PT) + * if possible due to lower VGPR initialization cost. + */ + use_float_frag_coord_xy = + /* Whether VRS can be other than 1x1. */ + cmd_buffer->state.dynamic.vk.fsr.fragment_size.width != 1 || + cmd_buffer->state.dynamic.vk.fsr.fragment_size.height != 1 || cmd_buffer->state.render.vrs_att.iview || + cmd_buffer->state.last_vgt_shader->info.outinfo.writes_primitive_shading_rate || + cmd_buffer->state.last_vgt_shader->info.outinfo.writes_primitive_shading_rate_per_primitive || + radv_is_sample_shading_enabled(cmd_buffer, NULL); + + /* Disable the initialized PS VGPRs that the shader doesn't use. */ + if (use_float_frag_coord_xy) + spi_ps_input_ena &= C_0286CC_POS_FIXED_PT_ENA; + else + spi_ps_input_ena &= C_0286CC_POS_X_FLOAT_ENA & C_0286CC_POS_Y_FLOAT_ENA; + } + struct radv_cmd_stream *cs = cmd_buffer->cs; radeon_begin(cs); @@ -11711,7 +11743,8 @@ radv_emit_ps_state(struct radv_cmd_buffer *cmd_buffer) const unsigned ps_state = SET_SGPR_FIELD(PS_STATE_NUM_SAMPLES, rasterization_samples) | SET_SGPR_FIELD(PS_STATE_PS_ITER_MASK, ps_iter_mask) | SET_SGPR_FIELD(PS_STATE_LINE_RAST_MODE, line_rast_mode) | - SET_SGPR_FIELD(PS_STATE_RAST_PRIM, vgt_outprim_type); + SET_SGPR_FIELD(PS_STATE_RAST_PRIM, vgt_outprim_type) | + SET_SGPR_FIELD(PS_STATE_USE_FLOAT_FRAG_COORD_XY, use_float_frag_coord_xy); if (pdev->info.gfx_level >= GFX12) { gfx12_push_sh_reg(ps_state_offset, ps_state); @@ -12830,7 +12863,7 @@ radv_validate_dynamic_states(struct radv_cmd_buffer *cmd_buffer, uint64_t dynami cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DB_SHADER_CONTROL; if (dynamic_states & RADV_DYNAMIC_FRAGMENT_SHADING_RATE) - cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FSR_STATE; + cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FSR_STATE | RADV_CMD_DIRTY_PS_STATE; if (dynamic_states & RADV_DYNAMIC_SAMPLE_LOCATIONS_ENABLE) cmd_buffer->state.dirty |= RADV_CMD_DIRTY_RAST_SAMPLES_STATE; diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index 88153bae79d..a6be2a72113 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -237,14 +237,16 @@ struct radv_llvm_compiler_options { #define NGG_STATE_QUERY__SHIFT 6 #define NGG_STATE_QUERY__MASK 0x7 -#define PS_STATE_NUM_SAMPLES__SHIFT 0 -#define PS_STATE_NUM_SAMPLES__MASK 0xf -#define PS_STATE_LINE_RAST_MODE__SHIFT 4 -#define PS_STATE_LINE_RAST_MODE__MASK 0x3 -#define PS_STATE_PS_ITER_MASK__SHIFT 6 -#define PS_STATE_PS_ITER_MASK__MASK 0xffff -#define PS_STATE_RAST_PRIM__SHIFT 22 -#define PS_STATE_RAST_PRIM__MASK 0x3 +#define PS_STATE_NUM_SAMPLES__SHIFT 0 +#define PS_STATE_NUM_SAMPLES__MASK 0xf +#define PS_STATE_LINE_RAST_MODE__SHIFT 4 +#define PS_STATE_LINE_RAST_MODE__MASK 0x3 +#define PS_STATE_PS_ITER_MASK__SHIFT 6 +#define PS_STATE_PS_ITER_MASK__MASK 0xffff +#define PS_STATE_RAST_PRIM__SHIFT 22 +#define PS_STATE_RAST_PRIM__MASK 0x3 +#define PS_STATE_USE_FLOAT_FRAG_COORD_XY__SHIFT 24 +#define PS_STATE_USE_FLOAT_FRAG_COORD_XY__MASK 0x1 struct radv_shader_layout { uint32_t num_sets; diff --git a/src/amd/vulkan/radv_shader_args.c b/src/amd/vulkan/radv_shader_args.c index d23fad19dab..6ce9a50fdcb 100644 --- a/src/amd/vulkan/radv_shader_args.c +++ b/src/amd/vulkan/radv_shader_args.c @@ -420,6 +420,9 @@ radv_ps_needs_state_sgpr(const struct radv_shader_info *info, const struct radv_ if (info->ps.load_rasterization_prim && gfx_state->unknown_rast_prim) return true; + if (info->ps.selects_frag_coord_xy_dynamically) + return true; + return false; } diff --git a/src/amd/vulkan/radv_shader_info.c b/src/amd/vulkan/radv_shader_info.c index b90d4ce4b28..6037d22f9b9 100644 --- a/src/amd/vulkan/radv_shader_info.c +++ b/src/amd/vulkan/radv_shader_info.c @@ -312,6 +312,9 @@ gather_intrinsic_info(const nir_shader *nir, const nir_intrinsic_instr *instr, s case nir_intrinsic_begin_invocation_interlock: info->ps.pops = true; break; + case nir_intrinsic_load_use_float_frag_coord_xy_amd: + info->ps.selects_frag_coord_xy_dynamically = true; + break; default: break; } diff --git a/src/amd/vulkan/radv_shader_info.h b/src/amd/vulkan/radv_shader_info.h index 043e515df97..916075e3465 100644 --- a/src/amd/vulkan/radv_shader_info.h +++ b/src/amd/vulkan/radv_shader_info.h @@ -213,6 +213,7 @@ struct radv_shader_info { bool load_rasterization_prim : 1; bool force_sample_iter_shading_rate : 1; bool allow_flat_shading : 1; + bool selects_frag_coord_xy_dynamically : 1; bool has_epilog : 1; } ps; diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c index f658e65ed87..49979136dda 100644 --- a/src/compiler/nir/nir_divergence_analysis.c +++ b/src/compiler/nir/nir_divergence_analysis.c @@ -351,6 +351,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) case nir_intrinsic_load_fbfetch_image_fmask_desc_amd: case nir_intrinsic_load_fbfetch_image_desc_amd: case nir_intrinsic_load_polygon_stipple_buffer_amd: + case nir_intrinsic_load_use_float_frag_coord_xy_amd: case nir_intrinsic_load_tcs_mem_attrib_stride: case nir_intrinsic_load_printf_buffer_address: case nir_intrinsic_load_printf_buffer_size: diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 9188fab1f3a..19443980525 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1143,6 +1143,10 @@ system_value("blend_const_color_aaaa8888_unorm", 1) # System value for internal compute shaders in radeonsi. system_value("user_data_amd", 8) +# Whether to use load_frag_coord_x() or load_pixel_coord() based on dynamic states. +intrinsic("load_use_float_frag_coord_xy_amd", dest_comp=1, bit_sizes=[1], + flags=[CAN_ELIMINATE, CAN_REORDER]) + # Loads for gl_Color, for radeonsi which interpolates these in the shader # prolog to handle flatshading and front/back color selection without # recompiles and therefore doesn't handle them like normal varyings.