Merge branch 'radv_linking_cleanup' into 'main'

radv: Cleanup radv_link_shaders to remove some passes that aren't needed anymore.

See merge request mesa/mesa!33979
This commit is contained in:
Timur Kristóf 2025-12-20 00:06:19 +00:00
commit e23c9ef9a1
3 changed files with 65 additions and 62 deletions

View file

@ -149,13 +149,8 @@ radv_nir_lower_io(struct radv_device *device, nir_shader *nir)
NIR_PASS(_, nir, nir_lower_tess_level_array_vars_to_vec); NIR_PASS(_, nir, nir_lower_tess_level_array_vars_to_vec);
} }
if (nir->info.stage == MESA_SHADER_VERTEX) { NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out, type_size_vec4,
NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in, type_size_vec4, 0); nir_lower_io_lower_64bit_to_32 | nir_lower_io_use_interpolated_input_intrinsics);
NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out, type_size_vec4, nir_lower_io_lower_64bit_to_32);
} else {
NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out, type_size_vec4,
nir_lower_io_lower_64bit_to_32 | nir_lower_io_use_interpolated_input_intrinsics);
}
/* Fold constant offset srcs for IO. */ /* Fold constant offset srcs for IO. */
NIR_PASS(_, nir, nir_opt_constant_folding); NIR_PASS(_, nir, nir_opt_constant_folding);

View file

@ -184,6 +184,22 @@ adjust_vertex_fetch_alpha(nir_builder *b, enum ac_vs_input_alpha_adjust alpha_ad
return alpha; return alpha;
} }
static enum pipe_format
adjust_format(const enum pipe_format attrib_format)
{
if (util_format_get_max_channel_size(attrib_format) <= 32)
return attrib_format;
const struct util_format_description *f = util_format_description(attrib_format);
/* 1x 64-bit channel ~ 2x 32-bit channel */
if (f->nr_channels == 1)
return PIPE_FORMAT_R32G32_UINT;
/* 2x 64-bit channel ~ 4x 32-bit channel */
return PIPE_FORMAT_R32G32B32A32_UINT;
}
static nir_def * static nir_def *
lower_load_vs_input(nir_builder *b, nir_intrinsic_instr *intrin, lower_vs_inputs_state *s) lower_load_vs_input(nir_builder *b, nir_intrinsic_instr *intrin, lower_vs_inputs_state *s)
{ {
@ -192,10 +208,19 @@ lower_load_vs_input(nir_builder *b, nir_intrinsic_instr *intrin, lower_vs_inputs
const nir_io_semantics io_sem = nir_intrinsic_io_semantics(intrin); const nir_io_semantics io_sem = nir_intrinsic_io_semantics(intrin);
const unsigned base_offset = nir_src_as_uint(*offset_src); const unsigned base_offset = nir_src_as_uint(*offset_src);
const unsigned location = io_sem.location + base_offset - VERT_ATTRIB_GENERIC0; const unsigned loc = io_sem.location + base_offset - VERT_ATTRIB_GENERIC0;
const unsigned bit_size = intrin->def.bit_size; const unsigned bit_size = intrin->def.bit_size;
const unsigned dest_num_components = intrin->def.num_components; const unsigned dest_num_components = intrin->def.num_components;
/* Check if the current slot is the high part of a 64-bit input.
* If so, correct the location and remember to add an offset.
*/
const unsigned location =
loc > 0 && (s->gfx_state->vi.attributes_valid & (1 << (loc - 1))) &&
util_format_get_max_channel_size(s->gfx_state->vi.vertex_attribute_formats[loc - 1]) == 64
? loc - 1 : loc;
const unsigned high_dvec2 = location == loc - 1;
if (!(s->gfx_state->vi.attributes_valid & (1 << location))) { if (!(s->gfx_state->vi.attributes_valid & (1 << location))) {
/* Return early for unassigned attribute reads. */ /* Return early for unassigned attribute reads. */
return nir_imm_zero(b, intrin->def.num_components, intrin->def.bit_size); return nir_imm_zero(b, intrin->def.num_components, intrin->def.bit_size);
@ -209,7 +234,7 @@ lower_load_vs_input(nir_builder *b, nir_intrinsic_instr *intrin, lower_vs_inputs
* 64-bit variables must not have a component of 1 or 3. * 64-bit variables must not have a component of 1 or 3.
* (See VK spec 15.1.5 "Component Assignment") * (See VK spec 15.1.5 "Component Assignment")
*/ */
const unsigned component = nir_intrinsic_component(intrin) / (MAX2(32, bit_size) / 32); const unsigned component = nir_intrinsic_component(intrin);
/* Bitmask of components in bit_size units /* Bitmask of components in bit_size units
* of the current input load that are actually used. * of the current input load that are actually used.
@ -225,7 +250,7 @@ lower_load_vs_input(nir_builder *b, nir_intrinsic_instr *intrin, lower_vs_inputs
const uint32_t attrib_binding = s->gfx_state->vi.vertex_attribute_bindings[location]; const uint32_t attrib_binding = s->gfx_state->vi.vertex_attribute_bindings[location];
const uint32_t attrib_offset = s->gfx_state->vi.vertex_attribute_offsets[location]; const uint32_t attrib_offset = s->gfx_state->vi.vertex_attribute_offsets[location];
const uint32_t attrib_stride = s->gfx_state->vi.vertex_attribute_strides[location]; const uint32_t attrib_stride = s->gfx_state->vi.vertex_attribute_strides[location];
const enum pipe_format attrib_format = s->gfx_state->vi.vertex_attribute_formats[location]; const enum pipe_format attrib_format = adjust_format(s->gfx_state->vi.vertex_attribute_formats[location]);
const struct util_format_description *f = util_format_description(attrib_format); const struct util_format_description *f = util_format_description(attrib_format);
const struct ac_vtx_format_info *vtx_info = const struct ac_vtx_format_info *vtx_info =
ac_get_vtx_format_info(s->gpu_info->gfx_level, s->gpu_info->family, attrib_format); ac_get_vtx_format_info(s->gpu_info->gfx_level, s->gpu_info->family, attrib_format);
@ -255,14 +280,16 @@ lower_load_vs_input(nir_builder *b, nir_intrinsic_instr *intrin, lower_vs_inputs
* Beneficial because the backend may be able to emit fewer HW instructions. * Beneficial because the backend may be able to emit fewer HW instructions.
* Only possible with array formats. * Only possible with array formats.
*/ */
const unsigned first_used_channel = first_used_swizzled_channel(f, dest_use_mask, false); const unsigned first_used_channel =
needs_swizzle ? first_used_swizzled_channel(f, dest_use_mask, false) : (ffs(dest_use_mask) - 1);
const unsigned skipped_start = f->is_array ? first_used_channel : 0; const unsigned skipped_start = f->is_array ? first_used_channel : 0;
/* Number of channels we actually use and load. /* Number of channels we actually use and load.
* Don't shrink the format here because this might allow the backend to * Don't shrink the format here because this might allow the backend to
* emit fewer (but larger than needed) HW instructions. * emit fewer (but larger than needed) HW instructions.
*/ */
const unsigned first_trailing_unused_channel = first_used_swizzled_channel(f, dest_use_mask, true) + 1; const unsigned first_trailing_unused_channel =
needs_swizzle ? (first_used_swizzled_channel(f, dest_use_mask, true) + 1) : util_last_bit(dest_use_mask);
const unsigned max_loaded_channels = MIN2(first_trailing_unused_channel, f->nr_channels); const unsigned max_loaded_channels = MIN2(first_trailing_unused_channel, f->nr_channels);
const unsigned fetch_num_channels = const unsigned fetch_num_channels =
first_used_channel >= max_loaded_channels ? 0 : max_loaded_channels - skipped_start; first_used_channel >= max_loaded_channels ? 0 : max_loaded_channels - skipped_start;
@ -287,7 +314,7 @@ lower_load_vs_input(nir_builder *b, nir_intrinsic_instr *intrin, lower_vs_inputs
nir_def *index = base_index; nir_def *index = base_index;
/* Add excess constant offset to the index. */ /* Add excess constant offset to the index. */
unsigned const_off = attrib_offset + count_format_bytes(f, 0, start); unsigned const_off = attrib_offset + high_dvec2 * 16 + count_format_bytes(f, 0, start);
if (attrib_stride && const_off >= attrib_stride) { if (attrib_stride && const_off >= attrib_stride) {
index = nir_iadd_imm(b, base_index, const_off / attrib_stride); index = nir_iadd_imm(b, base_index, const_off / attrib_stride);
const_off %= attrib_stride; const_off %= attrib_stride;

View file

@ -1342,30 +1342,6 @@ radv_link_shaders(const struct radv_device *device, struct radv_shader_stage *pr
if (gfx_state->enable_remove_point_size) if (gfx_state->enable_remove_point_size)
radv_remove_point_size(gfx_state, producer, consumer); radv_remove_point_size(gfx_state, producer, consumer);
if (nir_link_opt_varyings(producer, consumer)) {
nir_validate_shader(producer, "after nir_link_opt_varyings");
nir_validate_shader(consumer, "after nir_link_opt_varyings");
NIR_PASS(_, consumer, nir_opt_constant_folding);
NIR_PASS(_, consumer, nir_opt_algebraic);
NIR_PASS(_, consumer, nir_opt_dce);
}
NIR_PASS(_, producer, nir_remove_dead_variables, nir_var_shader_out, NULL);
NIR_PASS(_, consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);
nir_remove_unused_varyings(producer, consumer);
nir_compact_varyings(producer, consumer, true);
nir_validate_shader(producer, "after nir_compact_varyings");
nir_validate_shader(consumer, "after nir_compact_varyings");
if (producer->info.stage == MESA_SHADER_MESH) {
/* nir_compact_varyings can change the location of per-vertex and per-primitive outputs */
nir_shader_gather_info(producer, nir_shader_get_entrypoint(producer));
}
const bool has_geom_or_tess = const bool has_geom_or_tess =
consumer->info.stage == MESA_SHADER_GEOMETRY || consumer->info.stage == MESA_SHADER_TESS_CTRL; consumer->info.stage == MESA_SHADER_GEOMETRY || consumer->info.stage == MESA_SHADER_TESS_CTRL;
const bool merged_gs = consumer->info.stage == MESA_SHADER_GEOMETRY && gfx_level >= GFX9; const bool merged_gs = consumer->info.stage == MESA_SHADER_GEOMETRY && gfx_level >= GFX9;
@ -1374,11 +1350,6 @@ radv_link_shaders(const struct radv_device *device, struct radv_shader_stage *pr
(producer->info.stage == MESA_SHADER_VERTEX && has_geom_or_tess) || (producer->info.stage == MESA_SHADER_VERTEX && has_geom_or_tess) ||
(producer->info.stage == MESA_SHADER_TESS_EVAL && merged_gs)) { (producer->info.stage == MESA_SHADER_TESS_EVAL && merged_gs)) {
NIR_PASS(_, producer, nir_opt_vectorize_io_vars, nir_var_shader_out); NIR_PASS(_, producer, nir_opt_vectorize_io_vars, nir_var_shader_out);
if (producer->info.stage == MESA_SHADER_TESS_CTRL)
NIR_PASS(_, producer, nir_lower_tess_level_array_vars_to_vec);
NIR_PASS(_, producer, nir_opt_combine_stores, nir_var_shader_out);
} }
if (consumer->info.stage == MESA_SHADER_GEOMETRY || consumer->info.stage == MESA_SHADER_TESS_CTRL || if (consumer->info.stage == MESA_SHADER_GEOMETRY || consumer->info.stage == MESA_SHADER_TESS_CTRL ||
@ -1703,29 +1674,33 @@ radv_graphics_shaders_link_varyings(struct radv_shader_stage *stages, enum amd_g
/* Prepare shaders before running nir_opt_varyings. */ /* Prepare shaders before running nir_opt_varyings. */
for (int i = 0; i < ARRAY_SIZE(graphics_shader_order); ++i) { for (int i = 0; i < ARRAY_SIZE(graphics_shader_order); ++i) {
const mesa_shader_stage s = graphics_shader_order[i]; const mesa_shader_stage s = graphics_shader_order[i];
const mesa_shader_stage next = stages[s].info.next_stage; if (!stages[s].nir)
if (!stages[s].nir || next == MESA_SHADER_NONE || !stages[next].nir)
continue; continue;
if (stages[s].key.optimisations_disabled || stages[next].key.optimisations_disabled) if (stages[s].key.optimisations_disabled)
continue; continue;
nir_shader *producer = stages[s].nir; nir_shader *producer = stages[s].nir;
nir_shader *consumer = stages[next].nir;
/* It is expected by nir_opt_varyings that no undefined stores are present in the shader. */ /* It is expected by nir_opt_varyings that no undefined stores are present in the shader. */
NIR_PASS(_, producer, nir_opt_undef); NIR_PASS(_, producer, nir_opt_undef);
/* Update load/store alignments because inter-stage code motion may move instructions used to deduce this info. */ /* Update load/store alignments because inter-stage code motion may move instructions used to deduce this info. */
NIR_PASS(_, consumer, nir_opt_load_store_update_alignments); NIR_PASS(_, producer, nir_opt_load_store_update_alignments);
/* Scalarize all I/O, because nir_opt_varyings and nir_opt_vectorize_io expect all I/O to be scalarized. */ /* Scalarize all I/O, because nir_opt_varyings and nir_opt_vectorize_io expect all I/O to be scalarized. */
NIR_PASS(_, producer, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL); nir_variable_mode sca_mode = nir_var_shader_in;
NIR_PASS(_, consumer, nir_lower_io_to_scalar, nir_var_shader_in, NULL, NULL); bool sca_progress;
if (s != MESA_SHADER_FRAGMENT)
sca_mode |= nir_var_shader_out;
/* Eliminate useless vec->mov copies resulting from scalarization. */ NIR_PASS(sca_progress, producer, nir_lower_io_to_scalar, sca_mode, NULL, NULL);
NIR_PASS(_, producer, nir_opt_copy_prop);
NIR_PASS(_, producer, nir_opt_constant_folding); if (sca_progress) {
/* Eliminate useless vec->mov copies resulting from scalarization. */
NIR_PASS(_, producer, nir_opt_copy_prop);
NIR_PASS(_, producer, nir_opt_constant_folding);
}
} }
int highest_changed_producer = -1; int highest_changed_producer = -1;
@ -1787,22 +1762,28 @@ radv_graphics_shaders_link_varyings(struct radv_shader_stage *stages, enum amd_g
/* Run optimizations and fixups after linking. */ /* Run optimizations and fixups after linking. */
for (int i = 0; i < ARRAY_SIZE(graphics_shader_order); ++i) { for (int i = 0; i < ARRAY_SIZE(graphics_shader_order); ++i) {
const mesa_shader_stage s = graphics_shader_order[i]; const mesa_shader_stage s = graphics_shader_order[i];
const mesa_shader_stage next = stages[s].info.next_stage;
if (!stages[s].nir) if (!stages[s].nir)
continue; continue;
nir_shader *producer = stages[s].nir; nir_shader *producer = stages[s].nir;
/* Re-vectorize I/O for stages that output to memory (LDS or VRAM). /* Re-vectorize I/O for stages that use memory for I/O (LDS or VRAM).
* Don't vectorize FS inputs, doing so just regresses shader stats without any benefit. * Don't vectorize FS I/O, doing so just regresses shader stats without any benefit.
* There is also no benefit from re-vectorizing the outputs of the last pre-rasterization
* stage here, because ac_nir_lower_ngg/legacy already takes care of that.
*/ */
if (next != MESA_SHADER_NONE && stages[next].nir && next != MESA_SHADER_FRAGMENT && if (s != MESA_SHADER_FRAGMENT && !stages[s].key.optimisations_disabled) {
!stages[s].key.optimisations_disabled && !stages[next].key.optimisations_disabled) { /* Delete dead instructions to prevent them from being vectorized. */
nir_shader *consumer = stages[next].nir; NIR_PASS(_, producer, nir_opt_dce);
NIR_PASS(_, producer, nir_opt_vectorize_io, nir_var_shader_out, false);
NIR_PASS(_, consumer, nir_opt_vectorize_io, nir_var_shader_in, false); /* Vectorize all inputs. Non-FS inputs are always read from memory. */
nir_variable_mode vec_mode = nir_var_shader_in;
/* There is also no benefit from re-vectorizing the outputs of the last pre-rasterization
* stage here, because ac_nir_lower_ngg/legacy already takes care of that.
*/
if (!radv_is_last_vgt_stage(&stages[s]))
vec_mode |= nir_var_shader_out;
NIR_PASS(_, producer, nir_opt_vectorize_io, vec_mode, true);
} }
/* Gather shader info; at least the I/O info likely changed /* Gather shader info; at least the I/O info likely changed