diff --git a/src/amd/common/nir/ac_nir_lower_tess_io_to_mem.c b/src/amd/common/nir/ac_nir_lower_tess_io_to_mem.c index 694ec55a5a8..970d32eeb6b 100644 --- a/src/amd/common/nir/ac_nir_lower_tess_io_to_mem.c +++ b/src/amd/common/nir/ac_nir_lower_tess_io_to_mem.c @@ -144,12 +144,6 @@ typedef struct { */ bool tcs_out_patch_fits_subgroup; - /* Save TCS tess factor for tess factor writer. */ - nir_variable *tcs_tess_level_outer; - nir_variable *tcs_tess_level_inner; - unsigned tcs_tess_level_outer_mask; - unsigned tcs_tess_level_inner_mask; - /* TCS output values, 8 channels per slot. The last 4 channels are high 16 bits of the first 4 channels. * Output values that are not stored with cross-invocation access and indirect indexing are stored here. * Output values stored with cross-invocation access or indirect indexing are stored in LDS. @@ -158,6 +152,13 @@ typedef struct { nir_variable *tcs_per_vertex_outputs[VARYING_SLOT_MAX][8]; /* Max. 4 channels, always 32 bits per channel. */ uint8_t tcs_per_vertex_output_vmem_chan_mask[VARYING_SLOT_MAX]; + + /* Same, but for tess levels. LDS isn't used if only invocation 0 writes and reads tess levels or + * if all invocations write tess levels. + */ + nir_variable *tcs_tess_level[2]; /* outer, inner */ + /* We can't use uint8_t due to a buggy gcc warning. */ + uint16_t tcs_tess_level_chan_mask[2]; /* outer, inner */ } lower_tess_io_state; typedef struct { @@ -179,15 +180,41 @@ ac_nir_get_tess_io_info(const nir_shader *tcs, const nir_tcs_info *tcs_info, uin io_info->vram_output_mask &= ~(VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PRIMITIVE_ID | VARYING_BIT_PRIMITIVE_SHADING_RATE); + /* Convert tess levels from 2-bit masks to 32-bit varying slot masks. */ + uint32_t tess_levels_defined_by_all_invoc = + (uint32_t)tcs_info->tess_levels_defined_by_all_invoc << VARYING_SLOT_TESS_LEVEL_OUTER; + uint32_t tess_levels_only_written_by_invoc0 = + (uint32_t)tcs_info->tess_levels_only_written_by_invoc0 << VARYING_SLOT_TESS_LEVEL_OUTER; + uint32_t tess_levels_only_read_by_invoc0 = + (uint32_t)tcs_info->tess_levels_only_read_by_invoc0 << VARYING_SLOT_TESS_LEVEL_OUTER; + + /* Per-patch outputs and tess levels don't need LDS if: + * - There is no indirect indexing + * AND + * - only written by invocation 0 and never read or only read by invocation 0 + * (always true when the number of output patch vertices is 1) + * OR + * - written by all invocations in all execution paths (so that output reads can always + * return values from VGPRs instead of LDS) + */ + uint32_t tess_levels_written = tcs->info.outputs_written & TESS_LVL_MASK; + uint32_t tess_levels_dont_need_lds = + tess_levels_written & ~tcs->info.outputs_read_indirectly & ~tcs->info.outputs_written_indirectly & + ((tess_levels_only_written_by_invoc0 & ~tcs->info.outputs_read) | + (tess_levels_only_written_by_invoc0 & tess_levels_only_read_by_invoc0) | + tess_levels_defined_by_all_invoc); + + /* Determine which outputs use LDS. */ io_info->lds_output_mask = (((tcs->info.outputs_read & tcs->info.outputs_written) | tcs->info.tess.tcs_cross_invocation_outputs_written | tcs->info.outputs_written_indirectly) & ~TESS_LVL_MASK) | - (tcs_info->all_invocations_define_tess_levels ? - 0 : (tcs->info.outputs_written & TESS_LVL_MASK)); + (tess_levels_written & ~tess_levels_dont_need_lds); io_info->lds_patch_output_mask = tcs->info.patch_outputs_read & tcs->info.patch_outputs_written; io_info->vgpr_output_mask = (tcs->info.outputs_written & ~(tcs->info.tess.tcs_cross_invocation_outputs_written | - tcs->info.outputs_written_indirectly) & ~TESS_LVL_MASK); + tcs->info.outputs_written_indirectly) & ~TESS_LVL_MASK) | + (tess_levels_written & + (tess_levels_defined_by_all_invoc | tess_levels_only_written_by_invoc0)); io_info->highest_remapped_vram_output = 0; io_info->highest_remapped_vram_patch_output = 0; @@ -674,24 +701,15 @@ lower_hs_output_store(nir_builder *b, st->tcs_per_vertex_outputs[semantics.location]); } - /* Save tess factor to be used by tess factor writer or reconstruct - * store output instruction later. - */ + /* Save tess levels that don't need to be stored in LDS into local variables. */ if (semantics.location == VARYING_SLOT_TESS_LEVEL_INNER || semantics.location == VARYING_SLOT_TESS_LEVEL_OUTER) { - if (semantics.location == VARYING_SLOT_TESS_LEVEL_INNER) { - st->tcs_tess_level_inner_mask |= write_mask << component; + unsigned i = semantics.location - VARYING_SLOT_TESS_LEVEL_OUTER; - if (st->tcs_info.all_invocations_define_tess_levels) - ac_nir_store_var_components(b, st->tcs_tess_level_inner, store_val, - component, write_mask); - } else { - st->tcs_tess_level_outer_mask |= write_mask << component; + st->tcs_tess_level_chan_mask[i] |= write_mask << component; - if (st->tcs_info.all_invocations_define_tess_levels) - ac_nir_store_var_components(b, st->tcs_tess_level_outer, store_val, - component, write_mask); - } + if (st->io_info.vgpr_output_mask & BITFIELD64_BIT(semantics.location)) + ac_nir_store_var_components(b, st->tcs_tess_level[i], store_val, component, write_mask); } return NIR_LOWER_INSTR_PROGRESS_REPLACE; @@ -703,19 +721,16 @@ lower_hs_output_load(nir_builder *b, lower_tess_io_state *st) { const nir_io_semantics io_sem = nir_intrinsic_io_semantics(intrin); - const bool is_tess_factor = io_sem.location == VARYING_SLOT_TESS_LEVEL_INNER || - io_sem.location == VARYING_SLOT_TESS_LEVEL_OUTER; - if (is_tess_factor && st->tcs_info.all_invocations_define_tess_levels) { + if ((io_sem.location == VARYING_SLOT_TESS_LEVEL_INNER || + io_sem.location == VARYING_SLOT_TESS_LEVEL_OUTER) && + !tcs_output_needs_lds(intrin, b->shader, st)) { const unsigned component = nir_intrinsic_component(intrin); const unsigned num_components = intrin->def.num_components; const unsigned bit_size = intrin->def.bit_size; + unsigned i = io_sem.location - VARYING_SLOT_TESS_LEVEL_OUTER; - nir_def *var = - io_sem.location == VARYING_SLOT_TESS_LEVEL_OUTER - ? nir_load_var(b, st->tcs_tess_level_outer) - : nir_load_var(b, st->tcs_tess_level_inner); - + nir_def *var = nir_load_var(b, st->tcs_tess_level[i]); return nir_extract_bits(b, &var, 1, component * bit_size, num_components, bit_size); } @@ -783,51 +798,41 @@ static tess_levels hs_load_tess_levels(nir_builder *b, lower_tess_io_state *st) { - unsigned outer_comps, inner_comps; + unsigned output_comps[2]; mesa_count_tess_level_components(b->shader->info.tess._primitive_mode, - &outer_comps, &inner_comps); + &output_comps[0], &output_comps[1]); - nir_def *outer = NULL; - nir_def *inner = NULL; + nir_def *outputs[2] = {0}; + nir_def *lds_base = NULL; - if (st->tcs_info.all_invocations_define_tess_levels) { - if (st->tcs_tess_level_outer_mask) { - outer = nir_load_var(b, st->tcs_tess_level_outer); - outer = nir_trim_vector(b, outer, outer_comps); + for (unsigned i = 0; i < 2; i++) { + if (!output_comps[i] || !st->tcs_tess_level_chan_mask[i]) { + /* Set tess levels to zero if the shader doesn't write them. */ + if (output_comps[i]) + outputs[i] = nir_imm_zero(b, output_comps[i], 32); + continue; } - if (inner_comps && st->tcs_tess_level_inner_mask) { - inner = nir_load_var(b, st->tcs_tess_level_inner); - inner = nir_trim_vector(b, inner, inner_comps); + if (st->io_info.vgpr_output_mask & BITFIELD64_BIT(VARYING_SLOT_TESS_LEVEL_OUTER + i)) { + outputs[i] = nir_load_var(b, st->tcs_tess_level[i]); + outputs[i] = nir_trim_vector(b, outputs[i], output_comps[i]); + continue; } - } else { + /* Base LDS address of per-patch outputs in the current patch. */ - nir_def *lds_base = hs_output_lds_offset(b, st, 0, 0, NULL, NULL); + if (!lds_base) + lds_base = hs_output_lds_offset(b, st, 0, 0, NULL, NULL); - /* Load all tessellation factors (aka. tess levels) from LDS. */ - if (st->tcs_tess_level_outer_mask) { - const unsigned mapped = hs_output_lds_map_io_location(b->shader, false, VARYING_SLOT_TESS_LEVEL_OUTER, st); - outer = nir_load_shared(b, outer_comps, 32, lds_base, .base = mapped * 16); - } - - if (inner_comps && st->tcs_tess_level_inner_mask) { - const unsigned mapped = hs_output_lds_map_io_location(b->shader, false, VARYING_SLOT_TESS_LEVEL_INNER, st); - inner = nir_load_shared(b, inner_comps, 32, lds_base, .base = mapped * 16); - } + /* Load tessellation levels from LDS. */ + const unsigned mapped = hs_output_lds_map_io_location(b->shader, false, + VARYING_SLOT_TESS_LEVEL_OUTER + i, st); + outputs[i] = nir_load_shared(b, output_comps[i], 32, lds_base, .base = mapped * 16); } - /* Set tess factor to zero if the shader did not write them. */ - if (!outer) - outer = nir_imm_zero(b, outer_comps, 32); - if (inner_comps && !inner) - inner = nir_imm_zero(b, inner_comps, 32); - - tess_levels r = { - .outer = outer, - .inner = inner, + return (tess_levels){ + .outer = outputs[0], + .inner = outputs[1], }; - - return r; } static void @@ -1136,23 +1141,19 @@ hs_store_tess_factors_for_tes(nir_builder *b, tess_levels tessfactors, lower_tes /* For linked shaders, we must only write the tess factors that the TES actually reads, * otherwise we would write to a memory location reserved for another per-patch output. */ - const bool tes_reads_outer = st->io_info.vram_output_mask & VARYING_BIT_TESS_LEVEL_OUTER; - const bool tes_reads_inner = st->io_info.vram_output_mask & VARYING_BIT_TESS_LEVEL_INNER; + for (unsigned i = 0; i < 2; i++) { + nir_def *output_value = i ? tessfactors.inner : tessfactors.outer; - if (st->tcs_tess_level_outer_mask && tes_reads_outer) { - nir_def *vmem_off_outer = hs_per_patch_output_vmem_offset(b, st, VARYING_SLOT_TESS_LEVEL_OUTER, 0, zero, 0, NULL); + if (!output_value || !(st->io_info.vram_output_mask & (VARYING_BIT_TESS_LEVEL_OUTER << i))) + continue; - nir_store_buffer_amd(b, tessfactors.outer, hs_ring_tess_offchip, - vmem_off_outer, offchip_offset, zero, - .memory_modes = nir_var_shader_out, - .access = ACCESS_COHERENT); - } + nir_def *vmem_off = hs_per_patch_output_vmem_offset(b, st, VARYING_SLOT_TESS_LEVEL_OUTER + i, 0, zero, 0, NULL); - if (tessfactors.inner && st->tcs_tess_level_inner_mask && tes_reads_inner) { - nir_def *vmem_off_inner = hs_per_patch_output_vmem_offset(b, st, VARYING_SLOT_TESS_LEVEL_INNER, 0, zero, 0, NULL); - - nir_store_buffer_amd(b, tessfactors.inner, hs_ring_tess_offchip, - vmem_off_inner, offchip_offset, zero, + /* Always store whole vec4s to get cached bandwidth. Non-vec4 stores cause implicit memory loads + * to fill the rest of cache lines with this layout. + */ + nir_store_buffer_amd(b, nir_pad_vec4(b, output_value), hs_ring_tess_offchip, vmem_off, + offchip_offset, zero, .memory_modes = nir_var_shader_out, .access = ACCESS_COHERENT); } @@ -1200,8 +1201,7 @@ hs_finale(nir_shader *shader, lower_tess_io_state *st) nir_builder *b = &builder; /* This is to avoid the & */ /* Insert a barrier to wait for output stores to LDS. */ - if (!st->tcs_info.all_invocations_define_tess_levels || - shader->info.outputs_written & ~st->io_info.vgpr_output_mask) { + if (shader->info.outputs_written & ~st->io_info.vgpr_output_mask) { mesa_scope scope = st->tcs_out_patch_fits_subgroup ? SCOPE_SUBGROUP : SCOPE_WORKGROUP; nir_barrier(b, .execution_scope = scope, .memory_scope = scope, .memory_semantics = NIR_MEMORY_ACQ_REL, .memory_modes = nir_var_mem_shared); @@ -1488,12 +1488,9 @@ ac_nir_lower_hs_outputs_to_mem(nir_shader *shader, const nir_tcs_info *info, .map_io = map, }; - if (state.tcs_info.all_invocations_define_tess_levels) { - nir_function_impl *impl = nir_shader_get_entrypoint(shader); - state.tcs_tess_level_outer = - nir_local_variable_create(impl, glsl_vec4_type(), "tess outer"); - state.tcs_tess_level_inner = - nir_local_variable_create(impl, glsl_vec4_type(), "tess inner"); + for (unsigned i = 0; i < 2; i++) { + state.tcs_tess_level[i] = + nir_local_variable_create(nir_shader_get_entrypoint(shader), glsl_vec4_type(), "tess outer"); } nir_shader_lower_instructions(shader,