ac/nir: add LDS layout info for GSVS and XFB to ac_nir_prerast_per_output_info

This will be used to reduce the NGG LDS size for uncompacted GS and XFB
outputs.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35351>
This commit is contained in:
Marek Olšák 2025-05-27 05:11:58 -04:00 committed by Marge Bot
parent 39a9dce5fc
commit ebdd97a993
2 changed files with 79 additions and 0 deletions

View file

@ -56,6 +56,19 @@ typedef struct
uint8_t as_varying_mask : 4;
/* Bitmask of components that are used as sysval, 1 bit per component. */
uint8_t as_sysval_mask : 4;
/* Prefix sum over all component masks. Used by the GS outputs in LDS for NGG GS.
* This is set even if components_mask is 0, in which case it's the offset after the last output.
*/
uint16_t packed_slot_gs_out_offset : 12;
/* Prefix sum over all component masks. Used by XFB outputs in LDS for NGG VS and TES.
* This is set even if xfb_components_mask is 0, in which case it's the offset after the last output.
* For NGG GS, it's equal to packed_slot_gs_out_offset because NGG GS has all outputs in LDS.
*/
uint16_t packed_slot_xfb_lds_offset : 12;
/* Bitmask of components written by XFB: 4 bits per slot, 1 bit per component.
* For NGG GS, it's equal to components_mask because NGG GS has all outputs in LDS.
*/
uint8_t xfb_lds_components_mask : 4;
} ac_nir_prerast_per_output_info;
typedef struct
@ -71,6 +84,10 @@ typedef struct
ac_nir_prerast_per_output_info infos[VARYING_SLOT_MAX];
ac_nir_prerast_per_output_info infos_16bit_lo[16];
ac_nir_prerast_per_output_info infos_16bit_hi[16];
/* The size of all components, packed. */
uint16_t total_packed_gs_out_size;
uint16_t total_packed_xfb_lds_size;
} ac_nir_prerast_out;
typedef struct {
@ -226,6 +243,9 @@ ac_nir_repack_invocations_in_workgroup(nir_builder *b, nir_def **input_bool,
nir_def *lds_addr_base, unsigned max_num_waves,
unsigned wave_size);
void
ac_nir_compute_prerast_packed_output_info(ac_nir_prerast_out *pr_out);
#ifdef __cplusplus
}
#endif

View file

@ -168,6 +168,22 @@ void ac_nir_gather_prerast_store_output_info(nir_builder *b, nir_intrinsic_instr
type[c] = src_type;
}
}
/* GS stores all outputs in LDS, while VS/TES only store XFB outputs in LDS. */
if (b->shader->info.stage == MESA_SHADER_GEOMETRY) {
info->xfb_lds_components_mask |= write_mask << component_offset;
} else {
info->xfb_lds_components_mask |= nir_instr_xfb_write_mask(intrin) & (write_mask << component_offset);
/* For VS, we store edge flags in LDS where the LDS space is shared with XFB, so we need
* to include edge flags in the XFB LDS size even though XFB doesn't use it.
* Only the prim export uses it.
*/
if (b->shader->info.stage == MESA_SHADER_VERTEX && slot == VARYING_SLOT_EDGE) {
assert(write_mask == 0x1);
info->xfb_lds_components_mask |= write_mask;
}
}
}
static nir_intrinsic_instr *
@ -1399,3 +1415,46 @@ ac_nir_ngg_build_streamout_vertex(nir_builder *b, nir_xfb_info *info,
.access = ACCESS_NON_TEMPORAL);
}
}
/* Determine optimal output packing based on component masks, and set packed offsets. */
void
ac_nir_compute_prerast_packed_output_info(ac_nir_prerast_out *pr_out)
{
unsigned gs_out_offset = 0;
unsigned xfb_lds_offset = 0;
for (unsigned i = 0; i < ARRAY_SIZE(pr_out->infos); i++) {
assert(gs_out_offset < BITFIELD_BIT(12));
assert(xfb_lds_offset < BITFIELD_BIT(12));
pr_out->infos[i].packed_slot_gs_out_offset = gs_out_offset;
pr_out->infos[i].packed_slot_xfb_lds_offset = xfb_lds_offset;
if (pr_out->infos[i].components_mask)
gs_out_offset += util_bitcount(pr_out->infos[i].components_mask) * 4;
if (pr_out->infos[i].xfb_lds_components_mask)
xfb_lds_offset += util_bitcount(pr_out->infos[i].xfb_lds_components_mask) * 4;
}
for (unsigned i = 0; i < ARRAY_SIZE(pr_out->infos_16bit_lo); i++) {
unsigned component_mask = pr_out->infos_16bit_lo[i].components_mask |
pr_out->infos_16bit_hi[i].components_mask;
unsigned xfb_component_mask = pr_out->infos_16bit_lo[i].xfb_lds_components_mask |
pr_out->infos_16bit_hi[i].xfb_lds_components_mask;
assert(gs_out_offset < BITFIELD_BIT(12));
assert(xfb_lds_offset < BITFIELD_BIT(12));
pr_out->infos_16bit_lo[i].packed_slot_gs_out_offset = gs_out_offset;
pr_out->infos_16bit_hi[i].packed_slot_gs_out_offset = gs_out_offset;
pr_out->infos_16bit_lo[i].packed_slot_xfb_lds_offset = xfb_lds_offset;
pr_out->infos_16bit_hi[i].packed_slot_xfb_lds_offset = xfb_lds_offset;
if (component_mask)
gs_out_offset += util_bitcount(component_mask) * 4;
if (xfb_component_mask)
xfb_lds_offset += util_bitcount(xfb_component_mask) * 4;
}
assert(gs_out_offset < BITFIELD_BIT(16));
assert(xfb_lds_offset < BITFIELD_BIT(16));
pr_out->total_packed_gs_out_size = gs_out_offset;
pr_out->total_packed_xfb_lds_size = xfb_lds_offset;
}