mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-05 11:48:06 +02:00
ac/nir: add LDS layout info for GSVS and XFB to ac_nir_prerast_per_output_info
This will be used to reduce the NGG LDS size for uncompacted GS and XFB outputs. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35351>
This commit is contained in:
parent
39a9dce5fc
commit
ebdd97a993
2 changed files with 79 additions and 0 deletions
|
|
@ -56,6 +56,19 @@ typedef struct
|
|||
uint8_t as_varying_mask : 4;
|
||||
/* Bitmask of components that are used as sysval, 1 bit per component. */
|
||||
uint8_t as_sysval_mask : 4;
|
||||
/* Prefix sum over all component masks. Used by the GS outputs in LDS for NGG GS.
|
||||
* This is set even if components_mask is 0, in which case it's the offset after the last output.
|
||||
*/
|
||||
uint16_t packed_slot_gs_out_offset : 12;
|
||||
/* Prefix sum over all component masks. Used by XFB outputs in LDS for NGG VS and TES.
|
||||
* This is set even if xfb_components_mask is 0, in which case it's the offset after the last output.
|
||||
* For NGG GS, it's equal to packed_slot_gs_out_offset because NGG GS has all outputs in LDS.
|
||||
*/
|
||||
uint16_t packed_slot_xfb_lds_offset : 12;
|
||||
/* Bitmask of components written by XFB: 4 bits per slot, 1 bit per component.
|
||||
* For NGG GS, it's equal to components_mask because NGG GS has all outputs in LDS.
|
||||
*/
|
||||
uint8_t xfb_lds_components_mask : 4;
|
||||
} ac_nir_prerast_per_output_info;
|
||||
|
||||
typedef struct
|
||||
|
|
@ -71,6 +84,10 @@ typedef struct
|
|||
ac_nir_prerast_per_output_info infos[VARYING_SLOT_MAX];
|
||||
ac_nir_prerast_per_output_info infos_16bit_lo[16];
|
||||
ac_nir_prerast_per_output_info infos_16bit_hi[16];
|
||||
|
||||
/* The size of all components, packed. */
|
||||
uint16_t total_packed_gs_out_size;
|
||||
uint16_t total_packed_xfb_lds_size;
|
||||
} ac_nir_prerast_out;
|
||||
|
||||
typedef struct {
|
||||
|
|
@ -226,6 +243,9 @@ ac_nir_repack_invocations_in_workgroup(nir_builder *b, nir_def **input_bool,
|
|||
nir_def *lds_addr_base, unsigned max_num_waves,
|
||||
unsigned wave_size);
|
||||
|
||||
void
|
||||
ac_nir_compute_prerast_packed_output_info(ac_nir_prerast_out *pr_out);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -168,6 +168,22 @@ void ac_nir_gather_prerast_store_output_info(nir_builder *b, nir_intrinsic_instr
|
|||
type[c] = src_type;
|
||||
}
|
||||
}
|
||||
|
||||
/* GS stores all outputs in LDS, while VS/TES only store XFB outputs in LDS. */
|
||||
if (b->shader->info.stage == MESA_SHADER_GEOMETRY) {
|
||||
info->xfb_lds_components_mask |= write_mask << component_offset;
|
||||
} else {
|
||||
info->xfb_lds_components_mask |= nir_instr_xfb_write_mask(intrin) & (write_mask << component_offset);
|
||||
|
||||
/* For VS, we store edge flags in LDS where the LDS space is shared with XFB, so we need
|
||||
* to include edge flags in the XFB LDS size even though XFB doesn't use it.
|
||||
* Only the prim export uses it.
|
||||
*/
|
||||
if (b->shader->info.stage == MESA_SHADER_VERTEX && slot == VARYING_SLOT_EDGE) {
|
||||
assert(write_mask == 0x1);
|
||||
info->xfb_lds_components_mask |= write_mask;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static nir_intrinsic_instr *
|
||||
|
|
@ -1399,3 +1415,46 @@ ac_nir_ngg_build_streamout_vertex(nir_builder *b, nir_xfb_info *info,
|
|||
.access = ACCESS_NON_TEMPORAL);
|
||||
}
|
||||
}
|
||||
|
||||
/* Determine optimal output packing based on component masks, and set packed offsets. */
|
||||
void
|
||||
ac_nir_compute_prerast_packed_output_info(ac_nir_prerast_out *pr_out)
|
||||
{
|
||||
unsigned gs_out_offset = 0;
|
||||
unsigned xfb_lds_offset = 0;
|
||||
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(pr_out->infos); i++) {
|
||||
assert(gs_out_offset < BITFIELD_BIT(12));
|
||||
assert(xfb_lds_offset < BITFIELD_BIT(12));
|
||||
pr_out->infos[i].packed_slot_gs_out_offset = gs_out_offset;
|
||||
pr_out->infos[i].packed_slot_xfb_lds_offset = xfb_lds_offset;
|
||||
|
||||
if (pr_out->infos[i].components_mask)
|
||||
gs_out_offset += util_bitcount(pr_out->infos[i].components_mask) * 4;
|
||||
if (pr_out->infos[i].xfb_lds_components_mask)
|
||||
xfb_lds_offset += util_bitcount(pr_out->infos[i].xfb_lds_components_mask) * 4;
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(pr_out->infos_16bit_lo); i++) {
|
||||
unsigned component_mask = pr_out->infos_16bit_lo[i].components_mask |
|
||||
pr_out->infos_16bit_hi[i].components_mask;
|
||||
unsigned xfb_component_mask = pr_out->infos_16bit_lo[i].xfb_lds_components_mask |
|
||||
pr_out->infos_16bit_hi[i].xfb_lds_components_mask;
|
||||
assert(gs_out_offset < BITFIELD_BIT(12));
|
||||
assert(xfb_lds_offset < BITFIELD_BIT(12));
|
||||
pr_out->infos_16bit_lo[i].packed_slot_gs_out_offset = gs_out_offset;
|
||||
pr_out->infos_16bit_hi[i].packed_slot_gs_out_offset = gs_out_offset;
|
||||
pr_out->infos_16bit_lo[i].packed_slot_xfb_lds_offset = xfb_lds_offset;
|
||||
pr_out->infos_16bit_hi[i].packed_slot_xfb_lds_offset = xfb_lds_offset;
|
||||
|
||||
if (component_mask)
|
||||
gs_out_offset += util_bitcount(component_mask) * 4;
|
||||
if (xfb_component_mask)
|
||||
xfb_lds_offset += util_bitcount(xfb_component_mask) * 4;
|
||||
}
|
||||
|
||||
assert(gs_out_offset < BITFIELD_BIT(16));
|
||||
assert(xfb_lds_offset < BITFIELD_BIT(16));
|
||||
pr_out->total_packed_gs_out_size = gs_out_offset;
|
||||
pr_out->total_packed_xfb_lds_size = xfb_lds_offset;
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue