tu: Split program draw state into per-shader states

This completely rewrites how uploading variants and emitting the program
state works, which will make fast linking significantly faster and lays
some of the groundwork for EXT_shader_objects. Variants are now
compiled and uploaded as part of creating a tu_shader, and a
a per-stage draw state is also created that contains all registers that
are only set based on one stage. The program state is split into
per-stage states, which come from the shaders, and VPC state which is
emitted at linking time. Pipelines now contain shaders, and importing
shaders from libraries when fast-linking just involves taking a
reference on the shader. While the command buffer code still uses the
old pipeline structures, the plan is to switch more and more things to
directly use the shaders or derived state from the shaders that gets set
by the pipeline, so that we can eliminate pipeline usage from
tu_cmd_buffer.cc to enable ESO.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25076>
This commit is contained in:
Connor Abbott 2023-05-17 16:07:45 +02:00 committed by Marge Bot
parent a80f026073
commit a03525d8db
8 changed files with 1611 additions and 1522 deletions

View file

@ -831,8 +831,8 @@ r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit,
tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_FRAGMENT, fs);
struct tu_pvtmem_config pvtmem = {};
tu6_emit_xs<CHIP>(cs, MESA_SHADER_VERTEX, vs, &pvtmem, vs_iova);
tu6_emit_xs<CHIP>(cs, MESA_SHADER_FRAGMENT, fs, &pvtmem, fs_iova);
tu6_emit_xs(cs, MESA_SHADER_VERTEX, vs, &pvtmem, vs_iova);
tu6_emit_xs(cs, MESA_SHADER_FRAGMENT, fs, &pvtmem, fs_iova);
tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());

View file

@ -633,7 +633,9 @@ tu_cs_emit_draw_state(struct tu_cs *cs, uint32_t id, struct tu_draw_state state)
{
uint32_t enable_mask;
switch (id) {
case TU_DRAW_STATE_PROGRAM:
case TU_DRAW_STATE_VS:
case TU_DRAW_STATE_FS:
case TU_DRAW_STATE_VPC:
/* The blob seems to not enable this (DESC_SETS_LOAD) for binning, even
* when resources would actually be used in the binning shader.
* Presumably the overhead of prefetching the resources isn't
@ -643,7 +645,8 @@ tu_cs_emit_draw_state(struct tu_cs *cs, uint32_t id, struct tu_draw_state state)
enable_mask = CP_SET_DRAW_STATE__0_GMEM |
CP_SET_DRAW_STATE__0_SYSMEM;
break;
case TU_DRAW_STATE_PROGRAM_BINNING:
case TU_DRAW_STATE_VS_BINNING:
case TU_DRAW_STATE_GS_BINNING:
enable_mask = CP_SET_DRAW_STATE__0_BINNING;
break;
case TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM:
@ -679,11 +682,25 @@ tu_cs_emit_draw_state(struct tu_cs *cs, uint32_t id, struct tu_draw_state state)
* the firmware would ignore and we wouldn't pre-load the new
* descriptors. Set the DIRTY bit to avoid this optimization.
*
* We set the dirty bit for shader draw states because they contain
* CP_LOAD_STATE packets that are invalidated by the PROGRAM_CONFIG draw
* state, so if PROGRAM_CONFIG changes but one of the shaders stays the
* same then we still need to re-emit everything. The GLES blob which
* implements separate shader draw states does the same thing.
*
* We also need to set this bit for draw states which may be patched by the
* GPU, because their underlying memory may change between setting the draw
* state.
*/
if (id == TU_DRAW_STATE_DESC_SETS_LOAD || state.writeable)
if (id == TU_DRAW_STATE_DESC_SETS_LOAD ||
id == TU_DRAW_STATE_VS ||
id == TU_DRAW_STATE_VS_BINNING ||
id == TU_DRAW_STATE_HS ||
id == TU_DRAW_STATE_DS ||
id == TU_DRAW_STATE_GS ||
id == TU_DRAW_STATE_GS_BINNING ||
id == TU_DRAW_STATE_FS ||
state.writeable)
enable_mask |= CP_SET_DRAW_STATE__0_DIRTY;
tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(state.size) |
@ -2940,7 +2957,8 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE) {
cmd->state.compute_pipeline = tu_pipeline_to_compute(pipeline);
tu_cs_emit_state_ib(&cmd->cs, pipeline->program.state);
tu_cs_emit_state_ib(&cmd->cs,
pipeline->shaders[MESA_SHADER_COMPUTE]->state);
return;
}
@ -3002,10 +3020,16 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
if (!(cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)) {
uint32_t mask = pipeline->set_state_mask;
tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (5 + util_bitcount(mask)));
tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (11 + util_bitcount(mask)));
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_CONFIG, pipeline->program.config_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM, pipeline->program.state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_BINNING, pipeline->program.binning_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS, pipeline->program.vs_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_BINNING, pipeline->program.vs_binning_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_HS, pipeline->program.hs_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DS, pipeline->program.ds_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_GS, pipeline->program.gs_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_GS_BINNING, pipeline->program.gs_binning_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_FS, pipeline->program.fs_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VPC, pipeline->program.vpc_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_SYSMEM, pipeline->prim_order.state_sysmem);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_GMEM, pipeline->prim_order.state_gmem);
@ -4509,7 +4533,7 @@ tu6_emit_fs_params(struct tu_cmd_buffer *cmd)
tu_cs_emit(&cs, 0);
STATIC_ASSERT(IR3_DP_FS_FRAG_INVOCATION_COUNT == IR3_DP_FS_DYNAMIC);
tu_cs_emit(&cs, pipeline->base.program.per_samp ?
tu_cs_emit(&cs, pipeline->base.fs.per_samp ?
cmd->vk.dynamic_graphics_state.ms.rasterization_samples : 1);
tu_cs_emit(&cs, 0);
tu_cs_emit(&cs, 0);
@ -4699,8 +4723,14 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_CONFIG, pipeline->program.config_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM, pipeline->program.state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_BINNING, pipeline->program.binning_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS, pipeline->program.vs_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_BINNING, pipeline->program.vs_binning_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_HS, pipeline->program.hs_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DS, pipeline->program.ds_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_GS, pipeline->program.gs_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_GS_BINNING, pipeline->program.gs_binning_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_FS, pipeline->program.fs_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VPC, pipeline->program.vpc_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_SYSMEM, pipeline->prim_order.state_sysmem);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_GMEM, pipeline->prim_order.state_gmem);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_CONST, cmd->state.shader_const);
@ -5377,7 +5407,8 @@ tu_dispatch(struct tu_cmd_buffer *cmd,
* could be overwritten by reg stomping in a renderpass or blit.
*/
if (cmd->device->dbg_renderpass_stomp_cs) {
tu_cs_emit_state_ib(&cmd->cs, cmd->state.compute_pipeline->base.program.state);
tu_cs_emit_state_ib(&cmd->cs,
cmd->state.compute_pipeline->base.shaders[MESA_SHADER_COMPUTE]->state);
}
/* There appears to be a HW bug where in some rare circumstances it appears

View file

@ -22,8 +22,14 @@
enum tu_draw_state_group_id
{
TU_DRAW_STATE_PROGRAM_CONFIG,
TU_DRAW_STATE_PROGRAM,
TU_DRAW_STATE_PROGRAM_BINNING,
TU_DRAW_STATE_VS,
TU_DRAW_STATE_VS_BINNING,
TU_DRAW_STATE_HS,
TU_DRAW_STATE_DS,
TU_DRAW_STATE_GS,
TU_DRAW_STATE_GS_BINNING,
TU_DRAW_STATE_VPC,
TU_DRAW_STATE_FS,
TU_DRAW_STATE_VB,
TU_DRAW_STATE_CONST,
TU_DRAW_STATE_DESC_SETS,

View file

@ -579,7 +579,7 @@ tu_get_features(struct tu_physical_device *pdevice,
}
static const struct vk_pipeline_cache_object_ops *const cache_import_ops[] = {
&tu_shaders_ops,
&tu_shader_ops,
&tu_nir_shaders_ops,
NULL,
};

File diff suppressed because it is too large Load diff

View file

@ -58,18 +58,6 @@ struct tu_bandwidth
bool valid;
};
struct tu_compiled_shaders
{
struct vk_pipeline_cache_object base;
struct tu_const_state const_state[MESA_SHADER_STAGES];
uint8_t active_desc_sets;
const struct ir3_shader_variant *variants[MESA_SHADER_STAGES];
const struct ir3_shader_variant *safe_const_variants[MESA_SHADER_STAGES];
};
struct tu_nir_shaders
{
struct vk_pipeline_cache_object base;
@ -80,7 +68,6 @@ struct tu_nir_shaders
nir_shader *nir[MESA_SHADER_STAGES];
};
extern const struct vk_pipeline_cache_object_ops tu_shaders_ops;
extern const struct vk_pipeline_cache_object_ops tu_nir_shaders_ops;
static bool inline
@ -125,9 +112,6 @@ struct tu_pipeline
struct tu_cs cs;
struct tu_suballoc_bo bo;
/* Separate BO for private memory since it should GPU writable */
struct tu_bo *pvtmem_bo;
VkShaderStageFlags active_stages;
uint32_t active_desc_sets;
@ -169,21 +153,30 @@ struct tu_pipeline
struct tu_push_constant_range shared_consts;
struct tu_shader *shaders[MESA_SHADER_STAGES];
struct {
bool per_samp;
} fs;
struct
{
struct tu_draw_state config_state;
struct tu_draw_state state;
struct tu_draw_state binning_state;
struct tu_program_descriptor_linkage link[MESA_SHADER_STAGES];
struct tu_draw_state vs_state, vs_binning_state;
struct tu_draw_state hs_state;
struct tu_draw_state ds_state;
struct tu_draw_state gs_state, gs_binning_state;
struct tu_draw_state vpc_state;
struct tu_draw_state fs_state;
uint32_t vs_param_stride;
uint32_t hs_param_stride;
uint32_t hs_param_dwords;
uint32_t hs_vertices_out;
struct tu_program_descriptor_linkage link[MESA_SHADER_STAGES];
bool per_view_viewport;
bool per_samp;
enum a6xx_tess_output tess_output_upper_left, tess_output_lower_left;
enum a6xx_tess_spacing tess_spacing;
@ -207,18 +200,10 @@ struct tu_graphics_lib_pipeline {
/* For vk_graphics_pipeline_state */
void *state_data;
/* compiled_shaders only contains variants compiled by this pipeline, and
* it owns them, so when it is freed they disappear. Similarly,
* nir_shaders owns the link-time NIR. shaders points to the shaders from
* this pipeline and all libraries included in it, for convenience.
*/
struct tu_compiled_shaders *compiled_shaders;
struct tu_nir_shaders *nir_shaders;
struct {
nir_shader *nir;
struct tu_shader_key key;
struct tu_const_state const_state;
const struct ir3_shader_variant *variant, *safe_const_variant;
} shaders[MESA_SHADER_FRAGMENT + 1];
struct ir3_shader_key ir3_key;
@ -288,32 +273,7 @@ tu6_emit_xs_config(struct tu_cs *cs,
template <chip CHIP>
void
tu6_emit_xs(struct tu_cs *cs,
gl_shader_stage stage,
const struct ir3_shader_variant *xs,
const struct tu_pvtmem_config *pvtmem,
uint64_t binary_iova);
template <chip CHIP>
void
tu6_emit_vs(struct tu_cs *cs, const struct ir3_shader_variant *vs,
uint32_t view_mask);
template <chip CHIP>
void
tu6_emit_hs(struct tu_cs *cs, const struct ir3_shader_variant *hs);
template <chip CHIP>
void
tu6_emit_ds(struct tu_cs *cs, const struct ir3_shader_variant *hs);
template <chip CHIP>
void
tu6_emit_gs(struct tu_cs *cs, const struct ir3_shader_variant *hs);
template <chip CHIP>
void
tu6_emit_fs(struct tu_cs *cs, const struct ir3_shader_variant *fs);
tu6_emit_shared_consts_enable(struct tu_cs *cs, bool shared_consts_enable);
template <chip CHIP>
void

File diff suppressed because it is too large Load diff

View file

@ -11,6 +11,8 @@
#define TU_SHADER_H
#include "tu_common.h"
#include "tu_cs.h"
#include "tu_suballoc.h"
struct tu_inline_ubo
{
@ -44,10 +46,21 @@ struct tu_const_state
struct tu_shader
{
struct ir3_shader *ir3_shader;
struct vk_pipeline_cache_object base;
const struct ir3_shader_variant *variant;
const struct ir3_shader_variant *safe_const_variant;
struct tu_suballoc_bo bo;
struct tu_cs cs;
struct tu_bo *pvtmem_bo;
struct tu_draw_state state;
struct tu_draw_state safe_const_state;
struct tu_draw_state binning_state;
struct tu_const_state const_state;
unsigned reserved_user_consts_vec4;
uint32_t view_mask;
uint8_t active_desc_sets;
};
@ -59,6 +72,7 @@ struct tu_shader_key {
enum ir3_wavesize_option api_wavesize, real_wavesize;
};
extern const struct vk_pipeline_cache_object_ops tu_shader_ops;
bool
tu_nir_lower_multiview(nir_shader *nir, uint32_t mask, struct tu_device *dev);
@ -68,16 +82,52 @@ tu_spirv_to_nir(struct tu_device *dev,
const VkPipelineShaderStageCreateInfo *stage_info,
gl_shader_stage stage);
struct tu_shader *
void
tu6_emit_xs(struct tu_cs *cs,
gl_shader_stage stage,
const struct ir3_shader_variant *xs,
const struct tu_pvtmem_config *pvtmem,
uint64_t binary_iova);
template <chip CHIP>
void
tu6_emit_vs(struct tu_cs *cs, const struct ir3_shader_variant *vs,
uint32_t view_mask);
template <chip CHIP>
void
tu6_emit_hs(struct tu_cs *cs, const struct ir3_shader_variant *hs);
template <chip CHIP>
void
tu6_emit_ds(struct tu_cs *cs, const struct ir3_shader_variant *hs);
template <chip CHIP>
void
tu6_emit_gs(struct tu_cs *cs, const struct ir3_shader_variant *hs);
template <chip CHIP>
void
tu6_emit_fs(struct tu_cs *cs, const struct ir3_shader_variant *fs);
VkResult
tu_shader_create(struct tu_device *dev,
struct tu_shader **shader_out,
nir_shader *nir,
const struct tu_shader_key *key,
const struct ir3_shader_key *ir3_key,
const void *key_data,
size_t key_size,
struct tu_pipeline_layout *layout,
const VkAllocationCallbacks *alloc);
bool executable_info);
VkResult
tu_empty_shader_create(struct tu_device *device,
struct tu_shader **shader_out,
gl_shader_stage stage);
void
tu_shader_destroy(struct tu_device *dev,
struct tu_shader *shader,
const VkAllocationCallbacks *alloc);
struct tu_shader *shader);
#endif /* TU_SHADER_H */