tu: Support VK_EXT_attachment_feedback_loop_dynamic_state

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23374>
This commit is contained in:
Connor Abbott 2023-05-24 23:09:25 +02:00 committed by Marge Bot
parent 833a0cf76e
commit 0e220cd45a
5 changed files with 136 additions and 40 deletions

View file

@ -733,13 +733,25 @@ tu_cs_emit_draw_state(struct tu_cs *cs, uint32_t id, struct tu_draw_state state)
enable_mask = CP_SET_DRAW_STATE__0_BINNING;
break;
case TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM:
case TU_DRAW_STATE_PRIM_MODE_GMEM:
enable_mask = CP_SET_DRAW_STATE__0_GMEM;
break;
case TU_DRAW_STATE_PRIM_MODE_GMEM:
/* On a7xx the prim mode is the same for gmem and sysmem, and it no
* longer depends on dynamic state, so we reuse the gmem state for
* everything:
*/
if (cs->device->physical_device->info->a6xx.has_coherent_ubwc_flag_caches) {
enable_mask = CP_SET_DRAW_STATE__0_GMEM |
CP_SET_DRAW_STATE__0_SYSMEM |
CP_SET_DRAW_STATE__0_BINNING;
} else {
enable_mask = CP_SET_DRAW_STATE__0_GMEM;
}
break;
case TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM:
enable_mask = CP_SET_DRAW_STATE__0_SYSMEM;
break;
case TU_DRAW_STATE_PRIM_MODE_SYSMEM:
case TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_PRIM_MODE_SYSMEM:
/* By also applying the state during binning we ensure that there
* is no rotation applied, by previous A6XX_GRAS_SC_CNTL::rotation.
*/
@ -3418,7 +3430,6 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
cmd->state.program = pipeline->program;
cmd->state.load_state = pipeline->load_state;
cmd->state.prim_order_sysmem = pipeline->prim_order.state_sysmem;
cmd->state.prim_order_gmem = pipeline->prim_order.state_gmem;
cmd->state.pipeline_sysmem_single_prim_mode = pipeline->prim_order.sysmem_single_prim_mode;
cmd->state.pipeline_has_tess = pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT;
@ -3447,7 +3458,7 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
if (!(cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)) {
uint32_t mask = pipeline->set_state_mask;
tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (11 + util_bitcount(mask)));
tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (10 + util_bitcount(mask)));
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_CONFIG, pipeline->program.config_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS, pipeline->program.vs_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_BINNING, pipeline->program.vs_binning_state);
@ -3457,7 +3468,6 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_GS_BINNING, pipeline->program.gs_binning_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_FS, pipeline->program.fs_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VPC, pipeline->program.vpc_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_SYSMEM, pipeline->prim_order.state_sysmem);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_GMEM, pipeline->prim_order.state_gmem);
u_foreach_bit(i, mask)
@ -3475,7 +3485,19 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
if (gfx_pipeline->feedback_loops != cmd->state.pipeline_feedback_loops) {
cmd->state.pipeline_feedback_loops = gfx_pipeline->feedback_loops;
cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
cmd->state.dirty |= TU_CMD_DIRTY_FEEDBACK_LOOPS | TU_CMD_DIRTY_LRZ;
}
bool raster_order_attachment_access =
pipeline->output.raster_order_attachment_access ||
pipeline->ds.raster_order_attachment_access;
if (!cmd->state.raster_order_attachment_access_valid ||
raster_order_attachment_access !=
cmd->state.raster_order_attachment_access) {
cmd->state.raster_order_attachment_access =
raster_order_attachment_access;
cmd->state.dirty |= TU_CMD_DIRTY_RAST_ORDER;
cmd->state.raster_order_attachment_access_valid = true;
}
}
@ -4974,7 +4996,9 @@ tu6_build_depth_plane_z_mode(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
const struct tu_subpass *subpass = cmd->state.subpass;
if ((fs->variant->has_kill ||
(cmd->state.pipeline_feedback_loops & VK_IMAGE_ASPECT_DEPTH_BIT)) &&
(cmd->state.pipeline_feedback_loops & VK_IMAGE_ASPECT_DEPTH_BIT) ||
(cmd->vk.dynamic_graphics_state.feedback_loops &
VK_IMAGE_ASPECT_DEPTH_BIT)) &&
(depth_write || stencil_write)) {
zmode = (cmd->state.lrz.valid && cmd->state.lrz.enabled)
? A6XX_EARLY_LRZ_LATE_Z
@ -5230,7 +5254,9 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
BITSET_TEST(cmd->vk.dynamic_graphics_state.dirty,
MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
BITSET_TEST(cmd->vk.dynamic_graphics_state.dirty,
MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE);
MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE) ||
BITSET_TEST(cmd->vk.dynamic_graphics_state.dirty,
MESA_VK_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE);
if (dirty_lrz) {
struct tu_cs cs;
@ -5245,6 +5271,17 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
tu6_build_depth_plane_z_mode(cmd, &cs);
}
if (BITSET_TEST(cmd->vk.dynamic_graphics_state.dirty,
MESA_VK_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE)) {
if (cmd->vk.dynamic_graphics_state.feedback_loops &&
!cmd->state.rp.disable_gmem) {
perf_debug(
cmd->device,
"Disabling gmem due to VK_EXT_attachment_feedback_loop_layout");
cmd->state.rp.disable_gmem = true;
}
}
if (BITSET_TEST(cmd->vk.dynamic_graphics_state.dirty,
MESA_VK_DYNAMIC_VI_BINDINGS_VALID)) {
cmd->state.vertex_buffers.size =
@ -5307,7 +5344,6 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_GS_BINNING, program->gs_binning_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_FS, program->fs_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VPC, program->vpc_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_SYSMEM, cmd->state.prim_order_sysmem);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_GMEM, cmd->state.prim_order_gmem);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_CONST, cmd->state.shader_const);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DESC_SETS, cmd->state.desc_sets);

View file

@ -40,7 +40,6 @@ enum tu_draw_state_group_id
TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,
TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE,
TU_DRAW_STATE_PRIM_MODE_GMEM,
TU_DRAW_STATE_PRIM_MODE_SYSMEM,
/* dynamic state related draw states */
TU_DRAW_STATE_DYNAMIC,
@ -71,8 +70,10 @@ enum tu_cmd_dirty_bits
TU_CMD_DIRTY_PER_VIEW_VIEWPORT = BIT(9),
TU_CMD_DIRTY_TES = BIT(10),
TU_CMD_DIRTY_PROGRAM = BIT(11),
TU_CMD_DIRTY_RAST_ORDER = BIT(12),
TU_CMD_DIRTY_FEEDBACK_LOOPS = BIT(13),
/* all draw states were disabled and need to be re-enabled: */
TU_CMD_DIRTY_DRAW_STATE = BIT(12)
TU_CMD_DIRTY_DRAW_STATE = BIT(14)
};
/* There are only three cache domains we have to care about: the CCU, or
@ -441,7 +442,7 @@ struct tu_cmd_state
struct tu_draw_state desc_sets;
struct tu_draw_state load_state;
struct tu_draw_state compute_load_state;
struct tu_draw_state prim_order_sysmem, prim_order_gmem;
struct tu_draw_state prim_order_gmem;
struct tu_draw_state vs_params;
struct tu_draw_state fs_params;
@ -509,6 +510,8 @@ struct tu_cmd_state
bool pipeline_has_tess;
bool pipeline_has_gs;
bool pipeline_disable_gmem;
bool raster_order_attachment_access;
bool raster_order_attachment_access_valid;
VkImageAspectFlags pipeline_feedback_loops;
bool pipeline_blend_lrz, pipeline_bandwidth;

View file

@ -221,6 +221,7 @@ get_device_extensions(const struct tu_physical_device *device,
.KHR_zero_initialize_workgroup_memory = true,
.EXT_4444_formats = true,
.EXT_attachment_feedback_loop_dynamic_state = true,
.EXT_attachment_feedback_loop_layout = true,
.EXT_border_color_swizzle = true,
.EXT_color_write_enable = true,
@ -484,6 +485,9 @@ tu_get_features(struct tu_physical_device *pdevice,
features->formatA4R4G4B4 = true;
features->formatA4B4G4R4 = true;
/* VK_EXT_attachment_feedback_loop_dynamic_state */
features->attachmentFeedbackLoopDynamicState = true;
/* VK_EXT_attachment_feedback_loop_layout */
features->attachmentFeedbackLoopLayout = true;

View file

@ -3218,6 +3218,54 @@ tu6_emit_rb_depth_cntl(struct tu_cs *cs,
}
}
static const enum mesa_vk_dynamic_graphics_state tu_prim_mode_sysmem_state[] = {
MESA_VK_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE,
};
template <chip CHIP>
static unsigned
tu6_prim_mode_sysmem_size(struct tu_device *dev,
bool raster_order_attachment_access,
VkImageAspectFlags feedback_loops,
bool *sysmem_single_prim_mode)
{
return 2;
}
template <chip CHIP>
static void
tu6_emit_prim_mode_sysmem(struct tu_cs *cs,
bool raster_order_attachment_access,
VkImageAspectFlags feedback_loops,
bool *sysmem_single_prim_mode)
{
/* VK_EXT_rasterization_order_attachment_access:
*
* This extension allow access to framebuffer attachments when used as both
* input and color attachments from one fragment to the next, in
* rasterization order, without explicit synchronization.
*/
raster_order_attachment_access |= TU_DEBUG(RAST_ORDER);
/* If there is a feedback loop, then the shader can read the previous value
* of a pixel being written out. It can also write some components and then
* read different components without a barrier in between. This is a
* problem in sysmem mode with UBWC, because the main buffer and flags
* buffer can get out-of-sync if only one is flushed. We fix this by
* setting the SINGLE_PRIM_MODE field to the same value that the blob does
* for advanced_blend in sysmem mode if a feedback loop is detected.
*/
enum a6xx_single_prim_mode sysmem_prim_mode =
(raster_order_attachment_access || feedback_loops) ?
FLUSH_PER_OVERLAP_AND_OVERWRITE : NO_FLUSH;
if (sysmem_prim_mode == FLUSH_PER_OVERLAP_AND_OVERWRITE)
*sysmem_single_prim_mode = true;
tu_cs_emit_regs(cs, A6XX_GRAS_SC_CNTL(.ccusinglecachelinesize = 2,
.single_prim_mode = sysmem_prim_mode));
}
static inline bool
emit_pipeline_state(BITSET_WORD *keep, BITSET_WORD *remove,
BITSET_WORD *pipeline_set,
@ -3380,6 +3428,26 @@ tu_pipeline_builder_emit_state(struct tu_pipeline_builder *builder,
pipeline->shaders[MESA_SHADER_TESS_EVAL],
&pipeline->program,
builder->graphics_state.ts->patch_control_points);
bool has_raster_order_state = false;
if (pipeline->type == TU_PIPELINE_GRAPHICS) {
has_raster_order_state = true;
} else {
struct tu_graphics_lib_pipeline *lib =
tu_pipeline_to_graphics_lib(pipeline);
has_raster_order_state =
(lib->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) &&
(lib->state &
VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT);
}
if (!builder->device->physical_device->info->a6xx.has_coherent_ubwc_flag_caches) {
DRAW_STATE_COND(prim_mode_sysmem,
TU_DYNAMIC_STATE_PRIM_MODE_SYSMEM,
has_raster_order_state,
pipeline->output.raster_order_attachment_access ||
pipeline->ds.raster_order_attachment_access,
vk_pipeline_flags_feedback_loops(builder->graphics_state.pipeline_flags),
&pipeline->prim_order.sysmem_single_prim_mode);
}
#undef DRAW_STATE
#undef DRAW_STATE_COND
#undef EMIT_STATE
@ -3452,7 +3520,7 @@ tu_emit_draw_state(struct tu_cmd_buffer *cmd)
emit_draw_state(&cmd->vk.dynamic_graphics_state, tu_##name##_state, \
ARRAY_SIZE(tu_##name##_state))
#define DRAW_STATE_COND(name, id, extra_cond, ...) \
if ((EMIT_STATE(name) || extra_cond) && \
if ((EMIT_STATE(name) || (extra_cond)) && \
!(cmd->state.pipeline_draw_states & (1u << id))) { \
unsigned size = tu6_##name##_size<CHIP>(cmd->device, __VA_ARGS__); \
if (size > 0) { \
@ -3569,6 +3637,16 @@ tu_emit_draw_state(struct tu_cmd_buffer *cmd)
cmd->state.shaders[MESA_SHADER_TESS_EVAL],
&cmd->state.program,
cmd->vk.dynamic_graphics_state.ts.patch_control_points);
if (!cmd->device->physical_device->info->a6xx.has_coherent_ubwc_flag_caches) {
DRAW_STATE_COND(prim_mode_sysmem,
TU_DYNAMIC_STATE_PRIM_MODE_SYSMEM,
cmd->state.dirty & (TU_CMD_DIRTY_RAST_ORDER |
TU_CMD_DIRTY_FEEDBACK_LOOPS),
cmd->state.raster_order_attachment_access,
cmd->vk.dynamic_graphics_state.feedback_loops |
cmd->state.pipeline_feedback_loops,
&cmd->state.rp.sysmem_single_prim_mode);
}
#undef DRAW_STATE
#undef DRAW_STATE_COND
#undef EMIT_STATE
@ -3651,7 +3729,6 @@ tu_pipeline_builder_parse_rasterization_order(
* when implemented in the future.
*/
enum a6xx_single_prim_mode sysmem_prim_mode = NO_FLUSH;
enum a6xx_single_prim_mode gmem_prim_mode = NO_FLUSH;
if (raster_order_attachment_access) {
@ -3661,27 +3738,7 @@ tu_pipeline_builder_parse_rasterization_order(
* both input and color attachments from one fragment to the next,
* in rasterization order, without explicit synchronization.
*/
if (builder->device->physical_device->info->a6xx.has_coherent_ubwc_flag_caches)
sysmem_prim_mode = FLUSH_PER_OVERLAP;
else
sysmem_prim_mode = FLUSH_PER_OVERLAP_AND_OVERWRITE;
gmem_prim_mode = FLUSH_PER_OVERLAP;
pipeline->prim_order.sysmem_single_prim_mode = true;
} else if (!builder->device->physical_device->info->a6xx.has_coherent_ubwc_flag_caches) {
/* If there is a feedback loop, then the shader can read the previous value
* of a pixel being written out. It can also write some components and then
* read different components without a barrier in between. This is a
* problem in sysmem mode with UBWC, because the main buffer and flags
* buffer can get out-of-sync if only one is flushed. We fix this by
* setting the SINGLE_PRIM_MODE field to the same value that the blob does
* for advanced_blend in sysmem mode if a feedback loop is detected.
*/
if (builder->graphics_state.pipeline_flags &
(VK_PIPELINE_CREATE_2_COLOR_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT |
VK_PIPELINE_CREATE_2_DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT)) {
sysmem_prim_mode = FLUSH_PER_OVERLAP_AND_OVERWRITE;
pipeline->prim_order.sysmem_single_prim_mode = true;
}
}
struct tu_cs cs;
@ -3690,11 +3747,6 @@ tu_pipeline_builder_parse_rasterization_order(
tu_cs_emit_write_reg(&cs, REG_A6XX_GRAS_SC_CNTL,
A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2) |
A6XX_GRAS_SC_CNTL_SINGLE_PRIM_MODE(gmem_prim_mode));
pipeline->prim_order.state_sysmem = tu_cs_draw_state(&pipeline->cs, &cs, 2);
tu_cs_emit_write_reg(&cs, REG_A6XX_GRAS_SC_CNTL,
A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2) |
A6XX_GRAS_SC_CNTL_SINGLE_PRIM_MODE(sysmem_prim_mode));
}
static void

View file

@ -31,6 +31,7 @@ enum tu_dynamic_state
TU_DYNAMIC_STATE_BLEND,
TU_DYNAMIC_STATE_VERTEX_INPUT,
TU_DYNAMIC_STATE_PATCH_CONTROL_POINTS,
TU_DYNAMIC_STATE_PRIM_MODE_SYSMEM,
TU_DYNAMIC_STATE_COUNT,
};
@ -153,7 +154,7 @@ struct tu_pipeline
struct {
/* If the pipeline sets SINGLE_PRIM_MODE for sysmem. */
bool sysmem_single_prim_mode;
struct tu_draw_state state_sysmem, state_gmem;
struct tu_draw_state state_gmem;
} prim_order;
/* draw states for the pipeline */