diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index 227175b2ce8..1bf4be979a3 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -733,13 +733,25 @@ tu_cs_emit_draw_state(struct tu_cs *cs, uint32_t id, struct tu_draw_state state) enable_mask = CP_SET_DRAW_STATE__0_BINNING; break; case TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM: - case TU_DRAW_STATE_PRIM_MODE_GMEM: enable_mask = CP_SET_DRAW_STATE__0_GMEM; break; + case TU_DRAW_STATE_PRIM_MODE_GMEM: + /* On a7xx the prim mode is the same for gmem and sysmem, and it no + * longer depends on dynamic state, so we reuse the gmem state for + * everything: + */ + if (cs->device->physical_device->info->a6xx.has_coherent_ubwc_flag_caches) { + enable_mask = CP_SET_DRAW_STATE__0_GMEM | + CP_SET_DRAW_STATE__0_SYSMEM | + CP_SET_DRAW_STATE__0_BINNING; + } else { + enable_mask = CP_SET_DRAW_STATE__0_GMEM; + } + break; case TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM: enable_mask = CP_SET_DRAW_STATE__0_SYSMEM; break; - case TU_DRAW_STATE_PRIM_MODE_SYSMEM: + case TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_PRIM_MODE_SYSMEM: /* By also applying the state during binning we ensure that there * is no rotation applied, by previous A6XX_GRAS_SC_CNTL::rotation. */ @@ -3418,7 +3430,6 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer, cmd->state.program = pipeline->program; cmd->state.load_state = pipeline->load_state; - cmd->state.prim_order_sysmem = pipeline->prim_order.state_sysmem; cmd->state.prim_order_gmem = pipeline->prim_order.state_gmem; cmd->state.pipeline_sysmem_single_prim_mode = pipeline->prim_order.sysmem_single_prim_mode; cmd->state.pipeline_has_tess = pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT; @@ -3447,7 +3458,7 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer, if (!(cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)) { uint32_t mask = pipeline->set_state_mask; - tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (11 + util_bitcount(mask))); + tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (10 + util_bitcount(mask))); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_CONFIG, pipeline->program.config_state); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS, pipeline->program.vs_state); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_BINNING, pipeline->program.vs_binning_state); @@ -3457,7 +3468,6 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer, tu_cs_emit_draw_state(cs, TU_DRAW_STATE_GS_BINNING, pipeline->program.gs_binning_state); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_FS, pipeline->program.fs_state); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VPC, pipeline->program.vpc_state); - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_SYSMEM, pipeline->prim_order.state_sysmem); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_GMEM, pipeline->prim_order.state_gmem); u_foreach_bit(i, mask) @@ -3475,7 +3485,19 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer, if (gfx_pipeline->feedback_loops != cmd->state.pipeline_feedback_loops) { cmd->state.pipeline_feedback_loops = gfx_pipeline->feedback_loops; - cmd->state.dirty |= TU_CMD_DIRTY_LRZ; + cmd->state.dirty |= TU_CMD_DIRTY_FEEDBACK_LOOPS | TU_CMD_DIRTY_LRZ; + } + + bool raster_order_attachment_access = + pipeline->output.raster_order_attachment_access || + pipeline->ds.raster_order_attachment_access; + if (!cmd->state.raster_order_attachment_access_valid || + raster_order_attachment_access != + cmd->state.raster_order_attachment_access) { + cmd->state.raster_order_attachment_access = + raster_order_attachment_access; + cmd->state.dirty |= TU_CMD_DIRTY_RAST_ORDER; + cmd->state.raster_order_attachment_access_valid = true; } } @@ -4974,7 +4996,9 @@ tu6_build_depth_plane_z_mode(struct tu_cmd_buffer *cmd, struct tu_cs *cs) const struct tu_subpass *subpass = cmd->state.subpass; if ((fs->variant->has_kill || - (cmd->state.pipeline_feedback_loops & VK_IMAGE_ASPECT_DEPTH_BIT)) && + (cmd->state.pipeline_feedback_loops & VK_IMAGE_ASPECT_DEPTH_BIT) || + (cmd->vk.dynamic_graphics_state.feedback_loops & + VK_IMAGE_ASPECT_DEPTH_BIT)) && (depth_write || stencil_write)) { zmode = (cmd->state.lrz.valid && cmd->state.lrz.enabled) ? A6XX_EARLY_LRZ_LATE_Z @@ -5230,7 +5254,9 @@ tu6_draw_common(struct tu_cmd_buffer *cmd, BITSET_TEST(cmd->vk.dynamic_graphics_state.dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) || BITSET_TEST(cmd->vk.dynamic_graphics_state.dirty, - MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE); + MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE) || + BITSET_TEST(cmd->vk.dynamic_graphics_state.dirty, + MESA_VK_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE); if (dirty_lrz) { struct tu_cs cs; @@ -5245,6 +5271,17 @@ tu6_draw_common(struct tu_cmd_buffer *cmd, tu6_build_depth_plane_z_mode(cmd, &cs); } + if (BITSET_TEST(cmd->vk.dynamic_graphics_state.dirty, + MESA_VK_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE)) { + if (cmd->vk.dynamic_graphics_state.feedback_loops && + !cmd->state.rp.disable_gmem) { + perf_debug( + cmd->device, + "Disabling gmem due to VK_EXT_attachment_feedback_loop_layout"); + cmd->state.rp.disable_gmem = true; + } + } + if (BITSET_TEST(cmd->vk.dynamic_graphics_state.dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID)) { cmd->state.vertex_buffers.size = @@ -5307,7 +5344,6 @@ tu6_draw_common(struct tu_cmd_buffer *cmd, tu_cs_emit_draw_state(cs, TU_DRAW_STATE_GS_BINNING, program->gs_binning_state); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_FS, program->fs_state); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VPC, program->vpc_state); - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_SYSMEM, cmd->state.prim_order_sysmem); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_GMEM, cmd->state.prim_order_gmem); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_CONST, cmd->state.shader_const); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DESC_SETS, cmd->state.desc_sets); diff --git a/src/freedreno/vulkan/tu_cmd_buffer.h b/src/freedreno/vulkan/tu_cmd_buffer.h index 2516704c469..eb76e5b686b 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.h +++ b/src/freedreno/vulkan/tu_cmd_buffer.h @@ -40,7 +40,6 @@ enum tu_draw_state_group_id TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM, TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE, TU_DRAW_STATE_PRIM_MODE_GMEM, - TU_DRAW_STATE_PRIM_MODE_SYSMEM, /* dynamic state related draw states */ TU_DRAW_STATE_DYNAMIC, @@ -71,8 +70,10 @@ enum tu_cmd_dirty_bits TU_CMD_DIRTY_PER_VIEW_VIEWPORT = BIT(9), TU_CMD_DIRTY_TES = BIT(10), TU_CMD_DIRTY_PROGRAM = BIT(11), + TU_CMD_DIRTY_RAST_ORDER = BIT(12), + TU_CMD_DIRTY_FEEDBACK_LOOPS = BIT(13), /* all draw states were disabled and need to be re-enabled: */ - TU_CMD_DIRTY_DRAW_STATE = BIT(12) + TU_CMD_DIRTY_DRAW_STATE = BIT(14) }; /* There are only three cache domains we have to care about: the CCU, or @@ -441,7 +442,7 @@ struct tu_cmd_state struct tu_draw_state desc_sets; struct tu_draw_state load_state; struct tu_draw_state compute_load_state; - struct tu_draw_state prim_order_sysmem, prim_order_gmem; + struct tu_draw_state prim_order_gmem; struct tu_draw_state vs_params; struct tu_draw_state fs_params; @@ -509,6 +510,8 @@ struct tu_cmd_state bool pipeline_has_tess; bool pipeline_has_gs; bool pipeline_disable_gmem; + bool raster_order_attachment_access; + bool raster_order_attachment_access_valid; VkImageAspectFlags pipeline_feedback_loops; bool pipeline_blend_lrz, pipeline_bandwidth; diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index fc05aa57ac0..69fd907b528 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -221,6 +221,7 @@ get_device_extensions(const struct tu_physical_device *device, .KHR_zero_initialize_workgroup_memory = true, .EXT_4444_formats = true, + .EXT_attachment_feedback_loop_dynamic_state = true, .EXT_attachment_feedback_loop_layout = true, .EXT_border_color_swizzle = true, .EXT_color_write_enable = true, @@ -484,6 +485,9 @@ tu_get_features(struct tu_physical_device *pdevice, features->formatA4R4G4B4 = true; features->formatA4B4G4R4 = true; + /* VK_EXT_attachment_feedback_loop_dynamic_state */ + features->attachmentFeedbackLoopDynamicState = true; + /* VK_EXT_attachment_feedback_loop_layout */ features->attachmentFeedbackLoopLayout = true; diff --git a/src/freedreno/vulkan/tu_pipeline.cc b/src/freedreno/vulkan/tu_pipeline.cc index e39b3af99d2..3048c9ff725 100644 --- a/src/freedreno/vulkan/tu_pipeline.cc +++ b/src/freedreno/vulkan/tu_pipeline.cc @@ -3218,6 +3218,54 @@ tu6_emit_rb_depth_cntl(struct tu_cs *cs, } } +static const enum mesa_vk_dynamic_graphics_state tu_prim_mode_sysmem_state[] = { + MESA_VK_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE, +}; + +template +static unsigned +tu6_prim_mode_sysmem_size(struct tu_device *dev, + bool raster_order_attachment_access, + VkImageAspectFlags feedback_loops, + bool *sysmem_single_prim_mode) +{ + return 2; +} + +template +static void +tu6_emit_prim_mode_sysmem(struct tu_cs *cs, + bool raster_order_attachment_access, + VkImageAspectFlags feedback_loops, + bool *sysmem_single_prim_mode) +{ + /* VK_EXT_rasterization_order_attachment_access: + * + * This extension allow access to framebuffer attachments when used as both + * input and color attachments from one fragment to the next, in + * rasterization order, without explicit synchronization. + */ + raster_order_attachment_access |= TU_DEBUG(RAST_ORDER); + + /* If there is a feedback loop, then the shader can read the previous value + * of a pixel being written out. It can also write some components and then + * read different components without a barrier in between. This is a + * problem in sysmem mode with UBWC, because the main buffer and flags + * buffer can get out-of-sync if only one is flushed. We fix this by + * setting the SINGLE_PRIM_MODE field to the same value that the blob does + * for advanced_blend in sysmem mode if a feedback loop is detected. + */ + enum a6xx_single_prim_mode sysmem_prim_mode = + (raster_order_attachment_access || feedback_loops) ? + FLUSH_PER_OVERLAP_AND_OVERWRITE : NO_FLUSH; + + if (sysmem_prim_mode == FLUSH_PER_OVERLAP_AND_OVERWRITE) + *sysmem_single_prim_mode = true; + + tu_cs_emit_regs(cs, A6XX_GRAS_SC_CNTL(.ccusinglecachelinesize = 2, + .single_prim_mode = sysmem_prim_mode)); +} + static inline bool emit_pipeline_state(BITSET_WORD *keep, BITSET_WORD *remove, BITSET_WORD *pipeline_set, @@ -3380,6 +3428,26 @@ tu_pipeline_builder_emit_state(struct tu_pipeline_builder *builder, pipeline->shaders[MESA_SHADER_TESS_EVAL], &pipeline->program, builder->graphics_state.ts->patch_control_points); + bool has_raster_order_state = false; + if (pipeline->type == TU_PIPELINE_GRAPHICS) { + has_raster_order_state = true; + } else { + struct tu_graphics_lib_pipeline *lib = + tu_pipeline_to_graphics_lib(pipeline); + has_raster_order_state = + (lib->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) && + (lib->state & + VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT); + } + if (!builder->device->physical_device->info->a6xx.has_coherent_ubwc_flag_caches) { + DRAW_STATE_COND(prim_mode_sysmem, + TU_DYNAMIC_STATE_PRIM_MODE_SYSMEM, + has_raster_order_state, + pipeline->output.raster_order_attachment_access || + pipeline->ds.raster_order_attachment_access, + vk_pipeline_flags_feedback_loops(builder->graphics_state.pipeline_flags), + &pipeline->prim_order.sysmem_single_prim_mode); + } #undef DRAW_STATE #undef DRAW_STATE_COND #undef EMIT_STATE @@ -3452,7 +3520,7 @@ tu_emit_draw_state(struct tu_cmd_buffer *cmd) emit_draw_state(&cmd->vk.dynamic_graphics_state, tu_##name##_state, \ ARRAY_SIZE(tu_##name##_state)) #define DRAW_STATE_COND(name, id, extra_cond, ...) \ - if ((EMIT_STATE(name) || extra_cond) && \ + if ((EMIT_STATE(name) || (extra_cond)) && \ !(cmd->state.pipeline_draw_states & (1u << id))) { \ unsigned size = tu6_##name##_size(cmd->device, __VA_ARGS__); \ if (size > 0) { \ @@ -3569,6 +3637,16 @@ tu_emit_draw_state(struct tu_cmd_buffer *cmd) cmd->state.shaders[MESA_SHADER_TESS_EVAL], &cmd->state.program, cmd->vk.dynamic_graphics_state.ts.patch_control_points); + if (!cmd->device->physical_device->info->a6xx.has_coherent_ubwc_flag_caches) { + DRAW_STATE_COND(prim_mode_sysmem, + TU_DYNAMIC_STATE_PRIM_MODE_SYSMEM, + cmd->state.dirty & (TU_CMD_DIRTY_RAST_ORDER | + TU_CMD_DIRTY_FEEDBACK_LOOPS), + cmd->state.raster_order_attachment_access, + cmd->vk.dynamic_graphics_state.feedback_loops | + cmd->state.pipeline_feedback_loops, + &cmd->state.rp.sysmem_single_prim_mode); + } #undef DRAW_STATE #undef DRAW_STATE_COND #undef EMIT_STATE @@ -3651,7 +3729,6 @@ tu_pipeline_builder_parse_rasterization_order( * when implemented in the future. */ - enum a6xx_single_prim_mode sysmem_prim_mode = NO_FLUSH; enum a6xx_single_prim_mode gmem_prim_mode = NO_FLUSH; if (raster_order_attachment_access) { @@ -3661,27 +3738,7 @@ tu_pipeline_builder_parse_rasterization_order( * both input and color attachments from one fragment to the next, * in rasterization order, without explicit synchronization. */ - if (builder->device->physical_device->info->a6xx.has_coherent_ubwc_flag_caches) - sysmem_prim_mode = FLUSH_PER_OVERLAP; - else - sysmem_prim_mode = FLUSH_PER_OVERLAP_AND_OVERWRITE; gmem_prim_mode = FLUSH_PER_OVERLAP; - pipeline->prim_order.sysmem_single_prim_mode = true; - } else if (!builder->device->physical_device->info->a6xx.has_coherent_ubwc_flag_caches) { - /* If there is a feedback loop, then the shader can read the previous value - * of a pixel being written out. It can also write some components and then - * read different components without a barrier in between. This is a - * problem in sysmem mode with UBWC, because the main buffer and flags - * buffer can get out-of-sync if only one is flushed. We fix this by - * setting the SINGLE_PRIM_MODE field to the same value that the blob does - * for advanced_blend in sysmem mode if a feedback loop is detected. - */ - if (builder->graphics_state.pipeline_flags & - (VK_PIPELINE_CREATE_2_COLOR_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT | - VK_PIPELINE_CREATE_2_DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT)) { - sysmem_prim_mode = FLUSH_PER_OVERLAP_AND_OVERWRITE; - pipeline->prim_order.sysmem_single_prim_mode = true; - } } struct tu_cs cs; @@ -3690,11 +3747,6 @@ tu_pipeline_builder_parse_rasterization_order( tu_cs_emit_write_reg(&cs, REG_A6XX_GRAS_SC_CNTL, A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2) | A6XX_GRAS_SC_CNTL_SINGLE_PRIM_MODE(gmem_prim_mode)); - - pipeline->prim_order.state_sysmem = tu_cs_draw_state(&pipeline->cs, &cs, 2); - tu_cs_emit_write_reg(&cs, REG_A6XX_GRAS_SC_CNTL, - A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2) | - A6XX_GRAS_SC_CNTL_SINGLE_PRIM_MODE(sysmem_prim_mode)); } static void diff --git a/src/freedreno/vulkan/tu_pipeline.h b/src/freedreno/vulkan/tu_pipeline.h index cf49249acdf..2a79e82e744 100644 --- a/src/freedreno/vulkan/tu_pipeline.h +++ b/src/freedreno/vulkan/tu_pipeline.h @@ -31,6 +31,7 @@ enum tu_dynamic_state TU_DYNAMIC_STATE_BLEND, TU_DYNAMIC_STATE_VERTEX_INPUT, TU_DYNAMIC_STATE_PATCH_CONTROL_POINTS, + TU_DYNAMIC_STATE_PRIM_MODE_SYSMEM, TU_DYNAMIC_STATE_COUNT, }; @@ -153,7 +154,7 @@ struct tu_pipeline struct { /* If the pipeline sets SINGLE_PRIM_MODE for sysmem. */ bool sysmem_single_prim_mode; - struct tu_draw_state state_sysmem, state_gmem; + struct tu_draw_state state_gmem; } prim_order; /* draw states for the pipeline */