From fec372dfa502f588e894c19e9c2a8ba2c73d6c9a Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Mon, 21 Nov 2022 14:52:44 +0100 Subject: [PATCH] tu: Implement FDM viewport patching We scale the actual rendering by patching the viewport state. This is helped by a HW bit to make the viewport index equal to the view index, so that we can have a different scaling per-view. Part-of: --- src/freedreno/vulkan/tu_cmd_buffer.cc | 236 +++++++++++++++++++++++++- src/freedreno/vulkan/tu_cmd_buffer.h | 2 + src/freedreno/vulkan/tu_pipeline.cc | 70 +++++++- src/freedreno/vulkan/tu_pipeline.h | 5 + 4 files changed, 306 insertions(+), 7 deletions(-) diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index b4a357e8b3f..adb68195e26 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -859,6 +859,21 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd, frag_areas[i].height--; } + /* If at any point we were forced to use the same scaling for all + * viewports, we need to make sure that any users *not* using shared + * scaling, including loads/stores, also consistently share the scaling. + */ + if (cmd->state.rp.shared_viewport) { + VkExtent2D frag_area = { UINT32_MAX, UINT32_MAX }; + for (unsigned i = 0; i < views; i++) { + frag_area.width = MIN2(frag_area.width, frag_areas[i].width); + frag_area.height = MIN2(frag_area.height, frag_areas[i].height); + } + + for (unsigned i = 0; i < views; i++) + frag_areas[i] = frag_area; + } + VkRect2D bin = { { x1, y1 }, { x2 - x1, y2 - y1 } }; util_dynarray_foreach (&cmd->fdm_bin_patchpoints, struct tu_fdm_bin_patchpoint, patch) { @@ -1455,6 +1470,15 @@ tu_emit_renderpass_begin(struct tu_cmd_buffer *cmd, tu_clear_sysmem_attachment(cmd, cs, i, &clear_values[i]); tu_cond_exec_end(cs); + + /* We need to re-emit any draw states that are patched in order for them to + * be correctly added to the per-renderpass patchpoint list, even if they + * are the same as before. + */ + if (cmd->state.pass->has_fdm) { + cmd->state.dirty |= + TU_CMD_DIRTY_VIEWPORTS | TU_CMD_DIRTY_SCISSORS; + } } static void @@ -1694,6 +1718,15 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd, trace_end_render_pass(&cmd->trace, &cmd->cs); + /* We have trashed the dynamically-emitted viewport, scissor, and FS params + * via the patchpoints, so we need to re-emit them if they are reused for a + * later render pass. + */ + if (cmd->state.pass->has_fdm) { + cmd->state.dirty |= + TU_CMD_DIRTY_VIEWPORTS | TU_CMD_DIRTY_SCISSORS; + } + /* tu6_render_tile has cloned these tracepoints for each tile */ if (!u_trace_iterator_equal(cmd->trace_renderpass_start, cmd->trace_renderpass_end)) u_trace_disable_event_range(cmd->trace_renderpass_start, @@ -2911,6 +2944,15 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer, cmd->state.dirty |= TU_CMD_DIRTY_VIEWPORTS; } + if (pipeline->viewport.set_dynamic_scissor_to_static) { + memcpy(cmd->state.scissor, pipeline->viewport.scissors, + pipeline->viewport.num_viewports * + sizeof(pipeline->viewport.scissors[0])); + + cmd->state.scissor_count = pipeline->viewport.num_scissors; + cmd->state.dirty |= TU_CMD_DIRTY_SCISSORS; + } + if ((pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_VIEWPORT)) && !(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_VIEWPORT_COUNT)) && cmd->state.viewport_count != pipeline->viewport.num_viewports) { @@ -2925,6 +2967,14 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer, cmd->state.dirty |= TU_CMD_DIRTY_SCISSORS; } + if (pipeline->viewport.per_view_viewport != cmd->state.per_view_viewport) { + cmd->state.per_view_viewport = pipeline->viewport.per_view_viewport; + if (pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_VIEWPORT)) + cmd->state.dirty |= TU_CMD_DIRTY_VIEWPORTS; + if (pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_SCISSOR)) + cmd->state.dirty |= TU_CMD_DIRTY_SCISSORS; + } + if (!(pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_VIEWPORT))) cmd->state.dirty &= ~TU_CMD_DIRTY_VIEWPORTS; @@ -4122,6 +4172,7 @@ tu_render_pass_state_merge(struct tu_render_pass_state *dst, dst->disable_gmem |= src->disable_gmem; dst->sysmem_single_prim_mode |= src->sysmem_single_prim_mode; dst->draw_cs_writes_to_cond_pred |= src->draw_cs_writes_to_cond_pred; + dst->shared_viewport |= src->shared_viewport; dst->drawcall_count += src->drawcall_count; dst->drawcall_bandwidth_per_sample_sum += @@ -5060,6 +5111,146 @@ tu6_emit_blend(struct tu_cs *cs, struct tu_cmd_buffer *cmd) ~pipeline->blend.rb_blend_cntl_mask)); } +struct apply_viewport_state { + VkViewport viewports[MAX_VIEWPORTS]; + unsigned num_viewports; + bool z_negative_one_to_one; + bool share_scale; +}; + +/* It's a hardware restriction that the window offset (i.e. bin.offset) must + * be the same for all views. This means that GMEM coordinates cannot be a + * simple scaling of framebuffer coordinates, because this would require us to + * scale the window offset and the scale may be different per view. Instead we + * have to apply a per-bin offset to the GMEM coordinate transform to make + * sure that the window offset maps to itself. Specifically we need an offset + * o to the transform: + * + * x' = s * x + o + * + * so that when we plug in the bin start b_s: + * + * b_s = s * b_s + o + * + * and we get: + * + * o = b_s - s * b_s + * + * We use this form exactly, because we know the bin offset is a multiple of + * the frag area so s * b_s is an integer and we can compute an exact result + * easily. + */ + +static VkOffset2D +fdm_per_bin_offset(VkExtent2D frag_area, VkRect2D bin) +{ + assert(bin.offset.x % frag_area.width == 0); + assert(bin.offset.y % frag_area.height == 0); + + return (VkOffset2D) { + bin.offset.x - bin.offset.x / frag_area.width, + bin.offset.y - bin.offset.y / frag_area.height + }; +} + +static void +fdm_apply_viewports(struct tu_cs *cs, void *data, VkRect2D bin, unsigned views, + VkExtent2D *frag_areas) +{ + VkViewport viewports[MAX_VIEWPORTS]; + const struct apply_viewport_state *state = + (const struct apply_viewport_state *)data; + + for (unsigned i = 0; i < state->num_viewports; i++) { + /* Note: If we're using shared scaling, the scale should already be the + * same across all views, we can pick any view. However the number + * of viewports and number of views is not guaranteed the same, so we + * need to pick the 0'th view which always exists to be safe. + * + * Conversly, if we're not using shared scaling then the rasterizer in + * the original pipeline is using only the first viewport, so we need to + * replicate it across all viewports. + */ + VkExtent2D frag_area = state->share_scale ? frag_areas[0] : frag_areas[i]; + VkViewport viewport = + state->share_scale ? state->viewports[i] : state->viewports[0]; + if (frag_area.width == 1 && frag_area.height == 1) { + viewports[i] = viewport; + continue; + } + + float scale_x = (float) 1.0f / frag_area.width; + float scale_y = (float) 1.0f / frag_area.height; + + viewports[i].minDepth = viewport.minDepth; + viewports[i].maxDepth = viewport.maxDepth; + viewports[i].width = viewport.width * scale_x; + viewports[i].height = viewport.height * scale_y; + + VkOffset2D offset = fdm_per_bin_offset(frag_area, bin); + + viewports[i].x = scale_x * viewport.x + offset.x; + viewports[i].y = scale_y * viewport.y + offset.y; + } + + tu6_emit_viewport(cs, viewports, state->num_viewports, state->z_negative_one_to_one); +} + +struct apply_scissor_state { + VkRect2D scissors[MAX_VIEWPORTS]; + unsigned num_scissors; + bool share_scale; +}; + +static void +fdm_apply_scissors(struct tu_cs *cs, void *data, VkRect2D bin, unsigned views, + VkExtent2D *frag_areas) +{ + VkRect2D scissors[MAX_VIEWPORTS]; + const struct apply_scissor_state *state = + (const struct apply_scissor_state *)data; + + for (unsigned i = 0; i < state->num_scissors; i++) { + VkExtent2D frag_area = state->share_scale ? frag_areas[0] : frag_areas[i]; + VkRect2D scissor = + state->share_scale ? state->scissors[i] : state->scissors[0]; + if (frag_area.width == 1 && frag_area.height == 1) { + scissors[i] = scissor; + continue; + } + + /* Transform the scissor following the viewport. It's unclear how this + * is supposed to handle cases where the scissor isn't aligned to the + * fragment area, but we round outwards to always render partial + * fragments if the scissor size equals the framebuffer size and it + * isn't aligned to the fragment area. + */ + VkOffset2D offset = fdm_per_bin_offset(frag_area, bin); + VkOffset2D min = { + scissor.offset.x / frag_area.width + offset.x, + scissor.offset.y / frag_area.width + offset.y, + }; + VkOffset2D max = { + DIV_ROUND_UP(scissor.offset.x + scissor.extent.width, frag_area.width) + offset.x, + DIV_ROUND_UP(scissor.offset.y + scissor.extent.height, frag_area.height) + offset.y, + }; + + /* Intersect scissor with the scaled bin, this essentially replaces the + * window scissor. + */ + uint32_t scaled_width = bin.extent.width / frag_area.width; + uint32_t scaled_height = bin.extent.height / frag_area.height; + scissors[i].offset.x = MAX2(min.x, bin.offset.x); + scissors[i].offset.y = MAX2(min.y, bin.offset.y); + scissors[i].extent.width = + MIN2(max.x, bin.offset.x + scaled_width) - scissors[i].offset.x; + scissors[i].extent.height = + MIN2(max.y, bin.offset.y + scaled_height) - scissors[i].offset.y; + } + + tu6_emit_scissor(cs, scissors, state->num_scissors); +} + static VkResult tu6_draw_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -5190,14 +5381,49 @@ tu6_draw_common(struct tu_cmd_buffer *cmd, cmd->state.shader_const = tu6_emit_consts(cmd, pipeline, false); if (dirty & TU_CMD_DIRTY_VIEWPORTS) { - struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_VIEWPORT, 8 + 10 * cmd->state.viewport_count); - tu6_emit_viewport(&cs, cmd->state.viewport, cmd->state.viewport_count, - cmd->state.z_negative_one_to_one); + if (pipeline->fs.fragment_density_map) { + unsigned num_views = MAX2(cmd->state.pass->num_views, 1); + unsigned num_viewports = cmd->state.per_view_viewport ? + num_views : cmd->state.viewport_count; + struct apply_viewport_state state = { + .num_viewports = num_viewports, + .z_negative_one_to_one = cmd->state.z_negative_one_to_one, + .share_scale = !cmd->state.per_view_viewport, + }; + memcpy(&state.viewports, cmd->state.viewport, sizeof(state.viewports)); + tu_cs_set_writeable(&cmd->sub_cs, true); + struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_VIEWPORT, 8 + 10 * num_viewports); + tu_cs_set_writeable(&cmd->sub_cs, false); + tu_create_fdm_bin_patchpoint(cmd, &cs, 8 + 10 * num_viewports, + fdm_apply_viewports, state); + cmd->state.rp.shared_viewport |= !cmd->state.per_view_viewport; + } else { + struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_VIEWPORT, 8 + 10 * cmd->state.viewport_count); + tu6_emit_viewport(&cs, cmd->state.viewport, cmd->state.viewport_count, + cmd->state.z_negative_one_to_one); + } } if (dirty & TU_CMD_DIRTY_SCISSORS) { - struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_SCISSOR, 1 + 2 * cmd->state.scissor_count); - tu6_emit_scissor(&cs, cmd->state.scissor, cmd->state.scissor_count); + if (pipeline->fs.fragment_density_map) { + unsigned num_views = MAX2(cmd->state.pass->num_views, 1); + unsigned num_scissors = cmd->state.per_view_viewport ? + num_views : cmd->state.scissor_count; + struct apply_scissor_state state = { + .num_scissors = num_scissors, + .share_scale = !cmd->state.per_view_viewport, + }; + memcpy(&state.scissors, cmd->state.scissor, sizeof(state.scissors)); + tu_cs_set_writeable(&cmd->sub_cs, true); + struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_SCISSOR, 1 + 2 * num_scissors); + tu_cs_set_writeable(&cmd->sub_cs, false); + tu_create_fdm_bin_patchpoint(cmd, &cs, 1 + 2 * num_scissors, + fdm_apply_scissors, state); + cmd->state.rp.shared_viewport |= !cmd->state.per_view_viewport; + } else { + struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_SCISSOR, 1 + 2 * cmd->state.scissor_count); + tu6_emit_scissor(&cs, cmd->state.scissor, cmd->state.scissor_count); + } } if (dirty & TU_CMD_DIRTY_BLEND) { diff --git a/src/freedreno/vulkan/tu_cmd_buffer.h b/src/freedreno/vulkan/tu_cmd_buffer.h index 9216d1e4caa..29db0ba9740 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.h +++ b/src/freedreno/vulkan/tu_cmd_buffer.h @@ -283,6 +283,7 @@ struct tu_render_pass_state bool has_prim_generated_query_in_rp; bool disable_gmem; bool sysmem_single_prim_mode; + bool shared_viewport; /* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */ bool draw_cs_writes_to_cond_pred; @@ -428,6 +429,7 @@ struct tu_cmd_state VkViewport viewport[MAX_VIEWPORTS]; VkRect2D scissor[MAX_SCISSORS]; uint32_t viewport_count, scissor_count; + bool per_view_viewport; /* for dynamic states that can't be emitted directly */ uint32_t dynamic_stencil_mask; diff --git a/src/freedreno/vulkan/tu_pipeline.cc b/src/freedreno/vulkan/tu_pipeline.cc index df61e70f2f9..bccdff4e8a9 100644 --- a/src/freedreno/vulkan/tu_pipeline.cc +++ b/src/freedreno/vulkan/tu_pipeline.cc @@ -4093,6 +4093,8 @@ tu_pipeline_builder_parse_shader_stages(struct tu_pipeline_builder *builder, struct ir3_shader_variant *vs = builder->variants[MESA_SHADER_VERTEX]; struct ir3_shader_variant *hs = builder->variants[MESA_SHADER_TESS_CTRL]; + struct ir3_shader_variant *ds = builder->variants[MESA_SHADER_TESS_EVAL]; + struct ir3_shader_variant *gs = builder->variants[MESA_SHADER_GEOMETRY]; if (hs) { pipeline->program.vs_param_stride = vs->output_size; pipeline->program.hs_param_stride = hs->output_size; @@ -4117,6 +4119,16 @@ tu_pipeline_builder_parse_shader_stages(struct tu_pipeline_builder *builder, pipeline->tess.patch_control_points); } } + + struct ir3_shader_variant *last_shader; + if (gs) + last_shader = gs; + else if (ds) + last_shader = ds; + else + last_shader = vs; + + pipeline->program.writes_viewport = last_shader->writes_viewport; } static void @@ -4251,16 +4263,34 @@ tu_pipeline_builder_parse_viewport(struct tu_pipeline_builder *builder, if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_VIEWPORT, 8 + 10 * vp_info->viewportCount)) { tu6_emit_viewport(&cs, vp_info->pViewports, vp_info->viewportCount, pipeline->viewport.z_negative_one_to_one); - } else if (pipeline->viewport.set_dynamic_vp_to_static) { + } + + /* We have to save the static viewports if set_dynamic_vp_to_static is set, + * but it may also be set later during pipeline linking if viewports are + * static state becuase FDM also enables set_dynamic_vp_to_static but in a + * different pipeline stage. Therefore we also have to save them if the + * viewport state is static, even though we emit them above. + */ + if (pipeline->viewport.set_dynamic_vp_to_static || + !(pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_VIEWPORT))) { memcpy(pipeline->viewport.viewports, vp_info->pViewports, vp_info->viewportCount * sizeof(*vp_info->pViewports)); } pipeline->viewport.num_viewports = vp_info->viewportCount; - if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_SCISSOR, 1 + 2 * vp_info->scissorCount)) + assert(!pipeline->viewport.set_dynamic_scissor_to_static); + if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_SCISSOR, 1 + 2 * vp_info->scissorCount)) { tu6_emit_scissor(&cs, vp_info->pScissors, vp_info->scissorCount); + /* Similarly to the above we need to save off the static scissors if + * they were originally static, but nothing sets + * set_dynamic_scissor_to_static except FDM. + */ + memcpy(pipeline->viewport.scissors, vp_info->pScissors, + vp_info->scissorCount * sizeof(*vp_info->pScissors)); + } + pipeline->viewport.num_scissors = vp_info->scissorCount; } @@ -4519,6 +4549,42 @@ tu_pipeline_builder_parse_rast_ds(struct tu_pipeline_builder *builder, else tu_cs_emit(&cs, pipeline->rast_ds.rb_depth_cntl); } + + /* With FDM we have to overwrite the viewport and scissor so they have to + * be set dynamically. This can only be done once we know the output state + * and whether viewport/scissor is dynamic. We also have to figure out + * whether we can use per-view viewports to and enable that if true. + */ + if (pipeline->fs.fragment_density_map) { + if (!(pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_VIEWPORT))) { + pipeline->viewport.set_dynamic_vp_to_static = true; + pipeline->dynamic_state_mask |= BIT(VK_DYNAMIC_STATE_VIEWPORT); + } + + if (!(pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_SCISSOR))) { + pipeline->viewport.set_dynamic_scissor_to_static = true; + pipeline->dynamic_state_mask |= BIT(VK_DYNAMIC_STATE_SCISSOR); + } + + /* We can use per-view viewports if the last geometry stage doesn't + * write its own viewport. + */ + pipeline->viewport.per_view_viewport = + !pipeline->program.writes_viewport && + builder->device->physical_device->info->a6xx.has_per_view_viewport; + + /* Fixup GRAS_SU_CNTL and re-emit rast state if necessary. */ + if (pipeline->viewport.per_view_viewport) { + pipeline->rast.gras_su_cntl |= A6XX_GRAS_SU_CNTL_VIEWPORTINDEXINCR; + + if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RAST, + tu6_rast_size(builder->device))) { + tu6_emit_rast(&cs, pipeline->rast.gras_su_cntl, + pipeline->rast.gras_cl_cntl, + pipeline->rast.polygon_mode); + } + } + } } static void diff --git a/src/freedreno/vulkan/tu_pipeline.h b/src/freedreno/vulkan/tu_pipeline.h index e04ed774f3f..b95ae6589ec 100644 --- a/src/freedreno/vulkan/tu_pipeline.h +++ b/src/freedreno/vulkan/tu_pipeline.h @@ -233,6 +233,8 @@ struct tu_pipeline uint32_t hs_param_dwords; uint32_t hs_vertices_out; uint32_t cs_instrlen; + + bool writes_viewport; } program; struct @@ -258,9 +260,12 @@ struct tu_pipeline struct { VkViewport viewports[MAX_VIEWPORTS]; + VkRect2D scissors[MAX_SCISSORS]; unsigned num_viewports, num_scissors; bool set_dynamic_vp_to_static; + bool set_dynamic_scissor_to_static; bool z_negative_one_to_one; + bool per_view_viewport; } viewport; /* Used only for libraries. compiled_shaders only contains variants compiled