v3dv: add a heuristic for double-buffer mode

For this we add a scoring system that evaluates various aspects of
the draw calls in a job.

If the cost of the geometry side of the pipeline is too high, then
we may pay too high a price in double-buffer mode because with smaller
tile size may will probably have more vertex shader invocations in the
render and binning stages.

On the other hand, if rendering cost is not high enough, we may not
have enough rendering work to hide the latency of tile stores in
double-buffer mode.

Also, because we need to make a decision after we know all the draw
calls in a job, but the double-buffer enable bit comes in the
TILE_BINNING_MODE_CFG that needs to be emitted first in the binning
command list before the draw calls are recorded, if we decide to
enable it we need to rewrite that packet and we need to size the
tile state properly to account for the extra tiles. For this
purpose we delay tile state setup for render pass jobs until we are
finishing a job.

Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17854>
This commit is contained in:
Iago Toral Quiroga 2022-07-27 11:42:01 +02:00 committed by Marge Bot
parent d17c56078a
commit ad1961a441
5 changed files with 180 additions and 20 deletions

View file

@ -470,18 +470,13 @@ v3dv_job_start_frame(struct v3dv_job *job,
{
assert(job);
/* FIXME: if we are emitting any tile loads the hardware will serialize
* loads and stores across tiles effectively disabling double buffering,
* so we would want to check for that and not enable it in that case to
* avoid reducing the tile size.
/* Start by computing frame tiling spec for this job assuming that
* double-buffer mode is disabled.
*/
bool double_buffer = unlikely(V3D_DEBUG & V3D_DEBUG_DOUBLE_BUFFER) && !msaa;
/* Start by computing frame tiling spec for this job */
const struct v3dv_frame_tiling *tiling =
job_compute_frame_tiling(job, width, height, layers,
render_target_count, max_internal_bpp,
msaa, double_buffer);
msaa, false);
v3dv_cl_ensure_space_with_branch(&job->bcl, 256);
v3dv_return_if_oom(NULL, job);
@ -503,6 +498,24 @@ v3dv_job_start_frame(struct v3dv_job *job,
job->first_ez_state = V3D_EZ_UNDECIDED;
}
static bool
job_should_enable_double_buffer(struct v3dv_job *job)
{
/* Inocmpatibility with double-buffer */
if (!job->can_use_double_buffer)
return false;
/* Too much geometry processing */
if (job->double_buffer_score.geom > 2000000)
return false;
/* Too little rendering to make up for tile store latency */
if (job->double_buffer_score.render < 100000)
return false;
return true;
}
static void
cmd_buffer_end_render_pass_frame(struct v3dv_cmd_buffer *cmd_buffer)
{
@ -518,6 +531,23 @@ cmd_buffer_end_render_pass_frame(struct v3dv_cmd_buffer *cmd_buffer)
* any RCL commands of its own.
*/
if (v3dv_cl_offset(&job->rcl) == 0) {
/* Decide if we want to enable double-buffer for this job. If we do, then
* we need to rewrite the TILE_BINNING_MODE_CFG packet in the BCL.
*/
if (job_should_enable_double_buffer(job)) {
assert(!job->frame_tiling.double_buffer);
job_compute_frame_tiling(job,
job->frame_tiling.width,
job->frame_tiling.height,
job->frame_tiling.layers,
job->frame_tiling.render_target_count,
job->frame_tiling.internal_bpp,
job->frame_tiling.msaa,
true);
v3dv_X(job->device, job_emit_enable_double_buffer)(job);
}
/* At this point we have decided whether we want to use double-buffer or
* not and the job's frame tiling represents that decision so we can
* allocate the tile state, which we need to do before we emit the RCL.
@ -997,6 +1027,13 @@ cmd_buffer_begin_render_pass_secondary(
cmd_buffer->state.render_area.extent.height =
framebuffer ? framebuffer->height : V3D_MAX_IMAGE_DIMENSION;
/* We only really execute double-buffer mode in primary jobs, so allow this
* mode in render pass secondaries to keep track of the double-buffer mode
* score in them and update the primaries accordingly when they are executed
* into them.
*/
job->can_use_double_buffer = true;
return VK_SUCCESS;
}
@ -2701,9 +2738,64 @@ consume_bcl_sync(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_job *job)
cmd_buffer->state.barrier.bcl_image_access = 0;
}
static inline uint32_t
compute_prog_score(struct v3dv_shader_variant *vs)
{
const uint32_t inst_count = vs->qpu_insts_size / sizeof(uint64_t);
const uint32_t tmu_count = vs->prog_data.base->tmu_count +
vs->prog_data.base->tmu_spills +
vs->prog_data.base->tmu_fills;
return inst_count + 4 * tmu_count;
}
static void
job_update_double_buffer_score(struct v3dv_job *job,
struct v3dv_pipeline *pipeline,
uint32_t vertex_count,
VkExtent2D *render_area)
{
/* FIXME: assume anything with GS workloads is too expensive */
struct v3dv_shader_variant *gs_bin =
pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN];
if (gs_bin) {
job->can_use_double_buffer = false;
return;
}
/* Keep track of vertex processing: too much geometry processing would not
* be good for double-buffer.
*/
struct v3dv_shader_variant *vs_bin =
pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN];
assert(vs_bin);
uint32_t geom_score = vertex_count * compute_prog_score(vs_bin);
struct v3dv_shader_variant *vs =
pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX];
assert(vs);
uint32_t vs_score = vertex_count * compute_prog_score(vs);
geom_score += vs_score;
job->double_buffer_score.geom += geom_score;
/* Compute pixel rendering cost.
*
* We estimate that on average a draw would render 0.2% of the pixels in
* the render area. That would be a 64x64 region in a 1920x1080 area.
*/
struct v3dv_shader_variant *fs =
pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
assert(fs);
uint32_t pixel_count = 0.002f * render_area->width * render_area->height;
uint32_t render_score = vs_score + pixel_count * compute_prog_score(fs);
job->double_buffer_score.render += render_score;
}
void
v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer,
bool indexed, bool indirect)
bool indexed, bool indirect,
uint32_t vertex_count)
{
assert(cmd_buffer->state.gfx.pipeline);
assert(!(cmd_buffer->state.gfx.pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT));
@ -2808,6 +2900,16 @@ v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer,
if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE))
v3dv_X(device, cmd_buffer_emit_color_write_mask)(cmd_buffer);
/* We disable double-buffer mode if indirect draws are used because in that
* case we don't know the vertex count.
*/
if (indirect) {
job->can_use_double_buffer = false;
} else if (job->can_use_double_buffer) {
job_update_double_buffer_score(job, pipeline, vertex_count,
&cmd_buffer->state.render_area.extent);
}
cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_PIPELINE;
}
@ -2823,10 +2925,12 @@ static void
cmd_buffer_draw(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_draw_info *info)
{
uint32_t vertex_count =
info->vertex_count * info->instance_count;
struct v3dv_render_pass *pass = cmd_buffer->state.pass;
if (likely(!pass->multiview_enabled)) {
v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, false);
v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, false, vertex_count);
v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw)(cmd_buffer, info);
return;
}
@ -2834,7 +2938,7 @@ cmd_buffer_draw(struct v3dv_cmd_buffer *cmd_buffer,
uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
while (view_mask) {
cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, false);
v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, false, vertex_count);
v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw)(cmd_buffer, info);
}
}
@ -2872,9 +2976,11 @@ v3dv_CmdDrawIndexed(VkCommandBuffer commandBuffer,
V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
uint32_t vertex_count = indexCount * instanceCount;
struct v3dv_render_pass *pass = cmd_buffer->state.pass;
if (likely(!pass->multiview_enabled)) {
v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, false);
v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, false, vertex_count);
v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indexed)
(cmd_buffer, indexCount, instanceCount,
firstIndex, vertexOffset, firstInstance);
@ -2884,7 +2990,7 @@ v3dv_CmdDrawIndexed(VkCommandBuffer commandBuffer,
uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
while (view_mask) {
cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, false);
v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, false, vertex_count);
v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indexed)
(cmd_buffer, indexCount, instanceCount,
firstIndex, vertexOffset, firstInstance);
@ -2907,7 +3013,7 @@ v3dv_CmdDrawIndirect(VkCommandBuffer commandBuffer,
struct v3dv_render_pass *pass = cmd_buffer->state.pass;
if (likely(!pass->multiview_enabled)) {
v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, true);
v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, true, 0);
v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indirect)
(cmd_buffer, buffer, offset, drawCount, stride);
return;
@ -2916,7 +3022,7 @@ v3dv_CmdDrawIndirect(VkCommandBuffer commandBuffer,
uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
while (view_mask) {
cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, true);
v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, true, 0);
v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indirect)
(cmd_buffer, buffer, offset, drawCount, stride);
}
@ -2938,7 +3044,7 @@ v3dv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
struct v3dv_render_pass *pass = cmd_buffer->state.pass;
if (likely(!pass->multiview_enabled)) {
v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, true);
v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, true, 0);
v3dv_X(cmd_buffer->device, cmd_buffer_emit_indexed_indirect)
(cmd_buffer, buffer, offset, drawCount, stride);
return;
@ -2947,7 +3053,7 @@ v3dv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
while (view_mask) {
cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, true);
v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, true, 0);
v3dv_X(cmd_buffer->device, cmd_buffer_emit_indexed_indirect)
(cmd_buffer, buffer, offset, drawCount, stride);
}

View file

@ -341,10 +341,12 @@ subpass_get_granularity(struct v3dv_device *device,
msaa = true;
}
/* If requested, double-buffer may or may not be enabled depending on
* heuristics so we choose a conservative granularity here, with it disabled.
*/
uint32_t width, height;
bool double_buffer = (V3D_DEBUG & V3D_DEBUG_DOUBLE_BUFFER) && !msaa;
v3d_choose_tile_size(color_attachment_count, max_bpp, msaa,
double_buffer, &width, &height);
false /* double-buffer */, &width, &height);
*granularity = (VkExtent2D) {
.width = width,
.height = height

View file

@ -1086,6 +1086,16 @@ struct v3dv_job {
*/
bool can_use_double_buffer;
/* This structure keeps track of various scores to inform a heuristic
* for double-buffer mode.
*/
struct {
/* Cost of geometry shading */
uint32_t geom;
/* Cost of shader rendering */
uint32_t render;
} double_buffer_score;
/* We only need to allocate tile state for all layers if the binner
* writes primitives to layers other than the first. This can only be
* done using layered rendering (writing gl_Layer from a geometry shader),
@ -1095,6 +1105,12 @@ struct v3dv_job {
*/
bool allocate_tile_state_for_all_layers;
/* A pointer to the location of the TILE_BINNING_MODE_CFG packet so we can
* rewrite it to enable double-buffer mode by the time we have enough info
* about the job to make that decision.
*/
struct v3dv_cl_out *bcl_tile_binning_mode_ptr;
enum v3dv_job_type type;
struct v3dv_device *device;
@ -1227,7 +1243,8 @@ v3dv_cmd_buffer_ensure_array_state(struct v3dv_cmd_buffer *cmd_buffer,
void **ptr);
void v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer,
bool indexed, bool indirect);
bool indexed, bool indirect,
uint32_t vertex_count);
bool v3dv_job_allocate_tile_state(struct v3dv_job *job);

View file

@ -42,6 +42,29 @@ v3dX(job_emit_binning_flush)(struct v3dv_job *job)
cl_emit(&job->bcl, FLUSH, flush);
}
void
v3dX(job_emit_enable_double_buffer)(struct v3dv_job *job)
{
assert(job->can_use_double_buffer);
assert(job->frame_tiling.double_buffer);
assert(!job->frame_tiling.msaa);
assert(job->bcl_tile_binning_mode_ptr);
const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
struct cl_packet_struct(TILE_BINNING_MODE_CFG) config = {
cl_packet_header(TILE_BINNING_MODE_CFG),
};
config.width_in_pixels = tiling->width;
config.height_in_pixels = tiling->height;
config.number_of_render_targets = MAX2(tiling->render_target_count, 1);
config.multisample_mode_4x = tiling->msaa;
config.double_buffer_in_non_ms_mode = tiling->double_buffer;
config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
uint8_t *rewrite_addr = (uint8_t *)job->bcl_tile_binning_mode_ptr;
cl_packet_pack(TILE_BINNING_MODE_CFG)(NULL, rewrite_addr, &config);
}
void
v3dX(job_emit_binning_prolog)(struct v3dv_job *job,
const struct v3dv_frame_tiling *tiling,
@ -55,6 +78,7 @@ v3dX(job_emit_binning_prolog)(struct v3dv_job *job,
}
assert(!tiling->double_buffer || !tiling->msaa);
job->bcl_tile_binning_mode_ptr = cl_start(&job->bcl);
cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
config.width_in_pixels = tiling->width;
config.height_in_pixels = tiling->height;
@ -1655,6 +1679,14 @@ v3dX(cmd_buffer_execute_inside_pass)(struct v3dv_cmd_buffer *primary,
}
}
if (!secondary_job->can_use_double_buffer) {
primary_job->can_use_double_buffer = false;
} else {
primary_job->double_buffer_score.geom +=
secondary_job->double_buffer_score.geom;
primary_job->double_buffer_score.render +=
secondary_job->double_buffer_score.render;
}
primary_job->tmu_dirty_rcl |= secondary_job->tmu_dirty_rcl;
} else {
/* This is a regular job (CPU or GPU), so just finish the current

View file

@ -74,6 +74,9 @@ v3dX(job_emit_binning_prolog)(struct v3dv_job *job,
const struct v3dv_frame_tiling *tiling,
uint32_t layers);
void
v3dX(job_emit_enable_double_buffer)(struct v3dv_job *job);
void
v3dX(cmd_buffer_execute_inside_pass)(struct v3dv_cmd_buffer *primary,
uint32_t cmd_buffer_count,