v3dv: implement double-buffer mode

Double buffer mode splits the tile buffer size in half so we can
start processing the next tile while the current one is being
stored to memory. This mode is available only if MSAA is not enabled
and can, in theory, improve performance by reducing tile store
overhead, however, it comes at the cost of reducing the tile size,
which also causes some overhead of its own.

Testing shows that this helps some cases (i.e the Vulkan Quake
ports) but hurts others (i.e. Unreal Engine 4), so for the time
being we don't enable this by default but we allow to enable it
selectively by using V3D_DEBUG.

Reviewed-by: Juan A. Suarez <jasuarez@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14551>
This commit is contained in:
Iago Toral Quiroga 2022-01-05 11:07:59 +01:00 committed by Marge Bot
parent 821c66e50c
commit b9f9474577
10 changed files with 61 additions and 17 deletions

View file

@ -88,6 +88,8 @@ static const struct debug_named_value debug_control[] = {
"Force 16-bit precision on all TMU operations" }, "Force 16-bit precision on all TMU operations" },
{ "noloopunroll", V3D_DEBUG_NO_LOOP_UNROLL, { "noloopunroll", V3D_DEBUG_NO_LOOP_UNROLL,
"Disable loop unrolling" }, "Disable loop unrolling" },
{ "db", V3D_DEBUG_DOUBLE_BUFFER,
"Enable double buffer for Tile Buffer when MSAA is disabled" },
{ NULL } { NULL }
}; };

View file

@ -63,6 +63,7 @@ extern uint32_t V3D_DEBUG;
#define V3D_DEBUG_TMU_16BIT (1 << 19) #define V3D_DEBUG_TMU_16BIT (1 << 19)
#define V3D_DEBUG_NO_LOOP_UNROLL (1 << 20) #define V3D_DEBUG_NO_LOOP_UNROLL (1 << 20)
#define V3D_DEBUG_CL_NO_BIN (1 << 21) #define V3D_DEBUG_CL_NO_BIN (1 << 21)
#define V3D_DEBUG_DOUBLE_BUFFER (1 << 22)
#define V3D_DEBUG_SHADERS (V3D_DEBUG_TGSI | V3D_DEBUG_NIR | \ #define V3D_DEBUG_SHADERS (V3D_DEBUG_TGSI | V3D_DEBUG_NIR | \
V3D_DEBUG_VIR | V3D_DEBUG_QPU | \ V3D_DEBUG_VIR | V3D_DEBUG_QPU | \

View file

@ -88,8 +88,8 @@ v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo,
} }
void void
v3d_choose_tile_size(uint32_t color_attachment_count, v3d_choose_tile_size(uint32_t color_attachment_count, uint32_t max_color_bpp,
uint32_t max_color_bpp, bool msaa, bool msaa, bool double_buffer,
uint32_t *width, uint32_t *height) uint32_t *width, uint32_t *height)
{ {
static const uint8_t tile_sizes[] = { static const uint8_t tile_sizes[] = {
@ -108,8 +108,12 @@ v3d_choose_tile_size(uint32_t color_attachment_count,
else if (color_attachment_count > 1) else if (color_attachment_count > 1)
idx += 1; idx += 1;
/* MSAA and double-buffer are mutually exclusive */
assert(!msaa || !double_buffer);
if (msaa) if (msaa)
idx += 2; idx += 2;
else if (double_buffer)
idx += 1;
idx += max_color_bpp; idx += max_color_bpp;

View file

@ -35,8 +35,8 @@ v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo,
uint32_t wg_size); uint32_t wg_size);
void void
v3d_choose_tile_size(uint32_t color_attachment_count, v3d_choose_tile_size(uint32_t color_attachment_count, uint32_t max_color_bpp,
uint32_t max_color_bpp, bool msaa, bool msaa, bool double_buffer,
uint32_t *width, uint32_t *height); uint32_t *width, uint32_t *height);
#endif #endif

View file

@ -441,7 +441,21 @@ job_compute_frame_tiling(struct v3dv_job *job,
tiling->msaa = msaa; tiling->msaa = msaa;
tiling->internal_bpp = max_internal_bpp; tiling->internal_bpp = max_internal_bpp;
v3d_choose_tile_size(render_target_count, max_internal_bpp, msaa, /* We can use double-buffer when MSAA is disabled to reduce tile store
* overhead.
*
* FIXME: if we are emitting any tile loads the hardware will serialize
* loads and stores across tiles effectivley disabling double buffering,
* so we would want to check for that and not enable it in that case to
* avoid reducing the tile size.
*/
tiling->double_buffer =
unlikely(V3D_DEBUG & V3D_DEBUG_DOUBLE_BUFFER) && !msaa;
assert(!tiling->msaa || !tiling->double_buffer);
v3d_choose_tile_size(render_target_count, max_internal_bpp,
tiling->msaa, tiling->double_buffer,
&tiling->tile_width, &tiling->tile_height); &tiling->tile_width, &tiling->tile_height);
tiling->draw_tiles_x = DIV_ROUND_UP(width, tiling->tile_width); tiling->draw_tiles_x = DIV_ROUND_UP(width, tiling->tile_width);

View file

@ -310,7 +310,9 @@ subpass_get_granularity(struct v3dv_device *device,
} }
uint32_t width, height; uint32_t width, height;
v3d_choose_tile_size(color_attachment_count, max_bpp, msaa, &width, &height); bool double_buffer = (V3D_DEBUG & V3D_DEBUG_DOUBLE_BUFFER) && !msaa;
v3d_choose_tile_size(color_attachment_count, max_bpp, msaa,
double_buffer, &width, &height);
*granularity = (VkExtent2D) { *granularity = (VkExtent2D) {
.width = width, .width = width,
.height = height .height = height

View file

@ -745,6 +745,7 @@ struct v3dv_frame_tiling {
uint32_t render_target_count; uint32_t render_target_count;
uint32_t internal_bpp; uint32_t internal_bpp;
bool msaa; bool msaa;
bool double_buffer;
uint32_t tile_width; uint32_t tile_width;
uint32_t tile_height; uint32_t tile_height;
uint32_t draw_tiles_x; uint32_t draw_tiles_x;
@ -765,6 +766,21 @@ bool v3dv_subpass_area_is_tile_aligned(struct v3dv_device *device,
struct v3dv_render_pass *pass, struct v3dv_render_pass *pass,
uint32_t subpass_idx); uint32_t subpass_idx);
/* Checks if we need to emit 2 initial tile clears for double buffer mode.
* This happens when we render at least 2 tiles, because in this mode each
* tile uses a different half of the tile buffer memory so we can have 2 tiles
* in flight (one being stored to memory and the next being rendered). In this
* scenario, if we emit a single initial tile clear we would only clear the
* first half of the tile buffer.
*/
static inline bool
v3dv_do_double_initial_tile_clear(const struct v3dv_frame_tiling *tiling)
{
return tiling->double_buffer &&
(tiling->draw_tiles_x > 1 || tiling->draw_tiles_y > 1 ||
tiling->layers > 1);
}
struct v3dv_cmd_pool { struct v3dv_cmd_pool {
struct vk_object_base base; struct vk_object_base base;

View file

@ -53,11 +53,13 @@ v3dX(job_emit_binning_prolog)(struct v3dv_job *job,
config.number_of_layers = layers; config.number_of_layers = layers;
} }
assert(!tiling->double_buffer || !tiling->msaa);
cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) { cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
config.width_in_pixels = tiling->width; config.width_in_pixels = tiling->width;
config.height_in_pixels = tiling->height; config.height_in_pixels = tiling->height;
config.number_of_render_targets = MAX2(tiling->render_target_count, 1); config.number_of_render_targets = MAX2(tiling->render_target_count, 1);
config.multisample_mode_4x = tiling->msaa; config.multisample_mode_4x = tiling->msaa;
config.double_buffer_in_non_ms_mode = tiling->double_buffer;
config.maximum_bpp_of_all_render_targets = tiling->internal_bpp; config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
} }
@ -762,11 +764,13 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
*/ */
bool do_early_zs_clear = false; bool do_early_zs_clear = false;
const uint32_t ds_attachment_idx = subpass->ds_attachment.attachment; const uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
assert(!tiling->msaa || !tiling->double_buffer);
cl_emit(rcl, TILE_RENDERING_MODE_CFG_COMMON, config) { cl_emit(rcl, TILE_RENDERING_MODE_CFG_COMMON, config) {
config.image_width_pixels = framebuffer->width; config.image_width_pixels = framebuffer->width;
config.image_height_pixels = framebuffer->height; config.image_height_pixels = framebuffer->height;
config.number_of_render_targets = MAX2(subpass->color_count, 1); config.number_of_render_targets = MAX2(subpass->color_count, 1);
config.multisample_mode_4x = tiling->msaa; config.multisample_mode_4x = tiling->msaa;
config.double_buffer_in_non_ms_mode = tiling->double_buffer;
config.maximum_bpp_of_all_render_targets = tiling->internal_bpp; config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) { if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
@ -944,12 +948,6 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
tiling->frame_height_in_supertiles; tiling->frame_height_in_supertiles;
} }
/* Start by clearing the tile buffer. */
cl_emit(rcl, TILE_COORDINATES, coords) {
coords.tile_column_number = 0;
coords.tile_row_number = 0;
}
/* Emit an initial clear of the tile buffers. This is necessary /* Emit an initial clear of the tile buffers. This is necessary
* for any buffers that should be cleared (since clearing * for any buffers that should be cleared (since clearing
* normally happens at the *end* of the generic tile list), but * normally happens at the *end* of the generic tile list), but
@ -964,13 +962,13 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
* changes on V3D 3.x, and 2 dummy stores on 4.x. * changes on V3D 3.x, and 2 dummy stores on 4.x.
*/ */
for (int i = 0; i < 2; i++) { for (int i = 0; i < 2; i++) {
if (i > 0)
cl_emit(rcl, TILE_COORDINATES, coords); cl_emit(rcl, TILE_COORDINATES, coords);
cl_emit(rcl, END_OF_LOADS, end); cl_emit(rcl, END_OF_LOADS, end);
cl_emit(rcl, STORE_TILE_BUFFER_GENERAL, store) { cl_emit(rcl, STORE_TILE_BUFFER_GENERAL, store) {
store.buffer_to_store = NONE; store.buffer_to_store = NONE;
} }
if (i == 0 && cmd_buffer->state.tile_aligned_render_area) { if (cmd_buffer->state.tile_aligned_render_area &&
(i == 0 || v3dv_do_double_initial_tile_clear(tiling))) {
cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) { cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
clear.clear_z_stencil_buffer = !job->early_zs_clear; clear.clear_z_stencil_buffer = !job->early_zs_clear;
clear.clear_all_render_targets = true; clear.clear_all_render_targets = true;

View file

@ -50,12 +50,14 @@ emit_rcl_prologue(struct v3dv_job *job,
if (job->cmd_buffer->state.oom) if (job->cmd_buffer->state.oom)
return NULL; return NULL;
assert(!tiling->msaa || !tiling->double_buffer);
cl_emit(rcl, TILE_RENDERING_MODE_CFG_COMMON, config) { cl_emit(rcl, TILE_RENDERING_MODE_CFG_COMMON, config) {
config.early_z_disable = true; config.early_z_disable = true;
config.image_width_pixels = tiling->width; config.image_width_pixels = tiling->width;
config.image_height_pixels = tiling->height; config.image_height_pixels = tiling->height;
config.number_of_render_targets = 1; config.number_of_render_targets = 1;
config.multisample_mode_4x = tiling->msaa; config.multisample_mode_4x = tiling->msaa;
config.double_buffer_in_non_ms_mode = tiling->double_buffer;
config.maximum_bpp_of_all_render_targets = tiling->internal_bpp; config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
config.internal_depth_type = fb->internal_depth_type; config.internal_depth_type = fb->internal_depth_type;
} }
@ -166,7 +168,11 @@ emit_frame_setup(struct v3dv_job *job,
cl_emit(rcl, STORE_TILE_BUFFER_GENERAL, store) { cl_emit(rcl, STORE_TILE_BUFFER_GENERAL, store) {
store.buffer_to_store = NONE; store.buffer_to_store = NONE;
} }
if (clear_value && i == 0) { /* When using double-buffering, we need to clear both buffers (unless
* we only have a single tile to render).
*/
if (clear_value &&
(i == 0 || v3dv_do_double_initial_tile_clear(tiling))) {
cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) { cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
clear.clear_z_stencil_buffer = true; clear.clear_z_stencil_buffer = true;
clear.clear_all_render_targets = true; clear.clear_all_render_targets = true;

View file

@ -266,6 +266,7 @@ v3d_get_tile_buffer_size(bool is_msaa,
} }
v3d_choose_tile_size(max_cbuf_idx + 1, *max_bpp, is_msaa, v3d_choose_tile_size(max_cbuf_idx + 1, *max_bpp, is_msaa,
false /* double-buffer */,
tile_width, tile_height); tile_width, tile_height);
} }