tu: Implement bin merging for fragment density map

This will let us merge compatible bins with a larger-than-1 fragment
area, reducing tile load/store overhead.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33230>
This commit is contained in:
Connor Abbott 2025-01-26 15:30:15 -05:00 committed by Marge Bot
parent ab79e0de82
commit 3fdaad0948
6 changed files with 168 additions and 12 deletions

View file

@ -654,8 +654,8 @@ Additionally, not all ``TU_DEBUG`` options can be toggled at runtime, the follow
are supported at the moment: ``nir``, ``nobin``, ``sysmem``, ``gmem``, ``forcebin``,
``layout``, ``nolrz``, ``nolrzfc``, ``perf``, ``flushall``, ``syncdraw``,
``rast_order``, ``unaligned_store``, ``log_skip_gmem_ops``, ``3d_load``, ``fdm``,
``noconcurrentresolves``, ``noconcurrentunresolves``.
``noconcurrentresolves``, ``noconcurrentunresolves``, ``nobinmerging``.
Some of these options will behave differently when toggled at runtime, for example:
``nolrz`` will still result in LRZ allocation which would not happen if the option
was set in the environment variable.
was set in the environment variable.

View file

@ -234,6 +234,8 @@ struct fd_dev_info {
float line_width_min;
float line_width_max;
bool has_bin_mask;
} a6xx;
struct {

View file

@ -866,6 +866,7 @@ a7xx_base = A6XXProps(
prede_nop_quirk = True,
predtf_nop_quirk = True,
has_sad = True,
has_bin_mask = True,
)
a7xx_gen1 = A7XXProps(

View file

@ -1073,7 +1073,8 @@ tu6_emit_cond_for_load_stores(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
struct tu_tile_config {
VkOffset2D pos;
uint32_t pipe;
uint32_t slot;
uint32_t slot_mask;
VkExtent2D extent;
VkExtent2D frag_areas[MAX_VIEWS];
};
@ -1115,6 +1116,8 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
tu6_emit_window_scissor(cs, x1, y1, x2 - 1, y2 - 1);
tu6_emit_window_offset<CHIP>(cs, x1, y1);
unsigned slot = ffs(tile->slot_mask) - 1;
if (hw_binning) {
tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
@ -1123,13 +1126,15 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
tu_cs_emit_pkt7(cs, CP_SET_BIN_DATA5_OFFSET, 4);
tu_cs_emit(cs, tiling->pipe_sizes[tile->pipe] |
CP_SET_BIN_DATA5_0_VSC_N(tile->slot));
CP_SET_BIN_DATA5_0_VSC_N(slot) |
CP_SET_BIN_DATA5_0_VSC_MASK(tile->slot_mask >> slot));
tu_cs_emit(cs, tile->pipe * cmd->vsc_draw_strm_pitch);
tu_cs_emit(cs, tile->pipe * 4);
tu_cs_emit(cs, tile->pipe * cmd->vsc_prim_strm_pitch);
}
tu6_emit_cond_for_load_stores(cmd, cs, tile->pipe, tile->slot, hw_binning);
if (util_is_power_of_two_nonzero(tile->slot_mask))
tu6_emit_cond_for_load_stores(cmd, cs, tile->pipe, slot, hw_binning);
tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
tu_cs_emit(cs, !hw_binning);
@ -1140,7 +1145,10 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
if (fdm) {
unsigned views =
cmd->state.pass->num_views ? cmd->state.pass->num_views : 1;
VkRect2D bin = { { x1, y1 }, { x2 - x1, y2 - y1 } };
VkRect2D bin = {
{ x1, y1 },
{ (x2 - x1) * tile->extent.width, (y2 - y1) * tile->extent.height }
};
util_dynarray_foreach (&cmd->fdm_bin_patchpoints,
struct tu_fdm_bin_patchpoint, patch) {
tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 2 + patch->size);
@ -2256,8 +2264,11 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
}
/* Predicate is changed in draw_cs so we have to re-emit it */
if (cmd->state.rp.draw_cs_writes_to_cond_pred)
tu6_emit_cond_for_load_stores(cmd, cs, tile->pipe, tile->slot, false);
if (cmd->state.rp.draw_cs_writes_to_cond_pred &&
util_is_power_of_two_nonzero(tile->slot_mask)) {
uint32_t slot = ffs(tile->slot_mask) - 1;
tu6_emit_cond_for_load_stores(cmd, cs, tile->pipe, slot, false);
}
tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
tu_cs_emit(cs, 0x0);
@ -2367,6 +2378,118 @@ tu_calc_frag_area(struct tu_cmd_buffer *cmd,
}
}
static bool
try_merge_tiles(struct tu_tile_config *dst, const struct tu_tile_config *src,
unsigned views)
{
uint32_t slot_mask = dst->slot_mask | src->slot_mask;
/* The fragment areas must be the same. */
for (unsigned i = 0; i < views; i++) {
if (dst->frag_areas[i].width != src->frag_areas[i].width ||
dst->frag_areas[i].height != src->frag_areas[i].height)
return false;
}
/* The mask of the combined tile has to fit in 16 bits */
uint32_t hw_mask = slot_mask >> (ffs(slot_mask) - 1);
if ((hw_mask & 0xffff) != hw_mask)
return false;
/* Note, this assumes that dst is below or to the right of src, which is
* how we call this function below.
*/
VkExtent2D extent = {
dst->extent.width + (dst->pos.x - src->pos.x),
dst->extent.height + (dst->pos.y - src->pos.y),
};
assert(dst->extent.height > 0);
/* The common fragment areas must not be smaller than the combined bin
* extent, so that the combined bin is not larger than the original
* unscaled bin.
*/
for (unsigned i = 0; i < views; i++) {
if (dst->frag_areas[i].width < extent.width ||
dst->frag_areas[i].height < extent.height)
return false;
}
/* Ok, let's combine them. dst is below or to the right of src, so it takes
* src's position.
*/
dst->extent = extent;
dst->pos = src->pos;
dst->slot_mask = slot_mask;
return true;
}
template <chip CHIP>
void
tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
uint32_t tx1, uint32_t ty1, uint32_t tx2, uint32_t ty2,
const struct tu_image_view *fdm)
{
uint32_t width = tx2 - tx1;
uint32_t height = ty2 - ty1;
unsigned views =
cmd->state.pass->num_views ? cmd->state.pass->num_views : 1;
struct tu_tile_config tiles[width * height];
/* Initialize tiles and sample fragment density map */
for (uint32_t y = 0; y < height; y++) {
for (uint32_t x = 0; x < width; x++) {
struct tu_tile_config *tile = &tiles[width * y + x];
tile->pos = { x + tx1, y + ty1 };
tile->extent = { 1, 1 };
tile->pipe = pipe;
tile->slot_mask = 1u << (width * y + x);
tu_calc_frag_area(cmd, tile, fdm);
}
}
uint32_t merged_tiles = 0;
/* Merge tiles */
for (uint32_t y = 0; y < height; y++) {
for (uint32_t x = 0; x < width; x++) {
struct tu_tile_config *tile = &tiles[width * y + x];
if (x > 0) {
struct tu_tile_config *prev_x_tile = &tiles[width * y + x - 1];
if (try_merge_tiles(tile, prev_x_tile, views)) {
merged_tiles |= prev_x_tile->slot_mask;
}
}
if (y > 0) {
struct tu_tile_config *prev_y_tile = &tiles[width * (y - 1) + x];
if (!(merged_tiles & prev_y_tile->slot_mask) &&
try_merge_tiles(tile, prev_y_tile, views)) {
merged_tiles |= prev_y_tile->slot_mask;
}
}
}
}
/* Finally, iterate over tiles and draw them */
for (uint32_t y = 0; y < height; y++) {
for (uint32_t x = 0; x < width; x++) {
uint32_t tx;
if (y & 1)
tx = width - 1 - x;
else
tx = x;
unsigned tile_idx = y * width + tx;
if (merged_tiles & (1u << tile_idx))
continue;
tu6_render_tile<CHIP>(cmd, &cmd->cs, &tiles[tile_idx], true);
}
}
}
template <chip CHIP>
static void
tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
@ -2380,6 +2503,8 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
}
bool has_fdm = fdm || (TU_DEBUG(FDM) && cmd->state.pass->has_fdm);
bool merge_tiles = has_fdm && !TU_DEBUG(NO_BIN_MERGING) &&
cmd->device->physical_device->info->a6xx.has_bin_mask;
/* Create gmem stores now (at EndRenderPass time)) because they needed to
* know whether to allow their conditional execution, which was tied to a
@ -2410,6 +2535,12 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
uint32_t ty1 = py * tiling->pipe0.height;
uint32_t tx2 = MIN2(tx1 + tiling->pipe0.width, tiling->tile_count.width);
uint32_t ty2 = MIN2(ty1 + tiling->pipe0.height, tiling->tile_count.height);
if (merge_tiles) {
tu_render_pipe_fdm<CHIP>(cmd, pipe, tx1, ty1, tx2, ty2, fdm);
continue;
}
uint32_t tile_row_stride = tx2 - tx1;
uint32_t slot_row = 0;
for (uint32_t ty = ty1; ty < ty2; ty++) {
@ -2423,7 +2554,8 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
struct tu_tile_config tile = {
.pos = { tx1 + tx, ty },
.pipe = pipe,
.slot = slot_row + tx,
.slot_mask = 1u << (slot_row + tx),
.extent = { 1, 1 },
};
if (has_fdm)
tu_calc_frag_area(cmd, &tile, fdm);

View file

@ -47,6 +47,7 @@ static const struct debug_control tu_debug_options[] = {
{ "noconcurrentresolves", TU_DEBUG_NO_CONCURRENT_RESOLVES },
{ "noconcurrentunresolves", TU_DEBUG_NO_CONCURRENT_UNRESOLVES },
{ "dumpas", TU_DEBUG_DUMPAS },
{ "nobinmerging", TU_DEBUG_NO_BIN_MERGING },
{ NULL, 0 }
};
@ -62,7 +63,8 @@ const uint32_t tu_runtime_debug_flags =
TU_DEBUG_PERF | TU_DEBUG_FLUSHALL | TU_DEBUG_SYNCDRAW |
TU_DEBUG_RAST_ORDER | TU_DEBUG_UNALIGNED_STORE |
TU_DEBUG_LOG_SKIP_GMEM_OPS | TU_DEBUG_3D_LOAD | TU_DEBUG_FDM |
TU_DEBUG_NO_CONCURRENT_RESOLVES | TU_DEBUG_NO_CONCURRENT_UNRESOLVES;
TU_DEBUG_NO_CONCURRENT_RESOLVES | TU_DEBUG_NO_CONCURRENT_UNRESOLVES |
TU_DEBUG_NO_BIN_MERGING;
os_file_notifier_t tu_debug_notifier;
struct tu_env tu_env;
@ -317,11 +319,29 @@ tu_tiling_config_update_tile_layout(struct tu_framebuffer *fb,
static void
tu_tiling_config_update_pipe_layout(struct tu_tiling_config *tiling,
const struct tu_device *dev)
const struct tu_device *dev,
bool fdm)
{
const uint32_t max_pipe_count =
dev->physical_device->info->num_vsc_pipes;
/* If there is a fragment density map and bin merging is enabled, we will
* likely be able to merge some bins. Bins can only be merged if they are
* in the same visibility stream, so making the pipes cover too small an
* area can prevent bin merging from happening. Maximize the size of each
* pipe instead of minimizing it.
*/
if (fdm && dev->physical_device->info->a6xx.has_bin_mask &&
!TU_DEBUG(NO_BIN_MERGING)) {
tiling->pipe0.width = 4;
tiling->pipe0.height = 8;
tiling->pipe_count.width =
DIV_ROUND_UP(tiling->tile_count.width, tiling->pipe0.width);
tiling->pipe_count.height =
DIV_ROUND_UP(tiling->tile_count.height, tiling->pipe0.height);
return;
}
/* start from 1 tile per pipe */
tiling->pipe0 = (VkExtent2D) {
.width = 1,
@ -422,7 +442,7 @@ tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
if (!tiling->possible)
continue;
tu_tiling_config_update_pipe_layout(tiling, device);
tu_tiling_config_update_pipe_layout(tiling, device, pass->has_fdm);
tu_tiling_config_update_pipes(tiling, device);
tu_tiling_config_update_binning(tiling, device);
}

View file

@ -67,6 +67,7 @@ enum tu_debug_flags
TU_DEBUG_NO_CONCURRENT_RESOLVES = 1 << 27,
TU_DEBUG_NO_CONCURRENT_UNRESOLVES = 1 << 28,
TU_DEBUG_DUMPAS = 1 << 29,
TU_DEBUG_NO_BIN_MERGING = 1 << 30,
};
struct tu_env {