mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-23 00:20:09 +01:00
tu: Implement bin merging for fragment density map
This will let us merge compatible bins with a larger-than-1 fragment area, reducing tile load/store overhead. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33230>
This commit is contained in:
parent
ab79e0de82
commit
3fdaad0948
6 changed files with 168 additions and 12 deletions
|
|
@ -654,8 +654,8 @@ Additionally, not all ``TU_DEBUG`` options can be toggled at runtime, the follow
|
|||
are supported at the moment: ``nir``, ``nobin``, ``sysmem``, ``gmem``, ``forcebin``,
|
||||
``layout``, ``nolrz``, ``nolrzfc``, ``perf``, ``flushall``, ``syncdraw``,
|
||||
``rast_order``, ``unaligned_store``, ``log_skip_gmem_ops``, ``3d_load``, ``fdm``,
|
||||
``noconcurrentresolves``, ``noconcurrentunresolves``.
|
||||
``noconcurrentresolves``, ``noconcurrentunresolves``, ``nobinmerging``.
|
||||
|
||||
Some of these options will behave differently when toggled at runtime, for example:
|
||||
``nolrz`` will still result in LRZ allocation which would not happen if the option
|
||||
was set in the environment variable.
|
||||
was set in the environment variable.
|
||||
|
|
|
|||
|
|
@ -234,6 +234,8 @@ struct fd_dev_info {
|
|||
|
||||
float line_width_min;
|
||||
float line_width_max;
|
||||
|
||||
bool has_bin_mask;
|
||||
} a6xx;
|
||||
|
||||
struct {
|
||||
|
|
|
|||
|
|
@ -866,6 +866,7 @@ a7xx_base = A6XXProps(
|
|||
prede_nop_quirk = True,
|
||||
predtf_nop_quirk = True,
|
||||
has_sad = True,
|
||||
has_bin_mask = True,
|
||||
)
|
||||
|
||||
a7xx_gen1 = A7XXProps(
|
||||
|
|
|
|||
|
|
@ -1073,7 +1073,8 @@ tu6_emit_cond_for_load_stores(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
|||
struct tu_tile_config {
|
||||
VkOffset2D pos;
|
||||
uint32_t pipe;
|
||||
uint32_t slot;
|
||||
uint32_t slot_mask;
|
||||
VkExtent2D extent;
|
||||
VkExtent2D frag_areas[MAX_VIEWS];
|
||||
};
|
||||
|
||||
|
|
@ -1115,6 +1116,8 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
|
|||
tu6_emit_window_scissor(cs, x1, y1, x2 - 1, y2 - 1);
|
||||
tu6_emit_window_offset<CHIP>(cs, x1, y1);
|
||||
|
||||
unsigned slot = ffs(tile->slot_mask) - 1;
|
||||
|
||||
if (hw_binning) {
|
||||
tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
|
||||
|
||||
|
|
@ -1123,13 +1126,15 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
|
|||
|
||||
tu_cs_emit_pkt7(cs, CP_SET_BIN_DATA5_OFFSET, 4);
|
||||
tu_cs_emit(cs, tiling->pipe_sizes[tile->pipe] |
|
||||
CP_SET_BIN_DATA5_0_VSC_N(tile->slot));
|
||||
CP_SET_BIN_DATA5_0_VSC_N(slot) |
|
||||
CP_SET_BIN_DATA5_0_VSC_MASK(tile->slot_mask >> slot));
|
||||
tu_cs_emit(cs, tile->pipe * cmd->vsc_draw_strm_pitch);
|
||||
tu_cs_emit(cs, tile->pipe * 4);
|
||||
tu_cs_emit(cs, tile->pipe * cmd->vsc_prim_strm_pitch);
|
||||
}
|
||||
|
||||
tu6_emit_cond_for_load_stores(cmd, cs, tile->pipe, tile->slot, hw_binning);
|
||||
if (util_is_power_of_two_nonzero(tile->slot_mask))
|
||||
tu6_emit_cond_for_load_stores(cmd, cs, tile->pipe, slot, hw_binning);
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
|
||||
tu_cs_emit(cs, !hw_binning);
|
||||
|
|
@ -1140,7 +1145,10 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
|
|||
if (fdm) {
|
||||
unsigned views =
|
||||
cmd->state.pass->num_views ? cmd->state.pass->num_views : 1;
|
||||
VkRect2D bin = { { x1, y1 }, { x2 - x1, y2 - y1 } };
|
||||
VkRect2D bin = {
|
||||
{ x1, y1 },
|
||||
{ (x2 - x1) * tile->extent.width, (y2 - y1) * tile->extent.height }
|
||||
};
|
||||
util_dynarray_foreach (&cmd->fdm_bin_patchpoints,
|
||||
struct tu_fdm_bin_patchpoint, patch) {
|
||||
tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 2 + patch->size);
|
||||
|
|
@ -2256,8 +2264,11 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
|||
}
|
||||
|
||||
/* Predicate is changed in draw_cs so we have to re-emit it */
|
||||
if (cmd->state.rp.draw_cs_writes_to_cond_pred)
|
||||
tu6_emit_cond_for_load_stores(cmd, cs, tile->pipe, tile->slot, false);
|
||||
if (cmd->state.rp.draw_cs_writes_to_cond_pred &&
|
||||
util_is_power_of_two_nonzero(tile->slot_mask)) {
|
||||
uint32_t slot = ffs(tile->slot_mask) - 1;
|
||||
tu6_emit_cond_for_load_stores(cmd, cs, tile->pipe, slot, false);
|
||||
}
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
|
||||
tu_cs_emit(cs, 0x0);
|
||||
|
|
@ -2367,6 +2378,118 @@ tu_calc_frag_area(struct tu_cmd_buffer *cmd,
|
|||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
try_merge_tiles(struct tu_tile_config *dst, const struct tu_tile_config *src,
|
||||
unsigned views)
|
||||
{
|
||||
uint32_t slot_mask = dst->slot_mask | src->slot_mask;
|
||||
|
||||
/* The fragment areas must be the same. */
|
||||
for (unsigned i = 0; i < views; i++) {
|
||||
if (dst->frag_areas[i].width != src->frag_areas[i].width ||
|
||||
dst->frag_areas[i].height != src->frag_areas[i].height)
|
||||
return false;
|
||||
}
|
||||
|
||||
/* The mask of the combined tile has to fit in 16 bits */
|
||||
uint32_t hw_mask = slot_mask >> (ffs(slot_mask) - 1);
|
||||
if ((hw_mask & 0xffff) != hw_mask)
|
||||
return false;
|
||||
|
||||
/* Note, this assumes that dst is below or to the right of src, which is
|
||||
* how we call this function below.
|
||||
*/
|
||||
VkExtent2D extent = {
|
||||
dst->extent.width + (dst->pos.x - src->pos.x),
|
||||
dst->extent.height + (dst->pos.y - src->pos.y),
|
||||
};
|
||||
|
||||
assert(dst->extent.height > 0);
|
||||
|
||||
/* The common fragment areas must not be smaller than the combined bin
|
||||
* extent, so that the combined bin is not larger than the original
|
||||
* unscaled bin.
|
||||
*/
|
||||
for (unsigned i = 0; i < views; i++) {
|
||||
if (dst->frag_areas[i].width < extent.width ||
|
||||
dst->frag_areas[i].height < extent.height)
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Ok, let's combine them. dst is below or to the right of src, so it takes
|
||||
* src's position.
|
||||
*/
|
||||
dst->extent = extent;
|
||||
dst->pos = src->pos;
|
||||
dst->slot_mask = slot_mask;
|
||||
return true;
|
||||
}
|
||||
|
||||
template <chip CHIP>
|
||||
void
|
||||
tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
|
||||
uint32_t tx1, uint32_t ty1, uint32_t tx2, uint32_t ty2,
|
||||
const struct tu_image_view *fdm)
|
||||
{
|
||||
uint32_t width = tx2 - tx1;
|
||||
uint32_t height = ty2 - ty1;
|
||||
unsigned views =
|
||||
cmd->state.pass->num_views ? cmd->state.pass->num_views : 1;
|
||||
|
||||
struct tu_tile_config tiles[width * height];
|
||||
|
||||
/* Initialize tiles and sample fragment density map */
|
||||
for (uint32_t y = 0; y < height; y++) {
|
||||
for (uint32_t x = 0; x < width; x++) {
|
||||
struct tu_tile_config *tile = &tiles[width * y + x];
|
||||
tile->pos = { x + tx1, y + ty1 };
|
||||
tile->extent = { 1, 1 };
|
||||
tile->pipe = pipe;
|
||||
tile->slot_mask = 1u << (width * y + x);
|
||||
tu_calc_frag_area(cmd, tile, fdm);
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t merged_tiles = 0;
|
||||
|
||||
/* Merge tiles */
|
||||
for (uint32_t y = 0; y < height; y++) {
|
||||
for (uint32_t x = 0; x < width; x++) {
|
||||
struct tu_tile_config *tile = &tiles[width * y + x];
|
||||
if (x > 0) {
|
||||
struct tu_tile_config *prev_x_tile = &tiles[width * y + x - 1];
|
||||
if (try_merge_tiles(tile, prev_x_tile, views)) {
|
||||
merged_tiles |= prev_x_tile->slot_mask;
|
||||
}
|
||||
}
|
||||
if (y > 0) {
|
||||
struct tu_tile_config *prev_y_tile = &tiles[width * (y - 1) + x];
|
||||
if (!(merged_tiles & prev_y_tile->slot_mask) &&
|
||||
try_merge_tiles(tile, prev_y_tile, views)) {
|
||||
merged_tiles |= prev_y_tile->slot_mask;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Finally, iterate over tiles and draw them */
|
||||
for (uint32_t y = 0; y < height; y++) {
|
||||
for (uint32_t x = 0; x < width; x++) {
|
||||
uint32_t tx;
|
||||
if (y & 1)
|
||||
tx = width - 1 - x;
|
||||
else
|
||||
tx = x;
|
||||
|
||||
unsigned tile_idx = y * width + tx;
|
||||
if (merged_tiles & (1u << tile_idx))
|
||||
continue;
|
||||
|
||||
tu6_render_tile<CHIP>(cmd, &cmd->cs, &tiles[tile_idx], true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <chip CHIP>
|
||||
static void
|
||||
tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
|
||||
|
|
@ -2380,6 +2503,8 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
|
|||
}
|
||||
|
||||
bool has_fdm = fdm || (TU_DEBUG(FDM) && cmd->state.pass->has_fdm);
|
||||
bool merge_tiles = has_fdm && !TU_DEBUG(NO_BIN_MERGING) &&
|
||||
cmd->device->physical_device->info->a6xx.has_bin_mask;
|
||||
|
||||
/* Create gmem stores now (at EndRenderPass time)) because they needed to
|
||||
* know whether to allow their conditional execution, which was tied to a
|
||||
|
|
@ -2410,6 +2535,12 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
|
|||
uint32_t ty1 = py * tiling->pipe0.height;
|
||||
uint32_t tx2 = MIN2(tx1 + tiling->pipe0.width, tiling->tile_count.width);
|
||||
uint32_t ty2 = MIN2(ty1 + tiling->pipe0.height, tiling->tile_count.height);
|
||||
|
||||
if (merge_tiles) {
|
||||
tu_render_pipe_fdm<CHIP>(cmd, pipe, tx1, ty1, tx2, ty2, fdm);
|
||||
continue;
|
||||
}
|
||||
|
||||
uint32_t tile_row_stride = tx2 - tx1;
|
||||
uint32_t slot_row = 0;
|
||||
for (uint32_t ty = ty1; ty < ty2; ty++) {
|
||||
|
|
@ -2423,7 +2554,8 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
|
|||
struct tu_tile_config tile = {
|
||||
.pos = { tx1 + tx, ty },
|
||||
.pipe = pipe,
|
||||
.slot = slot_row + tx,
|
||||
.slot_mask = 1u << (slot_row + tx),
|
||||
.extent = { 1, 1 },
|
||||
};
|
||||
if (has_fdm)
|
||||
tu_calc_frag_area(cmd, &tile, fdm);
|
||||
|
|
|
|||
|
|
@ -47,6 +47,7 @@ static const struct debug_control tu_debug_options[] = {
|
|||
{ "noconcurrentresolves", TU_DEBUG_NO_CONCURRENT_RESOLVES },
|
||||
{ "noconcurrentunresolves", TU_DEBUG_NO_CONCURRENT_UNRESOLVES },
|
||||
{ "dumpas", TU_DEBUG_DUMPAS },
|
||||
{ "nobinmerging", TU_DEBUG_NO_BIN_MERGING },
|
||||
{ NULL, 0 }
|
||||
};
|
||||
|
||||
|
|
@ -62,7 +63,8 @@ const uint32_t tu_runtime_debug_flags =
|
|||
TU_DEBUG_PERF | TU_DEBUG_FLUSHALL | TU_DEBUG_SYNCDRAW |
|
||||
TU_DEBUG_RAST_ORDER | TU_DEBUG_UNALIGNED_STORE |
|
||||
TU_DEBUG_LOG_SKIP_GMEM_OPS | TU_DEBUG_3D_LOAD | TU_DEBUG_FDM |
|
||||
TU_DEBUG_NO_CONCURRENT_RESOLVES | TU_DEBUG_NO_CONCURRENT_UNRESOLVES;
|
||||
TU_DEBUG_NO_CONCURRENT_RESOLVES | TU_DEBUG_NO_CONCURRENT_UNRESOLVES |
|
||||
TU_DEBUG_NO_BIN_MERGING;
|
||||
|
||||
os_file_notifier_t tu_debug_notifier;
|
||||
struct tu_env tu_env;
|
||||
|
|
@ -317,11 +319,29 @@ tu_tiling_config_update_tile_layout(struct tu_framebuffer *fb,
|
|||
|
||||
static void
|
||||
tu_tiling_config_update_pipe_layout(struct tu_tiling_config *tiling,
|
||||
const struct tu_device *dev)
|
||||
const struct tu_device *dev,
|
||||
bool fdm)
|
||||
{
|
||||
const uint32_t max_pipe_count =
|
||||
dev->physical_device->info->num_vsc_pipes;
|
||||
|
||||
/* If there is a fragment density map and bin merging is enabled, we will
|
||||
* likely be able to merge some bins. Bins can only be merged if they are
|
||||
* in the same visibility stream, so making the pipes cover too small an
|
||||
* area can prevent bin merging from happening. Maximize the size of each
|
||||
* pipe instead of minimizing it.
|
||||
*/
|
||||
if (fdm && dev->physical_device->info->a6xx.has_bin_mask &&
|
||||
!TU_DEBUG(NO_BIN_MERGING)) {
|
||||
tiling->pipe0.width = 4;
|
||||
tiling->pipe0.height = 8;
|
||||
tiling->pipe_count.width =
|
||||
DIV_ROUND_UP(tiling->tile_count.width, tiling->pipe0.width);
|
||||
tiling->pipe_count.height =
|
||||
DIV_ROUND_UP(tiling->tile_count.height, tiling->pipe0.height);
|
||||
return;
|
||||
}
|
||||
|
||||
/* start from 1 tile per pipe */
|
||||
tiling->pipe0 = (VkExtent2D) {
|
||||
.width = 1,
|
||||
|
|
@ -422,7 +442,7 @@ tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
|
|||
if (!tiling->possible)
|
||||
continue;
|
||||
|
||||
tu_tiling_config_update_pipe_layout(tiling, device);
|
||||
tu_tiling_config_update_pipe_layout(tiling, device, pass->has_fdm);
|
||||
tu_tiling_config_update_pipes(tiling, device);
|
||||
tu_tiling_config_update_binning(tiling, device);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -67,6 +67,7 @@ enum tu_debug_flags
|
|||
TU_DEBUG_NO_CONCURRENT_RESOLVES = 1 << 27,
|
||||
TU_DEBUG_NO_CONCURRENT_UNRESOLVES = 1 << 28,
|
||||
TU_DEBUG_DUMPAS = 1 << 29,
|
||||
TU_DEBUG_NO_BIN_MERGING = 1 << 30,
|
||||
};
|
||||
|
||||
struct tu_env {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue