diff --git a/docs/drivers/freedreno.rst b/docs/drivers/freedreno.rst index 032dd6e798c..52a7472427e 100644 --- a/docs/drivers/freedreno.rst +++ b/docs/drivers/freedreno.rst @@ -654,8 +654,8 @@ Additionally, not all ``TU_DEBUG`` options can be toggled at runtime, the follow are supported at the moment: ``nir``, ``nobin``, ``sysmem``, ``gmem``, ``forcebin``, ``layout``, ``nolrz``, ``nolrzfc``, ``perf``, ``flushall``, ``syncdraw``, ``rast_order``, ``unaligned_store``, ``log_skip_gmem_ops``, ``3d_load``, ``fdm``, -``noconcurrentresolves``, ``noconcurrentunresolves``. +``noconcurrentresolves``, ``noconcurrentunresolves``, ``nobinmerging``. Some of these options will behave differently when toggled at runtime, for example: ``nolrz`` will still result in LRZ allocation which would not happen if the option -was set in the environment variable. \ No newline at end of file +was set in the environment variable. diff --git a/src/freedreno/common/freedreno_dev_info.h b/src/freedreno/common/freedreno_dev_info.h index eb09db1241a..0338091b981 100644 --- a/src/freedreno/common/freedreno_dev_info.h +++ b/src/freedreno/common/freedreno_dev_info.h @@ -234,6 +234,8 @@ struct fd_dev_info { float line_width_min; float line_width_max; + + bool has_bin_mask; } a6xx; struct { diff --git a/src/freedreno/common/freedreno_devices.py b/src/freedreno/common/freedreno_devices.py index 0e7e74c6a14..7268de338b5 100644 --- a/src/freedreno/common/freedreno_devices.py +++ b/src/freedreno/common/freedreno_devices.py @@ -866,6 +866,7 @@ a7xx_base = A6XXProps( prede_nop_quirk = True, predtf_nop_quirk = True, has_sad = True, + has_bin_mask = True, ) a7xx_gen1 = A7XXProps( diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index b017f6cfc05..c08b1068ddb 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -1073,7 +1073,8 @@ tu6_emit_cond_for_load_stores(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_tile_config { VkOffset2D pos; uint32_t pipe; - uint32_t slot; + uint32_t slot_mask; + VkExtent2D extent; VkExtent2D frag_areas[MAX_VIEWS]; }; @@ -1115,6 +1116,8 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd, tu6_emit_window_scissor(cs, x1, y1, x2 - 1, y2 - 1); tu6_emit_window_offset(cs, x1, y1); + unsigned slot = ffs(tile->slot_mask) - 1; + if (hw_binning) { tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0); @@ -1123,13 +1126,15 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd, tu_cs_emit_pkt7(cs, CP_SET_BIN_DATA5_OFFSET, 4); tu_cs_emit(cs, tiling->pipe_sizes[tile->pipe] | - CP_SET_BIN_DATA5_0_VSC_N(tile->slot)); + CP_SET_BIN_DATA5_0_VSC_N(slot) | + CP_SET_BIN_DATA5_0_VSC_MASK(tile->slot_mask >> slot)); tu_cs_emit(cs, tile->pipe * cmd->vsc_draw_strm_pitch); tu_cs_emit(cs, tile->pipe * 4); tu_cs_emit(cs, tile->pipe * cmd->vsc_prim_strm_pitch); } - tu6_emit_cond_for_load_stores(cmd, cs, tile->pipe, tile->slot, hw_binning); + if (util_is_power_of_two_nonzero(tile->slot_mask)) + tu6_emit_cond_for_load_stores(cmd, cs, tile->pipe, slot, hw_binning); tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1); tu_cs_emit(cs, !hw_binning); @@ -1140,7 +1145,10 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd, if (fdm) { unsigned views = cmd->state.pass->num_views ? cmd->state.pass->num_views : 1; - VkRect2D bin = { { x1, y1 }, { x2 - x1, y2 - y1 } }; + VkRect2D bin = { + { x1, y1 }, + { (x2 - x1) * tile->extent.width, (y2 - y1) * tile->extent.height } + }; util_dynarray_foreach (&cmd->fdm_bin_patchpoints, struct tu_fdm_bin_patchpoint, patch) { tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 2 + patch->size); @@ -2256,8 +2264,11 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs, } /* Predicate is changed in draw_cs so we have to re-emit it */ - if (cmd->state.rp.draw_cs_writes_to_cond_pred) - tu6_emit_cond_for_load_stores(cmd, cs, tile->pipe, tile->slot, false); + if (cmd->state.rp.draw_cs_writes_to_cond_pred && + util_is_power_of_two_nonzero(tile->slot_mask)) { + uint32_t slot = ffs(tile->slot_mask) - 1; + tu6_emit_cond_for_load_stores(cmd, cs, tile->pipe, slot, false); + } tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1); tu_cs_emit(cs, 0x0); @@ -2367,6 +2378,118 @@ tu_calc_frag_area(struct tu_cmd_buffer *cmd, } } +static bool +try_merge_tiles(struct tu_tile_config *dst, const struct tu_tile_config *src, + unsigned views) +{ + uint32_t slot_mask = dst->slot_mask | src->slot_mask; + + /* The fragment areas must be the same. */ + for (unsigned i = 0; i < views; i++) { + if (dst->frag_areas[i].width != src->frag_areas[i].width || + dst->frag_areas[i].height != src->frag_areas[i].height) + return false; + } + + /* The mask of the combined tile has to fit in 16 bits */ + uint32_t hw_mask = slot_mask >> (ffs(slot_mask) - 1); + if ((hw_mask & 0xffff) != hw_mask) + return false; + + /* Note, this assumes that dst is below or to the right of src, which is + * how we call this function below. + */ + VkExtent2D extent = { + dst->extent.width + (dst->pos.x - src->pos.x), + dst->extent.height + (dst->pos.y - src->pos.y), + }; + + assert(dst->extent.height > 0); + + /* The common fragment areas must not be smaller than the combined bin + * extent, so that the combined bin is not larger than the original + * unscaled bin. + */ + for (unsigned i = 0; i < views; i++) { + if (dst->frag_areas[i].width < extent.width || + dst->frag_areas[i].height < extent.height) + return false; + } + + /* Ok, let's combine them. dst is below or to the right of src, so it takes + * src's position. + */ + dst->extent = extent; + dst->pos = src->pos; + dst->slot_mask = slot_mask; + return true; +} + +template +void +tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe, + uint32_t tx1, uint32_t ty1, uint32_t tx2, uint32_t ty2, + const struct tu_image_view *fdm) +{ + uint32_t width = tx2 - tx1; + uint32_t height = ty2 - ty1; + unsigned views = + cmd->state.pass->num_views ? cmd->state.pass->num_views : 1; + + struct tu_tile_config tiles[width * height]; + + /* Initialize tiles and sample fragment density map */ + for (uint32_t y = 0; y < height; y++) { + for (uint32_t x = 0; x < width; x++) { + struct tu_tile_config *tile = &tiles[width * y + x]; + tile->pos = { x + tx1, y + ty1 }; + tile->extent = { 1, 1 }; + tile->pipe = pipe; + tile->slot_mask = 1u << (width * y + x); + tu_calc_frag_area(cmd, tile, fdm); + } + } + + uint32_t merged_tiles = 0; + + /* Merge tiles */ + for (uint32_t y = 0; y < height; y++) { + for (uint32_t x = 0; x < width; x++) { + struct tu_tile_config *tile = &tiles[width * y + x]; + if (x > 0) { + struct tu_tile_config *prev_x_tile = &tiles[width * y + x - 1]; + if (try_merge_tiles(tile, prev_x_tile, views)) { + merged_tiles |= prev_x_tile->slot_mask; + } + } + if (y > 0) { + struct tu_tile_config *prev_y_tile = &tiles[width * (y - 1) + x]; + if (!(merged_tiles & prev_y_tile->slot_mask) && + try_merge_tiles(tile, prev_y_tile, views)) { + merged_tiles |= prev_y_tile->slot_mask; + } + } + } + } + + /* Finally, iterate over tiles and draw them */ + for (uint32_t y = 0; y < height; y++) { + for (uint32_t x = 0; x < width; x++) { + uint32_t tx; + if (y & 1) + tx = width - 1 - x; + else + tx = x; + + unsigned tile_idx = y * width + tx; + if (merged_tiles & (1u << tile_idx)) + continue; + + tu6_render_tile(cmd, &cmd->cs, &tiles[tile_idx], true); + } + } +} + template static void tu_cmd_render_tiles(struct tu_cmd_buffer *cmd, @@ -2380,6 +2503,8 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd, } bool has_fdm = fdm || (TU_DEBUG(FDM) && cmd->state.pass->has_fdm); + bool merge_tiles = has_fdm && !TU_DEBUG(NO_BIN_MERGING) && + cmd->device->physical_device->info->a6xx.has_bin_mask; /* Create gmem stores now (at EndRenderPass time)) because they needed to * know whether to allow their conditional execution, which was tied to a @@ -2410,6 +2535,12 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd, uint32_t ty1 = py * tiling->pipe0.height; uint32_t tx2 = MIN2(tx1 + tiling->pipe0.width, tiling->tile_count.width); uint32_t ty2 = MIN2(ty1 + tiling->pipe0.height, tiling->tile_count.height); + + if (merge_tiles) { + tu_render_pipe_fdm(cmd, pipe, tx1, ty1, tx2, ty2, fdm); + continue; + } + uint32_t tile_row_stride = tx2 - tx1; uint32_t slot_row = 0; for (uint32_t ty = ty1; ty < ty2; ty++) { @@ -2423,7 +2554,8 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd, struct tu_tile_config tile = { .pos = { tx1 + tx, ty }, .pipe = pipe, - .slot = slot_row + tx, + .slot_mask = 1u << (slot_row + tx), + .extent = { 1, 1 }, }; if (has_fdm) tu_calc_frag_area(cmd, &tile, fdm); diff --git a/src/freedreno/vulkan/tu_util.cc b/src/freedreno/vulkan/tu_util.cc index a45cdce3ba8..a699c71778f 100644 --- a/src/freedreno/vulkan/tu_util.cc +++ b/src/freedreno/vulkan/tu_util.cc @@ -47,6 +47,7 @@ static const struct debug_control tu_debug_options[] = { { "noconcurrentresolves", TU_DEBUG_NO_CONCURRENT_RESOLVES }, { "noconcurrentunresolves", TU_DEBUG_NO_CONCURRENT_UNRESOLVES }, { "dumpas", TU_DEBUG_DUMPAS }, + { "nobinmerging", TU_DEBUG_NO_BIN_MERGING }, { NULL, 0 } }; @@ -62,7 +63,8 @@ const uint32_t tu_runtime_debug_flags = TU_DEBUG_PERF | TU_DEBUG_FLUSHALL | TU_DEBUG_SYNCDRAW | TU_DEBUG_RAST_ORDER | TU_DEBUG_UNALIGNED_STORE | TU_DEBUG_LOG_SKIP_GMEM_OPS | TU_DEBUG_3D_LOAD | TU_DEBUG_FDM | - TU_DEBUG_NO_CONCURRENT_RESOLVES | TU_DEBUG_NO_CONCURRENT_UNRESOLVES; + TU_DEBUG_NO_CONCURRENT_RESOLVES | TU_DEBUG_NO_CONCURRENT_UNRESOLVES | + TU_DEBUG_NO_BIN_MERGING; os_file_notifier_t tu_debug_notifier; struct tu_env tu_env; @@ -317,11 +319,29 @@ tu_tiling_config_update_tile_layout(struct tu_framebuffer *fb, static void tu_tiling_config_update_pipe_layout(struct tu_tiling_config *tiling, - const struct tu_device *dev) + const struct tu_device *dev, + bool fdm) { const uint32_t max_pipe_count = dev->physical_device->info->num_vsc_pipes; + /* If there is a fragment density map and bin merging is enabled, we will + * likely be able to merge some bins. Bins can only be merged if they are + * in the same visibility stream, so making the pipes cover too small an + * area can prevent bin merging from happening. Maximize the size of each + * pipe instead of minimizing it. + */ + if (fdm && dev->physical_device->info->a6xx.has_bin_mask && + !TU_DEBUG(NO_BIN_MERGING)) { + tiling->pipe0.width = 4; + tiling->pipe0.height = 8; + tiling->pipe_count.width = + DIV_ROUND_UP(tiling->tile_count.width, tiling->pipe0.width); + tiling->pipe_count.height = + DIV_ROUND_UP(tiling->tile_count.height, tiling->pipe0.height); + return; + } + /* start from 1 tile per pipe */ tiling->pipe0 = (VkExtent2D) { .width = 1, @@ -422,7 +442,7 @@ tu_framebuffer_tiling_config(struct tu_framebuffer *fb, if (!tiling->possible) continue; - tu_tiling_config_update_pipe_layout(tiling, device); + tu_tiling_config_update_pipe_layout(tiling, device, pass->has_fdm); tu_tiling_config_update_pipes(tiling, device); tu_tiling_config_update_binning(tiling, device); } diff --git a/src/freedreno/vulkan/tu_util.h b/src/freedreno/vulkan/tu_util.h index 1a5e09c0660..ca133d5215b 100644 --- a/src/freedreno/vulkan/tu_util.h +++ b/src/freedreno/vulkan/tu_util.h @@ -67,6 +67,7 @@ enum tu_debug_flags TU_DEBUG_NO_CONCURRENT_RESOLVES = 1 << 27, TU_DEBUG_NO_CONCURRENT_UNRESOLVES = 1 << 28, TU_DEBUG_DUMPAS = 1 << 29, + TU_DEBUG_NO_BIN_MERGING = 1 << 30, }; struct tu_env {