diff --git a/src/freedreno/vulkan/meson.build b/src/freedreno/vulkan/meson.build index b529488c09a..4d0856ed6ac 100644 --- a/src/freedreno/vulkan/meson.build +++ b/src/freedreno/vulkan/meson.build @@ -48,6 +48,7 @@ libtu_files = files( 'tu_rmv.cc', 'tu_shader.cc', 'tu_suballoc.cc', + 'tu_tile_config.cc', 'tu_util.cc', ) diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index 544cbc07375..8e9c21a82ef 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -20,6 +20,7 @@ #include "tu_event.h" #include "tu_image.h" #include "tu_knl.h" +#include "tu_tile_config.h" #include "tu_tracepoints.h" #include "common/freedreno_gpu_event.h" @@ -1310,13 +1311,6 @@ use_hw_binning(struct tu_cmd_buffer *cmd) return vsc->binning; } -static uint32_t -tu_fdm_num_layers(const struct tu_cmd_buffer *cmd) -{ - return cmd->state.pass->num_views ? cmd->state.pass->num_views : - (cmd->state.fdm_per_layer ? cmd->state.framebuffer->layers : 1); -} - static bool use_sysmem_rendering(struct tu_cmd_buffer *cmd, struct tu_renderpass_result **autotune_result) @@ -1424,59 +1418,6 @@ tu6_emit_cond_for_load_stores(struct tu_cmd_buffer *cmd, struct tu_cs *cs, } } -struct tu_tile_config { - VkOffset2D pos; - uint32_t pipe; - uint32_t slot_mask; - uint32_t visible_views; - - /* The tile this tile was merged with. */ - struct tu_tile_config *merged_tile; - - /* For merged tiles, the extent in tiles when resolved to system memory. - */ - VkExtent2D sysmem_extent; - - /* For merged tiles, the extent in tiles in GMEM. This can only be more - * than 1 if there is extra free space from an unused view. - */ - VkExtent2D gmem_extent; - - VkExtent2D frag_areas[MAX_VIEWS]; -}; - -/* For bin offsetting we want to do "Euclidean division," where the remainder - * (i.e. the offset of the bin) is always positive. Unfortunately C/C++ - * remainder and division don't do this, so we have to implement it ourselves. - * - * For example, we should have: - * - * euclid_rem(-3, 4) = 1 - * euclid_rem(-4, 4) = 0 - * euclid_rem(-4, 4) = 3 - */ - -static int32_t -euclid_rem(int32_t divisor, int32_t divisend) -{ - if (divisor >= 0) - return divisor % divisend; - int32_t tmp = divisend - (-divisor % divisend); - return tmp == divisend ? 0 : tmp; -} - -/* Calculate how much the bins for a given view should be shifted to the left - * and upwards, given the application-provided FDM offset. - */ -static VkOffset2D -tu_bin_offset(VkOffset2D fdm_offset, const struct tu_tiling_config *tiling) -{ - return (VkOffset2D) { - euclid_rem(-fdm_offset.x, tiling->tile0.width), - euclid_rem(-fdm_offset.y, tiling->tile0.height), - }; -} - template static void tu6_emit_bin_size_gmem(struct tu_cmd_buffer *cmd, @@ -3673,184 +3614,6 @@ tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs, tu_cs_sanity_check(cs); } -static void -tu_calc_frag_area(struct tu_cmd_buffer *cmd, - struct tu_tile_config *tile, - const struct tu_image_view *fdm, - const VkOffset2D *fdm_offsets) -{ - const struct tu_tiling_config *tiling = cmd->state.tiling; - const uint32_t x1 = tiling->tile0.width * tile->pos.x; - const uint32_t y1 = tiling->tile0.height * tile->pos.y; - const uint32_t x2 = MIN2(x1 + tiling->tile0.width, MAX_VIEWPORT_SIZE); - const uint32_t y2 = MIN2(y1 + tiling->tile0.height, MAX_VIEWPORT_SIZE); - - unsigned views = tu_fdm_num_layers(cmd); - const struct tu_framebuffer *fb = cmd->state.framebuffer; - struct tu_frag_area raw_areas[views]; - if (fdm) { - for (unsigned i = 0; i < views; i++) { - VkOffset2D sample_pos = { 0, 0 }; - - /* Offsets less than a tile size are accomplished by sliding the - * tiles. However once we shift a whole tile size then we reset the - * tiles back to where they were at the beginning and we need to - * adjust where each bin is sampling from: - * - * x offset = 0: - * - * ------------------------------------ - * | * | * | * | (unused) | - * ------------------------------------ - * - * x offset = 4: - * - * ------------------------- - * | * | * | * | * | - * ------------------------- - * - * x offset = 8: - * - * ------------------------------------ - * | * | * | * | (unused) | - * ------------------------------------ - * - * As the user's offset increases we slide the tiles to the right, - * until we reach the whole tile size and reset the tile positions. - * tu_bin_offset() returns an amount to shift to the left, negating - * the offset. - * - * If we were forced to use a shared viewport, then we must not shift - * over the tiles and instead must only shift when sampling because - * we cannot shift the tiles differently per view. This disables - * smooth transitions of the fragment density map and effectively - * negates the extension. - * - * Note that we cannot clamp x2/y2 to the framebuffer size, as we - * normally would do, because then tiles along the edge would - * incorrectly nudge the sample_pos towards the center of the - * framebuffer. If we shift one complete tile over towards the - * center and reset the tiles as above, the sample_pos would - * then shift back towards the edge and we could get a "pop" from - * suddenly changing density due to the slight shift. - */ - if (fdm_offsets) { - VkOffset2D offset = fdm_offsets[i]; - if (!cmd->state.rp.shared_viewport) { - VkOffset2D bin_offset = tu_bin_offset(fdm_offsets[i], tiling); - offset.x += bin_offset.x; - offset.y += bin_offset.y; - } - sample_pos.x = (x1 + x2) / 2 - offset.x; - sample_pos.y = (y1 + y2) / 2 - offset.y; - } else { - sample_pos.x = (x1 + MIN2(x2, fb->width)) / 2; - sample_pos.y = (y1 + MIN2(y2, fb->height)) / 2; - } - - tu_fragment_density_map_sample(fdm, - sample_pos.x, - sample_pos.y, - fb->width, fb->height, i, - &raw_areas[i]); - } - } else { - for (unsigned i = 0; i < views; i++) - raw_areas[i].width = raw_areas[i].height = 1.0f; - } - - for (unsigned i = 0; i < views; i++) { - float floor_x, floor_y; - float area = raw_areas[i].width * raw_areas[i].height; - float frac_x = modff(raw_areas[i].width, &floor_x); - float frac_y = modff(raw_areas[i].height, &floor_y); - - /* The Vulkan spec says that a density of 0 results in an undefined - * fragment area. However the blob driver skips rendering tiles with 0 - * density, and apps rely on that behavior. Replicate that here. - */ - if (!isfinite(area)) { - tile->frag_areas[i].width = UINT32_MAX; - tile->frag_areas[i].height = UINT32_MAX; - tile->visible_views &= ~(1u << i); - continue; - } - - /* The spec allows rounding up one of the axes as long as the total - * area is less than or equal to the original area. Take advantage of - * this to try rounding up the number with the largest fraction. - */ - if ((frac_x > frac_y ? (floor_x + 1.f) * floor_y : - floor_x * (floor_y + 1.f)) <= area) { - if (frac_x > frac_y) - floor_x += 1.f; - else - floor_y += 1.f; - } - uint32_t width = floor_x; - uint32_t height = floor_y; - - /* Areas that aren't a power of two, especially large areas, can create - * in floating-point rounding errors when dividing by the area in the - * viewport that result in under-rendering. Round down to a power of two - * to make sure all operations are exact. - */ - width = 1u << util_logbase2(width); - height = 1u << util_logbase2(height); - - /* When FDM offset is enabled, the fragment area has to divide the - * offset to make sure that we don't have tiles with partial fragments. - * It would be bad to have the fragment area change as a function of the - * offset, because we'd get "popping" as the resolution changes with the - * offset, so just make sure it divides the offset granularity. This - * should mean it always divides the offset for any possible offset. - */ - if (fdm_offsets) { - width = MIN2(width, TU_FDM_OFFSET_GRANULARITY); - height = MIN2(height, TU_FDM_OFFSET_GRANULARITY); - } - - /* HW viewport scaling supports a maximum fragment width/height of 4. - */ - if (views <= MAX_HW_SCALED_VIEWS) { - width = MIN2(width, 4); - height = MIN2(height, 4); - } - - /* Make sure that the width/height divides the tile width/height so - * we don't have to do extra awkward clamping of the edges of each - * bin when resolving. It also has to divide the fdm offset, if any. - * Note that because the tile width is rounded to a multiple of 32 any - * power of two 32 or less will work, and if there is an offset then it - * must be a multiple of 4 so 2 or 4 will definitely work. - * - * TODO: Try to take advantage of the total area allowance here, too. - */ - while (tiling->tile0.width % width != 0) - width /= 2; - while (tiling->tile0.height % height != 0) - height /= 2; - - tile->frag_areas[i].width = width; - tile->frag_areas[i].height = height; - } - - /* If at any point we were forced to use the same scaling for all - * viewports, we need to make sure that any users *not* using shared - * scaling, including loads/stores, also consistently share the scaling. - */ - if (cmd->state.rp.shared_viewport) { - VkExtent2D frag_area = { UINT32_MAX, UINT32_MAX }; - for (unsigned i = 0; i < views; i++) { - frag_area.width = MIN2(frag_area.width, tile->frag_areas[i].width); - frag_area.height = MIN2(frag_area.height, tile->frag_areas[i].height); - } - - for (unsigned i = 0; i < views; i++) - tile->frag_areas[i] = frag_area; - } -} - static void tu_identity_frag_area(struct tu_cmd_buffer *cmd, struct tu_tile_config *tile) @@ -3859,239 +3622,6 @@ tu_identity_frag_area(struct tu_cmd_buffer *cmd, tile->frag_areas[i] = (VkExtent2D) { 1, 1 }; } -static bool -rects_intersect(VkRect2D a, VkRect2D b) -{ - return a.offset.x < b.offset.x + (int32_t)b.extent.width && - b.offset.x < a.offset.x + (int32_t)a.extent.width && - a.offset.y < b.offset.y + (int32_t)b.extent.height && - b.offset.y < a.offset.y + (int32_t)a.extent.height; -} - -/* Use the render area(s) to figure out which views of the bin are visible. - */ -static void -tu_calc_bin_visibility(struct tu_cmd_buffer *cmd, - struct tu_tile_config *tile, - const VkOffset2D *offsets) -{ - const struct tu_tiling_config *tiling = cmd->state.tiling; - uint32_t views = tu_fdm_num_layers(cmd); - VkRect2D bin = { - { - tile->pos.x * tiling->tile0.width, - tile->pos.y * tiling->tile0.height - }, - tiling->tile0 - }; - - tile->visible_views = 0; - for (unsigned i = 0; i < views; i++) { - VkRect2D offsetted_bin = bin; - if (offsets && !cmd->state.rp.shared_viewport) { - VkOffset2D bin_offset = tu_bin_offset(offsets[i], tiling); - offsetted_bin.offset.x -= bin_offset.x; - offsetted_bin.offset.y -= bin_offset.y; - } - - if (rects_intersect(offsetted_bin, - cmd->state.per_layer_render_area ? - cmd->state.render_areas[i] : - cmd->state.render_areas[0])) { - tile->visible_views |= (1u << i); - } - } -} - -static bool -try_merge_tiles(struct tu_tile_config *dst, struct tu_tile_config *src, - unsigned views, bool has_abs_bin_mask, bool shared_viewport) -{ - uint32_t slot_mask = dst->slot_mask | src->slot_mask; - uint32_t visible_views = dst->visible_views | src->visible_views; - - /* The fragment areas must be the same for views where both bins are - * visible. - */ - for (unsigned i = 0; i < views; i++) { - if ((dst->visible_views & src->visible_views & (1u << i)) && - (dst->frag_areas[i].width != src->frag_areas[i].width || - dst->frag_areas[i].height != src->frag_areas[i].height)) - return false; - } - - /* The tiles must be vertically or horizontally adjacent and have the - * compatible width/height. - */ - if (dst->pos.x == src->pos.x) { - if (dst->sysmem_extent.height != src->sysmem_extent.height) - return false; - } else if (dst->pos.y == src->pos.y) { - if (dst->sysmem_extent.width != src->sysmem_extent.width) - return false; - } else { - return false; - } - - if (dst->gmem_extent.width != src->gmem_extent.width || - dst->gmem_extent.height != src->gmem_extent.height) - return false; - - if (!has_abs_bin_mask) { - /* The mask of the combined tile has to fit in 16 bits */ - uint32_t hw_mask = slot_mask >> (ffs(slot_mask) - 1); - if ((hw_mask & 0xffff) != hw_mask) - return false; - } - - /* Note, this assumes that dst is below or to the right of src, which is - * how we call this function below. - */ - VkExtent2D extent = { - dst->sysmem_extent.width + (dst->pos.x - src->pos.x), - dst->sysmem_extent.height + (dst->pos.y - src->pos.y), - }; - - assert(dst->sysmem_extent.height > 0); - - /* If only the first view is visible in both tiles, we can reuse the GMEM - * space meant for the rest of the views to multiply the height of the - * tile. We can't do this if we can't override the scissor for different - * views though. - */ - unsigned height_multiplier = 1; - if (visible_views == 1 && views > 1 && dst->gmem_extent.height == 1 && - !shared_viewport) - height_multiplier = views; - else - height_multiplier = dst->gmem_extent.height; - - /* The combined fragment areas must not be smaller than the combined bin - * extent, so that the combined bin is not larger than the original - * unscaled bin. - */ - for (unsigned i = 0; i < views; i++) { - if ((dst->visible_views & (1u << i)) && - (dst->frag_areas[i].width < extent.width || - dst->frag_areas[i].height * height_multiplier < extent.height)) - return false; - if ((src->visible_views & (1u << i)) && - (src->frag_areas[i].width < extent.width || - src->frag_areas[i].height * height_multiplier < extent.height)) - return false; - } - - /* Ok, let's combine them. dst is below or to the right of src, so it takes - * src's position. - */ - for (unsigned i = 0; i < views; i++) { - if (src->visible_views & ~dst->visible_views & (1u << i)) - dst->frag_areas[i] = src->frag_areas[i]; - if (((src->visible_views | dst->visible_views) & (1u << i)) && - dst->frag_areas[i].height < extent.height) - dst->gmem_extent.height = height_multiplier; - } - dst->sysmem_extent = extent; - dst->visible_views = visible_views; - dst->pos = src->pos; - dst->slot_mask = slot_mask; - - src->merged_tile = dst; - - return true; -} - -static void -tu_merge_tiles(struct tu_cmd_buffer *cmd, const struct tu_vsc_config *vsc, - struct tu_tile_config *tiles, - uint32_t tx1, uint32_t ty1, uint32_t tx2, uint32_t ty2) -{ - bool has_abs_mask = - cmd->device->physical_device->info->props.has_abs_bin_mask; - unsigned views = tu_fdm_num_layers(cmd); - bool shared_viewport = cmd->state.rp.shared_viewport; - uint32_t width = vsc->tile_count.width; - - for (uint32_t y = ty1; y < ty2; y++) { - for (uint32_t x = tx1; x < tx2; x++) { - struct tu_tile_config *tile = - &tiles[width * y + x]; - if (tile->visible_views == 0) - continue; - if (x > tx1) { - struct tu_tile_config *prev_x_tile = &tiles[width * y + x - 1]; - try_merge_tiles(tile, prev_x_tile, views, has_abs_mask, - shared_viewport); - } - if (y > ty1) { - unsigned prev_y_idx = width * (y - 1) + x; - struct tu_tile_config *prev_y_tile = &tiles[prev_y_idx]; - - /* We can't merge prev_y_tile into tile if it's already been - * merged horizontally into its neighbor in the previous row. - */ - if (!prev_y_tile->merged_tile) { - try_merge_tiles(tile, prev_y_tile, views, has_abs_mask, - shared_viewport); - } - } - } - } -} - -static struct tu_tile_config * -tu_calc_tile_config(struct tu_cmd_buffer *cmd, const struct tu_vsc_config *vsc, - const struct tu_image_view *fdm, const VkOffset2D *fdm_offsets) -{ - struct tu_tile_config *tiles = (struct tu_tile_config *) - calloc(vsc->tile_count.width * vsc->tile_count.height, - sizeof(struct tu_tile_config)); - - for (uint32_t py = 0; py < vsc->pipe_count.height; py++) { - uint32_t ty1 = py * vsc->pipe0.height; - uint32_t ty2 = MIN2(ty1 + vsc->pipe0.height, vsc->tile_count.height); - for (uint32_t px = 0; px < vsc->pipe_count.width; px++) { - uint32_t tx1 = px * vsc->pipe0.width; - uint32_t tx2 = MIN2(tx1 + vsc->pipe0.width, vsc->tile_count.width); - uint32_t pipe_width = tx2 - tx1; - uint32_t pipe = py * vsc->pipe_count.width + px; - - /* Initialize tiles and sample fragment density map */ - for (uint32_t y = ty1; y < ty2; y++) { - for (uint32_t x = tx1; x < tx2; x++) { - uint32_t tx = x - tx1; - uint32_t ty = y - ty1; - struct tu_tile_config *tile = &tiles[vsc->tile_count.width * y + x]; - - tile->pos = { x, y }; - tile->sysmem_extent = { 1, 1 }; - tile->gmem_extent = { 1, 1 }; - tile->pipe = pipe; - tile->slot_mask = 1u << (pipe_width * ty + tx); - tile->merged_tile = NULL; - tu_calc_bin_visibility(cmd, tile, fdm_offsets); - tu_calc_frag_area(cmd, tile, fdm, fdm_offsets); - } - } - - /* Merge tiles */ - /* TODO: we should also be able to merge tiles when only - * per_view_render_areas is used without FDM. That requires using - * another method to force disable draws since we don't want to force - * the viewport to be re-emitted, like overriding the view mask. It - * would also require disabling stores, and adding patchpoints for - * CmdClearAttachments in secondaries or making it use the view mask. - */ - if (!TU_DEBUG(NO_BIN_MERGING) && - cmd->device->physical_device->info->props.has_bin_mask) { - tu_merge_tiles(cmd, vsc, tiles, tx1, ty1, tx2, ty2); - } - } - } - - return tiles; -} - static VkResult tu_allocate_transient_attachments(struct tu_cmd_buffer *cmd, bool sysmem) { diff --git a/src/freedreno/vulkan/tu_cmd_buffer.h b/src/freedreno/vulkan/tu_cmd_buffer.h index f2943f30acf..1014132d483 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.h +++ b/src/freedreno/vulkan/tu_cmd_buffer.h @@ -954,4 +954,43 @@ tu7_set_thread_br_patchpoint(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool force_disable_cb); +/* For bin offsetting we want to do "Euclidean division," where the remainder + * (i.e. the offset of the bin) is always positive. Unfortunately C/C++ + * remainder and division don't do this, so we have to implement it ourselves. + * + * For example, we should have: + * + * euclid_rem(-3, 4) = 1 + * euclid_rem(-4, 4) = 0 + * euclid_rem(-4, 4) = 3 + */ + +static inline int32_t +euclid_rem(int32_t divisor, int32_t divisend) +{ + if (divisor >= 0) + return divisor % divisend; + int32_t tmp = divisend - (-divisor % divisend); + return tmp == divisend ? 0 : tmp; +} + +/* Calculate how much the bins for a given view should be shifted to the left + * and upwards, given the application-provided FDM offset. + */ +static inline VkOffset2D +tu_bin_offset(VkOffset2D fdm_offset, const struct tu_tiling_config *tiling) +{ + return (VkOffset2D) { + euclid_rem(-fdm_offset.x, tiling->tile0.width), + euclid_rem(-fdm_offset.y, tiling->tile0.height), + }; +} + +static inline uint32_t +tu_fdm_num_layers(const struct tu_cmd_buffer *cmd) +{ + return cmd->state.pass->num_views ? cmd->state.pass->num_views : + (cmd->state.fdm_per_layer ? cmd->state.framebuffer->layers : 1); +} + #endif /* TU_CMD_BUFFER_H */ diff --git a/src/freedreno/vulkan/tu_tile_config.cc b/src/freedreno/vulkan/tu_tile_config.cc new file mode 100644 index 00000000000..87f9031eee7 --- /dev/null +++ b/src/freedreno/vulkan/tu_tile_config.cc @@ -0,0 +1,425 @@ +/* + * Copyright © 2026 Valve Corporation. + * Copyright © 2016 Red Hat. + * Copyright © 2016 Bas Nieuwenhuizen + * SPDX-License-Identifier: MIT + * + * based in part on anv driver which is: + * Copyright © 2015 Intel Corporation + */ + +#include "tu_cmd_buffer.h" +#include "tu_tile_config.h" + +static void +tu_calc_frag_area(struct tu_cmd_buffer *cmd, + struct tu_tile_config *tile, + const struct tu_image_view *fdm, + const VkOffset2D *fdm_offsets) +{ + const struct tu_tiling_config *tiling = cmd->state.tiling; + const uint32_t x1 = tiling->tile0.width * tile->pos.x; + const uint32_t y1 = tiling->tile0.height * tile->pos.y; + const uint32_t x2 = MIN2(x1 + tiling->tile0.width, MAX_VIEWPORT_SIZE); + const uint32_t y2 = MIN2(y1 + tiling->tile0.height, MAX_VIEWPORT_SIZE); + + unsigned views = tu_fdm_num_layers(cmd); + const struct tu_framebuffer *fb = cmd->state.framebuffer; + struct tu_frag_area raw_areas[views]; + if (fdm) { + for (unsigned i = 0; i < views; i++) { + VkOffset2D sample_pos = { 0, 0 }; + + /* Offsets less than a tile size are accomplished by sliding the + * tiles. However once we shift a whole tile size then we reset the + * tiles back to where they were at the beginning and we need to + * adjust where each bin is sampling from: + * + * x offset = 0: + * + * ------------------------------------ + * | * | * | * | (unused) | + * ------------------------------------ + * + * x offset = 4: + * + * ------------------------- + * | * | * | * | * | + * ------------------------- + * + * x offset = 8: + * + * ------------------------------------ + * | * | * | * | (unused) | + * ------------------------------------ + * + * As the user's offset increases we slide the tiles to the right, + * until we reach the whole tile size and reset the tile positions. + * tu_bin_offset() returns an amount to shift to the left, negating + * the offset. + * + * If we were forced to use a shared viewport, then we must not shift + * over the tiles and instead must only shift when sampling because + * we cannot shift the tiles differently per view. This disables + * smooth transitions of the fragment density map and effectively + * negates the extension. + * + * Note that we cannot clamp x2/y2 to the framebuffer size, as we + * normally would do, because then tiles along the edge would + * incorrectly nudge the sample_pos towards the center of the + * framebuffer. If we shift one complete tile over towards the + * center and reset the tiles as above, the sample_pos would + * then shift back towards the edge and we could get a "pop" from + * suddenly changing density due to the slight shift. + */ + if (fdm_offsets) { + VkOffset2D offset = fdm_offsets[i]; + if (!cmd->state.rp.shared_viewport) { + VkOffset2D bin_offset = tu_bin_offset(fdm_offsets[i], tiling); + offset.x += bin_offset.x; + offset.y += bin_offset.y; + } + sample_pos.x = (x1 + x2) / 2 - offset.x; + sample_pos.y = (y1 + y2) / 2 - offset.y; + } else { + sample_pos.x = (x1 + MIN2(x2, fb->width)) / 2; + sample_pos.y = (y1 + MIN2(y2, fb->height)) / 2; + } + + tu_fragment_density_map_sample(fdm, + sample_pos.x, + sample_pos.y, + fb->width, fb->height, i, + &raw_areas[i]); + } + } else { + for (unsigned i = 0; i < views; i++) + raw_areas[i].width = raw_areas[i].height = 1.0f; + } + + for (unsigned i = 0; i < views; i++) { + float floor_x, floor_y; + float area = raw_areas[i].width * raw_areas[i].height; + float frac_x = modff(raw_areas[i].width, &floor_x); + float frac_y = modff(raw_areas[i].height, &floor_y); + + /* The Vulkan spec says that a density of 0 results in an undefined + * fragment area. However the blob driver skips rendering tiles with 0 + * density, and apps rely on that behavior. Replicate that here. + */ + if (!isfinite(area)) { + tile->frag_areas[i].width = UINT32_MAX; + tile->frag_areas[i].height = UINT32_MAX; + tile->visible_views &= ~(1u << i); + continue; + } + + /* The spec allows rounding up one of the axes as long as the total + * area is less than or equal to the original area. Take advantage of + * this to try rounding up the number with the largest fraction. + */ + if ((frac_x > frac_y ? (floor_x + 1.f) * floor_y : + floor_x * (floor_y + 1.f)) <= area) { + if (frac_x > frac_y) + floor_x += 1.f; + else + floor_y += 1.f; + } + uint32_t width = floor_x; + uint32_t height = floor_y; + + /* Areas that aren't a power of two, especially large areas, can create + * in floating-point rounding errors when dividing by the area in the + * viewport that result in under-rendering. Round down to a power of two + * to make sure all operations are exact. + */ + width = 1u << util_logbase2(width); + height = 1u << util_logbase2(height); + + /* When FDM offset is enabled, the fragment area has to divide the + * offset to make sure that we don't have tiles with partial fragments. + * It would be bad to have the fragment area change as a function of the + * offset, because we'd get "popping" as the resolution changes with the + * offset, so just make sure it divides the offset granularity. This + * should mean it always divides the offset for any possible offset. + */ + if (fdm_offsets) { + width = MIN2(width, TU_FDM_OFFSET_GRANULARITY); + height = MIN2(height, TU_FDM_OFFSET_GRANULARITY); + } + + /* HW viewport scaling supports a maximum fragment width/height of 4. + */ + if (views <= MAX_HW_SCALED_VIEWS) { + width = MIN2(width, 4); + height = MIN2(height, 4); + } + + /* Make sure that the width/height divides the tile width/height so + * we don't have to do extra awkward clamping of the edges of each + * bin when resolving. It also has to divide the fdm offset, if any. + * Note that because the tile width is rounded to a multiple of 32 any + * power of two 32 or less will work, and if there is an offset then it + * must be a multiple of 4 so 2 or 4 will definitely work. + * + * TODO: Try to take advantage of the total area allowance here, too. + */ + while (tiling->tile0.width % width != 0) + width /= 2; + while (tiling->tile0.height % height != 0) + height /= 2; + + tile->frag_areas[i].width = width; + tile->frag_areas[i].height = height; + } + + /* If at any point we were forced to use the same scaling for all + * viewports, we need to make sure that any users *not* using shared + * scaling, including loads/stores, also consistently share the scaling. + */ + if (cmd->state.rp.shared_viewport) { + VkExtent2D frag_area = { UINT32_MAX, UINT32_MAX }; + for (unsigned i = 0; i < views; i++) { + frag_area.width = MIN2(frag_area.width, tile->frag_areas[i].width); + frag_area.height = MIN2(frag_area.height, tile->frag_areas[i].height); + } + + for (unsigned i = 0; i < views; i++) + tile->frag_areas[i] = frag_area; + } +} + +static bool +rects_intersect(VkRect2D a, VkRect2D b) +{ + return a.offset.x < b.offset.x + (int32_t)b.extent.width && + b.offset.x < a.offset.x + (int32_t)a.extent.width && + a.offset.y < b.offset.y + (int32_t)b.extent.height && + b.offset.y < a.offset.y + (int32_t)a.extent.height; +} + +/* Use the render area(s) to figure out which views of the bin are visible. + */ +void +tu_calc_bin_visibility(struct tu_cmd_buffer *cmd, + struct tu_tile_config *tile, + const VkOffset2D *offsets) +{ + const struct tu_tiling_config *tiling = cmd->state.tiling; + uint32_t views = tu_fdm_num_layers(cmd); + VkRect2D bin = { + { + tile->pos.x * tiling->tile0.width, + tile->pos.y * tiling->tile0.height + }, + tiling->tile0 + }; + + tile->visible_views = 0; + for (unsigned i = 0; i < views; i++) { + VkRect2D offsetted_bin = bin; + if (offsets && !cmd->state.rp.shared_viewport) { + VkOffset2D bin_offset = tu_bin_offset(offsets[i], tiling); + offsetted_bin.offset.x -= bin_offset.x; + offsetted_bin.offset.y -= bin_offset.y; + } + + if (rects_intersect(offsetted_bin, + cmd->state.per_layer_render_area ? + cmd->state.render_areas[i] : + cmd->state.render_areas[0])) { + tile->visible_views |= (1u << i); + } + } +} + +static bool +try_merge_tiles(struct tu_tile_config *dst, struct tu_tile_config *src, + unsigned views, bool has_abs_bin_mask, bool shared_viewport) +{ + uint32_t slot_mask = dst->slot_mask | src->slot_mask; + uint32_t visible_views = dst->visible_views | src->visible_views; + + /* The fragment areas must be the same for views where both bins are + * visible. + */ + for (unsigned i = 0; i < views; i++) { + if ((dst->visible_views & src->visible_views & (1u << i)) && + (dst->frag_areas[i].width != src->frag_areas[i].width || + dst->frag_areas[i].height != src->frag_areas[i].height)) + return false; + } + + /* The tiles must be vertically or horizontally adjacent and have the + * compatible width/height. + */ + if (dst->pos.x == src->pos.x) { + if (dst->sysmem_extent.height != src->sysmem_extent.height) + return false; + } else if (dst->pos.y == src->pos.y) { + if (dst->sysmem_extent.width != src->sysmem_extent.width) + return false; + } else { + return false; + } + + if (dst->gmem_extent.width != src->gmem_extent.width || + dst->gmem_extent.height != src->gmem_extent.height) + return false; + + if (!has_abs_bin_mask) { + /* The mask of the combined tile has to fit in 16 bits */ + uint32_t hw_mask = slot_mask >> (ffs(slot_mask) - 1); + if ((hw_mask & 0xffff) != hw_mask) + return false; + } + + /* Note, this assumes that dst is below or to the right of src, which is + * how we call this function below. + */ + VkExtent2D extent = { + dst->sysmem_extent.width + (dst->pos.x - src->pos.x), + dst->sysmem_extent.height + (dst->pos.y - src->pos.y), + }; + + assert(dst->sysmem_extent.height > 0); + + /* If only the first view is visible in both tiles, we can reuse the GMEM + * space meant for the rest of the views to multiply the height of the + * tile. We can't do this if we can't override the scissor for different + * views though. + */ + unsigned height_multiplier = 1; + if (visible_views == 1 && views > 1 && dst->gmem_extent.height == 1 && + !shared_viewport) + height_multiplier = views; + else + height_multiplier = dst->gmem_extent.height; + + /* The combined fragment areas must not be smaller than the combined bin + * extent, so that the combined bin is not larger than the original + * unscaled bin. + */ + for (unsigned i = 0; i < views; i++) { + if ((dst->visible_views & (1u << i)) && + (dst->frag_areas[i].width < extent.width || + dst->frag_areas[i].height * height_multiplier < extent.height)) + return false; + if ((src->visible_views & (1u << i)) && + (src->frag_areas[i].width < extent.width || + src->frag_areas[i].height * height_multiplier < extent.height)) + return false; + } + + /* Ok, let's combine them. dst is below or to the right of src, so it takes + * src's position. + */ + for (unsigned i = 0; i < views; i++) { + if (src->visible_views & ~dst->visible_views & (1u << i)) + dst->frag_areas[i] = src->frag_areas[i]; + if (((src->visible_views | dst->visible_views) & (1u << i)) && + dst->frag_areas[i].height < extent.height) + dst->gmem_extent.height = height_multiplier; + } + dst->sysmem_extent = extent; + dst->visible_views = visible_views; + dst->pos = src->pos; + dst->slot_mask = slot_mask; + + src->merged_tile = dst; + + return true; +} + +static void +tu_merge_tiles(struct tu_cmd_buffer *cmd, const struct tu_vsc_config *vsc, + struct tu_tile_config *tiles, + uint32_t tx1, uint32_t ty1, uint32_t tx2, uint32_t ty2) +{ + bool has_abs_mask = + cmd->device->physical_device->info->props.has_abs_bin_mask; + unsigned views = tu_fdm_num_layers(cmd); + bool shared_viewport = cmd->state.rp.shared_viewport; + uint32_t width = vsc->tile_count.width; + + for (uint32_t y = ty1; y < ty2; y++) { + for (uint32_t x = tx1; x < tx2; x++) { + struct tu_tile_config *tile = + &tiles[width * y + x]; + if (tile->visible_views == 0) + continue; + if (x > tx1) { + struct tu_tile_config *prev_x_tile = &tiles[width * y + x - 1]; + try_merge_tiles(tile, prev_x_tile, views, has_abs_mask, + shared_viewport); + } + if (y > ty1) { + unsigned prev_y_idx = width * (y - 1) + x; + struct tu_tile_config *prev_y_tile = &tiles[prev_y_idx]; + + /* We can't merge prev_y_tile into tile if it's already been + * merged horizontally into its neighbor in the previous row. + */ + if (!prev_y_tile->merged_tile) { + try_merge_tiles(tile, prev_y_tile, views, has_abs_mask, + shared_viewport); + } + } + } + } +} + + +struct tu_tile_config * +tu_calc_tile_config(struct tu_cmd_buffer *cmd, const struct tu_vsc_config *vsc, + const struct tu_image_view *fdm, const VkOffset2D *fdm_offsets) +{ + struct tu_tile_config *tiles = (struct tu_tile_config *) + calloc(vsc->tile_count.width * vsc->tile_count.height, + sizeof(struct tu_tile_config)); + + for (uint32_t py = 0; py < vsc->pipe_count.height; py++) { + uint32_t ty1 = py * vsc->pipe0.height; + uint32_t ty2 = MIN2(ty1 + vsc->pipe0.height, vsc->tile_count.height); + for (uint32_t px = 0; px < vsc->pipe_count.width; px++) { + uint32_t tx1 = px * vsc->pipe0.width; + uint32_t tx2 = MIN2(tx1 + vsc->pipe0.width, vsc->tile_count.width); + uint32_t pipe_width = tx2 - tx1; + uint32_t pipe = py * vsc->pipe_count.width + px; + + /* Initialize tiles and sample fragment density map */ + for (uint32_t y = ty1; y < ty2; y++) { + for (uint32_t x = tx1; x < tx2; x++) { + uint32_t tx = x - tx1; + uint32_t ty = y - ty1; + struct tu_tile_config *tile = &tiles[vsc->tile_count.width * y + x]; + + tile->pos = { x, y }; + tile->sysmem_extent = { 1, 1 }; + tile->gmem_extent = { 1, 1 }; + tile->pipe = pipe; + tile->slot_mask = 1u << (pipe_width * ty + tx); + tile->merged_tile = NULL; + tu_calc_bin_visibility(cmd, tile, fdm_offsets); + tu_calc_frag_area(cmd, tile, fdm, fdm_offsets); + } + } + + /* Merge tiles */ + /* TODO: we should also be able to merge tiles when only + * per_view_render_areas is used without FDM. That requires using + * another method to force disable draws since we don't want to force + * the viewport to be re-emitted, like overriding the view mask. It + * would also require disabling stores, and adding patchpoints for + * CmdClearAttachments in secondaries or making it use the view mask. + */ + if (!TU_DEBUG(NO_BIN_MERGING) && + cmd->device->physical_device->info->props.has_bin_mask) { + tu_merge_tiles(cmd, vsc, tiles, tx1, ty1, tx2, ty2); + } + } + } + + return tiles; +} + diff --git a/src/freedreno/vulkan/tu_tile_config.h b/src/freedreno/vulkan/tu_tile_config.h new file mode 100644 index 00000000000..920732320b1 --- /dev/null +++ b/src/freedreno/vulkan/tu_tile_config.h @@ -0,0 +1,46 @@ +/* + * Copyright © 2026 Valve Corporation. + * Copyright © 2016 Red Hat. + * Copyright © 2016 Bas Nieuwenhuizen + * SPDX-License-Identifier: MIT + * + * based in part on anv driver which is: + * Copyright © 2015 Intel Corporation + */ + +#include "tu_common.h" + +#ifndef TU_TILE_CONFIG_H +#define TU_TILE_CONFIG_H + +struct tu_tile_config { + VkOffset2D pos; + uint32_t pipe; + uint32_t slot_mask; + uint32_t visible_views; + + /* The tile this tile was merged with. */ + struct tu_tile_config *merged_tile; + + /* For merged tiles, the extent in tiles when resolved to system memory. + */ + VkExtent2D sysmem_extent; + + /* For merged tiles, the extent in tiles in GMEM. This can only be more + * than 1 if there is extra free space from an unused view. + */ + VkExtent2D gmem_extent; + + VkExtent2D frag_areas[MAX_VIEWS]; +}; + +struct tu_tile_config * +tu_calc_tile_config(struct tu_cmd_buffer *cmd, const struct tu_vsc_config *vsc, + const struct tu_image_view *fdm, const VkOffset2D *fdm_offsets); + +void +tu_calc_bin_visibility(struct tu_cmd_buffer *cmd, + struct tu_tile_config *tile, + const VkOffset2D *offsets); + +#endif