tu: Implement VK_QCOM_fragment_density_map_offset

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33500>
This commit is contained in:
Connor Abbott 2025-02-11 12:40:59 -05:00 committed by Marge Bot
parent 7351f8d587
commit 75178c4655
15 changed files with 490 additions and 132 deletions

View file

@ -694,6 +694,7 @@ Khronos extensions that are not part of any Vulkan version:
VK_EXT_map_memory_placed DONE (anv, nvk, radv, tu)
VK_MESA_image_alignment_control DONE (anv, nvk, radv)
VK_EXT_legacy_dithering DONE (anv, tu, vn)
VK_QCOM_fragment_density_map_offset DONE (tu)
Clover OpenCL 1.0 -- all DONE:

View file

@ -1363,6 +1363,22 @@ r3d_src_gmem(struct tu_cmd_buffer *cmd,
if (!iview->view.is_mutable)
desc[0] &= ~A6XX_TEX_CONST_0_SWAP__MASK;
desc[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
/* If FDM offset is used, the last row and column extend beyond the
* framebuffer but are shifted over when storing. Expand the width and
* height to account for that.
*/
if (tu_enable_fdm_offset(cmd)) {
uint32_t width = desc[1] & A6XX_TEX_CONST_1_WIDTH__MASK;
uint32_t height = (desc[1] & A6XX_TEX_CONST_1_HEIGHT__MASK) >>
A6XX_TEX_CONST_1_HEIGHT__SHIFT;
width += cmd->state.tiling->tile0.width;
height += cmd->state.tiling->tile0.height;
desc[1] = (desc[1] & ~(A6XX_TEX_CONST_1_WIDTH__MASK |
A6XX_TEX_CONST_1_HEIGHT__MASK)) |
A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
}
desc[2] =
A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
A6XX_TEX_CONST_2_PITCH(cmd->state.tiling->tile0.width * cpp);
@ -3910,17 +3926,19 @@ static void
fdm_apply_sysmem_clear_coords(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
void *data,
VkRect2D bin,
VkOffset2D common_bin_offset,
unsigned views,
const VkExtent2D *frag_areas)
const VkExtent2D *frag_areas,
const VkRect2D *bins)
{
const struct apply_sysmem_clear_coords_state *state =
(const struct apply_sysmem_clear_coords_state *)data;
assert(state->view < views);
VkExtent2D frag_area = frag_areas[state->view];
VkRect2D bin = bins[state->view];
VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin);
VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin, common_bin_offset);
unsigned x1 = state->rect.offset.x / frag_area.width + offset.x;
unsigned x2 = DIV_ROUND_UP(state->rect.offset.x + state->rect.extent.width,
@ -4182,17 +4200,19 @@ static void
fdm_apply_gmem_clear_coords(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
void *data,
VkRect2D bin,
VkOffset2D common_bin_offset,
unsigned views,
const VkExtent2D *frag_areas)
const VkExtent2D *frag_areas,
const VkRect2D *bins)
{
const struct apply_gmem_clear_coords_state *state =
(const struct apply_gmem_clear_coords_state *)data;
assert(state->view < views);
VkExtent2D frag_area = frag_areas[state->view];
VkRect2D bin = bins[state->view];
VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin);
VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin, common_bin_offset);
unsigned x1 = state->rect.offset.x / frag_area.width + offset.x;
unsigned x2 = DIV_ROUND_UP(state->rect.offset.x + state->rect.extent.width,
@ -4816,14 +4836,16 @@ static void
fdm_apply_load_coords(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
void *data,
VkRect2D bin,
VkOffset2D common_bin_offset,
unsigned views,
const VkExtent2D *frag_areas)
const VkExtent2D *frag_areas,
const VkRect2D *bins)
{
const struct apply_load_coords_state *state =
(const struct apply_load_coords_state *)data;
assert(state->view < views);
VkExtent2D frag_area = frag_areas[state->view];
VkRect2D bin = bins[state->view];
assert(bin.extent.width % frag_area.width == 0);
assert(bin.extent.height % frag_area.height == 0);
@ -4831,10 +4853,10 @@ fdm_apply_load_coords(struct tu_cmd_buffer *cmd,
uint32_t scaled_height = bin.extent.height / frag_area.height;
const float coords[] = {
bin.offset.x, bin.offset.y,
bin.offset.x, bin.offset.y,
bin.offset.x + scaled_width, bin.offset.y + scaled_height,
bin.offset.x + bin.extent.width, bin.offset.y + bin.extent.height,
common_bin_offset.x, common_bin_offset.y,
bin.offset.x, bin.offset.y,
common_bin_offset.x + scaled_width, common_bin_offset.y + scaled_height,
bin.offset.x + bin.extent.width, bin.offset.y + bin.extent.height,
};
r3d_coords_raw(cmd, cs, coords);
}
@ -5050,6 +5072,19 @@ store_cp_blit(struct tu_cmd_buffer *cmd,
enum a6xx_format format = fmt.fmt;
fixup_src_format(&src_format, dst_format, &format);
uint32_t src_width = dst_iview->vk.extent.width;
uint32_t src_height = dst_iview->vk.extent.height;
/* With FDM offset, we may blit from an extra row/column of tiles whose
* source coordinates are outside of the attachment. Add an extra tile
* width/height to the size to avoid clipping the source.
*/
if (tu_enable_fdm_offset(cmd)) {
const struct tu_tiling_config *tiling = cmd->state.tiling;
src_width += tiling->tile0.width;
src_height += tiling->tile0.height;
}
tu_cs_emit_regs(cs,
SP_PS_2D_SRC_INFO(CHIP,
.color_format = format,
@ -5063,8 +5098,8 @@ store_cp_blit(struct tu_cmd_buffer *cmd,
.unk22 = 1,
.mutableen = src_iview->view.is_mutable),
SP_PS_2D_SRC_SIZE(CHIP,
.width = dst_iview->vk.extent.width,
.height = dst_iview->vk.extent.height),
.width = src_width,
.height = src_height),
SP_PS_2D_SRC(CHIP, .qword = cmd->device->physical_device->gmem_base + gmem_offset),
SP_PS_2D_SRC_PITCH(CHIP, .pitch = cmd->state.tiling->tile0.width * cpp));
@ -5274,14 +5309,16 @@ static void
fdm_apply_store_coords(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
void *data,
VkRect2D bin,
VkOffset2D common_bin_offset,
unsigned views,
const VkExtent2D *frag_areas)
const VkExtent2D *frag_areas,
const VkRect2D *bins)
{
const struct apply_store_coords_state *state =
(const struct apply_store_coords_state *)data;
assert(state->view < views);
VkExtent2D frag_area = frag_areas[state->view];
VkRect2D bin = bins[state->view];
/* The bin width/height must be a multiple of the frag_area to make sure
* that the scaling happens correctly. This means there may be some
@ -5299,10 +5336,10 @@ fdm_apply_store_coords(struct tu_cmd_buffer *cmd,
A6XX_GRAS_2D_DST_BR(.x = bin.offset.x + bin.extent.width - 1,
.y = bin.offset.y + bin.extent.height - 1));
tu_cs_emit_regs(cs,
A6XX_GRAS_2D_SRC_TL_X(bin.offset.x),
A6XX_GRAS_2D_SRC_BR_X(bin.offset.x + scaled_width - 1),
A6XX_GRAS_2D_SRC_TL_Y(bin.offset.y),
A6XX_GRAS_2D_SRC_BR_Y(bin.offset.y + scaled_height - 1));
A6XX_GRAS_2D_SRC_TL_X(common_bin_offset.x),
A6XX_GRAS_2D_SRC_BR_X(common_bin_offset.x + scaled_width - 1),
A6XX_GRAS_2D_SRC_TL_Y(common_bin_offset.y),
A6XX_GRAS_2D_SRC_BR_Y(common_bin_offset.y + scaled_height - 1));
}
template <chip CHIP>

View file

@ -700,7 +700,8 @@ tu6_emit_render_cntl<A7XX>(struct tu_cmd_buffer *cmd,
}
static void
tu6_emit_blit_scissor(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool align)
tu6_emit_blit_scissor(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool align,
bool used_by_sysmem)
{
struct tu_physical_device *phys_dev = cmd->device->physical_device;
const VkRect2D *render_area = &cmd->state.render_area;
@ -727,9 +728,42 @@ tu6_emit_blit_scissor(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool align)
y2 = ALIGN_POT(y2 + 1, phys_dev->info->gmem_align_h) - 1;
}
tu_cs_emit_regs(cs,
A6XX_RB_BLIT_SCISSOR_TL(.x = x1, .y = y1),
A6XX_RB_BLIT_SCISSOR_BR(.x = x2, .y = y2));
/* With FDM offset, bins are shifted to the right in GMEM space compared to
* framebuffer space. We do not use RB_BLIT_SCISSOR_* for loads and stores
* because those do not use the fast path, but we do use it for
* LOAD_OP_CLEAR. Expand the render area so that GMEM clears work
* correctly. We may over-clear but that's ok because the store is clipped
* to the render area.
*/
if (tu_enable_fdm_offset(cmd)) {
const struct tu_tiling_config *tiling = cmd->state.tiling;
/* If this is a generic clear that's also used in sysmem mode then we
* need to emit the unmodified render area in sysmem mode because
* over-clearing is not allowed.
*/
if (used_by_sysmem) {
tu_cs_emit_regs(cs,
A6XX_RB_BLIT_SCISSOR_TL(.x = x1, .y = y1),
A6XX_RB_BLIT_SCISSOR_BR(.x = x2, .y = y2));
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
CP_COND_REG_EXEC_0_GMEM);
}
x2 += tiling->tile0.width;
y2 += tiling->tile0.height;
tu_cs_emit_regs(cs,
A6XX_RB_BLIT_SCISSOR_TL(.x = x1, .y = y1),
A6XX_RB_BLIT_SCISSOR_BR(.x = x2, .y = y2));
if (used_by_sysmem) {
tu_cond_exec_end(cs);
}
} else {
tu_cs_emit_regs(cs,
A6XX_RB_BLIT_SCISSOR_TL(.x = x1, .y = y1),
A6XX_RB_BLIT_SCISSOR_BR(.x = x2, .y = y2));
}
}
void
@ -950,12 +984,20 @@ tu6_update_msaa_disable(struct tu_cmd_buffer *cmd)
}
}
static const struct tu_vsc_config *
tu_vsc_config(struct tu_cmd_buffer *cmd, const struct tu_tiling_config *tiling)
{
if (tu_enable_fdm_offset(cmd))
return &tiling->fdm_offset_vsc;
return &tiling->vsc;
}
static bool
use_hw_binning(struct tu_cmd_buffer *cmd)
{
const struct tu_framebuffer *fb = cmd->state.framebuffer;
const struct tu_tiling_config *tiling = &fb->tiling[cmd->state.gmem_layout];
const struct tu_vsc_config *vsc = &tiling->vsc;
const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling);
/* XFB commands are emitted for BINNING || SYSMEM, which makes it
* incompatible with non-hw binning GMEM rendering. this is required because
@ -1014,7 +1056,7 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd,
return true;
}
const struct tu_vsc_config *vsc = &cmd->state.tiling->vsc;
const struct tu_vsc_config *vsc = tu_vsc_config(cmd, cmd->state.tiling);
/* XFB is incompatible with non-hw binning GMEM rendering, see use_hw_binning */
if (cmd->state.rp.xfb_used && !vsc->binning_possible) {
@ -1059,7 +1101,7 @@ static void
tu6_emit_cond_for_load_stores(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
uint32_t pipe, uint32_t slot, bool skip_wfm)
{
const struct tu_vsc_config *vsc = &cmd->state.tiling->vsc;
const struct tu_vsc_config *vsc = tu_vsc_config(cmd, cmd->state.tiling);
if (vsc->binning_possible &&
cmd->state.pass->has_cond_load_store) {
@ -1080,16 +1122,48 @@ struct tu_tile_config {
VkExtent2D frag_areas[MAX_VIEWS];
};
/* For bin offsetting we want to do "Euclidean division," where the remainder
* (i.e. the offset of the bin) is always positive. Unfortunately C/C++
* remainder and division don't do this, so we have to implement it ourselves.
*
* For example, we should have:
*
* euclid_rem(-3, 4) = 1
* euclid_rem(-4, 4) = 0
* euclid_rem(-4, 4) = 3
*/
static int32_t
euclid_rem(int32_t divisor, int32_t divisend)
{
if (divisor >= 0)
return divisor % divisend;
int32_t tmp = divisend - (-divisor % divisend);
return tmp == divisend ? 0 : tmp;
}
/* Calculate how much the bins for a given view should be shifted to the left
* and upwards, given the application-provided FDM offset.
*/
static VkOffset2D
tu_bin_offset(VkOffset2D fdm_offset, const struct tu_tiling_config *tiling)
{
return (VkOffset2D) {
euclid_rem(-fdm_offset.x, tiling->tile0.width),
euclid_rem(-fdm_offset.y, tiling->tile0.height),
};
}
template <chip CHIP>
static void
tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
const struct tu_tile_config *tile,
bool fdm)
bool fdm, const VkOffset2D *fdm_offsets)
{
struct tu_physical_device *phys_dev = cmd->device->physical_device;
const struct tu_tiling_config *tiling = cmd->state.tiling;
const struct tu_vsc_config *vsc = &tiling->vsc;
const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling);
bool hw_binning = use_hw_binning(cmd);
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
@ -1118,6 +1192,7 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
const uint32_t x1 = tiling->tile0.width * tile->pos.x;
const uint32_t y1 = tiling->tile0.height * tile->pos.y;
const uint32_t x2 = MIN2(x1 + tiling->tile0.width, MAX_VIEWPORT_SIZE);
const uint32_t y2 = MIN2(y1 + tiling->tile0.height, MAX_VIEWPORT_SIZE);
tu6_emit_window_scissor(cs, x1, y1, x2 - 1, y2 - 1);
@ -1161,11 +1236,29 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
{ x1, y1 },
{ (x2 - x1) * tile->extent.width, (y2 - y1) * tile->extent.height }
};
VkRect2D bins[views];
for (unsigned i = 0; i < views; i++) {
if (!fdm_offsets || cmd->state.rp.shared_viewport) {
bins[i] = bin;
continue;
}
VkOffset2D bin_offset = tu_bin_offset(fdm_offsets[i], tiling);
bins[i].offset.x = MAX2(0, (int32_t)x1 - bin_offset.x);
bins[i].offset.y = MAX2(0, (int32_t)y1 - bin_offset.y);
bins[i].extent.width =
MAX2(MIN2((int32_t)x1 + bin.extent.width - bin_offset.x, MAX_VIEWPORT_SIZE) - bins[i].offset.x, 0);
bins[i].extent.height =
MAX2(MIN2((int32_t)y1 + bin.extent.height - bin_offset.y, MAX_VIEWPORT_SIZE) - bins[i].offset.y, 0);
}
util_dynarray_foreach (&cmd->fdm_bin_patchpoints,
struct tu_fdm_bin_patchpoint, patch) {
tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 2 + patch->size);
tu_cs_emit_qw(cs, patch->iova);
patch->apply(cmd, cs, patch->data, bin, views, tile->frag_areas);
patch->apply(cmd, cs, patch->data, (VkOffset2D) { x1, y1 }, views,
tile->frag_areas, bins);
}
/* Make the CP wait until the CP_MEM_WRITE's to the command buffers
@ -1252,7 +1345,7 @@ tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
const struct tu_render_pass *pass = cmd->state.pass;
const struct tu_subpass *subpass = &pass->subpasses[pass->subpass_count-1];
const struct tu_framebuffer *fb = cmd->state.framebuffer;
const struct tu_vsc_config *vsc = &cmd->state.tiling->vsc;
const struct tu_vsc_config *vsc = tu_vsc_config(cmd, cmd->state.tiling);
if (pass->has_fdm)
tu_cs_set_writeable(cs, true);
@ -1261,7 +1354,7 @@ tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_RESOLVE) |
A6XX_CP_SET_MARKER_0_USES_GMEM);
tu6_emit_blit_scissor(cmd, cs, true);
tu6_emit_blit_scissor(cmd, cs, true, false);
struct tu_resolve_group resolve_group = {};
@ -1646,13 +1739,31 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
tu_cs_sanity_check(cs);
}
bool
tu_enable_fdm_offset(struct tu_cmd_buffer *cmd)
{
if (!cmd->state.pass)
return false;
if (!cmd->state.pass->has_fdm)
return false;
unsigned fdm_a = cmd->state.pass->fragment_density_map.attachment;
if (fdm_a == VK_ATTACHMENT_UNUSED)
return TU_DEBUG(FDM_OFFSET);
const struct tu_image_view *fdm = cmd->state.attachments[fdm_a];
return fdm->image->vk.create_flags &
VK_IMAGE_CREATE_FRAGMENT_DENSITY_MAP_OFFSET_BIT_QCOM;
}
static void
update_vsc_pipe(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
uint32_t num_vsc_pipes)
{
const struct tu_tiling_config *tiling = cmd->state.tiling;
const struct tu_vsc_config *vsc = &tiling->vsc;
const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling);
tu_cs_emit_regs(cs,
A6XX_VSC_BIN_SIZE(.width = tiling->tile0.width,
@ -1680,7 +1791,7 @@ static void
emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
const struct tu_tiling_config *tiling = cmd->state.tiling;
const struct tu_vsc_config *vsc = &tiling->vsc;
const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling);
const uint32_t used_pipe_count =
vsc->pipe_count.width * vsc->pipe_count.height;
@ -1711,36 +1822,70 @@ emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
template <chip CHIP>
static void
tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
const VkOffset2D *fdm_offsets)
{
struct tu_physical_device *phys_dev = cmd->device->physical_device;
const struct tu_framebuffer *fb = cmd->state.framebuffer;
const struct tu_tiling_config *tiling = cmd->state.tiling;
/* If this command buffer may be executed multiple times, then
* viewports/scissor states may have been changed by previous executions
* and we need to reset them before executing the binning IB.
* and we need to reset them before executing the binning IB. With FDM
* offset the viewport also needs to be transformed during the binning
* phase.
*/
if (!(cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) &&
cmd->fdm_bin_patchpoints.size != 0) {
if ((!(cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) ||
fdm_offsets) && cmd->fdm_bin_patchpoints.size != 0) {
unsigned num_views = MAX2(cmd->state.pass->num_views, 1);
VkExtent2D unscaled_frag_areas[num_views];
for (unsigned i = 0; i < num_views; i++)
VkRect2D bins[num_views];
for (unsigned i = 0; i < num_views; i++) {
unscaled_frag_areas[i] = (VkExtent2D) { 1, 1 };
VkRect2D bin = { { 0, 0 }, { fb->width, fb->height } };
if (fdm_offsets && !cmd->state.rp.shared_viewport) {
/* We need to shift over the viewport and scissor during the
* binning pass to match the shift applied when rendering. The way
* to do this is to make the per-view bin start negative. In the
* actual rendering pass, the per-view bin start is shifted in a
* negative direction but the first bin is clipped so that the bin
* start is never negative, but we need to do this to avoid
* clipping the user scissor to a non-zero common bin start. We
* skip patching load/store below in order to avoid patching loads
* and stores to a crazy negative-offset bin. The parts of the
* framebuffer left or above the origin correspond to the
* non-visible parts of the left or top bins that will be
* discarded. The framebuffer still needs to extend to the
* original bottom and right, to avoid incorrectly clipping the
* user scissor, so we need to add to the width and height to
* compensate.
*/
VkOffset2D bin_offset = tu_bin_offset(fdm_offsets[i], tiling);
bins[i] = {
{ -bin_offset.x, -bin_offset.y },
{ fb->width + bin_offset.x, fb->height + bin_offset.y },
};
} else {
bins[i] = { { 0, 0 }, { fb->width, fb->height } };
}
}
util_dynarray_foreach (&cmd->fdm_bin_patchpoints,
struct tu_fdm_bin_patchpoint, patch) {
if (patch->flags & TU_FDM_SKIP_BINNING)
continue;
tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 2 + patch->size);
tu_cs_emit_qw(cs, patch->iova);
patch->apply(cmd, cs, patch->data, bin, num_views, unscaled_frag_areas);
patch->apply(cmd, cs, patch->data, (VkOffset2D) {0, 0}, num_views,
unscaled_frag_areas, bins);
}
tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
}
tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1);
uint32_t width = fb->width + (fdm_offsets ? tiling->tile0.width : 0);
uint32_t height = fb->height + (fdm_offsets ? tiling->tile0.height : 0);
tu6_emit_window_scissor(cs, 0, 0, width - 1, height - 1);
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_VISIBILITY));
@ -1929,6 +2074,22 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd,
if (!iview->view.is_mutable)
dst[0] &= ~A6XX_TEX_CONST_0_SWAP__MASK;
dst[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
/* If FDM offset is used, the last row and column extend beyond the
* framebuffer but are shifted over when storing. Expand the width and
* height to account for that.
*/
if (tu_enable_fdm_offset(cmd)) {
uint32_t width = dst[1] & A6XX_TEX_CONST_1_WIDTH__MASK;
uint32_t height = (dst[1] & A6XX_TEX_CONST_1_HEIGHT__MASK) >>
A6XX_TEX_CONST_1_HEIGHT__SHIFT;
width += cmd->state.tiling->tile0.width;
height += cmd->state.tiling->tile0.height;
dst[1] = (dst[1] & ~(A6XX_TEX_CONST_1_WIDTH__MASK |
A6XX_TEX_CONST_1_HEIGHT__MASK)) |
A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
}
dst[2] =
A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
A6XX_TEX_CONST_2_PITCH(tiling->tile0.width * cpp);
@ -2177,11 +2338,12 @@ tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
template <chip CHIP>
static void
tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
struct tu_renderpass_result *autotune_result)
struct tu_renderpass_result *autotune_result,
const VkOffset2D *fdm_offsets)
{
struct tu_physical_device *phys_dev = cmd->device->physical_device;
const struct tu_tiling_config *tiling = cmd->state.tiling;
const struct tu_vsc_config *vsc = &tiling->vsc;
const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling);
tu_lrz_tiling_begin<CHIP>(cmd, cs);
tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
@ -2225,7 +2387,7 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
tu6_emit_render_cntl<CHIP>(cmd, cmd->state.subpass, cs, true);
tu6_emit_binning_pass<CHIP>(cmd, cs);
tu6_emit_binning_pass<CHIP>(cmd, cs, fdm_offsets);
if (CHIP == A6XX) {
tu_cs_emit_regs(cs,
@ -2270,9 +2432,9 @@ template <chip CHIP>
static void
tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
const struct tu_tile_config *tile,
bool fdm)
bool fdm, const VkOffset2D *fdm_offsets)
{
tu6_emit_tile_select<CHIP>(cmd, &cmd->cs, tile, fdm);
tu6_emit_tile_select<CHIP>(cmd, &cmd->cs, tile, fdm, fdm_offsets);
tu_lrz_before_tile<CHIP>(cmd, &cmd->cs);
trace_start_draw_ib_gmem(&cmd->trace, &cmd->cs);
@ -2338,7 +2500,8 @@ tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
static void
tu_calc_frag_area(struct tu_cmd_buffer *cmd,
struct tu_tile_config *tile,
const struct tu_image_view *fdm)
const struct tu_image_view *fdm,
const VkOffset2D *fdm_offsets)
{
const struct tu_tiling_config *tiling = cmd->state.tiling;
const uint32_t x1 = tiling->tile0.width * tile->pos.x;
@ -2351,11 +2514,71 @@ tu_calc_frag_area(struct tu_cmd_buffer *cmd,
const struct tu_framebuffer *fb = cmd->state.framebuffer;
struct tu_frag_area raw_areas[views];
if (fdm) {
tu_fragment_density_map_sample(fdm,
(x1 + MIN2(x2, fb->width)) / 2,
(y1 + MIN2(y2, fb->height)) / 2,
fb->width, fb->height, views,
raw_areas);
for (unsigned i = 0; i < views; i++) {
VkOffset2D sample_pos = { 0, 0 };
/* Offsets less than a tile size are accomplished by sliding the
* tiles. However once we shift a whole tile size then we reset the
* tiles back to where they were at the beginning and we need to
* adjust where each bin is sampling from:
*
* x offset = 0:
*
* ------------------------------------
* | * | * | * | (unused) |
* ------------------------------------
*
* x offset = 4:
*
* -------------------------
* | * | * | * | * |
* -------------------------
*
* x offset = 8:
*
* ------------------------------------
* | * | * | * | (unused) |
* ------------------------------------
*
* As the user's offset increases we slide the tiles to the right,
* until we reach the whole tile size and reset the tile positions.
* tu_bin_offset() returns an amount to shift to the left, negating
* the offset.
*
* If we were forced to use a shared viewport, then we must not shift
* over the tiles and instead must only shift when sampling because
* we cannot shift the tiles differently per view. This disables
* smooth transitions of the fragment density map and effectively
* negates the extension.
*
* Note that we cannot clamp x2/y2 to the framebuffer size, as we
* normally would do, because then tiles along the edge would
* incorrectly nudge the sample_pos towards the center of the
* framebuffer. If we shift one complete tile over towards the
* center and reset the tiles as above, the sample_pos would
* then shift back towards the edge and we could get a "pop" from
* suddenly changing density due to the slight shift.
*/
if (fdm_offsets) {
VkOffset2D offset = fdm_offsets[i];
if (!cmd->state.rp.shared_viewport) {
VkOffset2D bin_offset = tu_bin_offset(fdm_offsets[i], tiling);
offset.x += bin_offset.x;
offset.y += bin_offset.y;
}
sample_pos.x = (x1 + x2) / 2 - offset.x;
sample_pos.y = (y1 + y2) / 2 - offset.y;
} else {
sample_pos.x = (x1 + MIN2(x2, fb->width)) / 2;
sample_pos.y = (y1 + MIN2(y2, fb->height)) / 2;
}
tu_fragment_density_map_sample(fdm,
sample_pos.x,
sample_pos.y,
fb->width, fb->height, i,
&raw_areas[i]);
}
} else {
for (unsigned i = 0; i < views; i++)
raw_areas[i].width = raw_areas[i].height = 1.0f;
@ -2388,10 +2611,24 @@ tu_calc_frag_area(struct tu_cmd_buffer *cmd,
width = 1u << util_logbase2(width);
height = 1u << util_logbase2(height);
/* When FDM offset is enabled, the fragment area has to divide the
* offset to make sure that we don't have tiles with partial fragments.
* It would be bad to have the fragment area change as a function of the
* offset, because we'd get "popping" as the resolution changes with the
* offset, so just make sure it divides the offset granularity. This
* should mean it always divides the offset for any possible offset.
*/
if (fdm_offsets) {
width = MIN2(width, TU_FDM_OFFSET_GRANULARITY);
height = MIN2(height, TU_FDM_OFFSET_GRANULARITY);
}
/* Make sure that the width/height divides the tile width/height so
* we don't have to do extra awkward clamping of the edges of each
* bin when resolving. Note that because the tile width is rounded to
* a multiple of 32 any power of two 32 or less will work.
* bin when resolving. It also has to divide the fdm offset, if any.
* Note that because the tile width is rounded to a multiple of 32 any
* power of two 32 or less will work, and if there is an offset then it
* must be a multiple of 4 so 2 or 4 will definitely work.
*
* TODO: Try to take advantage of the total area allowance here, too.
*/
@ -2486,7 +2723,8 @@ template <chip CHIP>
void
tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
uint32_t tx1, uint32_t ty1, uint32_t tx2, uint32_t ty2,
const struct tu_image_view *fdm)
const struct tu_image_view *fdm,
const VkOffset2D *fdm_offsets)
{
uint32_t width = tx2 - tx1;
uint32_t height = ty2 - ty1;
@ -2505,7 +2743,7 @@ tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
tile->extent = { 1, 1 };
tile->pipe = pipe;
tile->slot_mask = 1u << (width * y + x);
tu_calc_frag_area(cmd, tile, fdm);
tu_calc_frag_area(cmd, tile, fdm, fdm_offsets);
}
}
@ -2549,7 +2787,8 @@ tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
if (merged_tiles & (1u << tile_idx))
continue;
tu6_render_tile<CHIP>(cmd, &cmd->cs, &tiles[tile_idx], true);
tu6_render_tile<CHIP>(cmd, &cmd->cs, &tiles[tile_idx],
true, fdm_offsets);
}
}
}
@ -2557,10 +2796,11 @@ tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
template <chip CHIP>
static void
tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
struct tu_renderpass_result *autotune_result)
struct tu_renderpass_result *autotune_result,
const VkOffset2D *fdm_offsets)
{
const struct tu_tiling_config *tiling = cmd->state.tiling;
const struct tu_vsc_config *vsc = &tiling->vsc;
const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling);
const struct tu_image_view *fdm = NULL;
if (cmd->state.pass->fragment_density_map.attachment != VK_ATTACHMENT_UNUSED) {
@ -2571,6 +2811,10 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
bool merge_tiles = has_fdm && !TU_DEBUG(NO_BIN_MERGING) &&
cmd->device->physical_device->info->a6xx.has_bin_mask;
/* If not using FDM make sure not to accidentally apply the offsets */
if (!has_fdm)
fdm_offsets = NULL;
/* Create gmem stores now (at EndRenderPass time)) because they needed to
* know whether to allow their conditional execution, which was tied to a
* state that was known only at the end of the renderpass. They will be
@ -2582,7 +2826,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
cmd->trace_renderpass_end = u_trace_end_iterator(&cmd->trace);
tu6_tile_render_begin<CHIP>(cmd, &cmd->cs, autotune_result);
tu6_tile_render_begin<CHIP>(cmd, &cmd->cs, autotune_result, fdm_offsets);
/* Note: we reverse the order of walking the pipes and tiles on every
* other row, to improve texture cache locality compared to raster order.
@ -2602,7 +2846,8 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
uint32_t ty2 = MIN2(ty1 + vsc->pipe0.height, vsc->tile_count.height);
if (merge_tiles) {
tu_render_pipe_fdm<CHIP>(cmd, pipe, tx1, ty1, tx2, ty2, fdm);
tu_render_pipe_fdm<CHIP>(cmd, pipe, tx1, ty1, tx2, ty2, fdm,
fdm_offsets);
continue;
}
@ -2623,9 +2868,10 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
.extent = { 1, 1 },
};
if (has_fdm)
tu_calc_frag_area(cmd, &tile, fdm);
tu_calc_frag_area(cmd, &tile, fdm, fdm_offsets);
tu6_render_tile<CHIP>(cmd, &cmd->cs, &tile, has_fdm);
tu6_render_tile<CHIP>(cmd, &cmd->cs, &tile, has_fdm,
fdm_offsets);
}
slot_row += tile_row_stride;
}
@ -2676,7 +2922,8 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
template <chip CHIP>
void
tu_cmd_render(struct tu_cmd_buffer *cmd_buffer)
tu_cmd_render(struct tu_cmd_buffer *cmd_buffer,
const VkOffset2D *fdm_offsets)
{
if (cmd_buffer->state.rp.has_tess)
tu6_lazy_emit_tessfactor_addr<CHIP>(cmd_buffer);
@ -2685,7 +2932,7 @@ tu_cmd_render(struct tu_cmd_buffer *cmd_buffer)
if (use_sysmem_rendering(cmd_buffer, &autotune_result))
tu_cmd_render_sysmem<CHIP>(cmd_buffer, autotune_result);
else
tu_cmd_render_tiles<CHIP>(cmd_buffer, autotune_result);
tu_cmd_render_tiles<CHIP>(cmd_buffer, autotune_result, fdm_offsets);
/* Outside of renderpasses we assume all draw states are disabled. We do
* this outside the draw CS for the normal case where 3d gmem stores aren't
@ -4771,7 +5018,7 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
*/
tu_restore_suspended_pass(cmd, cmd);
TU_CALLX(cmd->device, tu_cmd_render)(cmd);
TU_CALLX(cmd->device, tu_cmd_render)(cmd, NULL);
if (cmd->state.suspend_resume == SR_IN_CHAIN)
cmd->state.suspend_resume = SR_NONE;
else
@ -4877,7 +5124,7 @@ tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd, struct tu_resolve_group *r
{
struct tu_cs *cs = &cmd->draw_cs;
uint32_t subpass_idx = cmd->state.subpass - cmd->state.pass->subpasses;
const struct tu_vsc_config *vsc = &cmd->state.tiling->vsc;
const struct tu_vsc_config *vsc = tu_vsc_config(cmd, cmd->state.tiling);
/* If we might choose to bin, then put the loads under a check for geometry
* having been binned to this tile. If we don't choose to bin in the end,
@ -4902,7 +5149,7 @@ tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd, struct tu_resolve_group *r
struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[i];
if ((att->load || att->load_stencil) && att->first_subpass_idx == subpass_idx) {
if (!emitted_scissor) {
tu6_emit_blit_scissor(cmd, cs, true);
tu6_emit_blit_scissor(cmd, cs, true, false);
emitted_scissor = true;
}
tu_load_gmem_attachment<CHIP>(cmd, cs, resolve_group, i,
@ -4918,7 +5165,7 @@ tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd, struct tu_resolve_group *r
&cmd->state.pass->attachments[i];
if (att->clear_mask && att->first_subpass_idx == subpass_idx) {
if (!emitted_scissor) {
tu6_emit_blit_scissor(cmd, cs, false);
tu6_emit_blit_scissor(cmd, cs, false, false);
emitted_scissor = true;
}
tu_clear_gmem_attachment<CHIP>(cmd, cs, resolve_group, i);
@ -4969,7 +5216,7 @@ tu7_emit_subpass_clear(struct tu_cmd_buffer *cmd, struct tu_resolve_group *resol
&cmd->state.pass->attachments[i];
if (att->clear_mask && att->first_subpass_idx == subpass_idx) {
if (!emitted_scissor) {
tu6_emit_blit_scissor(cmd, cs, false);
tu6_emit_blit_scissor(cmd, cs, false, true);
emitted_scissor = true;
}
tu7_generic_clear_attachment(cmd, cs, resolve_group, i);
@ -5432,7 +5679,7 @@ tu_CmdNextSubpass2(VkCommandBuffer commandBuffer,
tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
if (subpass->resolve_attachments) {
tu6_emit_blit_scissor(cmd, cs, true);
tu6_emit_blit_scissor(cmd, cs, true, false);
struct tu_resolve_group resolve_group = {};
@ -5908,9 +6155,10 @@ static void
fdm_apply_fs_params(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
void *data,
VkRect2D bin,
VkOffset2D common_bin_offset,
unsigned views,
const VkExtent2D *frag_areas)
const VkExtent2D *frag_areas,
const VkRect2D *bins)
{
const struct apply_fs_params_state *state =
(const struct apply_fs_params_state *)data;
@ -5919,7 +6167,8 @@ fdm_apply_fs_params(struct tu_cmd_buffer *cmd,
for (unsigned i = 0; i < num_consts; i++) {
assert(i < views);
VkExtent2D area = frag_areas[i];
VkOffset2D offset = tu_fdm_per_bin_offset(area, bin);
VkRect2D bin = bins[i];
VkOffset2D offset = tu_fdm_per_bin_offset(area, bin, common_bin_offset);
tu_cs_emit(cs, area.width);
tu_cs_emit(cs, area.height);
@ -7443,9 +7692,25 @@ tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
return;
}
const VkSubpassFragmentDensityMapOffsetEndInfoQCOM *fdm_offset_info =
vk_find_struct_const(pSubpassEndInfo->pNext,
SUBPASS_FRAGMENT_DENSITY_MAP_OFFSET_END_INFO_QCOM);
const VkOffset2D *fdm_offsets =
(fdm_offset_info && fdm_offset_info->fragmentDensityOffsetCount > 0) ?
fdm_offset_info->pFragmentDensityOffsets : NULL;
VkOffset2D test_offsets[MAX_VIEWS];
if (TU_DEBUG(FDM) && TU_DEBUG(FDM_OFFSET)) {
for (unsigned i = 0;
i < MAX2(cmd_buffer->state.pass->num_views, 1); i++) {
test_offsets[i] = { 64, 64 };
}
fdm_offsets = test_offsets;
}
tu_cs_end(&cmd_buffer->draw_cs);
tu_cs_end(&cmd_buffer->draw_epilogue_cs);
TU_CALLX(cmd_buffer->device, tu_cmd_render)(cmd_buffer);
TU_CALLX(cmd_buffer->device, tu_cmd_render)(cmd_buffer, fdm_offsets);
cmd_buffer->state.cache.pending_flush_bits |=
cmd_buffer->state.renderpass_cache.pending_flush_bits;
@ -7483,7 +7748,16 @@ tu_CmdEndRendering(VkCommandBuffer commandBuffer)
*/
tu_disable_draw_states(cmd_buffer, &cmd_buffer->cs);
} else {
TU_CALLX(cmd_buffer->device, tu_cmd_render)(cmd_buffer);
VkOffset2D test_offsets[MAX_VIEWS];
const VkOffset2D *fdm_offsets = NULL;
if (TU_DEBUG(FDM) && TU_DEBUG(FDM_OFFSET)) {
for (unsigned i = 0;
i < MAX2(cmd_buffer->state.pass->num_views, 1); i++) {
test_offsets[i] = { 64, 64 };
}
fdm_offsets = test_offsets;
}
TU_CALLX(cmd_buffer->device, tu_cmd_render)(cmd_buffer, fdm_offsets);
}
tu_reset_render_pass(cmd_buffer);

View file

@ -695,7 +695,7 @@ tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
struct tu_cmd_buffer *suspended);
template <chip CHIP>
void tu_cmd_render(struct tu_cmd_buffer *cmd);
void tu_cmd_render(struct tu_cmd_buffer *cmd, const VkOffset2D *fdm_offsets);
void tu_dispatch_unaligned(VkCommandBuffer commandBuffer,
uint32_t x, uint32_t y, uint32_t z);
@ -748,12 +748,15 @@ void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
void tu6_apply_depth_bounds_workaround(struct tu_device *device,
uint32_t *rb_depth_cntl);
bool tu_enable_fdm_offset(struct tu_cmd_buffer *cmd);
typedef void (*tu_fdm_bin_apply_t)(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
void *data,
VkRect2D bin,
VkOffset2D common_bin_offset,
unsigned views,
const VkExtent2D *frag_areas);
const VkExtent2D *frag_areas,
const VkRect2D *bins);
enum tu_fdm_flags {
TU_FDM_NONE = 0,
@ -807,13 +810,15 @@ _tu_create_fdm_bin_patchpoint(struct tu_cmd_buffer *cmd,
*/
unsigned num_views = MAX2(cmd->state.pass->num_views, 1);
VkExtent2D unscaled_frag_areas[num_views];
VkRect2D bins[num_views];
for (unsigned i = 0; i < num_views; i++) {
unscaled_frag_areas[i] = (VkExtent2D) { 1, 1 };
}
apply(cmd, cs, state, (VkRect2D) {
bins[i] = (VkRect2D) {
{ 0, 0 },
{ MAX_VIEWPORT_SIZE, MAX_VIEWPORT_SIZE },
}, num_views, unscaled_frag_areas);
};
}
apply(cmd, cs, state, (VkOffset2D) {0, 0}, num_views, unscaled_frag_areas, bins);
assert(tu_cs_get_cur_iova(cs) == patch.iova + patch.size * sizeof(uint32_t));
util_dynarray_append(&cmd->fdm_bin_patchpoints,

View file

@ -138,6 +138,18 @@
#define MAX_FDM_TEXEL_SIZE_LOG2 10
#define MAX_FDM_TEXEL_SIZE (1u << MAX_FDM_TEXEL_SIZE_LOG2)
/* This granularity is arbitrary, but there are two competing concerns here:
*
* - The fragment area has to always divide the offset, and we don't want the
* fragment area changing with the offset, so we have to clamp the fragment
* area to this granularity. Therefore larger granularities lead to lower
* minimum resolution.
* - The larger the offset granularity, the choppier the motion is.
*
* Choose 8 as a compromise between the two.
*/
#define TU_FDM_OFFSET_GRANULARITY 8
#define TU_GENX(FUNC_NAME) FD_GENX(FUNC_NAME)
#define TU_CALLX(device, thing) FD_CALLX((device)->physical_device->info, thing)

View file

@ -333,6 +333,7 @@ get_device_extensions(const struct tu_physical_device *device,
.GOOGLE_user_type = true,
.IMG_filter_cubic = device->info->a6xx.has_tex_filter_cubic,
.NV_compute_shader_derivatives = device->info->chip >= 7,
.QCOM_fragment_density_map_offset = true,
.VALVE_mutable_descriptor_type = true,
} };
@ -747,6 +748,9 @@ tu_get_features(struct tu_physical_device *pdevice,
/* VK_KHR_subgroup_rotate */
features->shaderSubgroupRotate = true;
features->shaderSubgroupRotateClustered = true;
/* VK_QCOM_fragment_density_map_offset */
features->fragmentDensityMapOffset = true;
}
static void
@ -1385,6 +1389,11 @@ tu_get_properties(struct tu_physical_device *pdevice,
props->degenerateLinesRasterized = false;
props->fullyCoveredFragmentShaderInputVariable = false;
props->conservativeRasterizationPostDepthCoverage = false;
/* VK_QCOM_fragment_density_map_offset */
props->fragmentDensityOffsetGranularity = (VkExtent2D) {
TU_FDM_OFFSET_GRANULARITY, TU_FDM_OFFSET_GRANULARITY
};
}
static const struct vk_pipeline_cache_object_ops *const cache_import_ops[] = {

View file

@ -488,7 +488,7 @@ struct tu_tiling_config {
/* Whether using GMEM is even possible with this configuration */
bool possible;
struct tu_vsc_config vsc;
struct tu_vsc_config vsc, fdm_offset_vsc;
};
struct tu_framebuffer

View file

@ -152,7 +152,7 @@ tu_insert_dynamic_cmdbufs(struct tu_device *dev,
old_cmds[i]->pre_chain.trace_renderpass_end);
}
TU_CALLX(dev, tu_cmd_render)(cmd_buffer);
TU_CALLX(dev, tu_cmd_render)(cmd_buffer, NULL);
tu_cs_emit_pkt7(&cmd_buffer->cs, CP_MEM_WRITE, 3);
tu_cs_emit_qw(&cmd_buffer->cs,

View file

@ -1163,10 +1163,10 @@ tu_DestroyImageView(VkDevice _device,
*/
void
tu_fragment_density_map_sample(const struct tu_image_view *fdm,
uint32_t x, uint32_t y,
int32_t x, int32_t y,
uint32_t width, uint32_t height,
uint32_t layers,
struct tu_frag_area *areas)
uint32_t layer,
struct tu_frag_area *area)
{
assert(fdm->image->layout[0].tile_mode == TILE6_LINEAR);
@ -1176,20 +1176,19 @@ tu_fragment_density_map_sample(const struct tu_image_view *fdm,
fdm_shift_x = CLAMP(fdm_shift_x, MIN_FDM_TEXEL_SIZE_LOG2, MAX_FDM_TEXEL_SIZE_LOG2);
fdm_shift_y = CLAMP(fdm_shift_y, MIN_FDM_TEXEL_SIZE_LOG2, MAX_FDM_TEXEL_SIZE_LOG2);
uint32_t i = x >> fdm_shift_x;
uint32_t j = y >> fdm_shift_y;
int32_t i = x >> fdm_shift_x;
int32_t j = y >> fdm_shift_y;
i = CLAMP(i, 0, fdm->vk.extent.width - 1);
j = CLAMP(j, 0, fdm->vk.extent.height - 1);
unsigned cpp = fdm->image->layout[0].cpp;
unsigned pitch = fdm->view.pitch;
void *pixel = (char *)fdm->image->map + fdm->view.offset + cpp * i + pitch * j;
for (unsigned i = 0; i < layers; i++) {
float density_src[4], density[4];
util_format_unpack_rgba(fdm->view.format, density_src, pixel, 1);
pipe_swizzle_4f(density, density_src, fdm->swizzle);
areas[i].width = 1.0f / density[0];
areas[i].height = 1.0f / density[1];
pixel = (char *)pixel + fdm->view.layer_size;
}
void *pixel = (char *)fdm->image->map + fdm->view.offset + fdm->view.layer_size * layer + cpp * i + pitch * j;
float density_src[4], density[4];
util_format_unpack_rgba(fdm->view.format, density_src, pixel, 1);
pipe_swizzle_4f(density, density_src, fdm->swizzle);
area->width = 1.0f / density[0];
area->height = 1.0f / density[1];
}

View file

@ -129,9 +129,9 @@ struct tu_frag_area {
void
tu_fragment_density_map_sample(const struct tu_image_view *fdm,
uint32_t x, uint32_t y,
int32_t x, int32_t y,
uint32_t width, uint32_t height,
uint32_t layers, struct tu_frag_area *areas);
uint32_t layer, struct tu_frag_area *area);
VkResult
tu_image_update_layout(struct tu_device *device, struct tu_image *image,

View file

@ -157,7 +157,7 @@ tu6_write_lrz_cntl(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
struct A6XX_GRAS_LRZ_CNTL cntl)
{
if (CHIP >= A7XX) {
// A7XX split LRZ_CNTL into two seperate registers.
/* A7XX split LRZ_CNTL into two seperate registers. */
struct tu_reg_value cntl2 = A7XX_GRAS_LRZ_CNTL2(
.disable_on_wrong_dir = cntl.disable_on_wrong_dir,
.fc_enable = cntl.fc_enable,

View file

@ -2548,44 +2548,49 @@ struct apply_viewport_state {
bool share_scale;
};
/* It's a hardware restriction that the window offset (i.e. bin.offset) must
* be the same for all views. This means that GMEM coordinates cannot be a
* simple scaling of framebuffer coordinates, because this would require us to
* scale the window offset and the scale may be different per view. Instead we
* have to apply a per-bin offset to the GMEM coordinate transform to make
* sure that the window offset maps to itself. Specifically we need an offset
* o to the transform:
/* It's a hardware restriction that the window offset (i.e. common_bin_offset)
* must be the same for all views. This means that GMEM coordinates cannot be
* a simple scaling of framebuffer coordinates, because this would require us
* to scale the window offset and the scale may be different per view. Instead
* we have to apply a per-bin offset to the GMEM coordinate transform to make
* sure that the window offset maps to the per-view bin coordinate, which will
* be the same if there is no offset. Specifically we need an offset o to the
* transform:
*
* x' = s * x + o
*
* so that when we plug in the bin start b_s:
* so that when we plug in the per-view bin start b_s and the common window
* offset b_cs:
*
* b_s = s * b_s + o
* b_cs = s * b_s + o
*
* and we get:
*
* o = b_s - s * b_s
* o = b_cs - s * b_s
*
* We use this form exactly, because we know the bin offset is a multiple of
* We use this form exactly, because we know the bin start is a multiple of
* the frag area so s * b_s is an integer and we can compute an exact result
* easily.
* easily. We also have to make sure that the bin offset is a multiple of the
* frag area by restricting the frag area.
*/
VkOffset2D
tu_fdm_per_bin_offset(VkExtent2D frag_area, VkRect2D bin)
tu_fdm_per_bin_offset(VkExtent2D frag_area, VkRect2D bin,
VkOffset2D common_bin_offset)
{
assert(bin.offset.x % frag_area.width == 0);
assert(bin.offset.y % frag_area.height == 0);
return (VkOffset2D) {
bin.offset.x - bin.offset.x / frag_area.width,
bin.offset.y - bin.offset.y / frag_area.height
common_bin_offset.x - bin.offset.x / frag_area.width,
common_bin_offset.y - bin.offset.y / frag_area.height
};
}
static void
fdm_apply_viewports(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
VkRect2D bin, unsigned views, const VkExtent2D *frag_areas)
VkOffset2D common_bin_offset, unsigned views,
const VkExtent2D *frag_areas, const VkRect2D *bins)
{
const struct apply_viewport_state *state =
(const struct apply_viewport_state *)data;
@ -2603,9 +2608,12 @@ fdm_apply_viewports(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
* replicate it across all viewports.
*/
VkExtent2D frag_area = state->share_scale ? frag_areas[0] : frag_areas[i];
VkRect2D bin = state->share_scale ? bins[0] : bins[i];
VkViewport viewport =
state->share_scale ? state->vp.viewports[i] : state->vp.viewports[0];
if (frag_area.width == 1 && frag_area.height == 1) {
if (frag_area.width == 1 && frag_area.height == 1 &&
common_bin_offset.x == bin.offset.x &&
common_bin_offset.y == bin.offset.y) {
vp.viewports[i] = viewport;
continue;
}
@ -2618,7 +2626,8 @@ fdm_apply_viewports(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
vp.viewports[i].width = viewport.width * scale_x;
vp.viewports[i].height = viewport.height * scale_y;
VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin);
VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin,
common_bin_offset);
vp.viewports[i].x = scale_x * viewport.x + offset.x;
vp.viewports[i].y = scale_y * viewport.y + offset.y;
@ -2694,7 +2703,8 @@ tu6_emit_scissor(struct tu_cs *cs, const struct vk_viewport_state *vp)
static void
fdm_apply_scissors(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
VkRect2D bin, unsigned views, const VkExtent2D *frag_areas)
VkOffset2D common_bin_offset, unsigned views,
const VkExtent2D *frag_areas, const VkRect2D *bins)
{
const struct apply_viewport_state *state =
(const struct apply_viewport_state *)data;
@ -2703,12 +2713,9 @@ fdm_apply_scissors(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
for (unsigned i = 0; i < vp.scissor_count; i++) {
VkExtent2D frag_area = state->share_scale ? frag_areas[0] : frag_areas[i];
VkRect2D bin = state->share_scale ? bins[0] : bins[i];
VkRect2D scissor =
state->share_scale ? state->vp.scissors[i] : state->vp.scissors[0];
if (frag_area.width == 1 && frag_area.height == 1) {
vp.scissors[i] = scissor;
continue;
}
/* Transform the scissor following the viewport. It's unclear how this
* is supposed to handle cases where the scissor isn't aligned to the
@ -2716,7 +2723,8 @@ fdm_apply_scissors(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
* fragments if the scissor size equals the framebuffer size and it
* isn't aligned to the fragment area.
*/
VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin);
VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin,
common_bin_offset);
VkOffset2D min = {
scissor.offset.x / frag_area.width + offset.x,
scissor.offset.y / frag_area.width + offset.y,
@ -2731,12 +2739,12 @@ fdm_apply_scissors(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
*/
uint32_t scaled_width = bin.extent.width / frag_area.width;
uint32_t scaled_height = bin.extent.height / frag_area.height;
vp.scissors[i].offset.x = MAX2(min.x, bin.offset.x);
vp.scissors[i].offset.y = MAX2(min.y, bin.offset.y);
vp.scissors[i].offset.x = MAX2(min.x, common_bin_offset.x);
vp.scissors[i].offset.y = MAX2(min.y, common_bin_offset.y);
vp.scissors[i].extent.width =
MIN2(max.x, bin.offset.x + scaled_width) - vp.scissors[i].offset.x;
MIN2(max.x, common_bin_offset.x + scaled_width) - vp.scissors[i].offset.x;
vp.scissors[i].extent.height =
MIN2(max.y, bin.offset.y + scaled_height) - vp.scissors[i].offset.y;
MIN2(max.y, common_bin_offset.y + scaled_height) - vp.scissors[i].offset.y;
}
TU_CALLX(cs->device, tu6_emit_scissor)(cs, &vp);

View file

@ -243,7 +243,8 @@ TU_DECL_PIPELINE_DOWNCAST(graphics, TU_PIPELINE_GRAPHICS)
TU_DECL_PIPELINE_DOWNCAST(graphics_lib, TU_PIPELINE_GRAPHICS_LIB)
TU_DECL_PIPELINE_DOWNCAST(compute, TU_PIPELINE_COMPUTE)
VkOffset2D tu_fdm_per_bin_offset(VkExtent2D frag_area, VkRect2D bin);
VkOffset2D tu_fdm_per_bin_offset(VkExtent2D frag_area, VkRect2D bin,
VkOffset2D common_bin_offset);
template <chip CHIP>
uint32_t tu_emit_draw_state(struct tu_cmd_buffer *cmd);

View file

@ -49,6 +49,7 @@ static const struct debug_control tu_debug_options[] = {
{ "dumpas", TU_DEBUG_DUMPAS },
{ "nobinmerging", TU_DEBUG_NO_BIN_MERGING },
{ "perfcraw", TU_DEBUG_PERFCRAW },
{ "fdmoffset", TU_DEBUG_FDM_OFFSET },
{ NULL, 0 }
};
@ -454,6 +455,16 @@ tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
tu_tiling_config_update_pipe_layout(vsc, device, pass->has_fdm);
tu_tiling_config_update_pipes(vsc, device);
tu_tiling_config_update_binning(vsc, device);
if (pass->has_fdm) {
struct tu_vsc_config *fdm_offset_vsc = &tiling->fdm_offset_vsc;
fdm_offset_vsc->tile_count = (VkExtent2D) {
vsc->tile_count.width + 1, vsc->tile_count.height + 1
};
tu_tiling_config_update_pipe_layout(fdm_offset_vsc, device, true);
tu_tiling_config_update_pipes(fdm_offset_vsc, device);
tu_tiling_config_update_binning(fdm_offset_vsc, device);
}
}
}

View file

@ -69,6 +69,7 @@ enum tu_debug_flags : uint64_t
TU_DEBUG_DUMPAS = BITFIELD64_BIT(28),
TU_DEBUG_NO_BIN_MERGING = BITFIELD64_BIT(29),
TU_DEBUG_PERFCRAW = BITFIELD64_BIT(30),
TU_DEBUG_FDM_OFFSET = BITFIELD64_BIT(31),
};
struct tu_env {