turnip: input attachment descriptor set rework

Implement GMEM input attachments by using non-bindless texture state which
is emitted at the start of every subpass.

This achieves two things:
* More vulkan-like CmdBindDescriptorSets
* Fixing secondary command buffer input attachments with GMEM

Signed-off-by: Jonathan Marek <jonathan@marek.ca>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5446>
This commit is contained in:
Jonathan Marek 2020-06-14 23:10:01 -04:00 committed by Marge Bot
parent 233610f8cf
commit 159a1300ce
6 changed files with 211 additions and 383 deletions

View file

@ -707,13 +707,6 @@ tu_cs_emit_draw_state(struct tu_cs *cs, uint32_t id, struct tu_draw_state state)
case TU_DRAW_STATE_VI_BINNING: case TU_DRAW_STATE_VI_BINNING:
enable_mask = CP_SET_DRAW_STATE__0_BINNING; enable_mask = CP_SET_DRAW_STATE__0_BINNING;
break; break;
case TU_DRAW_STATE_DESC_SETS_GMEM:
enable_mask = CP_SET_DRAW_STATE__0_GMEM;
break;
case TU_DRAW_STATE_DESC_SETS_SYSMEM:
enable_mask = CP_SET_DRAW_STATE__0_BINNING |
CP_SET_DRAW_STATE__0_SYSMEM;
break;
default: default:
enable_mask = CP_SET_DRAW_STATE__0_GMEM | enable_mask = CP_SET_DRAW_STATE__0_GMEM |
CP_SET_DRAW_STATE__0_SYSMEM | CP_SET_DRAW_STATE__0_SYSMEM |
@ -1263,8 +1256,91 @@ tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
} }
static void static void
tu_emit_load_clear(struct tu_cmd_buffer *cmd, tu_emit_input_attachments(struct tu_cmd_buffer *cmd,
const VkRenderPassBeginInfo *info) const struct tu_subpass *subpass,
bool gmem)
{
/* note: we can probably emit input attachments just once for the whole
* renderpass, this would avoid emitting both sysmem/gmem versions
*
* emit two texture descriptors for each input, as a workaround for
* d24s8, which can be sampled as both float (depth) and integer (stencil)
* tu_shader lowers uint input attachment loads to use the 2nd descriptor
* in the pair
* TODO: a smarter workaround
*/
if (!subpass->input_count)
return;
struct ts_cs_memory texture;
VkResult result = tu_cs_alloc(&cmd->sub_cs, subpass->input_count * 2,
A6XX_TEX_CONST_DWORDS, &texture);
assert(result == VK_SUCCESS);
for (unsigned i = 0; i < subpass->input_count * 2; i++) {
uint32_t a = subpass->input_attachments[i / 2].attachment;
if (a == VK_ATTACHMENT_UNUSED)
continue;
struct tu_image_view *iview =
cmd->state.framebuffer->attachments[a].attachment;
const struct tu_render_pass_attachment *att =
&cmd->state.pass->attachments[a];
uint32_t *dst = &texture.map[A6XX_TEX_CONST_DWORDS * i];
memcpy(dst, iview->descriptor, A6XX_TEX_CONST_DWORDS * 4);
if (i % 2 == 1 && att->format == VK_FORMAT_D24_UNORM_S8_UINT) {
/* note this works because spec says fb and input attachments
* must use identity swizzle
*/
dst[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK);
dst[0] |= A6XX_TEX_CONST_0_FMT(FMT6_S8Z24_UINT) |
A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_Y) |
A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_ZERO) |
A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_ZERO) |
A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_ONE);
}
if (!gmem)
continue;
/* patched for gmem */
dst[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK);
dst[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
dst[2] &= ~(A6XX_TEX_CONST_2_TYPE__MASK | A6XX_TEX_CONST_2_PITCH__MASK);
dst[2] |=
A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
A6XX_TEX_CONST_2_PITCH(cmd->state.tiling_config.tile0.extent.width * att->cpp);
dst[3] = 0;
dst[4] = cmd->device->physical_device->gmem_base + att->gmem_offset;
dst[5] = A6XX_TEX_CONST_5_DEPTH(1);
for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
dst[i] = 0;
}
struct tu_cs *cs = &cmd->draw_cs;
tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
CP_LOAD_STATE6_0_NUM_UNIT(subpass->input_count * 2));
tu_cs_emit_qw(cs, texture.iova);
tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_CONST_LO, 2);
tu_cs_emit_qw(cs, texture.iova);
tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(subpass->input_count * 2));
}
static void
tu_emit_renderpass_begin(struct tu_cmd_buffer *cmd,
const VkRenderPassBeginInfo *info)
{ {
struct tu_cs *cs = &cmd->draw_cs; struct tu_cs *cs = &cmd->draw_cs;
@ -1280,6 +1356,8 @@ tu_emit_load_clear(struct tu_cmd_buffer *cmd,
for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i) for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
tu_clear_gmem_attachment(cmd, cs, i, info); tu_clear_gmem_attachment(cmd, cs, i, info);
tu_emit_input_attachments(cmd, cmd->state.subpass, true);
tu_cond_exec_end(cs); tu_cond_exec_end(cs);
tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM); tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
@ -1287,6 +1365,8 @@ tu_emit_load_clear(struct tu_cmd_buffer *cmd,
for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i) for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
tu_clear_sysmem_attachment(cmd, cs, i, info); tu_clear_sysmem_attachment(cmd, cs, i, info);
tu_emit_input_attachments(cmd, cmd->state.subpass, false);
tu_cond_exec_end(cs); tu_cond_exec_end(cs);
} }
@ -1343,7 +1423,6 @@ tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
tu_cs_sanity_check(cs); tu_cs_sanity_check(cs);
} }
static void static void
tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{ {
@ -1575,9 +1654,6 @@ tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer)
list_del(&cmd_buffer->pool_link); list_del(&cmd_buffer->pool_link);
for (unsigned i = 0; i < MAX_BIND_POINTS; i++)
free(cmd_buffer->descriptors[i].push_set.set.mapped_ptr);
tu_cs_finish(&cmd_buffer->cs); tu_cs_finish(&cmd_buffer->cs);
tu_cs_finish(&cmd_buffer->draw_cs); tu_cs_finish(&cmd_buffer->draw_cs);
tu_cs_finish(&cmd_buffer->draw_epilogue_cs); tu_cs_finish(&cmd_buffer->draw_epilogue_cs);
@ -1598,10 +1674,8 @@ tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer)
tu_cs_reset(&cmd_buffer->draw_epilogue_cs); tu_cs_reset(&cmd_buffer->draw_epilogue_cs);
tu_cs_reset(&cmd_buffer->sub_cs); tu_cs_reset(&cmd_buffer->sub_cs);
for (unsigned i = 0; i < MAX_BIND_POINTS; i++) { for (unsigned i = 0; i < MAX_BIND_POINTS; i++)
cmd_buffer->descriptors[i].valid = 0; memset(&cmd_buffer->descriptors[i].sets, 0, sizeof(cmd_buffer->descriptors[i].sets));
cmd_buffer->descriptors[i].push_dirty = false;
}
cmd_buffer->status = TU_CMD_BUFFER_STATUS_INITIAL; cmd_buffer->status = TU_CMD_BUFFER_STATUS_INITIAL;
@ -1829,31 +1903,10 @@ tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
TU_FROM_HANDLE(tu_descriptor_set, set, pDescriptorSets[i]); TU_FROM_HANDLE(tu_descriptor_set, set, pDescriptorSets[i]);
descriptors_state->sets[idx] = set; descriptors_state->sets[idx] = set;
descriptors_state->valid |= (1u << idx);
/* Note: the actual input attachment indices come from the shader
* itself, so we can't generate the patched versions of these until
* draw time when both the pipeline and descriptors are bound and
* we're inside the render pass.
*/
unsigned dst_idx = layout->set[idx].input_attachment_start;
memcpy(&descriptors_state->input_attachments[dst_idx * A6XX_TEX_CONST_DWORDS],
set->dynamic_descriptors,
set->layout->input_attachment_count * A6XX_TEX_CONST_DWORDS * 4);
for(unsigned j = 0; j < set->layout->dynamic_offset_count; ++j, ++dyn_idx) { for(unsigned j = 0; j < set->layout->dynamic_offset_count; ++j, ++dyn_idx) {
/* Dynamic buffers come after input attachments in the descriptor set /* update the contents of the dynamic descriptor set */
* itself, but due to how the Vulkan descriptor set binding works, we unsigned src_idx = j;
* have to put input attachments and dynamic buffers in separate
* buffers in the descriptor_state and then combine them at draw
* time. Binding a descriptor set only invalidates the descriptor
* sets after it, but if we try to tightly pack the descriptors after
* the input attachments then we could corrupt dynamic buffers in the
* descriptor set before it, or we'd have to move all the dynamic
* buffers over. We just put them into separate buffers to make
* binding as well as the later patching of input attachments easy.
*/
unsigned src_idx = j + set->layout->input_attachment_count;
unsigned dst_idx = j + layout->set[idx].dynamic_offset_start; unsigned dst_idx = j + layout->set[idx].dynamic_offset_start;
assert(dyn_idx < dynamicOffsetCount); assert(dyn_idx < dynamicOffsetCount);
@ -1894,11 +1947,65 @@ tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP); MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
} }
} }
assert(dyn_idx == dynamicOffsetCount);
uint32_t sp_bindless_base_reg, hlsq_bindless_base_reg, hlsq_update_value;
uint64_t addr[MAX_SETS + 1] = {};
struct tu_cs cs;
for (uint32_t i = 0; i < MAX_SETS; i++) {
struct tu_descriptor_set *set = descriptors_state->sets[i];
if (set)
addr[i] = set->va | 3;
}
if (layout->dynamic_offset_count) {
/* allocate and fill out dynamic descriptor set */
struct ts_cs_memory dynamic_desc_set;
VkResult result = tu_cs_alloc(&cmd->sub_cs, layout->dynamic_offset_count,
A6XX_TEX_CONST_DWORDS, &dynamic_desc_set);
assert(result == VK_SUCCESS);
memcpy(dynamic_desc_set.map, descriptors_state->dynamic_descriptors,
layout->dynamic_offset_count * A6XX_TEX_CONST_DWORDS * 4);
addr[MAX_SETS] = dynamic_desc_set.iova | 3;
}
if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) {
sp_bindless_base_reg = REG_A6XX_SP_BINDLESS_BASE(0);
hlsq_bindless_base_reg = REG_A6XX_HLSQ_BINDLESS_BASE(0);
hlsq_update_value = 0x7c000;
if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE)
cmd->state.dirty |= TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS;
else
cmd->state.dirty |= TU_CMD_DIRTY_DESCRIPTOR_SETS | TU_CMD_DIRTY_SHADER_CONSTS; cmd->state.dirty |= TU_CMD_DIRTY_DESCRIPTOR_SETS | TU_CMD_DIRTY_SHADER_CONSTS;
} else {
assert(pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE);
sp_bindless_base_reg = REG_A6XX_SP_CS_BINDLESS_BASE(0);
hlsq_bindless_base_reg = REG_A6XX_HLSQ_CS_BINDLESS_BASE(0);
hlsq_update_value = 0x3e00;
cmd->state.dirty |= TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS;
}
tu_cs_begin_sub_stream(&cmd->sub_cs, 24, &cs);
tu_cs_emit_pkt4(&cs, sp_bindless_base_reg, 10);
tu_cs_emit_array(&cs, (const uint32_t*) addr, 10);
tu_cs_emit_pkt4(&cs, hlsq_bindless_base_reg, 10);
tu_cs_emit_array(&cs, (const uint32_t*) addr, 10);
tu_cs_emit_regs(&cs, A6XX_HLSQ_UPDATE_CNTL(.dword = hlsq_update_value));
struct tu_cs_entry ib = tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) {
tu_cs_emit_pkt7(&cmd->draw_cs, CP_SET_DRAW_STATE, 3);
tu_cs_emit_sds_ib(&cmd->draw_cs, TU_DRAW_STATE_DESC_SETS, ib);
cmd->state.desc_sets_ib = ib;
} else {
/* note: for compute we could emit directly, instead of a CP_INDIRECT
* however, the blob uses draw states for compute
*/
tu_cs_emit_ib(&cmd->cs, &ib);
}
} }
void tu_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer, void tu_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,
@ -2111,7 +2218,7 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
cmd->state.dirty |= TU_CMD_DIRTY_VERTEX_BUFFERS; cmd->state.dirty |= TU_CMD_DIRTY_VERTEX_BUFFERS;
/* If the pipeline needs a dynamic descriptor, re-emit descriptor sets */ /* If the pipeline needs a dynamic descriptor, re-emit descriptor sets */
if (pipeline->layout->dynamic_offset_count + pipeline->layout->input_attachment_count) if (pipeline->layout->dynamic_offset_count)
cmd->state.dirty |= TU_CMD_DIRTY_DESCRIPTOR_SETS; cmd->state.dirty |= TU_CMD_DIRTY_DESCRIPTOR_SETS;
/* dynamic linewidth state depends pipeline state's gras_su_cntl /* dynamic linewidth state depends pipeline state's gras_su_cntl
@ -2666,7 +2773,7 @@ tu_CmdBeginRenderPass(VkCommandBuffer commandBuffer,
cmd->state.cache.pending_flush_bits; cmd->state.cache.pending_flush_bits;
cmd->state.renderpass_cache.flush_bits = 0; cmd->state.renderpass_cache.flush_bits = 0;
tu_emit_load_clear(cmd, pRenderPassBegin); tu_emit_renderpass_begin(cmd, pRenderPassBegin);
tu6_emit_zs(cmd, cmd->state.subpass, &cmd->draw_cs); tu6_emit_zs(cmd, cmd->state.subpass, &cmd->draw_cs);
tu6_emit_mrt(cmd, cmd->state.subpass, &cmd->draw_cs); tu6_emit_mrt(cmd, cmd->state.subpass, &cmd->draw_cs);
@ -2729,12 +2836,16 @@ tu_CmdNextSubpass(VkCommandBuffer commandBuffer, VkSubpassContents contents)
} }
} }
tu_emit_input_attachments(cmd, cmd->state.subpass, true);
tu_cond_exec_end(cs); tu_cond_exec_end(cs);
tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM); tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
tu6_emit_sysmem_resolves(cmd, cs, subpass); tu6_emit_sysmem_resolves(cmd, cs, subpass);
tu_emit_input_attachments(cmd, cmd->state.subpass, false);
tu_cond_exec_end(cs); tu_cond_exec_end(cs);
/* Handle dependencies for the next subpass */ /* Handle dependencies for the next subpass */
@ -2857,14 +2968,6 @@ tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline,
descriptors_state->dynamic_descriptors : descriptors_state->dynamic_descriptors :
descriptors_state->sets[state->range[i].bindless_base]->mapped_ptr; descriptors_state->sets[state->range[i].bindless_base]->mapped_ptr;
unsigned block = state->range[i].block; unsigned block = state->range[i].block;
/* If the block in the shader here is in the dynamic descriptor set, it
* is an index into the dynamic descriptor set which is combined from
* dynamic descriptors and input attachments on-the-fly, and we don't
* have access to it here. Instead we work backwards to get the index
* into dynamic_descriptors.
*/
if (state->range[i].bindless_base == MAX_SETS)
block -= pipeline->layout->input_attachment_count;
uint32_t *desc = base + block * A6XX_TEX_CONST_DWORDS; uint32_t *desc = base + block * A6XX_TEX_CONST_DWORDS;
uint64_t va = desc[0] | ((uint64_t)(desc[1] & A6XX_UBO_1_BASE_HI__MASK) << 32); uint64_t va = desc[0] | ((uint64_t)(desc[1] & A6XX_UBO_1_BASE_HI__MASK) << 32);
assert(va); assert(va);
@ -2957,143 +3060,6 @@ tu6_emit_vertex_buffers(struct tu_cmd_buffer *cmd,
return tu_cs_end_sub_stream(&cmd->sub_cs, &cs); return tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
} }
static VkResult
tu6_emit_descriptor_sets(struct tu_cmd_buffer *cmd,
const struct tu_pipeline *pipeline,
VkPipelineBindPoint bind_point,
struct tu_cs_entry *entry,
bool gmem)
{
struct tu_cs *draw_state = &cmd->sub_cs;
struct tu_pipeline_layout *layout = pipeline->layout;
struct tu_descriptor_state *descriptors_state =
tu_get_descriptors_state(cmd, bind_point);
const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
const uint32_t *input_attachment_idx =
pipeline->program.input_attachment_idx;
uint32_t num_dynamic_descs = layout->dynamic_offset_count +
layout->input_attachment_count;
struct ts_cs_memory dynamic_desc_set;
VkResult result;
if (num_dynamic_descs > 0) {
/* allocate and fill out dynamic descriptor set */
result = tu_cs_alloc(draw_state, num_dynamic_descs,
A6XX_TEX_CONST_DWORDS, &dynamic_desc_set);
if (result != VK_SUCCESS)
return result;
memcpy(dynamic_desc_set.map, descriptors_state->input_attachments,
layout->input_attachment_count * A6XX_TEX_CONST_DWORDS * 4);
if (gmem) {
/* Patch input attachments to refer to GMEM instead */
for (unsigned i = 0; i < layout->input_attachment_count; i++) {
uint32_t *dst =
&dynamic_desc_set.map[A6XX_TEX_CONST_DWORDS * i];
/* The compiler has already laid out input_attachment_idx in the
* final order of input attachments, so there's no need to go
* through the pipeline layout finding input attachments.
*/
unsigned attachment_idx = input_attachment_idx[i];
/* It's possible for the pipeline layout to include an input
* attachment which doesn't actually exist for the current
* subpass. Of course, this is only valid so long as the pipeline
* doesn't try to actually load that attachment. Just skip
* patching in that scenario to avoid out-of-bounds accesses.
*/
if (attachment_idx >= cmd->state.subpass->input_count)
continue;
uint32_t a = cmd->state.subpass->input_attachments[attachment_idx].attachment;
const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
assert(att->gmem_offset >= 0);
dst[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK);
dst[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
dst[2] &= ~(A6XX_TEX_CONST_2_TYPE__MASK | A6XX_TEX_CONST_2_PITCH__MASK);
dst[2] |=
A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
A6XX_TEX_CONST_2_PITCH(tiling->tile0.extent.width * att->cpp);
dst[3] = 0;
dst[4] = cmd->device->physical_device->gmem_base + att->gmem_offset;
dst[5] = A6XX_TEX_CONST_5_DEPTH(1);
for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
dst[i] = 0;
if (cmd->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
tu_finishme("patch input attachment pitch for secondary cmd buffer");
}
}
memcpy(dynamic_desc_set.map + layout->input_attachment_count * A6XX_TEX_CONST_DWORDS,
descriptors_state->dynamic_descriptors,
layout->dynamic_offset_count * A6XX_TEX_CONST_DWORDS * 4);
}
uint32_t sp_bindless_base_reg, hlsq_bindless_base_reg;
uint32_t hlsq_update_value;
switch (bind_point) {
case VK_PIPELINE_BIND_POINT_GRAPHICS:
sp_bindless_base_reg = REG_A6XX_SP_BINDLESS_BASE(0);
hlsq_bindless_base_reg = REG_A6XX_HLSQ_BINDLESS_BASE(0);
hlsq_update_value = 0x7c000;
break;
case VK_PIPELINE_BIND_POINT_COMPUTE:
sp_bindless_base_reg = REG_A6XX_SP_CS_BINDLESS_BASE(0);
hlsq_bindless_base_reg = REG_A6XX_HLSQ_CS_BINDLESS_BASE(0);
hlsq_update_value = 0x3e00;
break;
default:
unreachable("bad bind point");
}
/* Be careful here to *not* refer to the pipeline, so that if only the
* pipeline changes we don't have to emit this again (except if there are
* dynamic descriptors in the pipeline layout). This means always emitting
* all the valid descriptors, which means that we always have to put the
* dynamic descriptor in the driver-only slot at the end
*/
uint32_t num_user_sets = util_last_bit(descriptors_state->valid);
uint32_t num_sets = num_user_sets;
if (num_dynamic_descs > 0) {
num_user_sets = MAX_SETS;
num_sets = num_user_sets + 1;
}
unsigned regs[2] = { sp_bindless_base_reg, hlsq_bindless_base_reg };
struct tu_cs cs;
result = tu_cs_begin_sub_stream(draw_state, ARRAY_SIZE(regs) * (1 + num_sets * 2) + 2, &cs);
if (result != VK_SUCCESS)
return result;
if (num_sets > 0) {
for (unsigned i = 0; i < ARRAY_SIZE(regs); i++) {
tu_cs_emit_pkt4(&cs, regs[i], num_sets * 2);
for (unsigned j = 0; j < num_user_sets; j++) {
if (descriptors_state->valid & (1 << j)) {
/* magic | 3 copied from the blob */
tu_cs_emit_qw(&cs, descriptors_state->sets[j]->va | 3);
} else {
tu_cs_emit_qw(&cs, 0 | 3);
}
}
if (num_dynamic_descs > 0) {
tu_cs_emit_qw(&cs, dynamic_desc_set.iova | 3);
}
}
tu_cs_emit_regs(&cs, A6XX_HLSQ_UPDATE_CNTL(hlsq_update_value));
}
*entry = tu_cs_end_sub_stream(draw_state, &cs);
return VK_SUCCESS;
}
static void static void
tu6_emit_streamout(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu6_emit_streamout(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{ {
@ -3184,41 +3150,7 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd,
if (cmd->state.dirty & TU_CMD_DIRTY_STREAMOUT_BUFFERS) if (cmd->state.dirty & TU_CMD_DIRTY_STREAMOUT_BUFFERS)
tu6_emit_streamout(cmd, cs); tu6_emit_streamout(cmd, cs);
/* If there are any any dynamic descriptors, then we may need to re-emit
* them after every pipeline change in case the number of input attachments
* changes. We also always need to re-emit after a pipeline change if there
* are any input attachments, because the input attachment index comes from
* the pipeline. Finally, it can also happen that the subpass changes
* without the pipeline changing, in which case the GMEM descriptors need
* to be patched differently.
*
* TODO: We could probably be clever and avoid re-emitting state on
* pipeline changes if the number of input attachments is always 0. We
* could also only re-emit dynamic state.
*/
if (cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS) { if (cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS) {
bool need_gmem_desc_set = pipeline->layout->input_attachment_count > 0;
result = tu6_emit_descriptor_sets(cmd, pipeline,
VK_PIPELINE_BIND_POINT_GRAPHICS,
&cmd->state.desc_sets_ib, false);
if (result != VK_SUCCESS)
return result;
if (need_gmem_desc_set) {
cmd->state.desc_sets_sysmem_ib = cmd->state.desc_sets_ib;
cmd->state.desc_sets_ib.size = 0;
result = tu6_emit_descriptor_sets(cmd, pipeline,
VK_PIPELINE_BIND_POINT_GRAPHICS,
&cmd->state.desc_sets_gmem_ib, true);
if (result != VK_SUCCESS)
return result;
} else {
cmd->state.desc_sets_gmem_ib.size = 0;
cmd->state.desc_sets_sysmem_ib.size = 0;
}
/* We need to reload the descriptors every time the descriptor sets /* We need to reload the descriptors every time the descriptor sets
* change. However, the commands we send only depend on the pipeline * change. However, the commands we send only depend on the pipeline
* because the whole point is to cache descriptors which are used by the * because the whole point is to cache descriptors which are used by the
@ -3274,8 +3206,6 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd,
tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_GS_CONST, cmd->state.shader_const_ib[MESA_SHADER_GEOMETRY]); tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_GS_CONST, cmd->state.shader_const_ib[MESA_SHADER_GEOMETRY]);
tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_FS_CONST, cmd->state.shader_const_ib[MESA_SHADER_FRAGMENT]); tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_FS_CONST, cmd->state.shader_const_ib[MESA_SHADER_FRAGMENT]);
tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DESC_SETS, cmd->state.desc_sets_ib); tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DESC_SETS, cmd->state.desc_sets_ib);
tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DESC_SETS_GMEM, cmd->state.desc_sets_gmem_ib);
tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DESC_SETS_SYSMEM, cmd->state.desc_sets_sysmem_ib);
tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DESC_SETS_LOAD, cmd->state.desc_sets_load_ib); tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DESC_SETS_LOAD, cmd->state.desc_sets_load_ib);
tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VB, cmd->state.vertex_buffers_ib); tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VB, cmd->state.vertex_buffers_ib);
tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VS_PARAMS, vs_params); tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VS_PARAMS, vs_params);
@ -3293,7 +3223,7 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd,
*/ */
uint32_t draw_state_count = uint32_t draw_state_count =
((cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) ? 3 : 0) + ((cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) ? 3 : 0) +
((cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS) ? 4 : 0) + ((cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS) ? 1 : 0) +
((cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS) ? 1 : 0) + ((cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS) ? 1 : 0) +
1; /* vs_params */ 1; /* vs_params */
@ -3304,12 +3234,8 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd,
tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_GS_CONST, cmd->state.shader_const_ib[MESA_SHADER_GEOMETRY]); tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_GS_CONST, cmd->state.shader_const_ib[MESA_SHADER_GEOMETRY]);
tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_FS_CONST, cmd->state.shader_const_ib[MESA_SHADER_FRAGMENT]); tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_FS_CONST, cmd->state.shader_const_ib[MESA_SHADER_FRAGMENT]);
} }
if (cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS) { if (cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS)
tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DESC_SETS, cmd->state.desc_sets_ib);
tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DESC_SETS_GMEM, cmd->state.desc_sets_gmem_ib);
tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DESC_SETS_SYSMEM, cmd->state.desc_sets_sysmem_ib);
tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DESC_SETS_LOAD, cmd->state.desc_sets_load_ib); tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DESC_SETS_LOAD, cmd->state.desc_sets_load_ib);
}
if (cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS) if (cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS)
tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VB, cmd->state.vertex_buffers_ib); tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VB, cmd->state.vertex_buffers_ib);
tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VS_PARAMS, vs_params); tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VS_PARAMS, vs_params);
@ -3641,7 +3567,6 @@ tu_dispatch(struct tu_cmd_buffer *cmd,
struct tu_pipeline *pipeline = cmd->state.compute_pipeline; struct tu_pipeline *pipeline = cmd->state.compute_pipeline;
struct tu_descriptor_state *descriptors_state = struct tu_descriptor_state *descriptors_state =
&cmd->descriptors[VK_PIPELINE_BIND_POINT_COMPUTE]; &cmd->descriptors[VK_PIPELINE_BIND_POINT_COMPUTE];
VkResult result;
/* TODO: We could probably flush less if we add a compute_flush_bits /* TODO: We could probably flush less if we add a compute_flush_bits
* bitfield. * bitfield.
@ -3659,19 +3584,6 @@ tu_dispatch(struct tu_cmd_buffer *cmd,
tu_emit_compute_driver_params(cs, pipeline, info); tu_emit_compute_driver_params(cs, pipeline, info);
if (cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS) {
result = tu6_emit_descriptor_sets(cmd, pipeline,
VK_PIPELINE_BIND_POINT_COMPUTE, &ib,
false);
if (result != VK_SUCCESS) {
cmd->record_result = result;
return;
}
}
if (ib.size)
tu_cs_emit_ib(cs, &ib);
if ((cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS) && if ((cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS) &&
pipeline->load_state.state_ib.size > 0) { pipeline->load_state.state_ib.size > 0) {
tu_cs_emit_ib(cs, &pipeline->load_state.state_ib); tu_cs_emit_ib(cs, &pipeline->load_state.state_ib);

View file

@ -84,6 +84,7 @@ descriptor_size(VkDescriptorType type)
case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
/* These are remapped to the special driver-managed descriptor set, /* These are remapped to the special driver-managed descriptor set,
* hence they don't take up any space in the original descriptor set: * hence they don't take up any space in the original descriptor set:
* Input attachment doesn't use descriptor sets at all
*/ */
return 0; return 0;
case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
@ -175,7 +176,6 @@ tu_CreateDescriptorSetLayout(
size - sizeof(struct tu_descriptor_set_layout)); size - sizeof(struct tu_descriptor_set_layout));
uint32_t dynamic_offset_count = 0; uint32_t dynamic_offset_count = 0;
uint32_t input_attachment_count = 0;
uint32_t buffer_count = 0; uint32_t buffer_count = 0;
for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) { for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) {
@ -187,7 +187,6 @@ tu_CreateDescriptorSetLayout(
set_layout->binding[b].offset = set_layout->size; set_layout->binding[b].offset = set_layout->size;
set_layout->binding[b].buffer_offset = buffer_count; set_layout->binding[b].buffer_offset = buffer_count;
set_layout->binding[b].dynamic_offset_offset = dynamic_offset_count; set_layout->binding[b].dynamic_offset_offset = dynamic_offset_count;
set_layout->binding[b].input_attachment_offset = input_attachment_count;
set_layout->binding[b].size = descriptor_size(binding->descriptorType); set_layout->binding[b].size = descriptor_size(binding->descriptorType);
set_layout->binding[b].shader_stages = binding->stageFlags; set_layout->binding[b].shader_stages = binding->stageFlags;
@ -250,15 +249,13 @@ tu_CreateDescriptorSetLayout(
dynamic_offset_count += binding->descriptorCount; dynamic_offset_count += binding->descriptorCount;
} }
if (binding->descriptorType == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT)
input_attachment_count += binding->descriptorCount;
set_layout->shader_stages |= binding->stageFlags; set_layout->shader_stages |= binding->stageFlags;
} }
free(bindings); free(bindings);
set_layout->dynamic_offset_count = dynamic_offset_count; set_layout->dynamic_offset_count = dynamic_offset_count;
set_layout->input_attachment_count = input_attachment_count;
set_layout->buffer_count = buffer_count; set_layout->buffer_count = buffer_count;
*pSetLayout = tu_descriptor_set_layout_to_handle(set_layout); *pSetLayout = tu_descriptor_set_layout_to_handle(set_layout);
@ -364,10 +361,9 @@ tu_CreatePipelineLayout(VkDevice _device,
return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
layout->num_sets = pCreateInfo->setLayoutCount; layout->num_sets = pCreateInfo->setLayoutCount;
layout->input_attachment_count = 0;
layout->dynamic_offset_count = 0; layout->dynamic_offset_count = 0;
unsigned dynamic_offset_count = 0, input_attachment_count = 0; unsigned dynamic_offset_count = 0;
_mesa_sha1_init(&ctx); _mesa_sha1_init(&ctx);
for (uint32_t set = 0; set < pCreateInfo->setLayoutCount; set++) { for (uint32_t set = 0; set < pCreateInfo->setLayoutCount; set++) {
@ -375,9 +371,7 @@ tu_CreatePipelineLayout(VkDevice _device,
pCreateInfo->pSetLayouts[set]); pCreateInfo->pSetLayouts[set]);
layout->set[set].layout = set_layout; layout->set[set].layout = set_layout;
layout->set[set].dynamic_offset_start = dynamic_offset_count; layout->set[set].dynamic_offset_start = dynamic_offset_count;
layout->set[set].input_attachment_start = input_attachment_count;
dynamic_offset_count += set_layout->dynamic_offset_count; dynamic_offset_count += set_layout->dynamic_offset_count;
input_attachment_count += set_layout->input_attachment_count;
for (uint32_t b = 0; b < set_layout->binding_count; b++) { for (uint32_t b = 0; b < set_layout->binding_count; b++) {
if (set_layout->binding[b].immutable_samplers_offset) if (set_layout->binding[b].immutable_samplers_offset)
@ -392,7 +386,6 @@ tu_CreatePipelineLayout(VkDevice _device,
} }
layout->dynamic_offset_count = dynamic_offset_count; layout->dynamic_offset_count = dynamic_offset_count;
layout->input_attachment_count = input_attachment_count;
layout->push_constant_size = 0; layout->push_constant_size = 0;
for (unsigned i = 0; i < pCreateInfo->pushConstantRangeCount; ++i) { for (unsigned i = 0; i < pCreateInfo->pushConstantRangeCount; ++i) {
@ -445,8 +438,7 @@ tu_descriptor_set_create(struct tu_device *device,
unsigned dynamic_offset = sizeof(struct tu_descriptor_set) + unsigned dynamic_offset = sizeof(struct tu_descriptor_set) +
sizeof(struct tu_bo *) * buffer_count; sizeof(struct tu_bo *) * buffer_count;
unsigned mem_size = dynamic_offset + unsigned mem_size = dynamic_offset +
A6XX_TEX_CONST_DWORDS * 4 * (layout->dynamic_offset_count + A6XX_TEX_CONST_DWORDS * 4 * layout->dynamic_offset_count;
layout->input_attachment_count);;
if (pool->host_memory_base) { if (pool->host_memory_base) {
if (pool->host_memory_end - pool->host_memory_ptr < mem_size) if (pool->host_memory_end - pool->host_memory_ptr < mem_size)
@ -464,7 +456,7 @@ tu_descriptor_set_create(struct tu_device *device,
memset(set, 0, mem_size); memset(set, 0, mem_size);
if (layout->dynamic_offset_count + layout->input_attachment_count > 0) { if (layout->dynamic_offset_count) {
set->dynamic_descriptors = (uint32_t *)((uint8_t*)set + dynamic_offset); set->dynamic_descriptors = (uint32_t *)((uint8_t*)set + dynamic_offset);
} }
@ -590,7 +582,6 @@ tu_CreateDescriptorPool(VkDevice _device,
switch(pCreateInfo->pPoolSizes[i].type) { switch(pCreateInfo->pPoolSizes[i].type) {
case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
dynamic_count += pCreateInfo->pPoolSizes[i].descriptorCount; dynamic_count += pCreateInfo->pPoolSizes[i].descriptorCount;
default: default:
break; break;
@ -903,7 +894,7 @@ tu_update_descriptor_sets(struct tu_device *device,
case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: { case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: {
assert(!(set->layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR)); assert(!(set->layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR));
unsigned idx = writeset->dstArrayElement + j; unsigned idx = writeset->dstArrayElement + j;
idx += set->layout->input_attachment_count + binding_layout->dynamic_offset_offset; idx += binding_layout->dynamic_offset_offset;
write_ubo_descriptor(device, cmd_buffer, write_ubo_descriptor(device, cmd_buffer,
set->dynamic_descriptors + A6XX_TEX_CONST_DWORDS * idx, set->dynamic_descriptors + A6XX_TEX_CONST_DWORDS * idx,
buffer_list, writeset->pBufferInfo + j); buffer_list, writeset->pBufferInfo + j);
@ -916,7 +907,7 @@ tu_update_descriptor_sets(struct tu_device *device,
case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: { case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
assert(!(set->layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR)); assert(!(set->layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR));
unsigned idx = writeset->dstArrayElement + j; unsigned idx = writeset->dstArrayElement + j;
idx += set->layout->input_attachment_count + binding_layout->dynamic_offset_offset; idx += binding_layout->dynamic_offset_offset;
write_buffer_descriptor(device, cmd_buffer, write_buffer_descriptor(device, cmd_buffer,
set->dynamic_descriptors + A6XX_TEX_CONST_DWORDS * idx, set->dynamic_descriptors + A6XX_TEX_CONST_DWORDS * idx,
buffer_list, writeset->pBufferInfo + j); buffer_list, writeset->pBufferInfo + j);
@ -937,15 +928,6 @@ tu_update_descriptor_sets(struct tu_device *device,
writeset->descriptorType, writeset->descriptorType,
writeset->pImageInfo + j); writeset->pImageInfo + j);
break; break;
case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: {
unsigned idx = writeset->dstArrayElement + j;
idx += binding_layout->input_attachment_offset;
write_image_descriptor(device, cmd_buffer,
set->dynamic_descriptors + A6XX_TEX_CONST_DWORDS * idx,
buffer_list, writeset->descriptorType,
writeset->pImageInfo + j);
break;
}
case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
write_combined_image_sampler_descriptor(device, cmd_buffer, write_combined_image_sampler_descriptor(device, cmd_buffer,
A6XX_TEX_CONST_DWORDS * 4, A6XX_TEX_CONST_DWORDS * 4,
@ -957,6 +939,9 @@ tu_update_descriptor_sets(struct tu_device *device,
case VK_DESCRIPTOR_TYPE_SAMPLER: case VK_DESCRIPTOR_TYPE_SAMPLER:
write_sampler_descriptor(device, ptr, writeset->pImageInfo + j); write_sampler_descriptor(device, ptr, writeset->pImageInfo + j);
break; break;
case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
/* nothing in descriptor set - framebuffer state is used instead */
break;
default: default:
unreachable("unimplemented descriptor type"); unreachable("unimplemented descriptor type");
break; break;
@ -999,8 +984,6 @@ tu_update_descriptor_sets(struct tu_device *device,
case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: { case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
unsigned src_idx = copyset->srcArrayElement + j; unsigned src_idx = copyset->srcArrayElement + j;
unsigned dst_idx = copyset->dstArrayElement + j; unsigned dst_idx = copyset->dstArrayElement + j;
src_idx += src_set->layout->input_attachment_count;
dst_idx += dst_set->layout->input_attachment_count;
src_idx += src_binding_layout->dynamic_offset_offset; src_idx += src_binding_layout->dynamic_offset_offset;
dst_idx += dst_binding_layout->dynamic_offset_offset; dst_idx += dst_binding_layout->dynamic_offset_offset;
@ -1010,18 +993,6 @@ tu_update_descriptor_sets(struct tu_device *device,
memcpy(dst_dynamic, src_dynamic, A6XX_TEX_CONST_DWORDS * 4); memcpy(dst_dynamic, src_dynamic, A6XX_TEX_CONST_DWORDS * 4);
break; break;
} }
case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: {
unsigned src_idx = copyset->srcArrayElement + j;
unsigned dst_idx = copyset->dstArrayElement + j;
src_idx += src_binding_layout->input_attachment_offset;
dst_idx += dst_binding_layout->input_attachment_offset;
uint32_t *src_dynamic, *dst_dynamic;
src_dynamic = src_set->dynamic_descriptors + src_idx * A6XX_TEX_CONST_DWORDS;
dst_dynamic = dst_set->dynamic_descriptors + dst_idx * A6XX_TEX_CONST_DWORDS;
memcpy(dst_dynamic, src_dynamic, A6XX_TEX_CONST_DWORDS * 4);
break;
}
default: default:
memcpy(dst_ptr, src_ptr, src_binding_layout->size); memcpy(dst_ptr, src_ptr, src_binding_layout->size);
} }
@ -1099,13 +1070,7 @@ tu_CreateDescriptorUpdateTemplate(
switch (entry->descriptorType) { switch (entry->descriptorType) {
case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
dst_offset = (set_layout->input_attachment_count + dst_offset = (binding_layout->dynamic_offset_offset +
binding_layout->dynamic_offset_offset +
entry->dstArrayElement) * A6XX_TEX_CONST_DWORDS;
dst_stride = A6XX_TEX_CONST_DWORDS;
break;
case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
dst_offset = (binding_layout->input_attachment_offset +
entry->dstArrayElement) * A6XX_TEX_CONST_DWORDS; entry->dstArrayElement) * A6XX_TEX_CONST_DWORDS;
dst_stride = A6XX_TEX_CONST_DWORDS; dst_stride = A6XX_TEX_CONST_DWORDS;
break; break;
@ -1197,16 +1162,11 @@ tu_update_descriptor_set_with_template(
break; break;
case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: {
write_image_descriptor(device, cmd_buffer, ptr, buffer_list, write_image_descriptor(device, cmd_buffer, ptr, buffer_list,
templ->entry[i].descriptor_type, templ->entry[i].descriptor_type,
src); src);
break; break;
case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: {
write_image_descriptor(device, cmd_buffer,
set->dynamic_descriptors + dst_offset,
buffer_list, templ->entry[i].descriptor_type,
src);
break;
} }
case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
write_combined_image_sampler_descriptor(device, cmd_buffer, write_combined_image_sampler_descriptor(device, cmd_buffer,

View file

@ -54,9 +54,6 @@ struct tu_descriptor_set_binding_layout
*/ */
uint32_t dynamic_offset_offset; uint32_t dynamic_offset_offset;
/* Index into the array of dynamic input attachment descriptors */
uint32_t input_attachment_offset;
/* Offset in the tu_descriptor_set_layout of the immutable samplers, or 0 /* Offset in the tu_descriptor_set_layout of the immutable samplers, or 0
* if there are no immutable samplers. */ * if there are no immutable samplers. */
uint32_t immutable_samplers_offset; uint32_t immutable_samplers_offset;
@ -86,9 +83,6 @@ struct tu_descriptor_set_layout
/* Number of dynamic offsets used by this descriptor set */ /* Number of dynamic offsets used by this descriptor set */
uint16_t dynamic_offset_count; uint16_t dynamic_offset_count;
/* Number of input attachments used by the descriptor set */
uint16_t input_attachment_count;
/* A bitfield of which dynamic buffers are ubo's, to make the /* A bitfield of which dynamic buffers are ubo's, to make the
* descriptor-binding-time patching easier. * descriptor-binding-time patching easier.
*/ */
@ -110,13 +104,11 @@ struct tu_pipeline_layout
struct tu_descriptor_set_layout *layout; struct tu_descriptor_set_layout *layout;
uint32_t size; uint32_t size;
uint32_t dynamic_offset_start; uint32_t dynamic_offset_start;
uint32_t input_attachment_start;
} set[MAX_SETS]; } set[MAX_SETS];
uint32_t num_sets; uint32_t num_sets;
uint32_t push_constant_size; uint32_t push_constant_size;
uint32_t dynamic_offset_count; uint32_t dynamic_offset_count;
uint32_t input_attachment_count;
unsigned char sha1[20]; unsigned char sha1[20];
}; };

View file

@ -183,8 +183,7 @@ tu6_emit_load_state(struct tu_pipeline *pipeline, bool compute)
switch (binding->type) { switch (binding->type) {
case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
base = MAX_SETS; base = MAX_SETS;
offset = (layout->input_attachment_count + offset = (layout->set[i].dynamic_offset_start +
layout->set[i].dynamic_offset_start +
binding->dynamic_offset_offset) * A6XX_TEX_CONST_DWORDS; binding->dynamic_offset_offset) * A6XX_TEX_CONST_DWORDS;
/* fallthrough */ /* fallthrough */
case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
@ -201,9 +200,8 @@ tu6_emit_load_state(struct tu_pipeline *pipeline, bool compute)
} }
break; break;
case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
base = MAX_SETS; /* nothing - input attachment doesn't use bindless */
offset = (layout->set[i].input_attachment_start + break;
binding->input_attachment_offset) * A6XX_TEX_CONST_DWORDS;
case VK_DESCRIPTOR_TYPE_SAMPLER: case VK_DESCRIPTOR_TYPE_SAMPLER:
case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: { case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: {
@ -217,8 +215,7 @@ tu6_emit_load_state(struct tu_pipeline *pipeline, bool compute)
} }
case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
base = MAX_SETS; base = MAX_SETS;
offset = (layout->input_attachment_count + offset = (layout->set[i].dynamic_offset_start +
layout->set[i].dynamic_offset_start +
binding->dynamic_offset_offset) * A6XX_TEX_CONST_DWORDS; binding->dynamic_offset_offset) * A6XX_TEX_CONST_DWORDS;
/* fallthrough */ /* fallthrough */
case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: { case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: {
@ -2055,12 +2052,6 @@ tu_pipeline_builder_parse_shader_stages(struct tu_pipeline_builder *builder,
desc_sets |= builder->shaders[i]->active_desc_sets; desc_sets |= builder->shaders[i]->active_desc_sets;
} }
pipeline->active_desc_sets = desc_sets; pipeline->active_desc_sets = desc_sets;
if (builder->shaders[MESA_SHADER_FRAGMENT]) {
memcpy(pipeline->program.input_attachment_idx,
builder->shaders[MESA_SHADER_FRAGMENT]->attachment_idx,
sizeof(pipeline->program.input_attachment_idx));
}
} }
static void static void

View file

@ -436,8 +436,6 @@ enum tu_draw_state_group_id
TU_DRAW_STATE_GS_CONST, TU_DRAW_STATE_GS_CONST,
TU_DRAW_STATE_FS_CONST, TU_DRAW_STATE_FS_CONST,
TU_DRAW_STATE_DESC_SETS, TU_DRAW_STATE_DESC_SETS,
TU_DRAW_STATE_DESC_SETS_GMEM,
TU_DRAW_STATE_DESC_SETS_SYSMEM,
TU_DRAW_STATE_DESC_SETS_LOAD, TU_DRAW_STATE_DESC_SETS_LOAD,
TU_DRAW_STATE_VS_PARAMS, TU_DRAW_STATE_VS_PARAMS,
@ -630,11 +628,7 @@ tu_get_perftest_option_name(int id);
struct tu_descriptor_state struct tu_descriptor_state
{ {
struct tu_descriptor_set *sets[MAX_SETS]; struct tu_descriptor_set *sets[MAX_SETS];
uint32_t valid;
struct tu_push_descriptor_set push_set;
bool push_dirty;
uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS * A6XX_TEX_CONST_DWORDS]; uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS * A6XX_TEX_CONST_DWORDS];
uint32_t input_attachments[MAX_RTS * A6XX_TEX_CONST_DWORDS];
}; };
struct tu_tile struct tu_tile
@ -821,7 +815,7 @@ struct tu_cmd_state
struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT]; struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
struct tu_cs_entry vertex_buffers_ib; struct tu_cs_entry vertex_buffers_ib;
struct tu_cs_entry shader_const_ib[MESA_SHADER_STAGES]; struct tu_cs_entry shader_const_ib[MESA_SHADER_STAGES];
struct tu_cs_entry desc_sets_ib, desc_sets_gmem_ib, desc_sets_sysmem_ib, desc_sets_load_ib; struct tu_cs_entry desc_sets_ib, desc_sets_load_ib;
/* Stream output buffers */ /* Stream output buffers */
struct struct
@ -1055,7 +1049,6 @@ struct tu_shader
struct ir3_shader *ir3_shader; struct ir3_shader *ir3_shader;
struct tu_push_constant_range push_consts; struct tu_push_constant_range push_consts;
unsigned attachment_idx[MAX_RTS];
uint8_t active_desc_sets; uint8_t active_desc_sets;
}; };
@ -1109,7 +1102,6 @@ struct tu_pipeline
struct tu_cs_entry binning_state_ib; struct tu_cs_entry binning_state_ib;
struct tu_program_descriptor_linkage link[MESA_SHADER_STAGES]; struct tu_program_descriptor_linkage link[MESA_SHADER_STAGES];
unsigned input_attachment_idx[MAX_RTS];
} program; } program;
struct struct

View file

@ -141,8 +141,7 @@ lower_vulkan_resource_index(nir_builder *b, nir_intrinsic_instr *instr,
case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
base = layout->set[set].dynamic_offset_start + base = layout->set[set].dynamic_offset_start +
binding_layout->dynamic_offset_offset + binding_layout->dynamic_offset_offset;
layout->input_attachment_count;
set = MAX_SETS; set = MAX_SETS;
break; break;
default: default:
@ -177,31 +176,42 @@ build_bindless(nir_builder *b, nir_deref_instr *deref, bool is_sampler,
const struct tu_descriptor_set_binding_layout *bind_layout = const struct tu_descriptor_set_binding_layout *bind_layout =
&layout->set[set].layout->binding[binding]; &layout->set[set].layout->binding[binding];
/* input attachments use non bindless workaround */
if (bind_layout->type == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT) {
const struct glsl_type *glsl_type = glsl_without_array(var->type);
uint32_t idx = var->data.index * 2;
b->shader->info.textures_used |=
((1ull << (bind_layout->array_size * 2)) - 1) << (idx * 2);
/* D24S8 workaround: stencil of D24S8 will be sampled as uint */
if (glsl_get_sampler_result_type(glsl_type) == GLSL_TYPE_UINT)
idx += 1;
if (deref->deref_type == nir_deref_type_var)
return nir_imm_int(b, idx);
nir_ssa_def *arr_index = nir_ssa_for_src(b, deref->arr.index, 1);
return nir_iadd(b, nir_imm_int(b, idx),
nir_imul_imm(b, arr_index, 2));
}
shader->active_desc_sets |= 1u << set; shader->active_desc_sets |= 1u << set;
nir_ssa_def *desc_offset; nir_ssa_def *desc_offset;
unsigned descriptor_stride; unsigned descriptor_stride;
if (bind_layout->type == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT) { unsigned offset = 0;
unsigned offset = /* Samplers come second in combined image/sampler descriptors, see
layout->set[set].input_attachment_start + * write_combined_image_sampler_descriptor().
bind_layout->input_attachment_offset; */
desc_offset = nir_imm_int(b, offset); if (is_sampler && bind_layout->type ==
set = MAX_SETS; VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) {
descriptor_stride = 1; offset = 1;
} else {
unsigned offset = 0;
/* Samplers come second in combined image/sampler descriptors, see
* write_combined_image_sampler_descriptor().
*/
if (is_sampler && bind_layout->type ==
VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) {
offset = 1;
}
desc_offset =
nir_imm_int(b, (bind_layout->offset / (4 * A6XX_TEX_CONST_DWORDS)) +
offset);
descriptor_stride = bind_layout->size / (4 * A6XX_TEX_CONST_DWORDS);
} }
desc_offset =
nir_imm_int(b, (bind_layout->offset / (4 * A6XX_TEX_CONST_DWORDS)) +
offset);
descriptor_stride = bind_layout->size / (4 * A6XX_TEX_CONST_DWORDS);
if (deref->deref_type != nir_deref_type_var) { if (deref->deref_type != nir_deref_type_var) {
assert(deref->deref_type == nir_deref_type_array); assert(deref->deref_type == nir_deref_type_array);
@ -356,6 +366,10 @@ lower_tex(nir_builder *b, nir_tex_instr *tex,
nir_instr_rewrite_src(&tex->instr, &tex->src[tex_src_idx].src, nir_instr_rewrite_src(&tex->instr, &tex->src[tex_src_idx].src,
nir_src_for_ssa(bindless)); nir_src_for_ssa(bindless));
tex->src[tex_src_idx].src_type = nir_tex_src_texture_handle; tex->src[tex_src_idx].src_type = nir_tex_src_texture_handle;
/* for the input attachment case: */
if (bindless->parent_instr->type != nir_instr_type_intrinsic)
tex->src[tex_src_idx].src_type = nir_tex_src_texture_offset;
} }
return true; return true;
@ -435,38 +449,6 @@ gather_push_constants(nir_shader *shader, struct tu_shader *tu_shader)
align(max, 16) / 16 - tu_shader->push_consts.lo; align(max, 16) / 16 - tu_shader->push_consts.lo;
} }
/* Gather the InputAttachmentIndex for each input attachment from the NIR
* shader and organize the info in a way so that draw-time patching is easy.
*/
static void
gather_input_attachments(nir_shader *shader, struct tu_shader *tu_shader,
const struct tu_pipeline_layout *layout)
{
nir_foreach_variable(var, &shader->uniforms) {
const struct glsl_type *glsl_type = glsl_without_array(var->type);
if (!glsl_type_is_image(glsl_type))
continue;
enum glsl_sampler_dim dim = glsl_get_sampler_dim(glsl_type);
const uint32_t set = var->data.descriptor_set;
const uint32_t binding = var->data.binding;
const struct tu_descriptor_set_binding_layout *bind_layout =
&layout->set[set].layout->binding[binding];
const uint32_t array_size = bind_layout->array_size;
if (dim == GLSL_SAMPLER_DIM_SUBPASS ||
dim == GLSL_SAMPLER_DIM_SUBPASS_MS) {
unsigned offset =
layout->set[set].input_attachment_start +
bind_layout->input_attachment_offset;
for (unsigned i = 0; i < array_size; i++)
tu_shader->attachment_idx[offset + i] = var->data.index + i;
}
}
}
static bool static bool
tu_lower_io(nir_shader *shader, struct tu_shader *tu_shader, tu_lower_io(nir_shader *shader, struct tu_shader *tu_shader,
const struct tu_pipeline_layout *layout) const struct tu_pipeline_layout *layout)
@ -474,7 +456,6 @@ tu_lower_io(nir_shader *shader, struct tu_shader *tu_shader,
bool progress = false; bool progress = false;
gather_push_constants(shader, tu_shader); gather_push_constants(shader, tu_shader);
gather_input_attachments(shader, tu_shader, layout);
nir_foreach_function(function, shader) { nir_foreach_function(function, shader) {
if (function->impl) if (function->impl)