diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c index 3de6fb42afb..1c9c43f35e9 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/src/freedreno/vulkan/tu_cmd_buffer.c @@ -1875,16 +1875,62 @@ tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer, descriptors_state->sets[idx] = set; descriptors_state->valid |= (1u << idx); + /* Note: the actual input attachment indices come from the shader + * itself, so we can't generate the patched versions of these until + * draw time when both the pipeline and descriptors are bound and + * we're inside the render pass. + */ + unsigned dst_idx = layout->set[idx].input_attachment_start; + memcpy(&descriptors_state->input_attachments[dst_idx * A6XX_TEX_CONST_DWORDS], + set->dynamic_descriptors, + set->layout->input_attachment_count * A6XX_TEX_CONST_DWORDS * 4); + for(unsigned j = 0; j < set->layout->dynamic_offset_count; ++j, ++dyn_idx) { - unsigned idx = j + layout->set[i + firstSet].dynamic_offset_start; + /* Dynamic buffers come after input attachments in the descriptor set + * itself, but due to how the Vulkan descriptor set binding works, we + * have to put input attachments and dynamic buffers in separate + * buffers in the descriptor_state and then combine them at draw + * time. Binding a descriptor set only invalidates the descriptor + * sets after it, but if we try to tightly pack the descriptors after + * the input attachments then we could corrupt dynamic buffers in the + * descriptor set before it, or we'd have to move all the dynamic + * buffers over. We just put them into separate buffers to make + * binding as well as the later patching of input attachments easy. + */ + unsigned src_idx = j + set->layout->input_attachment_count; + unsigned dst_idx = j + layout->set[idx].dynamic_offset_start; assert(dyn_idx < dynamicOffsetCount); - descriptors_state->dynamic_buffers[idx] = - set->dynamic_descriptors[j].va + pDynamicOffsets[dyn_idx]; + uint32_t *dst = + &descriptors_state->dynamic_descriptors[dst_idx * A6XX_TEX_CONST_DWORDS]; + uint32_t *src = + &set->dynamic_descriptors[src_idx * A6XX_TEX_CONST_DWORDS]; + uint32_t offset = pDynamicOffsets[dyn_idx]; + + /* Patch the storage/uniform descriptors right away. */ + if (layout->set[idx].layout->dynamic_ubo & (1 << j)) { + /* Note: we can assume here that the addition won't roll over and + * change the SIZE field. + */ + uint64_t va = src[0] | ((uint64_t)src[1] << 32); + va += offset; + dst[0] = va; + dst[1] = va >> 32; + } else { + memcpy(dst, src, A6XX_TEX_CONST_DWORDS * 4); + /* Note: A6XX_IBO_5_DEPTH is always 0 */ + uint64_t va = dst[4] | ((uint64_t)dst[5] << 32); + va += offset; + dst[4] = va; + dst[5] = va >> 32; + } } } - cmd_buffer->state.dirty |= TU_CMD_DIRTY_DESCRIPTOR_SETS; + if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE) + cmd_buffer->state.dirty |= TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS; + else + cmd_buffer->state.dirty |= TU_CMD_DIRTY_DESCRIPTOR_SETS; } void tu_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer, @@ -2316,6 +2362,9 @@ tu_CmdBeginRenderPass(VkCommandBuffer commandBuffer, tu_bo_list_add(&cmd->bo_list, iview->image->bo, MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE); } + + /* Flag input attachment descriptors for re-emission if necessary */ + cmd->state.dirty |= TU_CMD_DIRTY_INPUT_ATTACHMENTS; } void @@ -2395,6 +2444,9 @@ tu_CmdNextSubpass(VkCommandBuffer commandBuffer, VkSubpassContents contents) tu6_emit_mrt(cmd, cmd->state.subpass, cs); tu6_emit_msaa(cs, cmd->state.subpass->samples); tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, false); + + /* Flag input attachment descriptors for re-emission if necessary */ + cmd->state.dirty |= TU_CMD_DIRTY_INPUT_ATTACHMENTS; } void @@ -2459,6 +2511,7 @@ struct tu_draw_info #define ENABLE_ALL (CP_SET_DRAW_STATE__0_BINNING | CP_SET_DRAW_STATE__0_GMEM | CP_SET_DRAW_STATE__0_SYSMEM) #define ENABLE_DRAW (CP_SET_DRAW_STATE__0_GMEM | CP_SET_DRAW_STATE__0_SYSMEM) +#define ENABLE_NON_GMEM (CP_SET_DRAW_STATE__0_BINNING | CP_SET_DRAW_STATE__0_SYSMEM) enum tu_draw_state_group_id { @@ -2472,10 +2525,8 @@ enum tu_draw_state_group_id TU_DRAW_STATE_BLEND, TU_DRAW_STATE_VS_CONST, TU_DRAW_STATE_FS_CONST, - TU_DRAW_STATE_VS_TEX, - TU_DRAW_STATE_FS_TEX_SYSMEM, - TU_DRAW_STATE_FS_TEX_GMEM, - TU_DRAW_STATE_FS_IBO, + TU_DRAW_STATE_DESC_SETS, + TU_DRAW_STATE_DESC_SETS_GMEM, TU_DRAW_STATE_VS_PARAMS, TU_DRAW_STATE_COUNT, @@ -2488,149 +2539,6 @@ struct tu_draw_state_group struct tu_cs_entry ib; }; -const static void * -sampler_ptr(struct tu_descriptor_state *descriptors_state, - const struct tu_descriptor_map *map, unsigned i, - unsigned array_index) -{ - assert(descriptors_state->valid & (1 << map->set[i])); - - struct tu_descriptor_set *set = descriptors_state->sets[map->set[i]]; - assert(map->binding[i] < set->layout->binding_count); - - const struct tu_descriptor_set_binding_layout *layout = - &set->layout->binding[map->binding[i]]; - - if (layout->immutable_samplers_offset) { - const uint32_t *immutable_samplers = - tu_immutable_samplers(set->layout, layout); - - return &immutable_samplers[array_index * A6XX_TEX_SAMP_DWORDS]; - } - - switch (layout->type) { - case VK_DESCRIPTOR_TYPE_SAMPLER: - return &set->mapped_ptr[layout->offset / 4]; - case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: - return &set->mapped_ptr[layout->offset / 4 + A6XX_TEX_CONST_DWORDS + - array_index * (A6XX_TEX_CONST_DWORDS + A6XX_TEX_SAMP_DWORDS)]; - default: - unreachable("unimplemented descriptor type"); - break; - } -} - -static void -write_tex_const(struct tu_cmd_buffer *cmd, - uint32_t *dst, - struct tu_descriptor_state *descriptors_state, - const struct tu_descriptor_map *map, - unsigned i, unsigned array_index, bool is_sysmem) -{ - assert(descriptors_state->valid & (1 << map->set[i])); - - struct tu_descriptor_set *set = descriptors_state->sets[map->set[i]]; - assert(map->binding[i] < set->layout->binding_count); - - const struct tu_descriptor_set_binding_layout *layout = - &set->layout->binding[map->binding[i]]; - - switch (layout->type) { - case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: - case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: - case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: - case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: - memcpy(dst, &set->mapped_ptr[layout->offset / 4 + - array_index * A6XX_TEX_CONST_DWORDS], - A6XX_TEX_CONST_DWORDS * 4); - break; - case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: - memcpy(dst, &set->mapped_ptr[layout->offset / 4 + - array_index * - (A6XX_TEX_CONST_DWORDS + - A6XX_TEX_SAMP_DWORDS)], - A6XX_TEX_CONST_DWORDS * 4); - break; - default: - unreachable("unimplemented descriptor type"); - break; - } - - if (layout->type == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT && !is_sysmem) { - const struct tu_tiling_config *tiling = &cmd->state.tiling_config; - uint32_t a = cmd->state.subpass->input_attachments[map->value[i] + - array_index].attachment; - const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a]; - - assert(att->gmem_offset >= 0); - - dst[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK); - dst[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2); - dst[2] &= ~(A6XX_TEX_CONST_2_TYPE__MASK | A6XX_TEX_CONST_2_PITCH__MASK); - dst[2] |= - A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) | - A6XX_TEX_CONST_2_PITCH(tiling->tile0.extent.width * att->cpp); - dst[3] = 0; - dst[4] = cmd->device->physical_device->gmem_base + att->gmem_offset; - dst[5] = A6XX_TEX_CONST_5_DEPTH(1); - for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++) - dst[i] = 0; - - if (cmd->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) - tu_finishme("patch input attachment pitch for secondary cmd buffer"); - } -} - -static void -write_image_ibo(struct tu_cmd_buffer *cmd, - uint32_t *dst, - struct tu_descriptor_state *descriptors_state, - const struct tu_descriptor_map *map, - unsigned i, unsigned array_index) -{ - assert(descriptors_state->valid & (1 << map->set[i])); - - struct tu_descriptor_set *set = descriptors_state->sets[map->set[i]]; - assert(map->binding[i] < set->layout->binding_count); - - const struct tu_descriptor_set_binding_layout *layout = - &set->layout->binding[map->binding[i]]; - - assert(layout->type == VK_DESCRIPTOR_TYPE_STORAGE_IMAGE); - - memcpy(dst, &set->mapped_ptr[layout->offset / 4 + - (array_index * 2 + 1) * A6XX_TEX_CONST_DWORDS], - A6XX_TEX_CONST_DWORDS * 4); -} - -static uint64_t -buffer_ptr(struct tu_descriptor_state *descriptors_state, - const struct tu_descriptor_map *map, - unsigned i, unsigned array_index) -{ - assert(descriptors_state->valid & (1 << map->set[i])); - - struct tu_descriptor_set *set = descriptors_state->sets[map->set[i]]; - assert(map->binding[i] < set->layout->binding_count); - - const struct tu_descriptor_set_binding_layout *layout = - &set->layout->binding[map->binding[i]]; - - switch (layout->type) { - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: - return descriptors_state->dynamic_buffers[layout->dynamic_offset_offset + - array_index]; - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: - return (uint64_t) set->mapped_ptr[layout->offset / 4 + array_index * 2 + 1] << 32 | - set->mapped_ptr[layout->offset / 4 + array_index * 2]; - default: - unreachable("unimplemented descriptor type"); - break; - } -} - static inline uint32_t tu6_stage2opcode(gl_shader_stage type) { @@ -2708,21 +2616,24 @@ tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline, debug_assert((size % 16) == 0); debug_assert((offset % 16) == 0); - /* Look through the UBO map to find our UBO index, and get the VA for - * that UBO. + /* Dig out the descriptor from the descriptor state and read the VA from + * it. */ - uint64_t va = 0; - uint32_t ubo_idx = state->range[i].block - 1; - uint32_t ubo_map_base = 0; - for (int j = 0; j < link->ubo_map.num; j++) { - if (ubo_idx >= ubo_map_base && - ubo_idx < ubo_map_base + link->ubo_map.array_size[j]) { - va = buffer_ptr(descriptors_state, &link->ubo_map, j, - ubo_idx - ubo_map_base); - break; - } - ubo_map_base += link->ubo_map.array_size[j]; - } + assert(state->range[i].bindless); + uint32_t *base = state->range[i].bindless_base == MAX_SETS ? + descriptors_state->dynamic_descriptors : + descriptors_state->sets[state->range[i].bindless_base]->mapped_ptr; + unsigned block = state->range[i].block; + /* If the block in the shader here is in the dynamic descriptor set, it + * is an index into the dynamic descriptor set which is combined from + * dynamic descriptors and input attachments on-the-fly, and we don't + * have access to it here. Instead we work backwards to get the index + * into dynamic_descriptors. + */ + if (state->range[i].bindless_base == MAX_SETS) + block -= pipeline->layout->input_attachment_count; + uint32_t *desc = base + block * A6XX_TEX_CONST_DWORDS; + uint64_t va = desc[0] | ((uint64_t)(desc[1] & A6XX_UBO_1_BASE_HI__MASK) << 32); assert(va); tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3); @@ -2735,43 +2646,6 @@ tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline, } } -static void -tu6_emit_ubos(struct tu_cs *cs, const struct tu_pipeline *pipeline, - struct tu_descriptor_state *descriptors_state, - gl_shader_stage type) -{ - const struct tu_program_descriptor_linkage *link = - &pipeline->program.link[type]; - - uint32_t num = MIN2(link->ubo_map.num_desc, link->const_state.num_ubos); - uint32_t anum = align(num, 2); - - if (!num) - return; - - tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + (2 * anum)); - tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(link->const_state.offsets.ubo) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | - CP_LOAD_STATE6_0_NUM_UNIT(anum/2)); - tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); - tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); - - unsigned emitted = 0; - for (unsigned i = 0; emitted < num && i < link->ubo_map.num; i++) { - for (unsigned j = 0; emitted < num && j < link->ubo_map.array_size[i]; j++) { - tu_cs_emit_qw(cs, buffer_ptr(descriptors_state, &link->ubo_map, i, j)); - emitted++; - } - } - - for (; emitted < anum; emitted++) { - tu_cs_emit(cs, 0xffffffff); - tu_cs_emit(cs, 0xffffffff); - } -} - static struct tu_cs_entry tu6_emit_consts(struct tu_cmd_buffer *cmd, const struct tu_pipeline *pipeline, @@ -2782,7 +2656,6 @@ tu6_emit_consts(struct tu_cmd_buffer *cmd, tu_cs_begin_sub_stream(&cmd->sub_cs, 512, &cs); /* TODO: maximum size? */ tu6_emit_user_consts(&cs, pipeline, descriptors_state, type, cmd->push_constants); - tu6_emit_ubos(&cs, pipeline, descriptors_state, type); return tu_cs_end_sub_stream(&cmd->sub_cs, &cs); } @@ -2828,225 +2701,138 @@ tu6_emit_vs_params(struct tu_cmd_buffer *cmd, } static VkResult -tu6_emit_textures(struct tu_cmd_buffer *cmd, - const struct tu_pipeline *pipeline, - struct tu_descriptor_state *descriptors_state, - gl_shader_stage type, - struct tu_cs_entry *entry, - bool is_sysmem) +tu6_emit_descriptor_sets(struct tu_cmd_buffer *cmd, + const struct tu_pipeline *pipeline, + VkPipelineBindPoint bind_point, + struct tu_cs_entry *entry, + bool gmem) { struct tu_cs *draw_state = &cmd->sub_cs; - const struct tu_program_descriptor_linkage *link = - &pipeline->program.link[type]; + struct tu_pipeline_layout *layout = pipeline->layout; + struct tu_descriptor_state *descriptors_state = + tu_get_descriptors_state(cmd, bind_point); + const struct tu_tiling_config *tiling = &cmd->state.tiling_config; + const uint32_t *input_attachment_idx = + pipeline->program.input_attachment_idx; + uint32_t num_dynamic_descs = layout->dynamic_offset_count + + layout->input_attachment_count; + struct ts_cs_memory dynamic_desc_set; VkResult result; - if (link->texture_map.num_desc == 0 && link->sampler_map.num_desc == 0) { - *entry = (struct tu_cs_entry) {}; - return VK_SUCCESS; - } - - /* allocate and fill texture state */ - struct ts_cs_memory tex_const; - result = tu_cs_alloc(draw_state, link->texture_map.num_desc, - A6XX_TEX_CONST_DWORDS, &tex_const); - if (result != VK_SUCCESS) - return result; - - int tex_index = 0; - for (unsigned i = 0; i < link->texture_map.num; i++) { - for (int j = 0; j < link->texture_map.array_size[i]; j++) { - write_tex_const(cmd, - &tex_const.map[A6XX_TEX_CONST_DWORDS * tex_index++], - descriptors_state, &link->texture_map, i, j, - is_sysmem); - } - } - - /* allocate and fill sampler state */ - struct ts_cs_memory tex_samp = { 0 }; - if (link->sampler_map.num_desc) { - result = tu_cs_alloc(draw_state, link->sampler_map.num_desc, - A6XX_TEX_SAMP_DWORDS, &tex_samp); + if (num_dynamic_descs > 0) { + /* allocate and fill out dynamic descriptor set */ + result = tu_cs_alloc(draw_state, num_dynamic_descs, + A6XX_TEX_CONST_DWORDS, &dynamic_desc_set); if (result != VK_SUCCESS) return result; - int sampler_index = 0; - for (unsigned i = 0; i < link->sampler_map.num; i++) { - for (int j = 0; j < link->sampler_map.array_size[i]; j++) { - const uint32_t *sampler = sampler_ptr(descriptors_state, - &link->sampler_map, - i, j); - memcpy(&tex_samp.map[A6XX_TEX_SAMP_DWORDS * sampler_index++], - sampler, A6XX_TEX_SAMP_DWORDS * 4); + memcpy(dynamic_desc_set.map, descriptors_state->input_attachments, + layout->input_attachment_count * A6XX_TEX_CONST_DWORDS * 4); + + if (gmem) { + /* Patch input attachments to refer to GMEM instead */ + for (unsigned i = 0; i < layout->input_attachment_count; i++) { + uint32_t *dst = + &dynamic_desc_set.map[A6XX_TEX_CONST_DWORDS * i]; + + /* The compiler has already laid out input_attachment_idx in the + * final order of input attachments, so there's no need to go + * through the pipeline layout finding input attachments. + */ + unsigned attachment_idx = input_attachment_idx[i]; + + /* It's possible for the pipeline layout to include an input + * attachment which doesn't actually exist for the current + * subpass. Of course, this is only valid so long as the pipeline + * doesn't try to actually load that attachment. Just skip + * patching in that scenario to avoid out-of-bounds accesses. + */ + if (attachment_idx >= cmd->state.subpass->input_count) + continue; + + uint32_t a = cmd->state.subpass->input_attachments[attachment_idx].attachment; + const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a]; + + assert(att->gmem_offset >= 0); + + dst[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK); + dst[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2); + dst[2] &= ~(A6XX_TEX_CONST_2_TYPE__MASK | A6XX_TEX_CONST_2_PITCH__MASK); + dst[2] |= + A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) | + A6XX_TEX_CONST_2_PITCH(tiling->tile0.extent.width * att->cpp); + dst[3] = 0; + dst[4] = cmd->device->physical_device->gmem_base + att->gmem_offset; + dst[5] = A6XX_TEX_CONST_5_DEPTH(1); + for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++) + dst[i] = 0; + + if (cmd->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) + tu_finishme("patch input attachment pitch for secondary cmd buffer"); } } + + memcpy(dynamic_desc_set.map + layout->input_attachment_count * A6XX_TEX_CONST_DWORDS, + descriptors_state->dynamic_descriptors, + layout->dynamic_offset_count * A6XX_TEX_CONST_DWORDS * 4); } - unsigned tex_samp_reg, tex_const_reg, tex_count_reg; - enum a6xx_state_block sb; - - switch (type) { - case MESA_SHADER_VERTEX: - sb = SB6_VS_TEX; - tex_samp_reg = REG_A6XX_SP_VS_TEX_SAMP_LO; - tex_const_reg = REG_A6XX_SP_VS_TEX_CONST_LO; - tex_count_reg = REG_A6XX_SP_VS_TEX_COUNT; + uint32_t sp_bindless_base_reg, hlsq_bindless_base_reg; + uint32_t hlsq_update_value; + switch (bind_point) { + case VK_PIPELINE_BIND_POINT_GRAPHICS: + sp_bindless_base_reg = REG_A6XX_SP_BINDLESS_BASE(0); + hlsq_bindless_base_reg = REG_A6XX_HLSQ_BINDLESS_BASE(0); + hlsq_update_value = 0x7c000; break; - case MESA_SHADER_FRAGMENT: - sb = SB6_FS_TEX; - tex_samp_reg = REG_A6XX_SP_FS_TEX_SAMP_LO; - tex_const_reg = REG_A6XX_SP_FS_TEX_CONST_LO; - tex_count_reg = REG_A6XX_SP_FS_TEX_COUNT; - break; - case MESA_SHADER_COMPUTE: - sb = SB6_CS_TEX; - tex_samp_reg = REG_A6XX_SP_CS_TEX_SAMP_LO; - tex_const_reg = REG_A6XX_SP_CS_TEX_CONST_LO; - tex_count_reg = REG_A6XX_SP_CS_TEX_COUNT; + case VK_PIPELINE_BIND_POINT_COMPUTE: + sp_bindless_base_reg = REG_A6XX_SP_CS_BINDLESS_BASE(0); + hlsq_bindless_base_reg = REG_A6XX_HLSQ_CS_BINDLESS_BASE(0); + hlsq_update_value = 0x3e00; break; default: - unreachable("bad state block"); + unreachable("bad bind point"); } + /* Be careful here to *not* refer to the pipeline, so that if only the + * pipeline changes we don't have to emit this again (except if there are + * dynamic descriptors in the pipeline layout). This means always emitting + * all the valid descriptors, which means that we always have to put the + * dynamic descriptor in the driver-only slot at the end + */ + uint32_t num_user_sets = util_last_bit(descriptors_state->valid); + uint32_t num_sets = num_user_sets; + if (num_dynamic_descs > 0) { + num_user_sets = MAX_SETS; + num_sets = num_user_sets + 1; + } + + unsigned regs[2] = { sp_bindless_base_reg, hlsq_bindless_base_reg }; + struct tu_cs cs; - result = tu_cs_begin_sub_stream(draw_state, 16, &cs); + result = tu_cs_begin_sub_stream(draw_state, ARRAY_SIZE(regs) * (1 + num_sets * 2) + 2, &cs); if (result != VK_SUCCESS) return result; - if (link->sampler_map.num_desc) { - /* output sampler state: */ - tu_cs_emit_pkt7(&cs, tu6_stage2opcode(type), 3); - tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(sb) | - CP_LOAD_STATE6_0_NUM_UNIT(link->sampler_map.num_desc)); - tu_cs_emit_qw(&cs, tex_samp.iova); /* SRC_ADDR_LO/HI */ - - tu_cs_emit_pkt4(&cs, tex_samp_reg, 2); - tu_cs_emit_qw(&cs, tex_samp.iova); /* SRC_ADDR_LO/HI */ - } - - /* emit texture state: */ - tu_cs_emit_pkt7(&cs, tu6_stage2opcode(type), 3); - tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(sb) | - CP_LOAD_STATE6_0_NUM_UNIT(link->texture_map.num_desc)); - tu_cs_emit_qw(&cs, tex_const.iova); /* SRC_ADDR_LO/HI */ - - tu_cs_emit_pkt4(&cs, tex_const_reg, 2); - tu_cs_emit_qw(&cs, tex_const.iova); /* SRC_ADDR_LO/HI */ - - tu_cs_emit_pkt4(&cs, tex_count_reg, 1); - tu_cs_emit(&cs, link->texture_map.num_desc); - - *entry = tu_cs_end_sub_stream(draw_state, &cs); - return VK_SUCCESS; -} - -static VkResult -tu6_emit_ibo(struct tu_cmd_buffer *cmd, - const struct tu_pipeline *pipeline, - struct tu_descriptor_state *descriptors_state, - gl_shader_stage type, - struct tu_cs_entry *entry) -{ - struct tu_cs *draw_state = &cmd->sub_cs; - const struct tu_program_descriptor_linkage *link = - &pipeline->program.link[type]; - VkResult result; - - unsigned num_desc = link->ssbo_map.num_desc + link->image_map.num_desc; - - if (num_desc == 0) { - *entry = (struct tu_cs_entry) {}; - return VK_SUCCESS; - } - - struct ts_cs_memory ibo_const; - result = tu_cs_alloc(draw_state, num_desc, - A6XX_TEX_CONST_DWORDS, &ibo_const); - if (result != VK_SUCCESS) - return result; - - int ssbo_index = 0; - for (unsigned i = 0; i < link->ssbo_map.num; i++) { - for (int j = 0; j < link->ssbo_map.array_size[i]; j++) { - uint32_t *dst = &ibo_const.map[A6XX_TEX_CONST_DWORDS * ssbo_index]; - - uint64_t va = buffer_ptr(descriptors_state, &link->ssbo_map, i, j); - /* We don't expose robustBufferAccess, so leave the size unlimited. */ - uint32_t sz = MAX_STORAGE_BUFFER_RANGE / 4; - - dst[0] = A6XX_IBO_0_FMT(FMT6_32_UINT); - dst[1] = A6XX_IBO_1_WIDTH(sz & MASK(15)) | - A6XX_IBO_1_HEIGHT(sz >> 15); - dst[2] = A6XX_IBO_2_UNK4 | - A6XX_IBO_2_UNK31 | - A6XX_IBO_2_TYPE(A6XX_TEX_1D); - dst[3] = 0; - dst[4] = va; - dst[5] = va >> 32; - for (int i = 6; i < A6XX_TEX_CONST_DWORDS; i++) - dst[i] = 0; - - ssbo_index++; + if (num_sets > 0) { + for (unsigned i = 0; i < ARRAY_SIZE(regs); i++) { + tu_cs_emit_pkt4(&cs, regs[i], num_sets * 2); + for (unsigned j = 0; j < num_user_sets; j++) { + if (descriptors_state->valid & (1 << j)) { + /* magic | 3 copied from the blob */ + tu_cs_emit_qw(&cs, descriptors_state->sets[j]->va | 3); + } else { + tu_cs_emit_qw(&cs, 0 | 3); + } + } + if (num_dynamic_descs > 0) { + tu_cs_emit_qw(&cs, dynamic_desc_set.iova | 3); + } } + + tu_cs_emit_regs(&cs, A6XX_HLSQ_UPDATE_CNTL(hlsq_update_value)); } - for (unsigned i = 0; i < link->image_map.num; i++) { - for (int j = 0; j < link->image_map.array_size[i]; j++) { - uint32_t *dst = &ibo_const.map[A6XX_TEX_CONST_DWORDS * ssbo_index]; - - write_image_ibo(cmd, dst, - descriptors_state, &link->image_map, i, j); - - ssbo_index++; - } - } - - assert(ssbo_index == num_desc); - - struct tu_cs cs; - result = tu_cs_begin_sub_stream(draw_state, 7, &cs); - if (result != VK_SUCCESS) - return result; - - uint32_t opcode, ibo_addr_reg; - enum a6xx_state_block sb; - enum a6xx_state_type st; - - switch (type) { - case MESA_SHADER_FRAGMENT: - opcode = CP_LOAD_STATE6; - st = ST6_SHADER; - sb = SB6_IBO; - ibo_addr_reg = REG_A6XX_SP_IBO_LO; - break; - case MESA_SHADER_COMPUTE: - opcode = CP_LOAD_STATE6_FRAG; - st = ST6_IBO; - sb = SB6_CS_SHADER; - ibo_addr_reg = REG_A6XX_SP_CS_IBO_LO; - break; - default: - unreachable("unsupported stage for ibos"); - } - - /* emit texture state: */ - tu_cs_emit_pkt7(&cs, opcode, 3); - tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) | - CP_LOAD_STATE6_0_STATE_TYPE(st) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(sb) | - CP_LOAD_STATE6_0_NUM_UNIT(num_desc)); - tu_cs_emit_qw(&cs, ibo_const.iova); /* SRC_ADDR_LO/HI */ - - tu_cs_emit_pkt4(&cs, ibo_addr_reg, 2); - tu_cs_emit_qw(&cs, ibo_const.iova); /* SRC_ADDR_LO/HI */ - *entry = tu_cs_end_sub_stream(draw_state, &cs); return VK_SUCCESS; } @@ -3255,59 +3041,54 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd, if (cmd->state.dirty & TU_CMD_DIRTY_STREAMOUT_BUFFERS) tu6_emit_streamout(cmd, cs); - if (cmd->state.dirty & - (TU_CMD_DIRTY_PIPELINE | TU_CMD_DIRTY_DESCRIPTOR_SETS)) { - struct tu_cs_entry vs_tex, fs_tex_sysmem, fs_tex_gmem, fs_ibo; + /* If there are any any dynamic descriptors, then we may need to re-emit + * them after every pipeline change in case the number of input attachments + * changes. We also always need to re-emit after a pipeline change if there + * are any input attachments, because the input attachment index comes from + * the pipeline. Finally, it can also happen that the subpass changes + * without the pipeline changing, in which case the GMEM descriptors need + * to be patched differently. + * + * TODO: We could probably be clever and avoid re-emitting state on + * pipeline changes if the number of input attachments is always 0. We + * could also only re-emit dynamic state. + */ + if (cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS || + ((pipeline->layout->dynamic_offset_count + + pipeline->layout->input_attachment_count > 0) && + cmd->state.dirty & TU_CMD_DIRTY_PIPELINE) || + (pipeline->layout->input_attachment_count > 0 && + cmd->state.dirty & TU_CMD_DIRTY_INPUT_ATTACHMENTS)) { + struct tu_cs_entry desc_sets, desc_sets_gmem; + bool need_gmem_desc_set = pipeline->layout->input_attachment_count > 0; - result = tu6_emit_textures(cmd, pipeline, descriptors_state, - MESA_SHADER_VERTEX, &vs_tex, false); - if (result != VK_SUCCESS) - return result; - - /* TODO: we could emit just one texture descriptor draw state when there - * are no input attachments, which is the most common case. We could - * also split out the sampler state, which doesn't change even for input - * attachments. - */ - result = tu6_emit_textures(cmd, pipeline, descriptors_state, - MESA_SHADER_FRAGMENT, &fs_tex_sysmem, true); - if (result != VK_SUCCESS) - return result; - - result = tu6_emit_textures(cmd, pipeline, descriptors_state, - MESA_SHADER_FRAGMENT, &fs_tex_gmem, false); - if (result != VK_SUCCESS) - return result; - - result = tu6_emit_ibo(cmd, pipeline, descriptors_state, - MESA_SHADER_FRAGMENT, &fs_ibo); + result = tu6_emit_descriptor_sets(cmd, pipeline, + VK_PIPELINE_BIND_POINT_GRAPHICS, + &desc_sets, false); if (result != VK_SUCCESS) return result; draw_state_groups[draw_state_group_count++] = (struct tu_draw_state_group) { - .id = TU_DRAW_STATE_VS_TEX, - .enable_mask = ENABLE_ALL, - .ib = vs_tex, - }; - draw_state_groups[draw_state_group_count++] = - (struct tu_draw_state_group) { - .id = TU_DRAW_STATE_FS_TEX_GMEM, - .enable_mask = CP_SET_DRAW_STATE__0_GMEM, - .ib = fs_tex_gmem, - }; - draw_state_groups[draw_state_group_count++] = - (struct tu_draw_state_group) { - .id = TU_DRAW_STATE_FS_TEX_SYSMEM, - .enable_mask = CP_SET_DRAW_STATE__0_SYSMEM, - .ib = fs_tex_sysmem, - }; - draw_state_groups[draw_state_group_count++] = - (struct tu_draw_state_group) { - .id = TU_DRAW_STATE_FS_IBO, - .enable_mask = ENABLE_DRAW, - .ib = fs_ibo, + .id = TU_DRAW_STATE_DESC_SETS, + .enable_mask = need_gmem_desc_set ? ENABLE_NON_GMEM : ENABLE_ALL, + .ib = desc_sets, }; + + if (need_gmem_desc_set) { + result = tu6_emit_descriptor_sets(cmd, pipeline, + VK_PIPELINE_BIND_POINT_GRAPHICS, + &desc_sets_gmem, true); + if (result != VK_SUCCESS) + return result; + + draw_state_groups[draw_state_group_count++] = + (struct tu_draw_state_group) { + .id = TU_DRAW_STATE_DESC_SETS_GMEM, + .enable_mask = CP_SET_DRAW_STATE__0_GMEM, + .ib = desc_sets_gmem, + }; + } } struct tu_cs_entry vs_params; @@ -3356,11 +3137,16 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd, unsigned i; for_each_bit(i, descriptors_state->valid) { struct tu_descriptor_set *set = descriptors_state->sets[i]; - for (unsigned j = 0; j < set->layout->buffer_count; ++j) - if (set->descriptors[j]) { - tu_bo_list_add(&cmd->bo_list, set->descriptors[j], + for (unsigned j = 0; j < set->layout->buffer_count; ++j) { + if (set->buffers[j]) { + tu_bo_list_add(&cmd->bo_list, set->buffers[j], MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE); } + } + if (set->size > 0) { + tu_bo_list_add(&cmd->bo_list, &set->pool->bo, + MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP); + } } } if (cmd->state.dirty & TU_CMD_DIRTY_STREAMOUT_BUFFERS) { @@ -3373,10 +3159,16 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd, } } + /* There are too many graphics dirty bits to list here, so just list the + * bits to preserve instead. The only things not emitted here are + * compute-related state. + */ + cmd->state.dirty &= TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS; + /* Fragment shader state overwrites compute shader state, so flag the * compute pipeline for re-emit. */ - cmd->state.dirty = TU_CMD_DIRTY_COMPUTE_PIPELINE; + cmd->state.dirty |= TU_CMD_DIRTY_COMPUTE_PIPELINE; return VK_SUCCESS; } @@ -3698,42 +3490,43 @@ tu_dispatch(struct tu_cmd_buffer *cmd, tu_emit_compute_driver_params(cs, pipeline, info); - result = tu6_emit_textures(cmd, pipeline, descriptors_state, - MESA_SHADER_COMPUTE, &ib, false); - if (result != VK_SUCCESS) { - cmd->record_result = result; - return; - } + if (cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS) { + result = tu6_emit_descriptor_sets(cmd, pipeline, + VK_PIPELINE_BIND_POINT_COMPUTE, &ib, + false); + if (result != VK_SUCCESS) { + cmd->record_result = result; + return; + } - if (ib.size) - tu_cs_emit_ib(cs, &ib); - - result = tu6_emit_ibo(cmd, pipeline, descriptors_state, MESA_SHADER_COMPUTE, &ib); - if (result != VK_SUCCESS) { - cmd->record_result = result; - return; - } - - if (ib.size) - tu_cs_emit_ib(cs, &ib); - - /* track BOs */ - if (cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS) { + /* track BOs */ unsigned i; for_each_bit(i, descriptors_state->valid) { struct tu_descriptor_set *set = descriptors_state->sets[i]; - for (unsigned j = 0; j < set->layout->buffer_count; ++j) - if (set->descriptors[j]) { - tu_bo_list_add(&cmd->bo_list, set->descriptors[j], + for (unsigned j = 0; j < set->layout->buffer_count; ++j) { + if (set->buffers[j]) { + tu_bo_list_add(&cmd->bo_list, set->buffers[j], MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE); } + } + + if (set->size > 0) { + tu_bo_list_add(&cmd->bo_list, &set->pool->bo, + MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP); + } } } + if (ib.size) + tu_cs_emit_ib(cs, &ib); + + cmd->state.dirty &= + ~(TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS | TU_CMD_DIRTY_COMPUTE_PIPELINE); + /* Compute shader state overwrites fragment shader state, so we flag the * graphics pipeline for re-emit. */ - cmd->state.dirty = TU_CMD_DIRTY_PIPELINE; + cmd->state.dirty |= TU_CMD_DIRTY_PIPELINE; tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_COMPUTE)); diff --git a/src/freedreno/vulkan/tu_descriptor_set.c b/src/freedreno/vulkan/tu_descriptor_set.c index 08562a3e5bb..de1683c6bd7 100644 --- a/src/freedreno/vulkan/tu_descriptor_set.c +++ b/src/freedreno/vulkan/tu_descriptor_set.c @@ -25,15 +25,14 @@ /** * @file * - * The texture and sampler descriptors are laid out in a single global space - * across all shader stages, for both simplicity of implementation and because - * that seems to be how things have to be structured for border color - * handling. - * - * Each shader stage will declare its texture/sampler count based on the last - * descriptor set it uses. At draw emit time (though it really should be - * CmdBind time), we upload the descriptor sets used by each shader stage to - * their stage. + * We use the bindless descriptor model, which maps fairly closely to how + * Vulkan descriptor sets work. The two exceptions are input attachments and + * dynamic descriptors, which have to be patched when recording command + * buffers. We reserve an extra descriptor set for these. This descriptor set + * contains all the input attachments in the pipeline, in order, and then all + * the dynamic descriptors. The dynamic descriptors are stored in the CPU-side + * datastructure for each tu_descriptor_set, and then combined into one big + * descriptor set at CmdBindDescriptors time/draw time. */ #include "tu_private.h" @@ -77,32 +76,27 @@ create_sorted_bindings(const VkDescriptorSetLayoutBinding *bindings, } static uint32_t -descriptor_size(enum VkDescriptorType type) +descriptor_size(VkDescriptorType type) { switch (type) { case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: - return 0; - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: - /* 64bit pointer */ - return 8; - case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: - case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: - case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: - return A6XX_TEX_CONST_DWORDS * 4; - case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: - /* We may need the IBO or the TEX representation, or both. */ - return A6XX_TEX_CONST_DWORDS * 4 * 2; - case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: - /* texture const + texture sampler */ - return (A6XX_TEX_CONST_DWORDS + A6XX_TEX_SAMP_DWORDS) * 4; - case VK_DESCRIPTOR_TYPE_SAMPLER: - return A6XX_TEX_SAMP_DWORDS * 4; - default: - unreachable("unknown descriptor type\n"); + /* These are remapped to the special driver-managed descriptor set, + * hence they don't take up any space in the original descriptor set: + */ return 0; + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: + /* We make offsets and sizes all 16 dwords, to match how the hardware + * interprets indices passed to sample/load/store instructions in + * multiples of 16 dwords. This means that "normal" descriptors are all + * of size 16, with padding for smaller descriptors like uniform storage + * descriptors which are less than 16 dwords. However combined images + * and samplers are actually two descriptors, so they have size 2. + */ + return A6XX_TEX_CONST_DWORDS * 4 * 2; + default: + return A6XX_TEX_CONST_DWORDS * 4; } } @@ -145,7 +139,7 @@ tu_CreateDescriptorSetLayout( set_layout->flags = pCreateInfo->flags; - /* We just allocate all the samplers at the end of the struct */ + /* We just allocate all the immutable samplers at the end of the struct */ struct tu_sampler *samplers = (void*) &set_layout->binding[max_binding + 1]; VkDescriptorSetLayoutBinding *bindings = create_sorted_bindings( @@ -157,41 +151,27 @@ tu_CreateDescriptorSetLayout( set_layout->binding_count = max_binding + 1; set_layout->shader_stages = 0; - set_layout->dynamic_shader_stages = 0; set_layout->has_immutable_samplers = false; set_layout->size = 0; + set_layout->dynamic_ubo = 0; memset(set_layout->binding, 0, size - sizeof(struct tu_descriptor_set_layout)); - uint32_t buffer_count = 0; uint32_t dynamic_offset_count = 0; + uint32_t input_attachment_count = 0; + uint32_t buffer_count = 0; for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) { const VkDescriptorSetLayoutBinding *binding = bindings + j; uint32_t b = binding->binding; - uint32_t alignment = 4; - unsigned binding_buffer_count = 1; - switch (binding->descriptorType) { - case VK_DESCRIPTOR_TYPE_SAMPLER: - binding_buffer_count = 0; - break; - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: - assert(!(pCreateInfo->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR)); - set_layout->binding[b].dynamic_offset_count = 1; - break; - default: - break; - } - - set_layout->size = align(set_layout->size, alignment); set_layout->binding[b].type = binding->descriptorType; set_layout->binding[b].array_size = binding->descriptorCount; set_layout->binding[b].offset = set_layout->size; set_layout->binding[b].buffer_offset = buffer_count; set_layout->binding[b].dynamic_offset_offset = dynamic_offset_count; + set_layout->binding[b].input_attachment_offset = input_attachment_count; set_layout->binding[b].size = descriptor_size(binding->descriptorType); if (variable_flags && binding->binding < variable_flags->bindingCount && @@ -219,16 +199,29 @@ tu_CreateDescriptorSetLayout( set_layout->size += binding->descriptorCount * set_layout->binding[b].size; - buffer_count += binding->descriptorCount * binding_buffer_count; - dynamic_offset_count += binding->descriptorCount * - set_layout->binding[b].dynamic_offset_count; + if (binding->descriptorType != VK_DESCRIPTOR_TYPE_SAMPLER && + binding->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) + buffer_count += binding->descriptorCount; + if (binding->descriptorType == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC || + binding->descriptorType == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) { + if (binding->descriptorType == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) { + STATIC_ASSERT(MAX_DYNAMIC_BUFFERS <= 8 * sizeof(set_layout->dynamic_ubo)); + set_layout->dynamic_ubo |= + ((1u << binding->descriptorCount) - 1) << dynamic_offset_count; + } + + dynamic_offset_count += binding->descriptorCount; + } + if (binding->descriptorType == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT) + input_attachment_count += binding->descriptorCount; set_layout->shader_stages |= binding->stageFlags; } free(bindings); - set_layout->buffer_count = buffer_count; set_layout->dynamic_offset_count = dynamic_offset_count; + set_layout->input_attachment_count = input_attachment_count; + set_layout->buffer_count = buffer_count; *pSetLayout = tu_descriptor_set_layout_to_handle(set_layout); @@ -333,19 +326,22 @@ tu_CreatePipelineLayout(VkDevice _device, return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); layout->num_sets = pCreateInfo->setLayoutCount; + layout->input_attachment_count = 0; + layout->dynamic_offset_count = 0; - unsigned dynamic_offset_count = 0; + unsigned dynamic_offset_count = 0, input_attachment_count = 0; _mesa_sha1_init(&ctx); for (uint32_t set = 0; set < pCreateInfo->setLayoutCount; set++) { TU_FROM_HANDLE(tu_descriptor_set_layout, set_layout, pCreateInfo->pSetLayouts[set]); layout->set[set].layout = set_layout; - layout->set[set].dynamic_offset_start = dynamic_offset_count; + layout->set[set].input_attachment_start = input_attachment_count; + dynamic_offset_count += set_layout->dynamic_offset_count; + input_attachment_count += set_layout->input_attachment_count; + for (uint32_t b = 0; b < set_layout->binding_count; b++) { - dynamic_offset_count += set_layout->binding[b].array_size * - set_layout->binding[b].dynamic_offset_count; if (set_layout->binding[b].immutable_samplers_offset) _mesa_sha1_update( &ctx, @@ -358,6 +354,7 @@ tu_CreatePipelineLayout(VkDevice _device, } layout->dynamic_offset_count = dynamic_offset_count; + layout->input_attachment_count = input_attachment_count; layout->push_constant_size = 0; for (unsigned i = 0; i < pCreateInfo->pushConstantRangeCount; ++i) { @@ -407,10 +404,11 @@ tu_descriptor_set_create(struct tu_device *device, buffer_count = layout->binding[layout->binding_count - 1].buffer_offset + *variable_count * stride; } - unsigned range_offset = sizeof(struct tu_descriptor_set) + + unsigned dynamic_offset = sizeof(struct tu_descriptor_set) + sizeof(struct tu_bo *) * buffer_count; - unsigned mem_size = range_offset + - sizeof(struct tu_descriptor_range) * layout->dynamic_offset_count; + unsigned mem_size = dynamic_offset + + A6XX_TEX_CONST_DWORDS * 4 * (layout->dynamic_offset_count + + layout->input_attachment_count);; if (pool->host_memory_base) { if (pool->host_memory_end - pool->host_memory_ptr < mem_size) @@ -428,18 +426,16 @@ tu_descriptor_set_create(struct tu_device *device, memset(set, 0, mem_size); - if (layout->dynamic_offset_count) { - set->dynamic_descriptors = (struct tu_descriptor_range*)((uint8_t*)set + range_offset); + if (layout->dynamic_offset_count + layout->input_attachment_count > 0) { + set->dynamic_descriptors = (uint32_t *)((uint8_t*)set + dynamic_offset); } set->layout = layout; + set->pool = pool; uint32_t layout_size = layout->size; if (variable_count) { assert(layout->has_variable_descriptors); uint32_t stride = layout->binding[layout->binding_count - 1].size; - if (layout->binding[layout->binding_count - 1].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) - stride = 1; - layout_size = layout->binding[layout->binding_count - 1].offset + *variable_count * stride; } @@ -527,7 +523,7 @@ tu_CreateDescriptorPool(VkDevice _device, TU_FROM_HANDLE(tu_device, device, _device); struct tu_descriptor_pool *pool; uint64_t size = sizeof(struct tu_descriptor_pool); - uint64_t bo_size = 0, bo_count = 0, range_count = 0; + uint64_t bo_size = 0, bo_count = 0, dynamic_count = 0; for (unsigned i = 0; i < pCreateInfo->poolSizeCount; ++i) { if (pCreateInfo->pPoolSizes[i].type != VK_DESCRIPTOR_TYPE_SAMPLER) @@ -536,7 +532,8 @@ tu_CreateDescriptorPool(VkDevice _device, switch(pCreateInfo->pPoolSizes[i].type) { case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: - range_count += pCreateInfo->pPoolSizes[i].descriptorCount; + case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: + dynamic_count += pCreateInfo->pPoolSizes[i].descriptorCount; default: break; } @@ -548,7 +545,7 @@ tu_CreateDescriptorPool(VkDevice _device, if (!(pCreateInfo->flags & VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT)) { uint64_t host_size = pCreateInfo->maxSets * sizeof(struct tu_descriptor_set); host_size += sizeof(struct tu_bo*) * bo_count; - host_size += sizeof(struct tu_descriptor_range) * range_count; + host_size += A6XX_TEX_CONST_DWORDS * 4 * dynamic_count; size += host_size; } else { size += sizeof(struct tu_descriptor_pool_entry) * pCreateInfo->maxSets; @@ -708,6 +705,16 @@ static void write_texel_buffer_descriptor(struct tu_device *device, *buffer_list = view->buffer->bo; } +static uint32_t get_range(struct tu_buffer *buf, VkDeviceSize offset, + VkDeviceSize range) +{ + if (range == VK_WHOLE_SIZE) { + return buf->size - offset; + } else { + return range; + } +} + static void write_buffer_descriptor(struct tu_device *device, struct tu_cmd_buffer *cmd_buffer, unsigned *dst, @@ -717,8 +724,18 @@ static void write_buffer_descriptor(struct tu_device *device, TU_FROM_HANDLE(tu_buffer, buffer, buffer_info->buffer); uint64_t va = tu_buffer_iova(buffer) + buffer_info->offset; - dst[0] = va; - dst[1] = va >> 32; + uint32_t range = get_range(buffer, buffer_info->offset, buffer_info->range); + range = ALIGN_POT(range, 4) / 4; + dst[0] = + A6XX_IBO_0_TILE_MODE(TILE6_LINEAR) | A6XX_IBO_0_FMT(FMT6_32_UINT); + dst[1] = range; + dst[2] = + A6XX_IBO_2_UNK4 | A6XX_IBO_2_TYPE(A6XX_TEX_1D) | A6XX_IBO_2_UNK31; + dst[3] = 0; + dst[4] = A6XX_IBO_4_BASE_LO(va); + dst[5] = A6XX_IBO_5_BASE_HI(va >> 32); + for (int i = 6; i < A6XX_TEX_CONST_DWORDS; i++) + dst[i] = 0; if (cmd_buffer) tu_bo_list_add(&cmd_buffer->bo_list, buffer->bo, MSM_SUBMIT_BO_READ); @@ -726,22 +743,25 @@ static void write_buffer_descriptor(struct tu_device *device, *buffer_list = buffer->bo; } -static void write_dynamic_buffer_descriptor(struct tu_device *device, - struct tu_descriptor_range *range, - struct tu_bo **buffer_list, - const VkDescriptorBufferInfo *buffer_info) +static void write_ubo_descriptor(struct tu_device *device, + struct tu_cmd_buffer *cmd_buffer, + unsigned *dst, + struct tu_bo **buffer_list, + const VkDescriptorBufferInfo *buffer_info) { TU_FROM_HANDLE(tu_buffer, buffer, buffer_info->buffer); + + uint32_t range = get_range(buffer, buffer_info->offset, buffer_info->range); + /* The HW range is in vec4 units */ + range = ALIGN_POT(range, 16) / 16; uint64_t va = tu_buffer_iova(buffer) + buffer_info->offset; - unsigned size = buffer_info->range; + dst[0] = A6XX_UBO_0_BASE_LO(va); + dst[1] = A6XX_UBO_1_BASE_HI(va >> 32) | A6XX_UBO_1_SIZE(range); - if (buffer_info->range == VK_WHOLE_SIZE) - size = buffer->size - buffer_info->offset; - - range->va = va; - range->size = size; - - *buffer_list = buffer->bo; + if (cmd_buffer) + tu_bo_list_add(&cmd_buffer->bo_list, buffer->bo, MSM_SUBMIT_BO_READ); + else + *buffer_list = buffer->bo; } static void @@ -754,10 +774,10 @@ write_image_descriptor(struct tu_device *device, { TU_FROM_HANDLE(tu_image_view, iview, image_info->imageView); - memcpy(dst, iview->descriptor, sizeof(iview->descriptor)); if (descriptor_type == VK_DESCRIPTOR_TYPE_STORAGE_IMAGE) { - memcpy(&dst[A6XX_TEX_CONST_DWORDS], iview->storage_descriptor, - sizeof(iview->storage_descriptor)); + memcpy(dst, iview->storage_descriptor, sizeof(iview->storage_descriptor)); + } else { + memcpy(dst, iview->descriptor, sizeof(iview->descriptor)); } if (cmd_buffer) @@ -782,7 +802,7 @@ write_combined_image_sampler_descriptor(struct tu_device *device, descriptor_type, image_info); /* copy over sampler state */ if (has_sampler) { - memcpy(dst + sampler_offset / sizeof(*dst), sampler, sizeof(*sampler)); + memcpy(dst + A6XX_TEX_CONST_DWORDS, sampler, sizeof(*sampler)); } } @@ -813,26 +833,37 @@ tu_update_descriptor_sets(struct tu_device *device, const struct tu_descriptor_set_binding_layout *binding_layout = set->layout->binding + writeset->dstBinding; uint32_t *ptr = set->mapped_ptr; - struct tu_bo **buffer_list = set->descriptors; + struct tu_bo **buffer_list = set->buffers; ptr += binding_layout->offset / 4; - ptr += binding_layout->size * writeset->dstArrayElement / 4; + ptr += (binding_layout->size / 4) * writeset->dstArrayElement; buffer_list += binding_layout->buffer_offset; buffer_list += writeset->dstArrayElement; for (j = 0; j < writeset->descriptorCount; ++j) { switch(writeset->descriptorType) { - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: { - unsigned idx = writeset->dstArrayElement + j; - idx += binding_layout->dynamic_offset_offset; + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: { assert(!(set->layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR)); - write_dynamic_buffer_descriptor(device, set->dynamic_descriptors + idx, - buffer_list, writeset->pBufferInfo + j); + unsigned idx = writeset->dstArrayElement + j; + idx += set->layout->input_attachment_count + binding_layout->dynamic_offset_offset; + write_ubo_descriptor(device, cmd_buffer, + set->dynamic_descriptors + A6XX_TEX_CONST_DWORDS * idx, + buffer_list, writeset->pBufferInfo + j); break; } - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + write_ubo_descriptor(device, cmd_buffer, ptr, buffer_list, + writeset->pBufferInfo + j); + break; + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: { + assert(!(set->layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR)); + unsigned idx = writeset->dstArrayElement + j; + idx += set->layout->input_attachment_count + binding_layout->dynamic_offset_offset; + write_buffer_descriptor(device, cmd_buffer, + set->dynamic_descriptors + A6XX_TEX_CONST_DWORDS * idx, + buffer_list, writeset->pBufferInfo + j); + break; + } case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: write_buffer_descriptor(device, cmd_buffer, ptr, buffer_list, writeset->pBufferInfo + j); @@ -844,11 +875,19 @@ tu_update_descriptor_sets(struct tu_device *device, break; case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: - case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: write_image_descriptor(device, cmd_buffer, ptr, buffer_list, writeset->descriptorType, writeset->pImageInfo + j); break; + case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: { + unsigned idx = writeset->dstArrayElement + j; + idx += binding_layout->input_attachment_offset; + write_image_descriptor(device, cmd_buffer, + set->dynamic_descriptors + A6XX_TEX_CONST_DWORDS * idx, + buffer_list, writeset->descriptorType, + writeset->pImageInfo + j); + break; + } case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: write_combined_image_sampler_descriptor(device, cmd_buffer, A6XX_TEX_CONST_DWORDS * 4, @@ -881,8 +920,8 @@ tu_update_descriptor_sets(struct tu_device *device, dst_set->layout->binding + copyset->dstBinding; uint32_t *src_ptr = src_set->mapped_ptr; uint32_t *dst_ptr = dst_set->mapped_ptr; - struct tu_bo **src_buffer_list = src_set->descriptors; - struct tu_bo **dst_buffer_list = dst_set->descriptors; + struct tu_bo **src_buffer_list = src_set->buffers; + struct tu_bo **dst_buffer_list = dst_set->buffers; src_ptr += src_binding_layout->offset / 4; dst_ptr += dst_binding_layout->offset / 4; @@ -902,18 +941,33 @@ tu_update_descriptor_sets(struct tu_device *device, case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: { unsigned src_idx = copyset->srcArrayElement + j; unsigned dst_idx = copyset->dstArrayElement + j; - struct tu_descriptor_range *src_range, *dst_range; + src_idx += src_set->layout->input_attachment_count; + dst_idx += dst_set->layout->input_attachment_count; src_idx += src_binding_layout->dynamic_offset_offset; dst_idx += dst_binding_layout->dynamic_offset_offset; - src_range = src_set->dynamic_descriptors + src_idx; - dst_range = dst_set->dynamic_descriptors + dst_idx; - *dst_range = *src_range; + uint32_t *src_dynamic, *dst_dynamic; + src_dynamic = src_set->dynamic_descriptors + src_idx * A6XX_TEX_CONST_DWORDS; + dst_dynamic = dst_set->dynamic_descriptors + dst_idx * A6XX_TEX_CONST_DWORDS; + memcpy(dst_dynamic, src_dynamic, A6XX_TEX_CONST_DWORDS * 4); + break; + } + case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: { + unsigned src_idx = copyset->srcArrayElement + j; + unsigned dst_idx = copyset->dstArrayElement + j; + src_idx += src_binding_layout->input_attachment_offset; + dst_idx += dst_binding_layout->input_attachment_offset; + + uint32_t *src_dynamic, *dst_dynamic; + src_dynamic = src_set->dynamic_descriptors + src_idx * A6XX_TEX_CONST_DWORDS; + dst_dynamic = dst_set->dynamic_descriptors + dst_idx * A6XX_TEX_CONST_DWORDS; + memcpy(dst_dynamic, src_dynamic, A6XX_TEX_CONST_DWORDS * 4); break; } default: memcpy(dst_ptr, src_ptr, src_binding_layout->size); } + src_ptr += src_binding_layout->size / 4; dst_ptr += dst_binding_layout->size / 4; diff --git a/src/freedreno/vulkan/tu_descriptor_set.h b/src/freedreno/vulkan/tu_descriptor_set.h index 282d75895c9..3a24822eb67 100644 --- a/src/freedreno/vulkan/tu_descriptor_set.h +++ b/src/freedreno/vulkan/tu_descriptor_set.h @@ -26,7 +26,10 @@ #include -#define MAX_SETS 32 +/* The hardware supports 5 descriptor sets, but we reserve 1 for dynamic + * descriptors and input attachments. + */ +#define MAX_SETS 4 struct tu_descriptor_set_binding_layout { @@ -35,14 +38,25 @@ struct tu_descriptor_set_binding_layout /* Number of array elements in this binding */ uint32_t array_size; - uint32_t offset; - uint32_t buffer_offset; - uint16_t dynamic_offset_offset; - - uint16_t dynamic_offset_count; - /* redundant with the type, each for a single array element */ + /* The size in bytes of each Vulkan descriptor. */ uint32_t size; + uint32_t offset; + + /* For descriptors that point to a buffer, index into the array of BO's to + * be added to the cmdbuffer's used BO list. + */ + uint32_t buffer_offset; + + /* Index into the pDynamicOffsets array for dynamic descriptors, as well as + * the array of dynamic descriptors (offsetted by + * tu_pipeline_layout::set::dynamic_offset_start). + */ + uint32_t dynamic_offset_offset; + + /* Index into the array of dynamic input attachment descriptors */ + uint32_t input_attachment_offset; + /* Offset in the tu_descriptor_set_layout of the immutable samplers, or 0 * if there are no immutable samplers. */ uint32_t immutable_samplers_offset; @@ -61,14 +75,20 @@ struct tu_descriptor_set_layout /* Shader stages affected by this descriptor set */ uint16_t shader_stages; - uint16_t dynamic_shader_stages; - - /* Number of buffers in this descriptor set */ - uint32_t buffer_count; /* Number of dynamic offsets used by this descriptor set */ uint16_t dynamic_offset_count; + /* Number of input attachments used by the descriptor set */ + uint16_t input_attachment_count; + + /* A bitfield of which dynamic buffers are ubo's, to make the + * descriptor-binding-time patching easier. + */ + uint32_t dynamic_ubo; + + uint32_t buffer_count; + bool has_immutable_samplers; bool has_variable_descriptors; @@ -83,11 +103,13 @@ struct tu_pipeline_layout struct tu_descriptor_set_layout *layout; uint32_t size; uint32_t dynamic_offset_start; + uint32_t input_attachment_start; } set[MAX_SETS]; uint32_t num_sets; uint32_t push_constant_size; uint32_t dynamic_offset_count; + uint32_t input_attachment_count; unsigned char sha1[20]; }; diff --git a/src/freedreno/vulkan/tu_device.c b/src/freedreno/vulkan/tu_device.c index 394c3ce1800..d71a4809bc7 100644 --- a/src/freedreno/vulkan/tu_device.c +++ b/src/freedreno/vulkan/tu_device.c @@ -732,18 +732,13 @@ tu_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice, VkSampleCountFlags sample_counts = VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT; - /* make sure that the entire descriptor set is addressable with a signed - * 32-bit int. So the sum of all limits scaled by descriptor size has to - * be at most 2 GiB. the combined image & samples object count as one of - * both. This limit is for the pipeline layout, not for the set layout, but - * there is no set limit, so we just set a pipeline limit. I don't think - * any app is going to hit this soon. */ - size_t max_descriptor_set_size = - ((1ull << 31) - 16 * MAX_DYNAMIC_BUFFERS) / - (32 /* uniform buffer, 32 due to potential space wasted on alignment */ + - 32 /* storage buffer, 32 due to potential space wasted on alignment */ + - 32 /* sampler, largest when combined with image */ + - 64 /* sampled image */ + 64 /* storage image */); + /* I have no idea what the maximum size is, but the hardware supports very + * large numbers of descriptors (at least 2^16). This limit is based on + * CP_LOAD_STATE6, which has a 28-bit field for the DWORD offset, so that + * we don't have to think about what to do if that overflows, but really + * nothing is likely to get close to this. + */ + const size_t max_descriptor_set_size = (1 << 28) / A6XX_TEX_CONST_DWORDS; VkPhysicalDeviceLimits limits = { .maxImageDimension1D = (1 << 14), @@ -752,7 +747,7 @@ tu_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice, .maxImageDimensionCube = (1 << 14), .maxImageArrayLayers = (1 << 11), .maxTexelBufferElements = 128 * 1024 * 1024, - .maxUniformBufferRange = UINT32_MAX, + .maxUniformBufferRange = MAX_UNIFORM_BUFFER_RANGE, .maxStorageBufferRange = MAX_STORAGE_BUFFER_RANGE, .maxPushConstantsSize = MAX_PUSH_CONSTANTS_SIZE, .maxMemoryAllocationCount = UINT32_MAX, @@ -765,7 +760,7 @@ tu_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice, .maxPerStageDescriptorStorageBuffers = max_descriptor_set_size, .maxPerStageDescriptorSampledImages = max_descriptor_set_size, .maxPerStageDescriptorStorageImages = max_descriptor_set_size, - .maxPerStageDescriptorInputAttachments = max_descriptor_set_size, + .maxPerStageDescriptorInputAttachments = MAX_RTS, .maxPerStageResources = max_descriptor_set_size, .maxDescriptorSetSamplers = max_descriptor_set_size, .maxDescriptorSetUniformBuffers = max_descriptor_set_size, @@ -774,7 +769,7 @@ tu_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice, .maxDescriptorSetStorageBuffersDynamic = MAX_DYNAMIC_STORAGE_BUFFERS, .maxDescriptorSetSampledImages = max_descriptor_set_size, .maxDescriptorSetStorageImages = max_descriptor_set_size, - .maxDescriptorSetInputAttachments = max_descriptor_set_size, + .maxDescriptorSetInputAttachments = MAX_RTS, .maxVertexInputAttributes = 32, .maxVertexInputBindings = 32, .maxVertexInputAttributeOffset = 4095, @@ -814,8 +809,8 @@ tu_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice, .viewportSubPixelBits = 8, .minMemoryMapAlignment = 4096, /* A page */ .minTexelBufferOffsetAlignment = 64, - .minUniformBufferOffsetAlignment = 4, - .minStorageBufferOffsetAlignment = 4, + .minUniformBufferOffsetAlignment = 64, + .minStorageBufferOffsetAlignment = 64, .minTexelOffset = -32, .maxTexelOffset = 31, .minTexelGatherOffset = -32, @@ -1715,7 +1710,7 @@ tu_GetBufferMemoryRequirements(VkDevice _device, TU_FROM_HANDLE(tu_buffer, buffer, _buffer); pMemoryRequirements->memoryTypeBits = 1; - pMemoryRequirements->alignment = 16; + pMemoryRequirements->alignment = 64; pMemoryRequirements->size = align64(buffer->size, pMemoryRequirements->alignment); } diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c index 5d36dfcaf3f..dc2a568a59c 100644 --- a/src/freedreno/vulkan/tu_pipeline.c +++ b/src/freedreno/vulkan/tu_pipeline.c @@ -333,13 +333,18 @@ tu6_blend_op(VkBlendOp op) } } -static unsigned -tu_shader_nibo(const struct tu_shader *shader) +static uint32_t +emit_xs_config(const struct ir3_shader_variant *sh) { - /* Don't use ir3_shader_nibo(), because that would include declared but - * unused storage images and SSBOs. - */ - return shader->ssbo_map.num_desc + shader->image_map.num_desc; + if (sh->instrlen) { + return A6XX_SP_VS_CONFIG_ENABLED | + COND(sh->bindless_tex, A6XX_SP_VS_CONFIG_BINDLESS_TEX) | + COND(sh->bindless_samp, A6XX_SP_VS_CONFIG_BINDLESS_SAMP) | + COND(sh->bindless_ibo, A6XX_SP_VS_CONFIG_BINDLESS_IBO) | + COND(sh->bindless_ubo, A6XX_SP_VS_CONFIG_BINDLESS_UBO); + } else { + return 0; + } } static void @@ -356,16 +361,11 @@ tu6_emit_vs_config(struct tu_cs *cs, struct tu_shader *shader, if (vs->need_fine_derivatives) sp_vs_ctrl |= A6XX_SP_VS_CTRL_REG0_DIFF_FINE; - uint32_t sp_vs_config = A6XX_SP_VS_CONFIG_NTEX(shader->texture_map.num_desc) | - A6XX_SP_VS_CONFIG_NSAMP(shader->sampler_map.num_desc); - if (vs->instrlen) - sp_vs_config |= A6XX_SP_VS_CONFIG_ENABLED; - tu_cs_emit_pkt4(cs, REG_A6XX_SP_VS_CTRL_REG0, 1); tu_cs_emit(cs, sp_vs_ctrl); tu_cs_emit_pkt4(cs, REG_A6XX_SP_VS_CONFIG, 2); - tu_cs_emit(cs, sp_vs_config); + tu_cs_emit(cs, emit_xs_config(vs)); tu_cs_emit(cs, vs->instrlen); tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_VS_CNTL, 1); @@ -377,15 +377,11 @@ static void tu6_emit_hs_config(struct tu_cs *cs, struct tu_shader *shader, const struct ir3_shader_variant *hs) { - uint32_t sp_hs_config = 0; - if (hs->instrlen) - sp_hs_config |= A6XX_SP_HS_CONFIG_ENABLED; - tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_UNKNOWN_A831, 1); tu_cs_emit(cs, 0); tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_CONFIG, 2); - tu_cs_emit(cs, sp_hs_config); + tu_cs_emit(cs, emit_xs_config(hs)); tu_cs_emit(cs, hs->instrlen); tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_HS_CNTL, 1); @@ -396,12 +392,8 @@ static void tu6_emit_ds_config(struct tu_cs *cs, struct tu_shader *shader, const struct ir3_shader_variant *ds) { - uint32_t sp_ds_config = 0; - if (ds->instrlen) - sp_ds_config |= A6XX_SP_DS_CONFIG_ENABLED; - tu_cs_emit_pkt4(cs, REG_A6XX_SP_DS_CONFIG, 2); - tu_cs_emit(cs, sp_ds_config); + tu_cs_emit(cs, emit_xs_config(ds)); tu_cs_emit(cs, ds->instrlen); tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_DS_CNTL, 1); @@ -417,11 +409,7 @@ tu6_emit_gs_config(struct tu_cs *cs, struct tu_shader *shader, tu_cs_emit(cs, 0); tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_CONFIG, 2); - tu_cs_emit(cs, COND(has_gs, - A6XX_SP_GS_CONFIG_ENABLED | - A6XX_SP_GS_CONFIG_NIBO(ir3_shader_nibo(gs)) | - A6XX_SP_GS_CONFIG_NTEX(gs->num_samp) | - A6XX_SP_GS_CONFIG_NSAMP(gs->num_samp))); + tu_cs_emit(cs, emit_xs_config(gs)); tu_cs_emit(cs, gs->instrlen); tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_GS_CNTL, 1); @@ -445,31 +433,16 @@ tu6_emit_fs_config(struct tu_cs *cs, struct tu_shader *shader, if (fs->need_fine_derivatives) sp_fs_ctrl |= A6XX_SP_FS_CTRL_REG0_DIFF_FINE; - uint32_t sp_fs_config = 0; - unsigned shader_nibo = 0; - if (shader) { - shader_nibo = tu_shader_nibo(shader); - sp_fs_config = A6XX_SP_FS_CONFIG_NTEX(shader->texture_map.num_desc) | - A6XX_SP_FS_CONFIG_NSAMP(shader->sampler_map.num_desc) | - A6XX_SP_FS_CONFIG_NIBO(shader_nibo); - } - - if (fs->instrlen) - sp_fs_config |= A6XX_SP_FS_CONFIG_ENABLED; - tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_CTRL_REG0, 1); tu_cs_emit(cs, sp_fs_ctrl); tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_CONFIG, 2); - tu_cs_emit(cs, sp_fs_config); + tu_cs_emit(cs, emit_xs_config(fs)); tu_cs_emit(cs, fs->instrlen); tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_FS_CNTL, 1); tu_cs_emit(cs, A6XX_HLSQ_FS_CNTL_CONSTLEN(align(fs->constlen, 4)) | A6XX_HLSQ_FS_CNTL_ENABLED); - - tu_cs_emit_pkt4(cs, REG_A6XX_SP_IBO_COUNT, 1); - tu_cs_emit(cs, shader_nibo); } static void @@ -485,10 +458,7 @@ tu6_emit_cs_config(struct tu_cs *cs, const struct tu_shader *shader, A6XX_HLSQ_CS_CNTL_ENABLED); tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CONFIG, 2); - tu_cs_emit(cs, A6XX_SP_CS_CONFIG_ENABLED | - A6XX_SP_CS_CONFIG_NIBO(tu_shader_nibo(shader)) | - A6XX_SP_CS_CONFIG_NTEX(shader->texture_map.num_desc) | - A6XX_SP_CS_CONFIG_NSAMP(shader->sampler_map.num_desc)); + tu_cs_emit(cs, emit_xs_config(v)); tu_cs_emit(cs, v->instrlen); tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CTRL_REG0, 1); @@ -514,9 +484,6 @@ tu6_emit_cs_config(struct tu_cs *cs, const struct tu_shader *shader, A6XX_HLSQ_CS_CNTL_0_UNK1(regid(63, 0)) | A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id)); tu_cs_emit(cs, 0x2fc); /* HLSQ_CS_UNKNOWN_B998 */ - - tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_IBO_COUNT, 1); - tu_cs_emit(cs, tu_shader_nibo(shader)); } static void @@ -1013,6 +980,16 @@ tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs) A6XX_SP_FS_PREFETCH_CMD_CMD(prefetch->cmd)); } + if (fs->num_sampler_prefetch > 0) { + tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_BINDLESS_PREFETCH_CMD(0), fs->num_sampler_prefetch); + for (int i = 0; i < fs->num_sampler_prefetch; i++) { + const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i]; + tu_cs_emit(cs, + A6XX_SP_FS_BINDLESS_PREFETCH_CMD_SAMP_ID(prefetch->samp_bindless_id) | + A6XX_SP_FS_BINDLESS_PREFETCH_CMD_TEX_ID(prefetch->tex_bindless_id)); + } + } + tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CONTROL_1_REG, 5); tu_cs_emit(cs, 0x7); tu_cs_emit(cs, A6XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid) | @@ -1949,11 +1926,6 @@ tu_pipeline_set_linkage(struct tu_program_descriptor_linkage *link, link->const_state = v->shader->const_state; link->constlen = v->constlen; link->push_consts = shader->push_consts; - link->texture_map = shader->texture_map; - link->sampler_map = shader->sampler_map; - link->ubo_map = shader->ubo_map; - link->ssbo_map = shader->ssbo_map; - link->image_map = shader->image_map; } static void @@ -1984,6 +1956,12 @@ tu_pipeline_builder_parse_shader_stages(struct tu_pipeline_builder *builder, builder->shaders[i], &builder->shaders[i]->variants[0]); } + + if (builder->shaders[MESA_SHADER_FRAGMENT]) { + memcpy(pipeline->program.input_attachment_idx, + builder->shaders[MESA_SHADER_FRAGMENT]->attachment_idx, + sizeof(pipeline->program.input_attachment_idx)); + } } static void @@ -2209,6 +2187,8 @@ tu_pipeline_builder_build(struct tu_pipeline_builder *builder, if (result != VK_SUCCESS) return result; + (*pipeline)->layout = builder->layout; + /* compile and upload shaders */ result = tu_pipeline_builder_compile_shaders(builder); if (result == VK_SUCCESS) diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h index 3c50b2ec019..87e4df85ff5 100644 --- a/src/freedreno/vulkan/tu_private.h +++ b/src/freedreno/vulkan/tu_private.h @@ -98,6 +98,12 @@ typedef uint32_t xcb_window_t; #define MAX_VIEWS 8 /* The Qualcomm driver exposes 0x20000058 */ #define MAX_STORAGE_BUFFER_RANGE 0x20000000 +/* We use ldc for uniform buffer loads, just like the Qualcomm driver, so + * expose the same maximum range. + * TODO: The SIZE bitfield is 15 bits, and in 4-dword units, so the actual + * range might be higher. + */ +#define MAX_UNIFORM_BUFFER_RANGE 0x10000 #define NUM_DEPTH_CLEAR_PIPELINES 3 @@ -615,13 +621,15 @@ struct tu_descriptor_range struct tu_descriptor_set { const struct tu_descriptor_set_layout *layout; + struct tu_descriptor_pool *pool; uint32_t size; uint64_t va; uint32_t *mapped_ptr; - struct tu_descriptor_range *dynamic_descriptors; - struct tu_bo *descriptors[0]; + uint32_t *dynamic_descriptors; + + struct tu_bo *buffers[0]; }; struct tu_push_descriptor_set @@ -806,7 +814,8 @@ struct tu_descriptor_state uint32_t valid; struct tu_push_descriptor_set push_set; bool push_dirty; - uint64_t dynamic_buffers[MAX_DYNAMIC_BUFFERS]; + uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS * A6XX_TEX_CONST_DWORDS]; + uint32_t input_attachments[MAX_RTS * A6XX_TEX_CONST_DWORDS]; }; struct tu_tile @@ -845,8 +854,10 @@ enum tu_cmd_dirty_bits TU_CMD_DIRTY_COMPUTE_PIPELINE = 1 << 1, TU_CMD_DIRTY_VERTEX_BUFFERS = 1 << 2, TU_CMD_DIRTY_DESCRIPTOR_SETS = 1 << 3, - TU_CMD_DIRTY_PUSH_CONSTANTS = 1 << 4, - TU_CMD_DIRTY_STREAMOUT_BUFFERS = 1 << 5, + TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS = 1 << 4, + TU_CMD_DIRTY_PUSH_CONSTANTS = 1 << 5, + TU_CMD_DIRTY_STREAMOUT_BUFFERS = 1 << 6, + TU_CMD_DIRTY_INPUT_ATTACHMENTS = 1 << 7, TU_CMD_DIRTY_DYNAMIC_LINE_WIDTH = 1 << 16, TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK = 1 << 17, @@ -1118,17 +1129,6 @@ struct tu_shader_compile_options bool include_binning_pass; }; -struct tu_descriptor_map -{ - /* TODO: avoid fixed size array/justify the size */ - unsigned num; /* number of array entries */ - unsigned num_desc; /* Number of descriptors (sum of array_size[]) */ - int set[128]; - int binding[128]; - int value[128]; - int array_size[128]; -}; - struct tu_push_constant_range { uint32_t lo; @@ -1140,11 +1140,7 @@ struct tu_shader struct ir3_shader ir3_shader; struct tu_push_constant_range push_consts; - struct tu_descriptor_map texture_map; - struct tu_descriptor_map sampler_map; - struct tu_descriptor_map ubo_map; - struct tu_descriptor_map ssbo_map; - struct tu_descriptor_map image_map; + unsigned attachment_idx[MAX_RTS]; /* This may be true for vertex shaders. When true, variants[1] is the * binning variant and binning_binary is non-NULL. @@ -1189,11 +1185,6 @@ struct tu_program_descriptor_linkage uint32_t constlen; struct tu_push_constant_range push_consts; - struct tu_descriptor_map texture_map; - struct tu_descriptor_map sampler_map; - struct tu_descriptor_map ubo_map; - struct tu_descriptor_map ssbo_map; - struct tu_descriptor_map image_map; }; struct tu_pipeline @@ -1216,6 +1207,7 @@ struct tu_pipeline struct tu_cs_entry binning_state_ib; struct tu_program_descriptor_linkage link[MESA_SHADER_STAGES]; + unsigned input_attachment_idx[MAX_RTS]; } program; struct diff --git a/src/freedreno/vulkan/tu_shader.c b/src/freedreno/vulkan/tu_shader.c index deb6d895feb..85bf6bbc50f 100644 --- a/src/freedreno/vulkan/tu_shader.c +++ b/src/freedreno/vulkan/tu_shader.c @@ -83,129 +83,6 @@ tu_spirv_to_nir(struct ir3_compiler *compiler, return nir; } -static unsigned -map_add(struct tu_descriptor_map *map, int set, int binding, int value, - int array_size) -{ - unsigned index = 0; - for (unsigned i = 0; i < map->num; i++) { - if (set == map->set[i] && binding == map->binding[i]) { - assert(value == map->value[i]); - assert(array_size == map->array_size[i]); - return index; - } - index += map->array_size[i]; - } - - assert(index == map->num_desc); - - map->set[map->num] = set; - map->binding[map->num] = binding; - map->value[map->num] = value; - map->array_size[map->num] = array_size; - map->num++; - map->num_desc += array_size; - - return index; -} - -static void -lower_tex_src_to_offset(nir_builder *b, nir_tex_instr *instr, unsigned src_idx, - struct tu_shader *shader, - const struct tu_pipeline_layout *layout) -{ - nir_ssa_def *index = NULL; - unsigned base_index = 0; - unsigned array_elements = 1; - nir_tex_src *src = &instr->src[src_idx]; - bool is_sampler = src->src_type == nir_tex_src_sampler_deref; - - /* We compute first the offsets */ - nir_deref_instr *deref = nir_instr_as_deref(src->src.ssa->parent_instr); - while (deref->deref_type != nir_deref_type_var) { - assert(deref->parent.is_ssa); - nir_deref_instr *parent = - nir_instr_as_deref(deref->parent.ssa->parent_instr); - - assert(deref->deref_type == nir_deref_type_array); - - if (nir_src_is_const(deref->arr.index) && index == NULL) { - /* We're still building a direct index */ - base_index += nir_src_as_uint(deref->arr.index) * array_elements; - } else { - if (index == NULL) { - /* We used to be direct but not anymore */ - index = nir_imm_int(b, base_index); - base_index = 0; - } - - index = nir_iadd(b, index, - nir_imul(b, nir_imm_int(b, array_elements), - nir_ssa_for_src(b, deref->arr.index, 1))); - } - - array_elements *= glsl_get_length(parent->type); - - deref = parent; - } - - if (index) - index = nir_umin(b, index, nir_imm_int(b, array_elements - 1)); - - /* We have the offsets, we apply them, rewriting the source or removing - * instr if needed - */ - if (index) { - nir_instr_rewrite_src(&instr->instr, &src->src, - nir_src_for_ssa(index)); - - src->src_type = is_sampler ? - nir_tex_src_sampler_offset : - nir_tex_src_texture_offset; - } else { - nir_tex_instr_remove_src(instr, src_idx); - } - - uint32_t set = deref->var->data.descriptor_set; - uint32_t binding = deref->var->data.binding; - struct tu_descriptor_set_layout *set_layout = layout->set[set].layout; - struct tu_descriptor_set_binding_layout *binding_layout = - &set_layout->binding[binding]; - - int desc_index = map_add(is_sampler ? - &shader->sampler_map : &shader->texture_map, - deref->var->data.descriptor_set, - deref->var->data.binding, - deref->var->data.index, - binding_layout->array_size) + base_index; - if (is_sampler) - instr->sampler_index = desc_index; - else - instr->texture_index = desc_index; -} - -static bool -lower_sampler(nir_builder *b, nir_tex_instr *instr, struct tu_shader *shader, - const struct tu_pipeline_layout *layout) -{ - int texture_idx = - nir_tex_instr_src_index(instr, nir_tex_src_texture_deref); - - if (texture_idx >= 0) - lower_tex_src_to_offset(b, instr, texture_idx, shader, layout); - - int sampler_idx = - nir_tex_instr_src_index(instr, nir_tex_src_sampler_deref); - - if (sampler_idx >= 0) - lower_tex_src_to_offset(b, instr, sampler_idx, shader, layout); - - if (texture_idx < 0 && sampler_idx < 0) - return false; - - return true; -} - static void lower_load_push_constant(nir_builder *b, nir_intrinsic_instr *instr, struct tu_shader *shader) @@ -234,66 +111,108 @@ lower_vulkan_resource_index(nir_builder *b, nir_intrinsic_instr *instr, struct tu_shader *shader, const struct tu_pipeline_layout *layout) { - nir_const_value *const_val = nir_src_as_const_value(instr->src[0]); + nir_ssa_def *vulkan_idx = instr->src[0].ssa; unsigned set = nir_intrinsic_desc_set(instr); unsigned binding = nir_intrinsic_binding(instr); struct tu_descriptor_set_layout *set_layout = layout->set[set].layout; struct tu_descriptor_set_binding_layout *binding_layout = &set_layout->binding[binding]; - unsigned index = 0; + uint32_t base; - switch (nir_intrinsic_desc_type(instr)) { - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + switch (binding_layout->type) { case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: - if (!const_val) - tu_finishme("non-constant vulkan_resource_index array index"); - /* skip index 0 which is used for push constants */ - index = map_add(&shader->ubo_map, set, binding, 0, - binding_layout->array_size) + 1; - index += const_val->u32; - break; - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: - if (!const_val) - tu_finishme("non-constant vulkan_resource_index array index"); - index = map_add(&shader->ssbo_map, set, binding, 0, - binding_layout->array_size); - index += const_val->u32; + base = layout->set[set].dynamic_offset_start + + binding_layout->dynamic_offset_offset + + layout->input_attachment_count; + set = MAX_SETS; break; default: - tu_finishme("unsupported desc_type for vulkan_resource_index"); + base = binding_layout->offset / (4 * A6XX_TEX_CONST_DWORDS); break; } + nir_intrinsic_instr *bindless = + nir_intrinsic_instr_create(b->shader, + nir_intrinsic_bindless_resource_ir3); + bindless->num_components = 1; + nir_ssa_dest_init(&bindless->instr, &bindless->dest, + 1, 32, NULL); + nir_intrinsic_set_desc_set(bindless, set); + bindless->src[0] = nir_src_for_ssa(nir_iadd(b, nir_imm_int(b, base), vulkan_idx)); + nir_builder_instr_insert(b, &bindless->instr); + nir_ssa_def_rewrite_uses(&instr->dest.ssa, - nir_src_for_ssa(nir_imm_int(b, index))); + nir_src_for_ssa(&bindless->dest.ssa)); nir_instr_remove(&instr->instr); } +static nir_ssa_def * +build_bindless(nir_builder *b, nir_deref_instr *deref, bool is_sampler, + struct tu_shader *shader, + const struct tu_pipeline_layout *layout) +{ + nir_variable *var = nir_deref_instr_get_variable(deref); + + unsigned set = var->data.descriptor_set; + unsigned binding = var->data.binding; + const struct tu_descriptor_set_binding_layout *bind_layout = + &layout->set[set].layout->binding[binding]; + + nir_ssa_def *desc_offset; + unsigned descriptor_stride; + if (bind_layout->type == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT) { + unsigned offset = + layout->set[set].input_attachment_start + + bind_layout->input_attachment_offset; + desc_offset = nir_imm_int(b, offset); + set = MAX_SETS; + descriptor_stride = 1; + } else { + unsigned offset = 0; + /* Samplers come second in combined image/sampler descriptors, see + * write_combined_image_sampler_descriptor(). + */ + if (is_sampler && bind_layout->type == + VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) { + offset = 1; + } + desc_offset = + nir_imm_int(b, (bind_layout->offset / (4 * A6XX_TEX_CONST_DWORDS)) + + offset); + descriptor_stride = bind_layout->size / (4 * A6XX_TEX_CONST_DWORDS); + } + + if (deref->deref_type != nir_deref_type_var) { + assert(deref->deref_type == nir_deref_type_array); + + nir_ssa_def *arr_index = nir_ssa_for_src(b, deref->arr.index, 1); + desc_offset = nir_iadd(b, desc_offset, + nir_imul_imm(b, arr_index, descriptor_stride)); + } + + nir_intrinsic_instr *bindless = + nir_intrinsic_instr_create(b->shader, + nir_intrinsic_bindless_resource_ir3); + bindless->num_components = 1; + nir_ssa_dest_init(&bindless->instr, &bindless->dest, + 1, 32, NULL); + nir_intrinsic_set_desc_set(bindless, set); + bindless->src[0] = nir_src_for_ssa(desc_offset); + nir_builder_instr_insert(b, &bindless->instr); + + return &bindless->dest.ssa; +} + static void lower_image_deref(nir_builder *b, nir_intrinsic_instr *instr, struct tu_shader *shader, const struct tu_pipeline_layout *layout) { nir_deref_instr *deref = nir_src_as_deref(instr->src[0]); - nir_variable *var = nir_deref_instr_get_variable(deref); - - uint32_t set = var->data.descriptor_set; - uint32_t binding = var->data.binding; - struct tu_descriptor_set_layout *set_layout = layout->set[set].layout; - struct tu_descriptor_set_binding_layout *binding_layout = - &set_layout->binding[binding]; - - nir_ssa_def *index = nir_imm_int(b, - map_add(&shader->image_map, - set, binding, var->data.index, - binding_layout->array_size)); - if (deref->deref_type != nir_deref_type_var) { - assert(deref->deref_type == nir_deref_type_array); - index = nir_iadd(b, index, nir_ssa_for_src(b, deref->arr.index, 1)); - } - nir_rewrite_image_intrinsic(instr, index, false); + nir_ssa_def *bindless = build_bindless(b, deref, false, shader, layout); + nir_rewrite_image_intrinsic(instr, bindless, true); } static bool @@ -331,9 +250,6 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr, case nir_intrinsic_image_deref_atomic_comp_swap: case nir_intrinsic_image_deref_size: case nir_intrinsic_image_deref_samples: - case nir_intrinsic_image_deref_load_param_intel: - case nir_intrinsic_image_deref_load_raw_intel: - case nir_intrinsic_image_deref_store_raw_intel: lower_image_deref(b, instr, shader, layout); return true; @@ -342,6 +258,59 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr, } } +static bool +lower_tex(nir_builder *b, nir_tex_instr *tex, + struct tu_shader *shader, const struct tu_pipeline_layout *layout) +{ + int sampler_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_deref); + if (sampler_src_idx >= 0) { + nir_deref_instr *deref = nir_src_as_deref(tex->src[sampler_src_idx].src); + nir_ssa_def *bindless = build_bindless(b, deref, true, shader, layout); + nir_instr_rewrite_src(&tex->instr, &tex->src[sampler_src_idx].src, + nir_src_for_ssa(bindless)); + tex->src[sampler_src_idx].src_type = nir_tex_src_sampler_handle; + } + + int tex_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref); + if (tex_src_idx >= 0) { + nir_deref_instr *deref = nir_src_as_deref(tex->src[tex_src_idx].src); + nir_ssa_def *bindless = build_bindless(b, deref, false, shader, layout); + nir_instr_rewrite_src(&tex->instr, &tex->src[tex_src_idx].src, + nir_src_for_ssa(bindless)); + tex->src[tex_src_idx].src_type = nir_tex_src_texture_handle; + } + + return true; +} + +static bool +lower_impl(nir_function_impl *impl, struct tu_shader *shader, + const struct tu_pipeline_layout *layout) +{ + nir_builder b; + nir_builder_init(&b, impl); + bool progress = false; + + nir_foreach_block(block, impl) { + nir_foreach_instr_safe(instr, block) { + b.cursor = nir_before_instr(instr); + switch (instr->type) { + case nir_instr_type_tex: + progress |= lower_tex(&b, nir_instr_as_tex(instr), shader, layout); + break; + case nir_instr_type_intrinsic: + progress |= lower_intrinsic(&b, nir_instr_as_intrinsic(instr), shader, layout); + break; + default: + break; + } + } + } + + return progress; +} + + /* Figure out the range of push constants that we're actually going to push to * the shader, and tell the backend to reserve this range when pushing UBO * constants. @@ -391,31 +360,36 @@ gather_push_constants(nir_shader *shader, struct tu_shader *tu_shader) align(tu_shader->push_consts.count, 4); } -static bool -lower_impl(nir_function_impl *impl, struct tu_shader *shader, - const struct tu_pipeline_layout *layout) +/* Gather the InputAttachmentIndex for each input attachment from the NIR + * shader and organize the info in a way so that draw-time patching is easy. + */ +static void +gather_input_attachments(nir_shader *shader, struct tu_shader *tu_shader, + const struct tu_pipeline_layout *layout) { - nir_builder b; - nir_builder_init(&b, impl); - bool progress = false; + nir_foreach_variable(var, &shader->uniforms) { + const struct glsl_type *glsl_type = glsl_without_array(var->type); - nir_foreach_block(block, impl) { - nir_foreach_instr_safe(instr, block) { - b.cursor = nir_before_instr(instr); - switch (instr->type) { - case nir_instr_type_tex: - progress |= lower_sampler(&b, nir_instr_as_tex(instr), shader, layout); - break; - case nir_instr_type_intrinsic: - progress |= lower_intrinsic(&b, nir_instr_as_intrinsic(instr), shader, layout); - break; - default: - break; - } + if (!glsl_type_is_image(glsl_type)) + continue; + + enum glsl_sampler_dim dim = glsl_get_sampler_dim(glsl_type); + + const uint32_t set = var->data.descriptor_set; + const uint32_t binding = var->data.binding; + const struct tu_descriptor_set_binding_layout *bind_layout = + &layout->set[set].layout->binding[binding]; + const uint32_t array_size = bind_layout->array_size; + + if (dim == GLSL_SAMPLER_DIM_SUBPASS || + dim == GLSL_SAMPLER_DIM_SUBPASS_MS) { + unsigned offset = + layout->set[set].input_attachment_start + + bind_layout->input_attachment_offset; + for (unsigned i = 0; i < array_size; i++) + tu_shader->attachment_idx[offset + i] = var->data.index + i; } } - - return progress; } static bool @@ -425,18 +399,13 @@ tu_lower_io(nir_shader *shader, struct tu_shader *tu_shader, bool progress = false; gather_push_constants(shader, tu_shader); + gather_input_attachments(shader, tu_shader, layout); nir_foreach_function(function, shader) { if (function->impl) progress |= lower_impl(function->impl, tu_shader, layout); } - /* spirv_to_nir produces num_ssbos equal to the number of SSBO-containing - * variables, while ir3 wants the number of descriptors (like the gallium - * path). - */ - shader->info.num_ssbos = tu_shader->ssbo_map.num_desc; - return progress; }