From 5fe64837cd43cada770dbbcd25a5439ab04ee97a Mon Sep 17 00:00:00 2001 From: Mark Collins Date: Tue, 3 Oct 2023 18:22:49 +0000 Subject: [PATCH] tu: Support higher descriptor set count for A7XX Allows for the descriptor set count to vary at runtime depending on the specific GPU to allow for 7 usable descriptor sets on A7XX with one reserved for dynamic offsets. Passing VK-CTS: dEQP-VK.binding_model.* Signed-off-by: Mark Collins Part-of: --- src/freedreno/vulkan/tu_clear_blit.cc | 4 ++-- src/freedreno/vulkan/tu_cmd_buffer.cc | 29 +++++++++++++---------- src/freedreno/vulkan/tu_cmd_buffer.h | 2 +- src/freedreno/vulkan/tu_descriptor_set.cc | 5 ++-- src/freedreno/vulkan/tu_descriptor_set.h | 7 +++--- src/freedreno/vulkan/tu_device.cc | 12 ++++++---- src/freedreno/vulkan/tu_device.h | 5 ++++ src/freedreno/vulkan/tu_pipeline.cc | 24 +++++++++++-------- src/freedreno/vulkan/tu_shader.cc | 17 +++++++------ 9 files changed, 62 insertions(+), 43 deletions(-) diff --git a/src/freedreno/vulkan/tu_clear_blit.cc b/src/freedreno/vulkan/tu_clear_blit.cc index b58b118cadd..c5b48b773a9 100644 --- a/src/freedreno/vulkan/tu_clear_blit.cc +++ b/src/freedreno/vulkan/tu_clear_blit.cc @@ -825,8 +825,8 @@ r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, .cs_ibo = true, .gfx_ibo = true, .gfx_shared_const = true, - .cs_bindless = 0x1f, - .gfx_bindless = 0x1f,)); + .cs_bindless = CHIP == A6XX ? 0x1f : 0xff, + .gfx_bindless = CHIP == A6XX ? 0x1f : 0xff,)); tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, vs); tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL); diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index e83c7d1ddba..f07e8615735 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -183,8 +183,8 @@ tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer, tu_emit_event_write(cmd_buffer, cs, FD_CACHE_INVALIDATE); if (flushes & TU_CMD_FLAG_BINDLESS_DESCRIPTOR_INVALIDATE) { tu_cs_emit_regs(cs, HLSQ_INVALIDATE_CMD(CHIP, - .cs_bindless = 0x1f, - .gfx_bindless = 0x1f, + .cs_bindless = CHIP == A6XX ? 0x1f : 0xff, + .gfx_bindless = CHIP == A6XX ? 0x1f : 0xff, )); } if (flushes & TU_CMD_FLAG_WAIT_MEM_WRITES) @@ -1146,8 +1146,8 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs) .gfx_ibo = true, .cs_shared_const = true, .gfx_shared_const = true, - .cs_bindless = 0x1f, - .gfx_bindless = 0x1f,)); + .cs_bindless = CHIP == A6XX ? 0x1f : 0xff, + .gfx_bindless = CHIP == A6XX ? 0x1f : 0xff,)); tu_cs_emit_wfi(cs); @@ -2395,19 +2395,22 @@ tu6_emit_descriptor_sets(struct tu_cmd_buffer *cmd, tu_cs_emit_array(cs, (const uint32_t*)descriptors_state->set_iova, 2 * descriptors_state->max_sets_bound); } - /* Dynamic descriptors get the last descriptor set. */ + /* Dynamic descriptors get the reserved descriptor set. */ if (descriptors_state->dynamic_bound) { - tu_cs_emit_pkt4(cs, sp_bindless_base_reg + 4 * 2, 2); - tu_cs_emit_qw(cs, descriptors_state->set_iova[MAX_SETS]); + int reserved_set_idx = cmd->device->physical_device->reserved_set_idx; + assert(reserved_set_idx >= 0); /* reserved set must be bound */ + + tu_cs_emit_pkt4(cs, sp_bindless_base_reg + reserved_set_idx * 2, 2); + tu_cs_emit_qw(cs, descriptors_state->set_iova[reserved_set_idx]); if (CHIP == A6XX) { - tu_cs_emit_pkt4(cs, hlsq_bindless_base_reg + 4 * 2, 2); - tu_cs_emit_qw(cs, descriptors_state->set_iova[MAX_SETS]); + tu_cs_emit_pkt4(cs, hlsq_bindless_base_reg + reserved_set_idx * 2, 2); + tu_cs_emit_qw(cs, descriptors_state->set_iova[reserved_set_idx]); } } tu_cs_emit_regs(cs, HLSQ_INVALIDATE_CMD(CHIP, - .cs_bindless = bind_point == VK_PIPELINE_BIND_POINT_COMPUTE ? 0x1f : 0, - .gfx_bindless = bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS ? 0x1f : 0, + .cs_bindless = bind_point == VK_PIPELINE_BIND_POINT_COMPUTE ? CHIP == A6XX ? 0x1f : 0xff : 0, + .gfx_bindless = bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS ? CHIP == A6XX ? 0x1f : 0xff : 0, )); if (bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) { @@ -2539,6 +2542,7 @@ tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer, if (layout->dynamic_offset_size) { /* allocate and fill out dynamic descriptor set */ struct tu_cs_memory dynamic_desc_set; + int reserved_set_idx = cmd->device->physical_device->reserved_set_idx; VkResult result = tu_cs_alloc(&cmd->sub_cs, layout->dynamic_offset_size / (4 * A6XX_TEX_CONST_DWORDS), A6XX_TEX_CONST_DWORDS, &dynamic_desc_set); @@ -2549,7 +2553,8 @@ tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer, memcpy(dynamic_desc_set.map, descriptors_state->dynamic_descriptors, layout->dynamic_offset_size); - descriptors_state->set_iova[MAX_SETS] = dynamic_desc_set.iova | BINDLESS_DESCRIPTOR_64B; + assert(reserved_set_idx >= 0); /* reserved set must be bound */ + descriptors_state->set_iova[reserved_set_idx] = dynamic_desc_set.iova | BINDLESS_DESCRIPTOR_64B; descriptors_state->dynamic_bound = true; } diff --git a/src/freedreno/vulkan/tu_cmd_buffer.h b/src/freedreno/vulkan/tu_cmd_buffer.h index 3c55af0fd9c..7538ad3a71c 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.h +++ b/src/freedreno/vulkan/tu_cmd_buffer.h @@ -52,7 +52,7 @@ struct tu_descriptor_state struct tu_descriptor_set *sets[MAX_SETS]; struct tu_descriptor_set push_set; uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS_SIZE]; - uint64_t set_iova[MAX_SETS + 1]; + uint64_t set_iova[MAX_SETS]; uint32_t max_sets_bound; bool dynamic_bound; }; diff --git a/src/freedreno/vulkan/tu_descriptor_set.cc b/src/freedreno/vulkan/tu_descriptor_set.cc index 8b21c3a90b1..3dd346bb080 100644 --- a/src/freedreno/vulkan/tu_descriptor_set.cc +++ b/src/freedreno/vulkan/tu_descriptor_set.cc @@ -491,7 +491,6 @@ tu_pipeline_layout_init(struct tu_pipeline_layout *layout) unsigned dynamic_offset_size = 0; for (uint32_t set = 0; set < layout->num_sets; set++) { - assert(set < MAX_SETS); layout->set[set].dynamic_offset_start = dynamic_offset_size; if (layout->set[set].layout) @@ -548,7 +547,7 @@ tu_CreatePipelineLayout(VkDevice _device, TU_FROM_HANDLE(tu_descriptor_set_layout, set_layout, pCreateInfo->pSetLayouts[set]); - assert(set < MAX_SETS); + assert(set < device->physical_device->usable_sets); layout->set[set].layout = set_layout; if (set_layout) vk_descriptor_set_layout_ref(&set_layout->vk); @@ -1431,7 +1430,7 @@ tu_CreateDescriptorUpdateTemplate( /* descriptorSetLayout should be ignored for push descriptors * and instead it refers to pipelineLayout and set. */ - assert(pCreateInfo->set < MAX_SETS); + assert(pCreateInfo->set < device->physical_device->usable_sets); set_layout = pipeline_layout->set[pCreateInfo->set].layout; } else { TU_FROM_HANDLE(tu_descriptor_set_layout, _set_layout, diff --git a/src/freedreno/vulkan/tu_descriptor_set.h b/src/freedreno/vulkan/tu_descriptor_set.h index 55fe6b9fb6e..c272b084e06 100644 --- a/src/freedreno/vulkan/tu_descriptor_set.h +++ b/src/freedreno/vulkan/tu_descriptor_set.h @@ -10,10 +10,11 @@ #include "vk_descriptor_set_layout.h" -/* The hardware supports 5 descriptor sets, but we reserve 1 for dynamic - * descriptors and input attachments. +/* The hardware supports up to 8 descriptor sets since A7XX. + * Note: This is the maximum across generations, not the maximum for a + * particular generation so it should only be used for allocation. */ -#define MAX_SETS 4 +#define MAX_SETS 8 /* I have no idea what the maximum size is, but the hardware supports very * large numbers of descriptors (at least 2^16). This limit is based on diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index 1bdf34164b3..d3343799634 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -628,6 +628,8 @@ tu_physical_device_init(struct tu_physical_device *device, device->ccu_offset_bypass = depth_cache_size; device->ccu_offset_gmem = device->gmem_size - color_cache_size; + + device->usable_sets = device->reserved_set_idx = device->info->a6xx.max_sets - 1; break; } default: @@ -1065,7 +1067,7 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice, .maxSamplerAllocationCount = 64 * 1024, .bufferImageGranularity = 64, /* A cache line */ .sparseAddressSpaceSize = 0, - .maxBoundDescriptorSets = MAX_SETS, + .maxBoundDescriptorSets = pdevice->usable_sets, .maxPerStageDescriptorSamplers = max_descriptor_set_size, .maxPerStageDescriptorUniformBuffers = max_descriptor_set_size, .maxPerStageDescriptorStorageBuffers = max_descriptor_set_size, @@ -1327,10 +1329,10 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice, properties->bufferlessPushDescriptors = true; properties->allowSamplerImageViewPostSubmitCreation = true; properties->descriptorBufferOffsetAlignment = A6XX_TEX_CONST_DWORDS * 4; - properties->maxDescriptorBufferBindings = MAX_SETS; - properties->maxResourceDescriptorBufferBindings = MAX_SETS; - properties->maxSamplerDescriptorBufferBindings = MAX_SETS; - properties->maxEmbeddedImmutableSamplerBindings = MAX_SETS; + properties->maxDescriptorBufferBindings = pdevice->usable_sets; + properties->maxResourceDescriptorBufferBindings = pdevice->usable_sets; + properties->maxSamplerDescriptorBufferBindings = pdevice->usable_sets; + properties->maxEmbeddedImmutableSamplerBindings = pdevice->usable_sets; properties->maxEmbeddedImmutableSamplers = max_descriptor_set_size; properties->bufferCaptureReplayDescriptorDataSize = 0; properties->imageCaptureReplayDescriptorDataSize = 0; diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h index 4f144168566..ad5ff44e13f 100644 --- a/src/freedreno/vulkan/tu_device.h +++ b/src/freedreno/vulkan/tu_device.h @@ -89,6 +89,11 @@ struct tu_physical_device uint32_t ccu_offset_gmem; uint32_t ccu_offset_bypass; + /* Amount of usable descriptor sets, this excludes any reserved set */ + uint32_t usable_sets; + /* Index of the reserved descriptor set, may be -1 if unset */ + int32_t reserved_set_idx; + bool has_set_iova; uint64_t va_start; uint64_t va_size; diff --git a/src/freedreno/vulkan/tu_pipeline.cc b/src/freedreno/vulkan/tu_pipeline.cc index d51e8093a69..1f4eb6051b2 100644 --- a/src/freedreno/vulkan/tu_pipeline.cc +++ b/src/freedreno/vulkan/tu_pipeline.cc @@ -110,7 +110,8 @@ tu6_load_state_size(struct tu_pipeline *pipeline, } static void -tu6_emit_load_state(struct tu_pipeline *pipeline, +tu6_emit_load_state(struct tu_device *device, + struct tu_pipeline *pipeline, struct tu_pipeline_layout *layout) { unsigned size = tu6_load_state_size(pipeline, layout); @@ -165,7 +166,8 @@ tu6_emit_load_state(struct tu_pipeline *pipeline, continue; switch (binding->type) { case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: - base = MAX_SETS; + assert(device->physical_device->reserved_set_idx >= 0); + base = device->physical_device->reserved_set_idx; offset = (layout->set[i].dynamic_offset_start + binding->dynamic_offset_offset) / 4; FALLTHROUGH; @@ -201,7 +203,8 @@ tu6_emit_load_state(struct tu_pipeline *pipeline, break; } case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: - base = MAX_SETS; + assert(device->physical_device->reserved_set_idx >= 0); + base = device->physical_device->reserved_set_idx; offset = (layout->set[i].dynamic_offset_start + binding->dynamic_offset_offset) / 4; FALLTHROUGH; @@ -404,19 +407,20 @@ tu6_emit_dynamic_offset(struct tu_cs *cs, const struct tu_shader *shader, struct tu_pipeline_builder *builder) { + const struct tu_physical_device *phys_dev = cs->device->physical_device; if (!xs || shader->const_state.dynamic_offset_loc == UINT32_MAX) return; - tu_cs_emit_pkt7(cs, tu6_stage2opcode(xs->type), 3 + MAX_SETS); + tu_cs_emit_pkt7(cs, tu6_stage2opcode(xs->type), 3 + phys_dev->usable_sets); tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(shader->const_state.dynamic_offset_loc / 4) | CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(xs->type)) | - CP_LOAD_STATE6_0_NUM_UNIT(DIV_ROUND_UP(MAX_SETS, 4))); + CP_LOAD_STATE6_0_NUM_UNIT(DIV_ROUND_UP(phys_dev->usable_sets, 4))); tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); - for (unsigned i = 0; i < MAX_SETS; i++) { + for (unsigned i = 0; i < phys_dev->usable_sets; i++) { unsigned dynamic_offset_start = builder->layout.set[i].dynamic_offset_start / (A6XX_TEX_CONST_DWORDS * 4); tu_cs_emit(cs, i < builder->layout.num_sets ? dynamic_offset_start : 0); @@ -2235,9 +2239,9 @@ tu_pipeline_builder_parse_layout(struct tu_pipeline_builder *builder, struct tu_graphics_lib_pipeline *library = builder->libraries[i]; builder->layout.num_sets = MAX2(builder->layout.num_sets, library->num_sets); + assert(builder->layout.num_sets <= builder->device->physical_device->usable_sets); for (unsigned j = 0; j < library->num_sets; j++) { - if (library->layouts[i]) - builder->layout.set[i].layout = library->layouts[i]; + builder->layout.set[i].layout = library->layouts[i]; } builder->layout.push_constant_size = library->push_constant_size; @@ -3920,7 +3924,7 @@ tu_pipeline_builder_build(struct tu_pipeline_builder *builder, /* Blob doesn't preload state on A7XX, likely preloading either * doesn't work or doesn't provide benefits. */ - tu6_emit_load_state(*pipeline, &builder->layout); + tu6_emit_load_state(builder->device, *pipeline, &builder->layout); } } @@ -4370,7 +4374,7 @@ tu_compute_pipeline_create(VkDevice device, pipeline->local_size[i] = v->local_size[i]; if (CHIP == A6XX) { - tu6_emit_load_state(&pipeline->base, layout); + tu6_emit_load_state(dev, &pipeline->base, layout); } tu_append_executable(&pipeline->base, v, nir_initial_disasm); diff --git a/src/freedreno/vulkan/tu_shader.cc b/src/freedreno/vulkan/tu_shader.cc index 6745da7cd24..a032e0a8e3d 100644 --- a/src/freedreno/vulkan/tu_shader.cc +++ b/src/freedreno/vulkan/tu_shader.cc @@ -167,7 +167,8 @@ lower_load_push_constant(struct tu_device *dev, } static void -lower_vulkan_resource_index(nir_builder *b, nir_intrinsic_instr *instr, +lower_vulkan_resource_index(struct tu_device *dev, nir_builder *b, + nir_intrinsic_instr *instr, struct tu_shader *shader, const struct tu_pipeline_layout *layout) { @@ -203,7 +204,8 @@ lower_vulkan_resource_index(nir_builder *b, nir_intrinsic_instr *instr, base = nir_imm_int(b, (layout->set[set].dynamic_offset_start + binding_layout->dynamic_offset_offset) / (4 * A6XX_TEX_CONST_DWORDS)); } - set = MAX_SETS; + assert(dev->physical_device->reserved_set_idx >= 0); + set = dev->physical_device->reserved_set_idx; break; default: base = nir_imm_int(b, binding_layout->offset / (4 * A6XX_TEX_CONST_DWORDS)); @@ -288,7 +290,7 @@ lower_ssbo_ubo_intrinsic(struct tu_device *dev, descriptor_idx = nir_iadd_imm(b, descriptor_idx, 1); } - nir_def *results[MAX_SETS + 1] = { NULL }; + nir_def *results[MAX_SETS] = { NULL }; if (nir_scalar_is_const(scalar_idx)) { nir_def *bindless = @@ -298,7 +300,7 @@ lower_ssbo_ubo_intrinsic(struct tu_device *dev, } nir_def *base_idx = nir_channel(b, scalar_idx.def, scalar_idx.comp); - for (unsigned i = 0; i < MAX_SETS + 1; i++) { + for (unsigned i = 0; i < dev->physical_device->info->a6xx.max_sets; i++) { /* if (base_idx == i) { ... */ nir_if *nif = nir_push_if(b, nir_ieq_imm(b, base_idx, i)); @@ -336,7 +338,7 @@ lower_ssbo_ubo_intrinsic(struct tu_device *dev, nir_def *result = nir_undef(b, intrin->def.num_components, intrin->def.bit_size); - for (int i = MAX_SETS; i >= 0; i--) { + for (int i = dev->physical_device->info->a6xx.max_sets - 1; i >= 0; i--) { nir_pop_if(b, NULL); if (info->has_dest) result = nir_if_phi(b, results[i], result); @@ -433,7 +435,7 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr, return true; case nir_intrinsic_vulkan_resource_index: - lower_vulkan_resource_index(b, instr, shader, layout); + lower_vulkan_resource_index(dev, b, instr, shader, layout); return true; case nir_intrinsic_vulkan_resource_reindex: lower_vulkan_resource_reindex(b, instr); @@ -715,7 +717,8 @@ tu_lower_io(nir_shader *shader, struct tu_device *dev, if (layout->independent_sets) { const_state->dynamic_offset_loc = reserved_consts_vec4 * 4; - reserved_consts_vec4 += DIV_ROUND_UP(MAX_SETS, 4); + assert(dev->physical_device->reserved_set_idx >= 0); + reserved_consts_vec4 += DIV_ROUND_UP(dev->physical_device->reserved_set_idx, 4); } else { const_state->dynamic_offset_loc = UINT32_MAX; }