diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c index 12cc4d43a6c..eac61f06682 100644 --- a/src/freedreno/ir3/ir3_compiler.c +++ b/src/freedreno/ir3/ir3_compiler.c @@ -314,6 +314,12 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id, compiler->bool_type = (compiler->gen >= 5) ? TYPE_U16 : TYPE_U32; compiler->has_shared_regfile = compiler->gen >= 5; + compiler->push_ubo_with_preamble = options->push_ubo_with_preamble; + + /* The driver can't request this unless preambles are supported. */ + if (options->push_ubo_with_preamble) + assert(compiler->has_preamble); + if (compiler->gen >= 6) { compiler->nir_options = nir_options_a6xx; compiler->nir_options.has_udot_4x8 = dev_info->a6xx.has_dp2acc; diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h index b0de1df863b..e67ed786564 100644 --- a/src/freedreno/ir3/ir3_compiler.h +++ b/src/freedreno/ir3/ir3_compiler.h @@ -182,6 +182,8 @@ struct ir3_compiler { /* True if preamble instructions (shps, shpe, etc.) are supported */ bool has_preamble; + + bool push_ubo_with_preamble; }; struct ir3_compiler_options { @@ -189,6 +191,13 @@ struct ir3_compiler_options { * VK_EXT_robustness2 and optimizations may have to be more conservative. */ bool robust_ubo_access; + + /* If true, promote UBOs (except for constant data) to constants using ldc.k + * in the preamble. The driver should ignore everything in ubo_state except + * for the constant data UBO, which is excluded because the command pushing + * constants for it can be pre-baked when compiling the shader. + */ + bool push_ubo_with_preamble; }; void ir3_compiler_destroy(struct ir3_compiler *compiler); diff --git a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c index dea75e95c9e..7167360e27e 100644 --- a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c +++ b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c @@ -342,6 +342,53 @@ lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b, return true; } +static bool +copy_ubo_to_uniform(nir_shader *nir, const struct ir3_const_state *const_state) +{ + const struct ir3_ubo_analysis_state *state = &const_state->ubo_state; + + if (state->num_enabled == 0 || + (state->num_enabled == 1 && !state->range[0].ubo.bindless && + state->range[0].ubo.block == const_state->constant_data_ubo)) + return false; + + nir_function_impl *preamble = nir_shader_get_preamble(nir); + nir_builder _b, *b = &_b; + nir_builder_init(b, preamble); + b->cursor = nir_after_cf_list(&preamble->body); + + for (unsigned i = 0; i < state->num_enabled; i++) { + const struct ir3_ubo_range *range = &state->range[i]; + + /* The constant_data UBO is pushed in a different path from normal + * uniforms, and the state is setup earlier so it makes more sense to let + * the CP do it for us. + */ + if (!range->ubo.bindless && + range->ubo.block == const_state->constant_data_ubo) + continue; + + nir_ssa_def *ubo = nir_imm_int(b, range->ubo.block); + if (range->ubo.bindless) { + ubo = nir_bindless_resource_ir3(b, 32, ubo, + .desc_set = range->ubo.bindless_base); + } + + /* ldc.k has a range of only 256, but there are 512 vec4 constants. + * Therefore we may have to split a large copy in two. + */ + unsigned size = (range->end - range->start) / 16; + for (unsigned offset = 0; offset < size; offset += 256) { + nir_copy_ubo_to_uniform_ir3(b, ubo, nir_imm_int(b, range->start / 16 + + offset), + .base = range->offset / 4 + offset * 4, + .range = MIN2(size - offset, 256)); + } + } + + return true; +} + static bool instr_is_load_ubo(nir_instr *instr) { @@ -379,8 +426,9 @@ ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader_variant *v) memset(state, 0, sizeof(*state)); uint32_t upload_remaining = max_upload; + bool push_ubos = compiler->push_ubo_with_preamble; nir_foreach_function (function, nir) { - if (function->impl) { + if (function->impl && (!push_ubos || !function->is_preamble)) { nir_foreach_block (block, function->impl) { nir_foreach_instr (instr, block) { if (instr_is_load_ubo(instr)) @@ -426,8 +474,15 @@ ir3_nir_lower_ubo_loads(nir_shader *nir, struct ir3_shader_variant *v) int num_ubos = 0; bool progress = false; + bool has_preamble = false; + bool push_ubos = compiler->push_ubo_with_preamble; nir_foreach_function (function, nir) { if (function->impl) { + if (function->is_preamble && push_ubos) { + has_preamble = true; + nir_metadata_preserve(function->impl, nir_metadata_all); + continue; + } nir_builder builder; nir_builder_init(&builder, function->impl); nir_foreach_block (block, function->impl) { @@ -448,9 +503,12 @@ ir3_nir_lower_ubo_loads(nir_shader *nir, struct ir3_shader_variant *v) * Vulkan's bindless, we don't use the num_ubos field, so we can leave it * incremented. */ - if (nir->info.first_ubo_is_default_ubo) + if (nir->info.first_ubo_is_default_ubo && !push_ubos && !has_preamble) nir->info.num_ubos = num_ubos; + if (compiler->has_preamble && push_ubos) + progress |= copy_ubo_to_uniform(nir, const_state); + return progress; } diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c index a91d4ea7645..0879de306c5 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/src/freedreno/vulkan/tu_cmd_buffer.c @@ -1916,7 +1916,7 @@ tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer, hlsq_invalidate_value = A6XX_HLSQ_INVALIDATE_CMD_GFX_BINDLESS(0x1f); cmd->state.desc_sets = tu_cs_draw_state(&cmd->sub_cs, &state_cs, 24); - cmd->state.dirty |= TU_CMD_DIRTY_DESC_SETS_LOAD | TU_CMD_DIRTY_SHADER_CONSTS; + cmd->state.dirty |= TU_CMD_DIRTY_DESC_SETS_LOAD; cs = &state_cs; } else { assert(pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE); @@ -3427,7 +3427,6 @@ tu6_user_consts_size(const struct tu_pipeline *pipeline, { const struct tu_program_descriptor_linkage *link = &pipeline->program.link[type]; - const struct ir3_ubo_analysis_state *state = &link->const_state.ubo_state; uint32_t dwords = 0; if (link->push_consts.count > 0) { @@ -3435,37 +3434,6 @@ tu6_user_consts_size(const struct tu_pipeline *pipeline, dwords += 4 + num_units * 4; } - for (uint32_t i = 0; i < state->num_enabled; i++) { - uint32_t size = state->range[i].end - state->range[i].start; - - size = MIN2(size, (16 * link->constlen) - state->range[i].offset); - - if (size == 0) - continue; - - if (!state->range[i].ubo.bindless) - continue; - - uint32_t *base = state->range[i].ubo.bindless_base == MAX_SETS ? - descriptors_state->dynamic_descriptors : - descriptors_state->sets[state->range[i].ubo.bindless_base]->mapped_ptr; - unsigned block = state->range[i].ubo.block; - uint32_t *desc = base + block * A6XX_TEX_CONST_DWORDS; - uint32_t desc_size = (desc[1] >> A6XX_UBO_1_SIZE__SHIFT) * 16; - desc_size = desc_size > state->range[i].start ? - desc_size - state->range[i].start : 0; - - if (desc_size < size) { - uint32_t zero_size = size - desc_size; - dwords += 4 + zero_size / 4; - size = desc_size; - } - - if (size > 0) { - dwords += 4; - } - } - return dwords; } @@ -3477,8 +3445,6 @@ tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline, { const struct tu_program_descriptor_linkage *link = &pipeline->program.link[type]; - const struct ir3_const_state *const_state = &link->const_state; - const struct ir3_ubo_analysis_state *state = &const_state->ubo_state; if (link->push_consts.count > 0) { unsigned num_units = link->push_consts.count; @@ -3494,74 +3460,6 @@ tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline, for (unsigned i = 0; i < num_units * 4; i++) tu_cs_emit(cs, push_constants[i + offset * 4]); } - - for (uint32_t i = 0; i < state->num_enabled; i++) { - uint32_t size = state->range[i].end - state->range[i].start; - uint32_t offset = state->range[i].start; - - /* and even if the start of the const buffer is before - * first_immediate, the end may not be: - */ - size = MIN2(size, (16 * link->constlen) - state->range[i].offset); - - if (size == 0) - continue; - - /* things should be aligned to vec4: */ - debug_assert((state->range[i].offset % 16) == 0); - debug_assert((size % 16) == 0); - debug_assert((offset % 16) == 0); - - /* Dig out the descriptor from the descriptor state and read the VA from - * it. All our UBOs are bindless with the exception of the NIR - * constant_data, which is uploaded once in the pipeline. - */ - if (!state->range[i].ubo.bindless) { - assert(state->range[i].ubo.block == const_state->constant_data_ubo); - continue; - } - - uint32_t *base = state->range[i].ubo.bindless_base == MAX_SETS ? - descriptors_state->dynamic_descriptors : - descriptors_state->sets[state->range[i].ubo.bindless_base]->mapped_ptr; - unsigned block = state->range[i].ubo.block; - uint32_t *desc = base + block * A6XX_TEX_CONST_DWORDS; - uint64_t va = desc[0] | ((uint64_t)(desc[1] & A6XX_UBO_1_BASE_HI__MASK) << 32); - uint32_t desc_size = (desc[1] >> A6XX_UBO_1_SIZE__SHIFT) * 16; - desc_size = desc_size > state->range[i].start ? - desc_size - state->range[i].start : 0; - - /* Handle null UBO descriptors and out-of-range UBO reads by filling the - * rest with 0, simulating what reading with ldc would do. This behavior - * is required by VK_EXT_robustness2. - */ - if (desc_size < size) { - uint32_t zero_size = size - desc_size; - uint32_t zero_offset = state->range[i].offset + desc_size; - tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + zero_size / 4); - tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(zero_offset / 16) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | - CP_LOAD_STATE6_0_NUM_UNIT(zero_size / 16)); - tu_cs_emit_qw(cs, 0); - for (unsigned i = 0; i < zero_size / 4; i++) { - tu_cs_emit(cs, 0); - } - size = desc_size; - } - - if (size > 0) { - assert(va); - tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3); - tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(state->range[i].offset / 16) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | - CP_LOAD_STATE6_0_NUM_UNIT(size / 16)); - tu_cs_emit_qw(cs, va + offset); - } - } } static struct tu_draw_state diff --git a/src/freedreno/vulkan/tu_device.c b/src/freedreno/vulkan/tu_device.c index b64d0fd51af..6ec8fed500d 100644 --- a/src/freedreno/vulkan/tu_device.c +++ b/src/freedreno/vulkan/tu_device.c @@ -1740,6 +1740,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, ir3_compiler_create(NULL, &physical_device->dev_id, &(struct ir3_compiler_options) { .robust_ubo_access = robust_buffer_access2, + .push_ubo_with_preamble = true, }); if (!device->compiler) { result = vk_startup_errorf(physical_device->instance,