mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-24 17:30:12 +01:00
ir3, turnip: Use ldc.k to push UBOs
This reuses the same UBO analysis to do the pushing in the shader preamble via the ldc.k instruction instead of in the driver via CP_LOAD_STATE6. The const_data UBO is exempted as it uses a different codepath that isn't as critical. Don't do this on gallium because there are some regressions. Aztec Ruins in particular regresses a bit, and nothing I've benchmarked benefits. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13148>
This commit is contained in:
parent
221a912b8c
commit
9932ca8a3f
5 changed files with 77 additions and 105 deletions
|
|
@ -314,6 +314,12 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
|
|||
compiler->bool_type = (compiler->gen >= 5) ? TYPE_U16 : TYPE_U32;
|
||||
compiler->has_shared_regfile = compiler->gen >= 5;
|
||||
|
||||
compiler->push_ubo_with_preamble = options->push_ubo_with_preamble;
|
||||
|
||||
/* The driver can't request this unless preambles are supported. */
|
||||
if (options->push_ubo_with_preamble)
|
||||
assert(compiler->has_preamble);
|
||||
|
||||
if (compiler->gen >= 6) {
|
||||
compiler->nir_options = nir_options_a6xx;
|
||||
compiler->nir_options.has_udot_4x8 = dev_info->a6xx.has_dp2acc;
|
||||
|
|
|
|||
|
|
@ -182,6 +182,8 @@ struct ir3_compiler {
|
|||
|
||||
/* True if preamble instructions (shps, shpe, etc.) are supported */
|
||||
bool has_preamble;
|
||||
|
||||
bool push_ubo_with_preamble;
|
||||
};
|
||||
|
||||
struct ir3_compiler_options {
|
||||
|
|
@ -189,6 +191,13 @@ struct ir3_compiler_options {
|
|||
* VK_EXT_robustness2 and optimizations may have to be more conservative.
|
||||
*/
|
||||
bool robust_ubo_access;
|
||||
|
||||
/* If true, promote UBOs (except for constant data) to constants using ldc.k
|
||||
* in the preamble. The driver should ignore everything in ubo_state except
|
||||
* for the constant data UBO, which is excluded because the command pushing
|
||||
* constants for it can be pre-baked when compiling the shader.
|
||||
*/
|
||||
bool push_ubo_with_preamble;
|
||||
};
|
||||
|
||||
void ir3_compiler_destroy(struct ir3_compiler *compiler);
|
||||
|
|
|
|||
|
|
@ -342,6 +342,53 @@ lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b,
|
|||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
copy_ubo_to_uniform(nir_shader *nir, const struct ir3_const_state *const_state)
|
||||
{
|
||||
const struct ir3_ubo_analysis_state *state = &const_state->ubo_state;
|
||||
|
||||
if (state->num_enabled == 0 ||
|
||||
(state->num_enabled == 1 && !state->range[0].ubo.bindless &&
|
||||
state->range[0].ubo.block == const_state->constant_data_ubo))
|
||||
return false;
|
||||
|
||||
nir_function_impl *preamble = nir_shader_get_preamble(nir);
|
||||
nir_builder _b, *b = &_b;
|
||||
nir_builder_init(b, preamble);
|
||||
b->cursor = nir_after_cf_list(&preamble->body);
|
||||
|
||||
for (unsigned i = 0; i < state->num_enabled; i++) {
|
||||
const struct ir3_ubo_range *range = &state->range[i];
|
||||
|
||||
/* The constant_data UBO is pushed in a different path from normal
|
||||
* uniforms, and the state is setup earlier so it makes more sense to let
|
||||
* the CP do it for us.
|
||||
*/
|
||||
if (!range->ubo.bindless &&
|
||||
range->ubo.block == const_state->constant_data_ubo)
|
||||
continue;
|
||||
|
||||
nir_ssa_def *ubo = nir_imm_int(b, range->ubo.block);
|
||||
if (range->ubo.bindless) {
|
||||
ubo = nir_bindless_resource_ir3(b, 32, ubo,
|
||||
.desc_set = range->ubo.bindless_base);
|
||||
}
|
||||
|
||||
/* ldc.k has a range of only 256, but there are 512 vec4 constants.
|
||||
* Therefore we may have to split a large copy in two.
|
||||
*/
|
||||
unsigned size = (range->end - range->start) / 16;
|
||||
for (unsigned offset = 0; offset < size; offset += 256) {
|
||||
nir_copy_ubo_to_uniform_ir3(b, ubo, nir_imm_int(b, range->start / 16 +
|
||||
offset),
|
||||
.base = range->offset / 4 + offset * 4,
|
||||
.range = MIN2(size - offset, 256));
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
instr_is_load_ubo(nir_instr *instr)
|
||||
{
|
||||
|
|
@ -379,8 +426,9 @@ ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader_variant *v)
|
|||
memset(state, 0, sizeof(*state));
|
||||
|
||||
uint32_t upload_remaining = max_upload;
|
||||
bool push_ubos = compiler->push_ubo_with_preamble;
|
||||
nir_foreach_function (function, nir) {
|
||||
if (function->impl) {
|
||||
if (function->impl && (!push_ubos || !function->is_preamble)) {
|
||||
nir_foreach_block (block, function->impl) {
|
||||
nir_foreach_instr (instr, block) {
|
||||
if (instr_is_load_ubo(instr))
|
||||
|
|
@ -426,8 +474,15 @@ ir3_nir_lower_ubo_loads(nir_shader *nir, struct ir3_shader_variant *v)
|
|||
|
||||
int num_ubos = 0;
|
||||
bool progress = false;
|
||||
bool has_preamble = false;
|
||||
bool push_ubos = compiler->push_ubo_with_preamble;
|
||||
nir_foreach_function (function, nir) {
|
||||
if (function->impl) {
|
||||
if (function->is_preamble && push_ubos) {
|
||||
has_preamble = true;
|
||||
nir_metadata_preserve(function->impl, nir_metadata_all);
|
||||
continue;
|
||||
}
|
||||
nir_builder builder;
|
||||
nir_builder_init(&builder, function->impl);
|
||||
nir_foreach_block (block, function->impl) {
|
||||
|
|
@ -448,9 +503,12 @@ ir3_nir_lower_ubo_loads(nir_shader *nir, struct ir3_shader_variant *v)
|
|||
* Vulkan's bindless, we don't use the num_ubos field, so we can leave it
|
||||
* incremented.
|
||||
*/
|
||||
if (nir->info.first_ubo_is_default_ubo)
|
||||
if (nir->info.first_ubo_is_default_ubo && !push_ubos && !has_preamble)
|
||||
nir->info.num_ubos = num_ubos;
|
||||
|
||||
if (compiler->has_preamble && push_ubos)
|
||||
progress |= copy_ubo_to_uniform(nir, const_state);
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1916,7 +1916,7 @@ tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
|
|||
hlsq_invalidate_value = A6XX_HLSQ_INVALIDATE_CMD_GFX_BINDLESS(0x1f);
|
||||
|
||||
cmd->state.desc_sets = tu_cs_draw_state(&cmd->sub_cs, &state_cs, 24);
|
||||
cmd->state.dirty |= TU_CMD_DIRTY_DESC_SETS_LOAD | TU_CMD_DIRTY_SHADER_CONSTS;
|
||||
cmd->state.dirty |= TU_CMD_DIRTY_DESC_SETS_LOAD;
|
||||
cs = &state_cs;
|
||||
} else {
|
||||
assert(pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE);
|
||||
|
|
@ -3427,7 +3427,6 @@ tu6_user_consts_size(const struct tu_pipeline *pipeline,
|
|||
{
|
||||
const struct tu_program_descriptor_linkage *link =
|
||||
&pipeline->program.link[type];
|
||||
const struct ir3_ubo_analysis_state *state = &link->const_state.ubo_state;
|
||||
uint32_t dwords = 0;
|
||||
|
||||
if (link->push_consts.count > 0) {
|
||||
|
|
@ -3435,37 +3434,6 @@ tu6_user_consts_size(const struct tu_pipeline *pipeline,
|
|||
dwords += 4 + num_units * 4;
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < state->num_enabled; i++) {
|
||||
uint32_t size = state->range[i].end - state->range[i].start;
|
||||
|
||||
size = MIN2(size, (16 * link->constlen) - state->range[i].offset);
|
||||
|
||||
if (size == 0)
|
||||
continue;
|
||||
|
||||
if (!state->range[i].ubo.bindless)
|
||||
continue;
|
||||
|
||||
uint32_t *base = state->range[i].ubo.bindless_base == MAX_SETS ?
|
||||
descriptors_state->dynamic_descriptors :
|
||||
descriptors_state->sets[state->range[i].ubo.bindless_base]->mapped_ptr;
|
||||
unsigned block = state->range[i].ubo.block;
|
||||
uint32_t *desc = base + block * A6XX_TEX_CONST_DWORDS;
|
||||
uint32_t desc_size = (desc[1] >> A6XX_UBO_1_SIZE__SHIFT) * 16;
|
||||
desc_size = desc_size > state->range[i].start ?
|
||||
desc_size - state->range[i].start : 0;
|
||||
|
||||
if (desc_size < size) {
|
||||
uint32_t zero_size = size - desc_size;
|
||||
dwords += 4 + zero_size / 4;
|
||||
size = desc_size;
|
||||
}
|
||||
|
||||
if (size > 0) {
|
||||
dwords += 4;
|
||||
}
|
||||
}
|
||||
|
||||
return dwords;
|
||||
}
|
||||
|
||||
|
|
@ -3477,8 +3445,6 @@ tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline,
|
|||
{
|
||||
const struct tu_program_descriptor_linkage *link =
|
||||
&pipeline->program.link[type];
|
||||
const struct ir3_const_state *const_state = &link->const_state;
|
||||
const struct ir3_ubo_analysis_state *state = &const_state->ubo_state;
|
||||
|
||||
if (link->push_consts.count > 0) {
|
||||
unsigned num_units = link->push_consts.count;
|
||||
|
|
@ -3494,74 +3460,6 @@ tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline,
|
|||
for (unsigned i = 0; i < num_units * 4; i++)
|
||||
tu_cs_emit(cs, push_constants[i + offset * 4]);
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < state->num_enabled; i++) {
|
||||
uint32_t size = state->range[i].end - state->range[i].start;
|
||||
uint32_t offset = state->range[i].start;
|
||||
|
||||
/* and even if the start of the const buffer is before
|
||||
* first_immediate, the end may not be:
|
||||
*/
|
||||
size = MIN2(size, (16 * link->constlen) - state->range[i].offset);
|
||||
|
||||
if (size == 0)
|
||||
continue;
|
||||
|
||||
/* things should be aligned to vec4: */
|
||||
debug_assert((state->range[i].offset % 16) == 0);
|
||||
debug_assert((size % 16) == 0);
|
||||
debug_assert((offset % 16) == 0);
|
||||
|
||||
/* Dig out the descriptor from the descriptor state and read the VA from
|
||||
* it. All our UBOs are bindless with the exception of the NIR
|
||||
* constant_data, which is uploaded once in the pipeline.
|
||||
*/
|
||||
if (!state->range[i].ubo.bindless) {
|
||||
assert(state->range[i].ubo.block == const_state->constant_data_ubo);
|
||||
continue;
|
||||
}
|
||||
|
||||
uint32_t *base = state->range[i].ubo.bindless_base == MAX_SETS ?
|
||||
descriptors_state->dynamic_descriptors :
|
||||
descriptors_state->sets[state->range[i].ubo.bindless_base]->mapped_ptr;
|
||||
unsigned block = state->range[i].ubo.block;
|
||||
uint32_t *desc = base + block * A6XX_TEX_CONST_DWORDS;
|
||||
uint64_t va = desc[0] | ((uint64_t)(desc[1] & A6XX_UBO_1_BASE_HI__MASK) << 32);
|
||||
uint32_t desc_size = (desc[1] >> A6XX_UBO_1_SIZE__SHIFT) * 16;
|
||||
desc_size = desc_size > state->range[i].start ?
|
||||
desc_size - state->range[i].start : 0;
|
||||
|
||||
/* Handle null UBO descriptors and out-of-range UBO reads by filling the
|
||||
* rest with 0, simulating what reading with ldc would do. This behavior
|
||||
* is required by VK_EXT_robustness2.
|
||||
*/
|
||||
if (desc_size < size) {
|
||||
uint32_t zero_size = size - desc_size;
|
||||
uint32_t zero_offset = state->range[i].offset + desc_size;
|
||||
tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + zero_size / 4);
|
||||
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(zero_offset / 16) |
|
||||
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
|
||||
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
|
||||
CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
|
||||
CP_LOAD_STATE6_0_NUM_UNIT(zero_size / 16));
|
||||
tu_cs_emit_qw(cs, 0);
|
||||
for (unsigned i = 0; i < zero_size / 4; i++) {
|
||||
tu_cs_emit(cs, 0);
|
||||
}
|
||||
size = desc_size;
|
||||
}
|
||||
|
||||
if (size > 0) {
|
||||
assert(va);
|
||||
tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);
|
||||
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(state->range[i].offset / 16) |
|
||||
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
|
||||
CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
|
||||
CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
|
||||
CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
|
||||
tu_cs_emit_qw(cs, va + offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static struct tu_draw_state
|
||||
|
|
|
|||
|
|
@ -1740,6 +1740,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
|
|||
ir3_compiler_create(NULL, &physical_device->dev_id,
|
||||
&(struct ir3_compiler_options) {
|
||||
.robust_ubo_access = robust_buffer_access2,
|
||||
.push_ubo_with_preamble = true,
|
||||
});
|
||||
if (!device->compiler) {
|
||||
result = vk_startup_errorf(physical_device->instance,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue