ir3, turnip: Use ldc.k to push UBOs

This reuses the same UBO analysis to do the pushing in the shader
preamble via the ldc.k instruction instead of in the driver via
CP_LOAD_STATE6. The const_data UBO is exempted as it uses a different
codepath that isn't as critical.

Don't do this on gallium because there are some regressions. Aztec Ruins
in particular regresses a bit, and nothing I've benchmarked benefits.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13148>
This commit is contained in:
Connor Abbott 2021-09-24 19:08:39 +02:00 committed by Marge Bot
parent 221a912b8c
commit 9932ca8a3f
5 changed files with 77 additions and 105 deletions

View file

@ -314,6 +314,12 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
compiler->bool_type = (compiler->gen >= 5) ? TYPE_U16 : TYPE_U32;
compiler->has_shared_regfile = compiler->gen >= 5;
compiler->push_ubo_with_preamble = options->push_ubo_with_preamble;
/* The driver can't request this unless preambles are supported. */
if (options->push_ubo_with_preamble)
assert(compiler->has_preamble);
if (compiler->gen >= 6) {
compiler->nir_options = nir_options_a6xx;
compiler->nir_options.has_udot_4x8 = dev_info->a6xx.has_dp2acc;

View file

@ -182,6 +182,8 @@ struct ir3_compiler {
/* True if preamble instructions (shps, shpe, etc.) are supported */
bool has_preamble;
bool push_ubo_with_preamble;
};
struct ir3_compiler_options {
@ -189,6 +191,13 @@ struct ir3_compiler_options {
* VK_EXT_robustness2 and optimizations may have to be more conservative.
*/
bool robust_ubo_access;
/* If true, promote UBOs (except for constant data) to constants using ldc.k
* in the preamble. The driver should ignore everything in ubo_state except
* for the constant data UBO, which is excluded because the command pushing
* constants for it can be pre-baked when compiling the shader.
*/
bool push_ubo_with_preamble;
};
void ir3_compiler_destroy(struct ir3_compiler *compiler);

View file

@ -342,6 +342,53 @@ lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b,
return true;
}
static bool
copy_ubo_to_uniform(nir_shader *nir, const struct ir3_const_state *const_state)
{
const struct ir3_ubo_analysis_state *state = &const_state->ubo_state;
if (state->num_enabled == 0 ||
(state->num_enabled == 1 && !state->range[0].ubo.bindless &&
state->range[0].ubo.block == const_state->constant_data_ubo))
return false;
nir_function_impl *preamble = nir_shader_get_preamble(nir);
nir_builder _b, *b = &_b;
nir_builder_init(b, preamble);
b->cursor = nir_after_cf_list(&preamble->body);
for (unsigned i = 0; i < state->num_enabled; i++) {
const struct ir3_ubo_range *range = &state->range[i];
/* The constant_data UBO is pushed in a different path from normal
* uniforms, and the state is setup earlier so it makes more sense to let
* the CP do it for us.
*/
if (!range->ubo.bindless &&
range->ubo.block == const_state->constant_data_ubo)
continue;
nir_ssa_def *ubo = nir_imm_int(b, range->ubo.block);
if (range->ubo.bindless) {
ubo = nir_bindless_resource_ir3(b, 32, ubo,
.desc_set = range->ubo.bindless_base);
}
/* ldc.k has a range of only 256, but there are 512 vec4 constants.
* Therefore we may have to split a large copy in two.
*/
unsigned size = (range->end - range->start) / 16;
for (unsigned offset = 0; offset < size; offset += 256) {
nir_copy_ubo_to_uniform_ir3(b, ubo, nir_imm_int(b, range->start / 16 +
offset),
.base = range->offset / 4 + offset * 4,
.range = MIN2(size - offset, 256));
}
}
return true;
}
static bool
instr_is_load_ubo(nir_instr *instr)
{
@ -379,8 +426,9 @@ ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader_variant *v)
memset(state, 0, sizeof(*state));
uint32_t upload_remaining = max_upload;
bool push_ubos = compiler->push_ubo_with_preamble;
nir_foreach_function (function, nir) {
if (function->impl) {
if (function->impl && (!push_ubos || !function->is_preamble)) {
nir_foreach_block (block, function->impl) {
nir_foreach_instr (instr, block) {
if (instr_is_load_ubo(instr))
@ -426,8 +474,15 @@ ir3_nir_lower_ubo_loads(nir_shader *nir, struct ir3_shader_variant *v)
int num_ubos = 0;
bool progress = false;
bool has_preamble = false;
bool push_ubos = compiler->push_ubo_with_preamble;
nir_foreach_function (function, nir) {
if (function->impl) {
if (function->is_preamble && push_ubos) {
has_preamble = true;
nir_metadata_preserve(function->impl, nir_metadata_all);
continue;
}
nir_builder builder;
nir_builder_init(&builder, function->impl);
nir_foreach_block (block, function->impl) {
@ -448,9 +503,12 @@ ir3_nir_lower_ubo_loads(nir_shader *nir, struct ir3_shader_variant *v)
* Vulkan's bindless, we don't use the num_ubos field, so we can leave it
* incremented.
*/
if (nir->info.first_ubo_is_default_ubo)
if (nir->info.first_ubo_is_default_ubo && !push_ubos && !has_preamble)
nir->info.num_ubos = num_ubos;
if (compiler->has_preamble && push_ubos)
progress |= copy_ubo_to_uniform(nir, const_state);
return progress;
}

View file

@ -1916,7 +1916,7 @@ tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
hlsq_invalidate_value = A6XX_HLSQ_INVALIDATE_CMD_GFX_BINDLESS(0x1f);
cmd->state.desc_sets = tu_cs_draw_state(&cmd->sub_cs, &state_cs, 24);
cmd->state.dirty |= TU_CMD_DIRTY_DESC_SETS_LOAD | TU_CMD_DIRTY_SHADER_CONSTS;
cmd->state.dirty |= TU_CMD_DIRTY_DESC_SETS_LOAD;
cs = &state_cs;
} else {
assert(pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE);
@ -3427,7 +3427,6 @@ tu6_user_consts_size(const struct tu_pipeline *pipeline,
{
const struct tu_program_descriptor_linkage *link =
&pipeline->program.link[type];
const struct ir3_ubo_analysis_state *state = &link->const_state.ubo_state;
uint32_t dwords = 0;
if (link->push_consts.count > 0) {
@ -3435,37 +3434,6 @@ tu6_user_consts_size(const struct tu_pipeline *pipeline,
dwords += 4 + num_units * 4;
}
for (uint32_t i = 0; i < state->num_enabled; i++) {
uint32_t size = state->range[i].end - state->range[i].start;
size = MIN2(size, (16 * link->constlen) - state->range[i].offset);
if (size == 0)
continue;
if (!state->range[i].ubo.bindless)
continue;
uint32_t *base = state->range[i].ubo.bindless_base == MAX_SETS ?
descriptors_state->dynamic_descriptors :
descriptors_state->sets[state->range[i].ubo.bindless_base]->mapped_ptr;
unsigned block = state->range[i].ubo.block;
uint32_t *desc = base + block * A6XX_TEX_CONST_DWORDS;
uint32_t desc_size = (desc[1] >> A6XX_UBO_1_SIZE__SHIFT) * 16;
desc_size = desc_size > state->range[i].start ?
desc_size - state->range[i].start : 0;
if (desc_size < size) {
uint32_t zero_size = size - desc_size;
dwords += 4 + zero_size / 4;
size = desc_size;
}
if (size > 0) {
dwords += 4;
}
}
return dwords;
}
@ -3477,8 +3445,6 @@ tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline,
{
const struct tu_program_descriptor_linkage *link =
&pipeline->program.link[type];
const struct ir3_const_state *const_state = &link->const_state;
const struct ir3_ubo_analysis_state *state = &const_state->ubo_state;
if (link->push_consts.count > 0) {
unsigned num_units = link->push_consts.count;
@ -3494,74 +3460,6 @@ tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline,
for (unsigned i = 0; i < num_units * 4; i++)
tu_cs_emit(cs, push_constants[i + offset * 4]);
}
for (uint32_t i = 0; i < state->num_enabled; i++) {
uint32_t size = state->range[i].end - state->range[i].start;
uint32_t offset = state->range[i].start;
/* and even if the start of the const buffer is before
* first_immediate, the end may not be:
*/
size = MIN2(size, (16 * link->constlen) - state->range[i].offset);
if (size == 0)
continue;
/* things should be aligned to vec4: */
debug_assert((state->range[i].offset % 16) == 0);
debug_assert((size % 16) == 0);
debug_assert((offset % 16) == 0);
/* Dig out the descriptor from the descriptor state and read the VA from
* it. All our UBOs are bindless with the exception of the NIR
* constant_data, which is uploaded once in the pipeline.
*/
if (!state->range[i].ubo.bindless) {
assert(state->range[i].ubo.block == const_state->constant_data_ubo);
continue;
}
uint32_t *base = state->range[i].ubo.bindless_base == MAX_SETS ?
descriptors_state->dynamic_descriptors :
descriptors_state->sets[state->range[i].ubo.bindless_base]->mapped_ptr;
unsigned block = state->range[i].ubo.block;
uint32_t *desc = base + block * A6XX_TEX_CONST_DWORDS;
uint64_t va = desc[0] | ((uint64_t)(desc[1] & A6XX_UBO_1_BASE_HI__MASK) << 32);
uint32_t desc_size = (desc[1] >> A6XX_UBO_1_SIZE__SHIFT) * 16;
desc_size = desc_size > state->range[i].start ?
desc_size - state->range[i].start : 0;
/* Handle null UBO descriptors and out-of-range UBO reads by filling the
* rest with 0, simulating what reading with ldc would do. This behavior
* is required by VK_EXT_robustness2.
*/
if (desc_size < size) {
uint32_t zero_size = size - desc_size;
uint32_t zero_offset = state->range[i].offset + desc_size;
tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + zero_size / 4);
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(zero_offset / 16) |
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
CP_LOAD_STATE6_0_NUM_UNIT(zero_size / 16));
tu_cs_emit_qw(cs, 0);
for (unsigned i = 0; i < zero_size / 4; i++) {
tu_cs_emit(cs, 0);
}
size = desc_size;
}
if (size > 0) {
assert(va);
tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(state->range[i].offset / 16) |
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
tu_cs_emit_qw(cs, va + offset);
}
}
}
static struct tu_draw_state

View file

@ -1740,6 +1740,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
ir3_compiler_create(NULL, &physical_device->dev_id,
&(struct ir3_compiler_options) {
.robust_ubo_access = robust_buffer_access2,
.push_ubo_with_preamble = true,
});
if (!device->compiler) {
result = vk_startup_errorf(physical_device->instance,