mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-08 06:58:05 +02:00
tu: Rewrite inline uniform implementation
Now we always push the inline uniforms, using an indirect CP_LOAD_STATE. There is just enough space to be able to always push them if we expose the minimum possible per-stage limits. This aligns us with Qualcomm and removes the need to setup a uniform descriptor which will be problematic. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18703>
This commit is contained in:
parent
e402d2dbe9
commit
37cde2c634
6 changed files with 206 additions and 79 deletions
|
|
@ -2091,6 +2091,9 @@ tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
|
||||||
|
|
||||||
descriptors_state->sets[idx] = set;
|
descriptors_state->sets[idx] = set;
|
||||||
|
|
||||||
|
if (set->layout->has_inline_uniforms)
|
||||||
|
cmd->state.dirty |= TU_CMD_DIRTY_SHADER_CONSTS;
|
||||||
|
|
||||||
if (!set->layout->dynamic_offset_size)
|
if (!set->layout->dynamic_offset_size)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
|
|
@ -4452,6 +4455,8 @@ tu6_user_consts_size(const struct tu_pipeline *pipeline,
|
||||||
dwords += 4 + num_units;
|
dwords += 4 + num_units;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
dwords += 4 * link->tu_const_state.num_inline_ubos;
|
||||||
|
|
||||||
return dwords;
|
return dwords;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -4459,6 +4464,7 @@ static void
|
||||||
tu6_emit_user_consts(struct tu_cs *cs,
|
tu6_emit_user_consts(struct tu_cs *cs,
|
||||||
const struct tu_pipeline *pipeline,
|
const struct tu_pipeline *pipeline,
|
||||||
gl_shader_stage type,
|
gl_shader_stage type,
|
||||||
|
struct tu_descriptor_state *descriptors,
|
||||||
uint32_t *push_constants)
|
uint32_t *push_constants)
|
||||||
{
|
{
|
||||||
const struct tu_program_descriptor_linkage *link =
|
const struct tu_program_descriptor_linkage *link =
|
||||||
|
|
@ -4480,6 +4486,20 @@ tu6_emit_user_consts(struct tu_cs *cs,
|
||||||
for (unsigned i = 0; i < num_units; i++)
|
for (unsigned i = 0; i < num_units; i++)
|
||||||
tu_cs_emit(cs, push_constants[i + offset]);
|
tu_cs_emit(cs, push_constants[i + offset]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Emit loads of inline uniforms. These load directly from the uniform's
|
||||||
|
* storage space inside the descriptor set.
|
||||||
|
*/
|
||||||
|
for (unsigned i = 0; i < link->tu_const_state.num_inline_ubos; i++) {
|
||||||
|
const struct tu_inline_ubo *ubo = &link->tu_const_state.ubos[i];
|
||||||
|
tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);
|
||||||
|
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(ubo->const_offset_vec4) |
|
||||||
|
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
|
||||||
|
CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
|
||||||
|
CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
|
||||||
|
CP_LOAD_STATE6_0_NUM_UNIT(ubo->size_vec4));
|
||||||
|
tu_cs_emit_qw(cs, descriptors->sets[ubo->base]->va + ubo->offset);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
|
|
@ -4518,14 +4538,14 @@ tu6_const_size(struct tu_cmd_buffer *cmd,
|
||||||
uint32_t dwords = 0;
|
uint32_t dwords = 0;
|
||||||
|
|
||||||
if (pipeline->shared_consts.dwords > 0) {
|
if (pipeline->shared_consts.dwords > 0) {
|
||||||
dwords = pipeline->shared_consts.dwords + 4;
|
dwords += pipeline->shared_consts.dwords + 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (compute) {
|
||||||
|
dwords += tu6_user_consts_size(pipeline, MESA_SHADER_COMPUTE);
|
||||||
} else {
|
} else {
|
||||||
if (compute) {
|
for (uint32_t type = MESA_SHADER_VERTEX; type <= MESA_SHADER_FRAGMENT; type++)
|
||||||
dwords = tu6_user_consts_size(pipeline, MESA_SHADER_COMPUTE);
|
dwords += tu6_user_consts_size(pipeline, type);
|
||||||
} else {
|
|
||||||
for (uint32_t type = MESA_SHADER_VERTEX; type <= MESA_SHADER_FRAGMENT; type++)
|
|
||||||
dwords += tu6_user_consts_size(pipeline, type);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return dwords;
|
return dwords;
|
||||||
|
|
@ -4554,13 +4574,17 @@ tu6_emit_consts(struct tu_cmd_buffer *cmd,
|
||||||
&pipeline->program.link[i];
|
&pipeline->program.link[i];
|
||||||
assert(!link->tu_const_state.push_consts.dwords);
|
assert(!link->tu_const_state.push_consts.dwords);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (compute) {
|
||||||
|
tu6_emit_user_consts(&cs, pipeline, MESA_SHADER_COMPUTE,
|
||||||
|
tu_get_descriptors_state(cmd, VK_PIPELINE_BIND_POINT_COMPUTE),
|
||||||
|
cmd->push_constants);
|
||||||
} else {
|
} else {
|
||||||
if (compute) {
|
struct tu_descriptor_state *descriptors =
|
||||||
tu6_emit_user_consts(&cs, pipeline, MESA_SHADER_COMPUTE, cmd->push_constants);
|
tu_get_descriptors_state(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS);
|
||||||
} else {
|
for (uint32_t type = MESA_SHADER_VERTEX; type <= MESA_SHADER_FRAGMENT; type++)
|
||||||
for (uint32_t type = MESA_SHADER_VERTEX; type <= MESA_SHADER_FRAGMENT; type++)
|
tu6_emit_user_consts(&cs, pipeline, type, descriptors, cmd->push_constants);
|
||||||
tu6_emit_user_consts(&cs, pipeline, type, cmd->push_constants);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return tu_cs_end_draw_state(&cmd->sub_cs, &cs);
|
return tu_cs_end_draw_state(&cmd->sub_cs, &cs);
|
||||||
|
|
|
||||||
|
|
@ -109,6 +109,12 @@
|
||||||
*/
|
*/
|
||||||
#define MAX_UNIFORM_BUFFER_RANGE 0x10000
|
#define MAX_UNIFORM_BUFFER_RANGE 0x10000
|
||||||
|
|
||||||
|
/* Use the minimum maximum to guarantee that it can always fit in the safe
|
||||||
|
* const file size, even with maximum push constant usage and driver params.
|
||||||
|
*/
|
||||||
|
#define MAX_INLINE_UBO_RANGE 256
|
||||||
|
#define MAX_INLINE_UBOS 4
|
||||||
|
|
||||||
#define A6XX_TEX_CONST_DWORDS 16
|
#define A6XX_TEX_CONST_DWORDS 16
|
||||||
#define A6XX_TEX_SAMP_DWORDS 4
|
#define A6XX_TEX_SAMP_DWORDS 4
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -66,8 +66,7 @@ descriptor_size(struct tu_device *dev,
|
||||||
return A6XX_TEX_CONST_DWORDS * 4;
|
return A6XX_TEX_CONST_DWORDS * 4;
|
||||||
}
|
}
|
||||||
case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
|
case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
|
||||||
return A6XX_TEX_CONST_DWORDS * 4 +
|
return binding->descriptorCount;
|
||||||
ALIGN(binding->descriptorCount, A6XX_TEX_CONST_DWORDS * 4);
|
|
||||||
default:
|
default:
|
||||||
return A6XX_TEX_CONST_DWORDS * 4;
|
return A6XX_TEX_CONST_DWORDS * 4;
|
||||||
}
|
}
|
||||||
|
|
@ -557,8 +556,8 @@ tu_descriptor_set_create(struct tu_device *device,
|
||||||
struct tu_descriptor_set_binding_layout *binding =
|
struct tu_descriptor_set_binding_layout *binding =
|
||||||
&layout->binding[layout->binding_count - 1];
|
&layout->binding[layout->binding_count - 1];
|
||||||
if (binding->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
|
if (binding->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
|
||||||
layout_size = binding->offset + A6XX_TEX_CONST_DWORDS * 4 +
|
layout_size = binding->offset +
|
||||||
ALIGN(variable_count, A6XX_TEX_CONST_DWORDS * 4);
|
ALIGN(variable_count, 4 * A6XX_TEX_CONST_DWORDS);
|
||||||
} else {
|
} else {
|
||||||
uint32_t stride = binding->size;
|
uint32_t stride = binding->size;
|
||||||
layout_size = binding->offset + variable_count * stride;
|
layout_size = binding->offset + variable_count * stride;
|
||||||
|
|
@ -635,24 +634,6 @@ tu_descriptor_set_create(struct tu_device *device,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (layout->has_inline_uniforms) {
|
|
||||||
for (unsigned i = 0; i < layout->binding_count; i++) {
|
|
||||||
if (layout->binding[i].type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
uint32_t *ptr = set->mapped_ptr + layout->binding[i].offset / 4;
|
|
||||||
uint64_t va = set->va + layout->binding[i].offset +
|
|
||||||
A6XX_TEX_CONST_DWORDS * 4;
|
|
||||||
uint32_t size =
|
|
||||||
(layout->has_variable_descriptors && i == layout->binding_count - 1) ?
|
|
||||||
variable_count : layout->binding[i].size - A6XX_TEX_CONST_DWORDS * 4;
|
|
||||||
size = ALIGN_POT(size, 16) / 16;
|
|
||||||
|
|
||||||
ptr[0] = A6XX_UBO_0_BASE_LO(va);
|
|
||||||
ptr[1] = A6XX_UBO_1_BASE_HI(va >> 32) | A6XX_UBO_1_SIZE(size);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
vk_descriptor_set_layout_ref(&layout->vk);
|
vk_descriptor_set_layout_ref(&layout->vk);
|
||||||
list_addtail(&set->pool_link, &pool->desc_sets);
|
list_addtail(&set->pool_link, &pool->desc_sets);
|
||||||
|
|
||||||
|
|
@ -705,12 +686,12 @@ tu_CreateDescriptorPool(VkDevice _device,
|
||||||
DESCRIPTOR_POOL_INLINE_UNIFORM_BLOCK_CREATE_INFO);
|
DESCRIPTOR_POOL_INLINE_UNIFORM_BLOCK_CREATE_INFO);
|
||||||
|
|
||||||
if (inline_info) {
|
if (inline_info) {
|
||||||
/* In addition to the size of the descriptors, we have to factor in the
|
/* We have to factor in the padding for each binding. The sizes are 4
|
||||||
* padding for each binding. The sizes are 4 aligned but we have to
|
* aligned but we have to align to 4 * A6XX_TEX_CONST_DWORDS bytes, and in
|
||||||
* align to a descriptor size, and in the worst case each inline
|
* the worst case each inline binding has a size of 4 bytes and we have
|
||||||
* binding has a size of 4 bytes and we have to pad each one out.
|
* to pad each one out.
|
||||||
*/
|
*/
|
||||||
bo_size += (2 * 4 * A6XX_TEX_CONST_DWORDS - 4) *
|
bo_size += (4 * A6XX_TEX_CONST_DWORDS - 4) *
|
||||||
inline_info->maxInlineUniformBlockBindings;
|
inline_info->maxInlineUniformBlockBindings;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1086,8 +1067,8 @@ tu_update_descriptor_sets(const struct tu_device *device,
|
||||||
* rule.
|
* rule.
|
||||||
*
|
*
|
||||||
* This means we can't just do a straight memcpy, because due to
|
* This means we can't just do a straight memcpy, because due to
|
||||||
* alignment padding and the descriptor itself there are gaps between
|
* alignment padding there are gaps between sequential bindings. We
|
||||||
* sequential bindings. We have to loop over each binding updated.
|
* have to loop over each binding updated.
|
||||||
*/
|
*/
|
||||||
const VkWriteDescriptorSetInlineUniformBlock *inline_write =
|
const VkWriteDescriptorSetInlineUniformBlock *inline_write =
|
||||||
vk_find_struct_const(writeset->pNext,
|
vk_find_struct_const(writeset->pNext,
|
||||||
|
|
@ -1096,9 +1077,8 @@ tu_update_descriptor_sets(const struct tu_device *device,
|
||||||
const uint8_t *src = inline_write->pData;
|
const uint8_t *src = inline_write->pData;
|
||||||
uint32_t dst_offset = writeset->dstArrayElement;
|
uint32_t dst_offset = writeset->dstArrayElement;
|
||||||
do {
|
do {
|
||||||
uint8_t *dst = (uint8_t *)(ptr + A6XX_TEX_CONST_DWORDS) + dst_offset;
|
uint8_t *dst = (uint8_t *)(ptr) + dst_offset;
|
||||||
uint32_t binding_size =
|
uint32_t binding_size = binding_layout->size - dst_offset;
|
||||||
binding_layout->size - A6XX_TEX_CONST_DWORDS * 4 - dst_offset;
|
|
||||||
uint32_t to_write = MIN2(remaining, binding_size);
|
uint32_t to_write = MIN2(remaining, binding_size);
|
||||||
memcpy(dst, src, to_write);
|
memcpy(dst, src, to_write);
|
||||||
|
|
||||||
|
|
@ -1188,12 +1168,12 @@ tu_update_descriptor_sets(const struct tu_device *device,
|
||||||
uint32_t remaining = copyset->descriptorCount;
|
uint32_t remaining = copyset->descriptorCount;
|
||||||
uint32_t src_start = copyset->srcArrayElement;
|
uint32_t src_start = copyset->srcArrayElement;
|
||||||
uint32_t dst_start = copyset->dstArrayElement;
|
uint32_t dst_start = copyset->dstArrayElement;
|
||||||
uint8_t *src = (uint8_t *)(src_ptr + A6XX_TEX_CONST_DWORDS) + src_start;
|
uint8_t *src = (uint8_t *)(src_ptr) + src_start;
|
||||||
uint8_t *dst = (uint8_t *)(dst_ptr + A6XX_TEX_CONST_DWORDS) + dst_start;
|
uint8_t *dst = (uint8_t *)(dst_ptr) + dst_start;
|
||||||
uint32_t src_remaining =
|
uint32_t src_remaining =
|
||||||
src_binding_layout->size - src_start - 4 * A6XX_TEX_CONST_DWORDS;
|
src_binding_layout->size - src_start;
|
||||||
uint32_t dst_remaining =
|
uint32_t dst_remaining =
|
||||||
dst_binding_layout->size - dst_start - 4 * A6XX_TEX_CONST_DWORDS;
|
dst_binding_layout->size - dst_start;
|
||||||
do {
|
do {
|
||||||
uint32_t to_write = MIN3(remaining, src_remaining, dst_remaining);
|
uint32_t to_write = MIN3(remaining, src_remaining, dst_remaining);
|
||||||
memcpy(dst, src, to_write);
|
memcpy(dst, src, to_write);
|
||||||
|
|
@ -1294,7 +1274,7 @@ tu_CreateDescriptorUpdateTemplate(
|
||||||
set_layout->binding + entry->dstBinding;
|
set_layout->binding + entry->dstBinding;
|
||||||
uint32_t dst_start = entry->dstArrayElement;
|
uint32_t dst_start = entry->dstArrayElement;
|
||||||
do {
|
do {
|
||||||
uint32_t size = binding_layout->size - A6XX_TEX_CONST_DWORDS * 4;
|
uint32_t size = binding_layout->size;
|
||||||
uint32_t count = MIN2(remaining, size - dst_start);
|
uint32_t count = MIN2(remaining, size - dst_start);
|
||||||
remaining -= count;
|
remaining -= count;
|
||||||
binding_layout++;
|
binding_layout++;
|
||||||
|
|
@ -1343,8 +1323,8 @@ tu_CreateDescriptorUpdateTemplate(
|
||||||
/* See comment in update_descriptor_sets() */
|
/* See comment in update_descriptor_sets() */
|
||||||
do {
|
do {
|
||||||
dst_offset =
|
dst_offset =
|
||||||
binding_layout->offset + A6XX_TEX_CONST_DWORDS * 4 + dst_start;
|
binding_layout->offset + dst_start;
|
||||||
uint32_t size = binding_layout->size - A6XX_TEX_CONST_DWORDS * 4;
|
uint32_t size = binding_layout->size;
|
||||||
uint32_t count = MIN2(remaining, size - dst_start);
|
uint32_t count = MIN2(remaining, size - dst_start);
|
||||||
templ->entry[j++] = (struct tu_descriptor_update_template_entry) {
|
templ->entry[j++] = (struct tu_descriptor_update_template_entry) {
|
||||||
.descriptor_type = entry->descriptorType,
|
.descriptor_type = entry->descriptorType,
|
||||||
|
|
|
||||||
|
|
@ -1097,19 +1097,12 @@ tu_get_physical_device_properties_1_3(struct tu_physical_device *pdevice,
|
||||||
p->maxComputeWorkgroupSubgroups = 16; /* max_waves */
|
p->maxComputeWorkgroupSubgroups = 16; /* max_waves */
|
||||||
p->requiredSubgroupSizeStages = VK_SHADER_STAGE_ALL;
|
p->requiredSubgroupSizeStages = VK_SHADER_STAGE_ALL;
|
||||||
|
|
||||||
/* Inline uniform buffers are just normal UBOs */
|
p->maxInlineUniformBlockSize = MAX_INLINE_UBO_RANGE;
|
||||||
p->maxInlineUniformBlockSize = MAX_UNIFORM_BUFFER_RANGE;
|
p->maxPerStageDescriptorInlineUniformBlocks = MAX_INLINE_UBOS;
|
||||||
|
p->maxPerStageDescriptorUpdateAfterBindInlineUniformBlocks = MAX_INLINE_UBOS;
|
||||||
/* Halve the normal limit on the number of descriptors, see below. */
|
p->maxDescriptorSetInlineUniformBlocks = MAX_INLINE_UBOS;
|
||||||
p->maxPerStageDescriptorInlineUniformBlocks = max_descriptor_set_size / 2;
|
p->maxDescriptorSetUpdateAfterBindInlineUniformBlocks = MAX_INLINE_UBOS;
|
||||||
p->maxPerStageDescriptorUpdateAfterBindInlineUniformBlocks = max_descriptor_set_size / 2;
|
p->maxInlineUniformTotalSize = MAX_INLINE_UBOS * MAX_INLINE_UBO_RANGE;
|
||||||
p->maxDescriptorSetInlineUniformBlocks = max_descriptor_set_size / 2;
|
|
||||||
p->maxDescriptorSetUpdateAfterBindInlineUniformBlocks = max_descriptor_set_size / 2;
|
|
||||||
/* Because we halve the normal limit on the number of descriptors, in the
|
|
||||||
* worst case each descriptor takes up half the space, leaving the rest for
|
|
||||||
* the actual data.
|
|
||||||
*/
|
|
||||||
p->maxInlineUniformTotalSize = MAX_SET_SIZE / 2;
|
|
||||||
|
|
||||||
p->integerDotProduct8BitUnsignedAccelerated = false;
|
p->integerDotProduct8BitUnsignedAccelerated = false;
|
||||||
p->integerDotProduct8BitSignedAccelerated = false;
|
p->integerDotProduct8BitSignedAccelerated = false;
|
||||||
|
|
|
||||||
|
|
@ -178,6 +178,9 @@ lower_vulkan_resource_index(nir_builder *b, nir_intrinsic_instr *instr,
|
||||||
&set_layout->binding[binding];
|
&set_layout->binding[binding];
|
||||||
nir_ssa_def *base;
|
nir_ssa_def *base;
|
||||||
|
|
||||||
|
if (binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
|
||||||
|
return;
|
||||||
|
|
||||||
shader->active_desc_sets |= 1u << set;
|
shader->active_desc_sets |= 1u << set;
|
||||||
|
|
||||||
switch (binding_layout->type) {
|
switch (binding_layout->type) {
|
||||||
|
|
@ -205,16 +208,9 @@ lower_vulkan_resource_index(nir_builder *b, nir_intrinsic_instr *instr,
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
nir_ssa_def *shift;
|
unsigned stride = binding_layout->size / (4 * A6XX_TEX_CONST_DWORDS);
|
||||||
|
assert(util_is_power_of_two_nonzero(stride));
|
||||||
if (binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
|
nir_ssa_def *shift = nir_imm_int(b, util_logbase2(stride));
|
||||||
/* Inline uniform blocks cannot have arrays so the stride is unused */
|
|
||||||
shift = nir_imm_int(b, 0);
|
|
||||||
} else {
|
|
||||||
unsigned stride = binding_layout->size / (4 * A6XX_TEX_CONST_DWORDS);
|
|
||||||
assert(util_is_power_of_two_nonzero(stride));
|
|
||||||
shift = nir_imm_int(b, util_logbase2(stride));
|
|
||||||
}
|
|
||||||
|
|
||||||
nir_ssa_def *def = nir_vec3(b, nir_imm_int(b, set),
|
nir_ssa_def *def = nir_vec3(b, nir_imm_int(b, set),
|
||||||
nir_iadd(b, base,
|
nir_iadd(b, base,
|
||||||
|
|
@ -608,6 +604,70 @@ lower_instr(nir_builder *b, nir_instr *instr, void *cb_data)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Since we always push inline uniforms into constant memory, lower loads of
|
||||||
|
* them to load_uniform which turns into constant memory loads.
|
||||||
|
*/
|
||||||
|
static bool
|
||||||
|
lower_inline_ubo(nir_builder *b, nir_instr *instr, void *cb_data)
|
||||||
|
{
|
||||||
|
if (instr->type != nir_instr_type_intrinsic)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
||||||
|
if (intrin->intrinsic != nir_intrinsic_load_ubo)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
struct lower_instr_params *params = cb_data;
|
||||||
|
struct tu_shader *shader = params->shader;
|
||||||
|
const struct tu_pipeline_layout *layout = params->layout;
|
||||||
|
|
||||||
|
nir_binding binding = nir_chase_binding(intrin->src[0]);
|
||||||
|
|
||||||
|
if (!binding.success)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
struct tu_descriptor_set_layout *set_layout = layout->set[binding.desc_set].layout;
|
||||||
|
struct tu_descriptor_set_binding_layout *binding_layout =
|
||||||
|
&set_layout->binding[binding.binding];
|
||||||
|
|
||||||
|
if (binding_layout->type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
/* lookup the const offset of the inline UBO */
|
||||||
|
struct tu_const_state *const_state = &shader->const_state;
|
||||||
|
|
||||||
|
unsigned base = UINT_MAX;
|
||||||
|
for (unsigned i = 0; i < const_state->num_inline_ubos; i++) {
|
||||||
|
if (const_state->ubos[i].base == binding.desc_set &&
|
||||||
|
const_state->ubos[i].offset == binding_layout->offset) {
|
||||||
|
base = const_state->ubos[i].const_offset_vec4 * 4;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (base == UINT_MAX) {
|
||||||
|
/* Assume we're loading out-of-bounds from a 0-sized inline uniform
|
||||||
|
* filtered out below.
|
||||||
|
*/
|
||||||
|
nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
|
||||||
|
nir_ssa_undef(b, intrin->num_components,
|
||||||
|
intrin->dest.ssa.bit_size));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
nir_ssa_def *offset = intrin->src[1].ssa;
|
||||||
|
|
||||||
|
b->cursor = nir_before_instr(instr);
|
||||||
|
nir_ssa_def *uniform =
|
||||||
|
nir_load_uniform(b, intrin->num_components,
|
||||||
|
intrin->dest.ssa.bit_size,
|
||||||
|
nir_ishr_imm(b, offset, 2), .base = base);
|
||||||
|
|
||||||
|
nir_ssa_def_rewrite_uses(&intrin->dest.ssa, uniform);
|
||||||
|
nir_instr_remove(instr);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
/* Figure out the range of push constants that we're actually going to push to
|
/* Figure out the range of push constants that we're actually going to push to
|
||||||
* the shader, and tell the backend to reserve this range when pushing UBO
|
* the shader, and tell the backend to reserve this range when pushing UBO
|
||||||
* constants.
|
* constants.
|
||||||
|
|
@ -678,6 +738,47 @@ tu_lower_io(nir_shader *shader, struct tu_device *dev,
|
||||||
const_state->dynamic_offset_loc = UINT32_MAX;
|
const_state->dynamic_offset_loc = UINT32_MAX;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Reserve space for inline uniforms, so we can always load them from
|
||||||
|
* constants and not setup a UBO descriptor for them.
|
||||||
|
*/
|
||||||
|
for (unsigned set = 0; set < layout->num_sets; set++) {
|
||||||
|
const struct tu_descriptor_set_layout *desc_layout =
|
||||||
|
layout->set[set].layout;
|
||||||
|
|
||||||
|
if (!desc_layout || !desc_layout->has_inline_uniforms)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
for (unsigned b = 0; b < desc_layout->binding_count; b++) {
|
||||||
|
const struct tu_descriptor_set_binding_layout *binding =
|
||||||
|
&desc_layout->binding[b];
|
||||||
|
|
||||||
|
if (binding->type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
|
||||||
|
continue;
|
||||||
|
if (!(binding->shader_stages &
|
||||||
|
mesa_to_vk_shader_stage(shader->info.stage)))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
/* Workaround a CTS bug by ignoring zero-sized inline uniform
|
||||||
|
* blocks that aren't being properly filtered out when creating the
|
||||||
|
* descriptor set layout, see
|
||||||
|
* https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/4115
|
||||||
|
*/
|
||||||
|
if (binding->size == 0)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
assert(const_state->num_inline_ubos < ARRAY_SIZE(const_state->ubos));
|
||||||
|
unsigned size_vec4 = DIV_ROUND_UP(binding->size, 16);
|
||||||
|
const_state->ubos[const_state->num_inline_ubos++] = (struct tu_inline_ubo) {
|
||||||
|
.base = set,
|
||||||
|
.offset = binding->offset,
|
||||||
|
.const_offset_vec4 = reserved_consts_vec4,
|
||||||
|
.size_vec4 = size_vec4,
|
||||||
|
};
|
||||||
|
|
||||||
|
reserved_consts_vec4 += align(size_vec4, dev->compiler->const_upload_unit);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
tu_shader->reserved_user_consts_vec4 = reserved_consts_vec4;
|
tu_shader->reserved_user_consts_vec4 = reserved_consts_vec4;
|
||||||
|
|
||||||
struct lower_instr_params params = {
|
struct lower_instr_params params = {
|
||||||
|
|
@ -686,10 +787,18 @@ tu_lower_io(nir_shader *shader, struct tu_device *dev,
|
||||||
.layout = layout,
|
.layout = layout,
|
||||||
};
|
};
|
||||||
|
|
||||||
bool progress = nir_shader_instructions_pass(shader,
|
bool progress = false;
|
||||||
lower_instr,
|
if (const_state->num_inline_ubos) {
|
||||||
nir_metadata_none,
|
progress |= nir_shader_instructions_pass(shader,
|
||||||
¶ms);
|
lower_inline_ubo,
|
||||||
|
nir_metadata_none,
|
||||||
|
¶ms);
|
||||||
|
}
|
||||||
|
|
||||||
|
progress |= nir_shader_instructions_pass(shader,
|
||||||
|
lower_instr,
|
||||||
|
nir_metadata_none,
|
||||||
|
¶ms);
|
||||||
|
|
||||||
/* Remove now-unused variables so that when we gather the shader info later
|
/* Remove now-unused variables so that when we gather the shader info later
|
||||||
* they won't be counted.
|
* they won't be counted.
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,19 @@
|
||||||
|
|
||||||
#include "tu_common.h"
|
#include "tu_common.h"
|
||||||
|
|
||||||
|
struct tu_inline_ubo
|
||||||
|
{
|
||||||
|
/* Push the data at BINDLESS_BASE[base] + offset */
|
||||||
|
unsigned base;
|
||||||
|
unsigned offset;
|
||||||
|
|
||||||
|
/* Push it to this location in the const file, in vec4s */
|
||||||
|
unsigned const_offset_vec4;
|
||||||
|
|
||||||
|
/* How much to push */
|
||||||
|
unsigned size_vec4;
|
||||||
|
};
|
||||||
|
|
||||||
struct tu_push_constant_range
|
struct tu_push_constant_range
|
||||||
{
|
{
|
||||||
uint32_t lo;
|
uint32_t lo;
|
||||||
|
|
@ -22,6 +35,8 @@ struct tu_const_state
|
||||||
{
|
{
|
||||||
struct tu_push_constant_range push_consts;
|
struct tu_push_constant_range push_consts;
|
||||||
uint32_t dynamic_offset_loc;
|
uint32_t dynamic_offset_loc;
|
||||||
|
unsigned num_inline_ubos;
|
||||||
|
struct tu_inline_ubo ubos[MAX_INLINE_UBOS];
|
||||||
};
|
};
|
||||||
|
|
||||||
struct tu_shader
|
struct tu_shader
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue