pco, pvr: push constants support

Signed-off-by: Simon Perretta <simon.perretta@imgtec.com>
Acked-by: Erik Faye-Lund <erik.faye-lund@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36412>
This commit is contained in:
Simon Perretta 2024-12-09 12:58:28 +00:00 committed by Marge Bot
parent 3c321b77ec
commit fd31165c38
8 changed files with 244 additions and 74 deletions

View file

@ -37,7 +37,7 @@
#define PVR_MAX_VIEWPORTS 1U
#define PVR_MAX_NEG_OFFSCREEN_OFFSET 4096U
#define PVR_MAX_PUSH_CONSTANTS_SIZE 256U
#define PVR_MAX_PUSH_CONSTANTS_SIZE 128U
#define PVR_POINT_SIZE_RANGE_MIN 1.0f
#define PVR_POINT_SIZE_RANGE_MAX 511.0f

View file

@ -113,6 +113,13 @@ typedef struct _pco_descriptor_set_data {
bool used; /** Whether the descriptor set is used by the shader. */
} pco_descriptor_set_data;
/** PCO push constant data. */
typedef struct _pco_push_const_data {
pco_range range; /** Push constant range. */
unsigned used;
} pco_push_const_data;
/** PCO common data. */
typedef struct _pco_common_data {
/** System value mappings. */
@ -121,6 +128,9 @@ typedef struct _pco_common_data {
/** Descriptor set data. */
pco_descriptor_set_data desc_sets[PVR_MAX_DESCRIPTOR_SETS];
/** Push constant data. */
pco_push_const_data push_consts;
unsigned temps; /** Number of allocated temp registers. */
unsigned vtxins; /** Number of allocated vertex input registers. */
unsigned interns; /** Number of allocated internal registers. */

View file

@ -23,6 +23,7 @@ static const struct spirv_to_nir_options spirv_options = {
.ubo_addr_format = nir_address_format_vec2_index_32bit_offset,
.ssbo_addr_format = nir_address_format_vec2_index_32bit_offset,
.push_const_addr_format = nir_address_format_32bit_offset,
.min_ubo_alignment = PVR_UNIFORM_BUFFER_OFFSET_ALIGNMENT,
.min_ssbo_alignment = PVR_STORAGE_BUFFER_OFFSET_ALIGNMENT,
@ -249,6 +250,12 @@ void pco_lower_nir(pco_ctx *ctx, nir_shader *nir, pco_data *data)
nir_var_mem_ubo | nir_var_mem_ssbo,
nir_address_format_vec2_index_32bit_offset);
NIR_PASS(_,
nir,
nir_lower_explicit_io,
nir_var_mem_push_const,
spirv_options.push_const_addr_format);
NIR_PASS(_, nir, pco_nir_lower_vk, &data->common);
NIR_PASS(_,

View file

@ -457,6 +457,42 @@ static unsigned fetch_resource_base_reg_packed(const pco_common_data *common,
return fetch_resource_base_reg(common, desc_set, binding, elem, is_img_smp);
}
static pco_instr *trans_load_push_constant(trans_ctx *tctx,
nir_intrinsic_instr *intr,
pco_ref dest,
pco_ref src)
{
const pco_common_data *common = &tctx->shader->data.common;
unsigned chans = pco_ref_get_chans(dest);
ASSERTED unsigned bits = pco_ref_get_bits(dest);
assert(bits == 32);
assert(common->push_consts.range.count > 0);
if (nir_src_is_const(intr->src[0])) {
unsigned offset = nir_src_as_uint(intr->src[0]);
assert(offset < common->push_consts.range.count);
unsigned reg_index = common->push_consts.range.start + offset;
src = pco_ref_hwreg_vec(reg_index, PCO_REG_CLASS_SHARED, chans);
return pco_mov(&tctx->b, dest, src, .rpt = chans);
}
/* Use the dynamic offset to set up the index register. */
pco_ref idx_reg = pco_ref_hwreg_idx(0, 0, PCO_REG_CLASS_INDEX);
pco_mov(&tctx->b, idx_reg, src);
pco_ref idx_src = pco_ref_hwreg_idx_vec(0,
common->push_consts.range.start,
PCO_REG_CLASS_SHARED,
chans);
return pco_mov(&tctx->b, dest, idx_src, .rpt = chans);
}
static pco_instr *trans_load_buffer(trans_ctx *tctx,
nir_intrinsic_instr *intr,
pco_ref dest,
@ -889,6 +925,10 @@ static pco_instr *trans_intr(trans_ctx *tctx, nir_intrinsic_instr *intr)
UNREACHABLE("Unsupported stage for \"nir_intrinsic_store_output\".");
break;
case nir_intrinsic_load_push_constant:
instr = trans_load_push_constant(tctx, intr, dest, src[0]);
break;
case nir_intrinsic_load_ubo:
case nir_intrinsic_load_ssbo:
instr = trans_load_buffer(tctx, intr, dest, src[1]);

View file

@ -1517,7 +1517,12 @@ void pvr_pds_generate_descriptor_upload_program(
pvr_init_pds_const_map_entry_write_state(info, &entry_write_state);
assert(!input_program->buffer_count);
/* 1 DOUTD per compile time buffer: */
for (unsigned int index = 0; index < input_program->buffer_count; index++) {
num_consts32++;
num_consts64++;
total_dma_count++;
}
/* DOUTU for the secondary update program requires a 64-bit constant. */
if (input_program->secondary_program_present)
@ -1561,6 +1566,42 @@ void pvr_pds_generate_descriptor_upload_program(
next_const32++;
}
for (unsigned int index = 0; index < input_program->buffer_count; index++) {
struct pvr_pds_buffer *buffer = &input_program->buffers[index];
bool last_dma = (++running_dma_count == total_dma_count);
bool halt = last_dma && !input_program->secondary_program_present;
switch (buffer->type) {
case PVR_BUFFER_TYPE_PUSH_CONSTS: {
struct pvr_const_map_entry_special_buffer *special_buffer_entry;
special_buffer_entry =
pvr_prepare_next_pds_const_map_entry(&entry_write_state,
sizeof(*special_buffer_entry));
special_buffer_entry->type =
PVR_PDS_CONST_MAP_ENTRY_TYPE_SPECIAL_BUFFER;
special_buffer_entry->buffer_type = buffer->type;
break;
}
}
entry_write_state.entry->const_offset = next_const64 * 2;
PVR_PDS_MODE_TOGGLE(code_section,
instruction,
pvr_encode_burst_cs(&entry_write_state,
last_dma,
halt,
next_const32,
next_const64,
buffer->size_in_dwords,
buffer->destination));
next_const64++;
next_const32++;
}
if (total_dma_count != running_dma_count)
fprintf(stderr, "Mismatch in DMA count\n");

View file

@ -2690,28 +2690,43 @@ void pvr_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
state->dirty.index_buffer_binding = true;
}
void pvr_CmdPushConstants(VkCommandBuffer commandBuffer,
VkPipelineLayout layout,
VkShaderStageFlags stageFlags,
uint32_t offset,
uint32_t size,
const void *pValues)
static void update_push_constants(struct pvr_push_constants *push_consts,
uint32_t offset,
uint32_t size,
const void *data)
{
#if MESA_DEBUG
const uint64_t ending = (uint64_t)offset + (uint64_t)size;
#endif
memcpy(&push_consts->data[offset], data, size);
push_consts->bytes_updated = MAX2(push_consts->bytes_updated, offset + size);
push_consts->dirty = true;
}
void pvr_CmdPushConstants2KHR(VkCommandBuffer commandBuffer,
const VkPushConstantsInfoKHR *pPushConstantsInfo)
{
PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
if (pPushConstantsInfo->stageFlags & VK_SHADER_STAGE_VERTEX_BIT) {
update_push_constants(
&state->push_consts[PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY],
pPushConstantsInfo->offset,
pPushConstantsInfo->size,
pPushConstantsInfo->pValues);
}
pvr_assert(ending <= PVR_MAX_PUSH_CONSTANTS_SIZE);
if (pPushConstantsInfo->stageFlags & VK_SHADER_STAGE_FRAGMENT_BIT) {
update_push_constants(&state->push_consts[PVR_STAGE_ALLOCATION_FRAGMENT],
pPushConstantsInfo->offset,
pPushConstantsInfo->size,
pPushConstantsInfo->pValues);
}
memcpy(&state->push_constants.data[offset], pValues, size);
state->push_constants.dirty_stages |= stageFlags;
state->push_constants.uploaded = false;
if (pPushConstantsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
update_push_constants(&state->push_consts[PVR_STAGE_ALLOCATION_COMPUTE],
pPushConstantsInfo->offset,
pPushConstantsInfo->size,
pPushConstantsInfo->pValues);
}
}
static VkResult
@ -3505,6 +3520,9 @@ pvr_setup_vertex_buffers(struct pvr_cmd_buffer *cmd_buffer,
return VK_SUCCESS;
}
static VkResult pvr_cmd_upload_push_consts(struct pvr_cmd_buffer *cmd_buffer,
enum pvr_stage_allocation stage);
static VkResult pvr_setup_descriptor_mappings(
struct pvr_cmd_buffer *const cmd_buffer,
enum pvr_stage_allocation stage,
@ -3591,6 +3609,45 @@ static VkResult pvr_setup_descriptor_mappings(
break;
}
case PVR_PDS_CONST_MAP_ENTRY_TYPE_SPECIAL_BUFFER: {
const struct pvr_const_map_entry_special_buffer *special_buff_entry =
(struct pvr_const_map_entry_special_buffer *)entries;
switch (special_buff_entry->buffer_type) {
case PVR_BUFFER_TYPE_PUSH_CONSTS: {
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
/* Handle running with undefined push constants. */
if (!state->push_consts[stage].dev_addr.addr) {
state->push_consts[stage].dirty = true;
assert(!state->push_consts[stage].bytes_updated);
state->push_consts[stage].bytes_updated =
sizeof(state->push_consts[stage].data);
result = pvr_cmd_upload_push_consts(cmd_buffer, stage);
/* Reset. */
state->push_consts[stage].bytes_updated = 0;
if (result != VK_SUCCESS)
return result;
}
PVR_WRITE(qword_buffer,
state->push_consts[stage].dev_addr.addr,
special_buff_entry->const_offset,
pds_info->data_size_in_dwords);
break;
}
default:
UNREACHABLE("Unsupported special buffer type.");
}
entries += sizeof(*special_buff_entry);
break;
}
default:
UNREACHABLE("Unsupported map entry type.");
}
@ -3921,45 +3978,26 @@ static void pvr_compute_update_kernel(
pvr_compute_generate_control_stream(csb, sub_cmd, &info);
}
static VkResult pvr_cmd_upload_push_consts(struct pvr_cmd_buffer *cmd_buffer)
static VkResult pvr_cmd_upload_push_consts(struct pvr_cmd_buffer *cmd_buffer,
enum pvr_stage_allocation stage)
{
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
struct pvr_push_constants *push_consts = &state->push_consts[stage];
struct pvr_suballoc_bo *suballoc_bo;
VkResult result;
/* TODO: Here are some possible optimizations/things to consider:
*
* - Currently we upload maxPushConstantsSize. The application might only
* be using a portion of that so we might end up with unused memory.
* Should we be smarter about this. If we intend to upload the push
* consts into shareds, we definitely want to do avoid reserving unused
* regs.
*
* - For now we have to upload to a new buffer each time since the shaders
* access the push constants from memory. If we were to reuse the same
* buffer we might update the contents out of sync with job submission
* and the shaders will see the updated contents while the command
* buffer was still being recorded and not yet submitted.
* If we were to upload the push constants directly to shared regs we
* could reuse the same buffer (avoiding extra allocation overhead)
* since the contents will be DMAed only on job submission when the
* control stream is processed and the PDS program is executed. This
* approach would also allow us to avoid regenerating the PDS data
* section in some cases since the buffer address will be constants.
*/
if (cmd_buffer->state.push_constants.uploaded)
if (!push_consts->dirty)
return VK_SUCCESS;
result = pvr_cmd_buffer_upload_general(cmd_buffer,
state->push_constants.data,
sizeof(state->push_constants.data),
push_consts->data,
push_consts->bytes_updated,
&suballoc_bo);
if (result != VK_SUCCESS)
return result;
cmd_buffer->state.push_constants.dev_addr = suballoc_bo->dev_addr;
cmd_buffer->state.push_constants.uploaded = true;
push_consts->dev_addr = suballoc_bo->dev_addr;
push_consts->dirty = false;
return VK_SUCCESS;
}
@ -3983,15 +4021,14 @@ static void pvr_cmd_dispatch(
sub_cmd->uses_atomic_ops |= cs_data->common.uses.atomics;
sub_cmd->uses_barrier |= cs_data->common.uses.barriers;
if (state->push_constants.dirty_stages & VK_SHADER_STAGE_COMPUTE_BIT) {
result = pvr_cmd_upload_push_consts(cmd_buffer);
if (state->push_consts[PVR_STAGE_ALLOCATION_COMPUTE].dirty) {
result =
pvr_cmd_upload_push_consts(cmd_buffer, PVR_STAGE_ALLOCATION_COMPUTE);
if (result != VK_SUCCESS)
return;
/* Regenerate the PDS program to use the new push consts buffer. */
state->dirty.compute_desc_dirty = true;
state->push_constants.dirty_stages &= ~VK_SHADER_STAGE_COMPUTE_BIT;
}
if (state->dirty.compute_desc_dirty ||
@ -5279,7 +5316,7 @@ pvr_ppp_state_update_required(const struct pvr_cmd_buffer *cmd_buffer)
header->pres_varying_word2 || header->pres_stream_out_program ||
state->dirty.fragment_descriptors || state->dirty.vis_test ||
state->dirty.gfx_pipeline_binding || state->dirty.isp_userpass ||
state->push_constants.dirty_stages & VK_SHADER_STAGE_FRAGMENT_BIT ||
state->push_consts[PVR_STAGE_ALLOCATION_FRAGMENT].dirty ||
BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE) ||
@ -5658,15 +5695,23 @@ static VkResult pvr_validate_draw_state(struct pvr_cmd_buffer *cmd_buffer)
pvr_setup_vertex_buffers(cmd_buffer, gfx_pipeline);
}
if (state->push_constants.dirty_stages & VK_SHADER_STAGE_ALL_GRAPHICS) {
result = pvr_cmd_upload_push_consts(cmd_buffer);
if (result != VK_SUCCESS)
return result;
}
state->dirty.vertex_descriptors = state->dirty.gfx_pipeline_binding;
state->dirty.fragment_descriptors = state->dirty.vertex_descriptors;
if (state->push_consts[PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY].dirty) {
result = pvr_cmd_upload_push_consts(cmd_buffer,
PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY);
state->dirty.vertex_descriptors = true;
}
if (state->push_consts[PVR_STAGE_ALLOCATION_FRAGMENT].dirty) {
result =
pvr_cmd_upload_push_consts(cmd_buffer, PVR_STAGE_ALLOCATION_FRAGMENT);
state->dirty.fragment_descriptors = true;
}
/* Account for dirty descriptor set. */
/* TODO: It could be the case that there are no descriptors for a specific
* stage, or that the update descriptors aren't active for a particular
@ -5679,12 +5724,6 @@ static VkResult pvr_validate_draw_state(struct pvr_cmd_buffer *cmd_buffer)
if (BITSET_TEST(dynamic_state->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS))
state->dirty.fragment_descriptors = true;
state->dirty.vertex_descriptors |=
state->push_constants.dirty_stages &
(VK_SHADER_STAGE_ALL_GRAPHICS & ~VK_SHADER_STAGE_FRAGMENT_BIT);
state->dirty.fragment_descriptors |= state->push_constants.dirty_stages &
VK_SHADER_STAGE_FRAGMENT_BIT;
if (state->dirty.fragment_descriptors) {
result = pvr_setup_descriptor_mappings(
cmd_buffer,
@ -5730,8 +5769,6 @@ static VkResult pvr_validate_draw_state(struct pvr_cmd_buffer *cmd_buffer)
state->dirty.vertex_bindings = false;
state->dirty.vis_test = false;
state->push_constants.dirty_stages &= ~VK_SHADER_STAGE_ALL_GRAPHICS;
return VK_SUCCESS;
}

View file

@ -580,6 +580,14 @@ static VkResult pvr_pds_descriptor_program_create_and_upload(
goto err_free_static_consts;
}
if (data->common.push_consts.range.count > 0) {
program.buffers[program.buffer_count++] = (struct pvr_pds_buffer){
.type = PVR_BUFFER_TYPE_PUSH_CONSTS,
.size_in_dwords = data->common.push_consts.range.count,
.destination = data->common.push_consts.range.start,
};
}
pds_info->entries_size_in_bytes = const_entries_size_in_bytes;
pvr_pds_generate_descriptor_upload_program(&program, NULL, pds_info);
@ -2003,6 +2011,32 @@ static void pvr_setup_descriptors(pco_data *data,
};
}
}
if (data->common.push_consts.used > 0) {
unsigned count = data->common.push_consts.used;
if (count == ~0U) {
count = 0;
for (unsigned u = 0; u < layout->push_range_count; ++u) {
VkPushConstantRange *range = &layout->push_ranges[u];
if (!(mesa_to_vk_shader_stage(stage) & range->stageFlags))
continue;
count = MAX2(count, range->offset + range->size);
}
assert(!(count % 4));
count = count / 4;
}
data->common.push_consts.range = (pco_range){
.start = data->common.shareds,
.count = count,
};
data->common.shareds += count;
}
assert(data->common.shareds < 256);
}
@ -2098,6 +2132,9 @@ static void pvr_postprocess_shader_data(pco_data *data,
pvr_setup_descriptors(data, nir, layout);
/* TODO: common things, like large constants being put into shareds. */
assert(data->common.shareds < 256);
assert(data->common.coeffs < 256);
}
/* Compiles and uploads shaders and PDS programs. */

View file

@ -690,6 +690,13 @@ struct pvr_cmd_buffer_draw_state {
bool draw_indexed;
};
struct pvr_push_constants {
uint8_t data[PVR_MAX_PUSH_CONSTANTS_SIZE];
unsigned bytes_updated;
pvr_dev_addr_t dev_addr;
bool dirty;
};
struct pvr_cmd_buffer_state {
/* Pipeline binding. */
const struct pvr_graphics_pipeline *gfx_pipeline;
@ -712,17 +719,6 @@ struct pvr_cmd_buffer_state {
VkIndexType type;
} index_buffer_binding;
struct {
uint8_t data[PVR_MAX_PUSH_CONSTANTS_SIZE];
VkShaderStageFlags dirty_stages;
/* Indicates if the whole push constants buffer was uploaded. This avoids
* having to upload the same stuff twice when the push constant range
* covers both gfx and compute.
*/
bool uploaded;
pvr_dev_addr_t dev_addr;
} push_constants;
/* Array size of barriers_needed is based on number of sync pipeline
* stages.
*/
@ -731,6 +727,8 @@ struct pvr_cmd_buffer_state {
struct pvr_descriptor_state gfx_desc_state;
struct pvr_descriptor_state compute_desc_state;
struct pvr_push_constants push_consts[PVR_STAGE_ALLOCATION_COUNT];
VkFormat depth_format;
struct {