diff --git a/src/imagination/vulkan/pds/pvr_pds.h b/src/imagination/vulkan/pds/pvr_pds.h index 7f5396bda3a..ccf95e05bb1 100644 --- a/src/imagination/vulkan/pds/pvr_pds.h +++ b/src/imagination/vulkan/pds/pvr_pds.h @@ -96,6 +96,10 @@ enum pvr_pds_vertex_attrib_program_type { PVR_PDS_VERTEX_ATTRIB_PROGRAM_COUNT }; +enum pvr_pds_addr_literal_type { + PVR_PDS_ADDR_LITERAL_DESC_SET_ADDRS_TABLE, +}; + /***************************************************************************** Structure definitions *****************************************************************************/ @@ -881,6 +885,11 @@ struct pvr_pds_descriptor_set { */ }; +struct pvr_pds_addr_literal { + enum pvr_pds_addr_literal_type type; + unsigned int destination; +}; + #define PVR_BUFFER_TYPE_UBO (0) #define PVR_BUFFER_TYPE_COMPILE_TIME (1) #define PVR_BUFFER_TYPE_BLEND_CONSTS (2) @@ -914,6 +923,9 @@ struct pvr_pds_descriptor_program_input { unsigned int descriptor_set_count; struct pvr_pds_descriptor_set descriptor_sets[8]; + unsigned int addr_literal_count; + struct pvr_pds_addr_literal addr_literals[8]; + /* "State" buffers, including: * compile-time constants * blend constants @@ -1002,6 +1014,9 @@ struct pvr_pds_vertex_primary_program_input { #define PVR_PDS_CONST_MAP_ENTRY_TYPE_BASE_WORKGROUP (13) #define PVR_PDS_CONST_MAP_ENTRY_TYPE_COND_RENDER (14) +#define PVR_PDS_CONST_MAP_ENTRY_TYPE_ADDR_LITERAL_BUFFER (15) +#define PVR_PDS_CONST_MAP_ENTRY_TYPE_ADDR_LITERAL (16) + /* We pack all the following structs tightly into a buffer using += sizeof(x) * offsets, this can lead to data that is not native aligned. Supplying the * packed attribute indicates that unaligned accesses may be required, and the @@ -1135,6 +1150,18 @@ struct pvr_pds_const_map_entry_cond_render { uint32_t cond_render_pred_temp; } PVR_ALIGNED; +struct pvr_pds_const_map_entry_addr_literal_buffer { + uint8_t type; + uint8_t const_offset; + + uint32_t size; +} PVR_ALIGNED; + +struct pvr_pds_const_map_entry_addr_literal { + uint8_t type; + enum pvr_pds_addr_literal_type addr_type; +} PVR_ALIGNED; + struct pvr_pds_info { uint32_t temps_required; uint32_t code_size_in_dwords; diff --git a/src/imagination/vulkan/pds/pvr_xgl_pds.c b/src/imagination/vulkan/pds/pvr_xgl_pds.c index e9360cf5b93..dbd18d6720e 100644 --- a/src/imagination/vulkan/pds/pvr_xgl_pds.c +++ b/src/imagination/vulkan/pds/pvr_xgl_pds.c @@ -1514,6 +1514,13 @@ void pvr_pds_generate_descriptor_upload_program( num_consts64 = input_program->descriptor_set_count; total_dma_count = input_program->descriptor_set_count; + /* 1 DOUTD for buffer containing address literals. */ + if (input_program->addr_literal_count > 0) { + num_consts32++; + num_consts64++; + total_dma_count++; + } + pvr_init_pds_const_map_entry_write_state(info, &entry_write_state); for (unsigned int index = 0; index < input_program->buffer_count; index++) { @@ -1543,6 +1550,67 @@ void pvr_pds_generate_descriptor_upload_program( next_const64 = 0; next_const32 = num_consts64 * 2; + if (input_program->addr_literal_count > 0) { + bool last_dma = (++running_dma_count == total_dma_count); + bool halt = last_dma && !input_program->secondary_program_present; + + unsigned int size_in_dwords = input_program->addr_literal_count * + sizeof(uint64_t) / sizeof(uint32_t); + unsigned int destination = input_program->addr_literals[0].destination; + + struct pvr_pds_const_map_entry_addr_literal_buffer + *addr_literal_buffer_entry; + + addr_literal_buffer_entry = pvr_prepare_next_pds_const_map_entry( + &entry_write_state, + sizeof(*addr_literal_buffer_entry)); + + addr_literal_buffer_entry->type = + PVR_PDS_CONST_MAP_ENTRY_TYPE_ADDR_LITERAL_BUFFER; + addr_literal_buffer_entry->size = size_in_dwords * sizeof(uint32_t); + addr_literal_buffer_entry->const_offset = next_const64 * 2; + + for (unsigned int i = 0; i < input_program->addr_literal_count; i++) { + struct pvr_pds_const_map_entry_addr_literal *addr_literal_entry; + + /* Check that the destinations for the addr literals are contiguous. + * Not supporting non contiguous ranges as that would either require a + * single large buffer with wasted memory for DMA, or multiple buffers + * to DMA. + */ + if (i > 0) { + const uint32_t current_addr_literal_destination = + input_program->addr_literals[i].destination; + const uint32_t previous_addr_literal_destination = + input_program->addr_literals[i - 1].destination; + + /* 2 regs to store 64 bits address. */ + assert(current_addr_literal_destination == + previous_addr_literal_destination + 2); + } + + addr_literal_entry = + pvr_prepare_next_pds_const_map_entry(&entry_write_state, + sizeof(*addr_literal_entry)); + + addr_literal_entry->type = PVR_PDS_CONST_MAP_ENTRY_TYPE_ADDR_LITERAL; + addr_literal_entry->addr_type = input_program->addr_literals[i].type; + } + + PVR_PDS_MODE_TOGGLE(code_section, + instruction, + pvr_encode_burst_cs(&entry_write_state, + last_dma, + halt, + next_const32, + next_const64, + size_in_dwords, + destination)); + + next_const64++; + next_const32++; + } + /* For each descriptor set perform a DOUTD. */ for (unsigned int descriptor_index = 0; descriptor_index < input_program->descriptor_set_count; diff --git a/src/imagination/vulkan/pvr_cmd_buffer.c b/src/imagination/vulkan/pvr_cmd_buffer.c index b19221bbedd..ed1ce3d1ea0 100644 --- a/src/imagination/vulkan/pvr_cmd_buffer.c +++ b/src/imagination/vulkan/pvr_cmd_buffer.c @@ -3207,13 +3207,208 @@ static VkResult pvr_setup_descriptor_mappings_old( } static VkResult -pvr_setup_descriptor_mappings_new(uint32_t *const descriptor_data_offset_out) +pvr_cmd_buffer_upload_desc_set_table(struct pvr_cmd_buffer *const cmd_buffer, + enum pvr_stage_allocation stage, + pvr_dev_addr_t *addr_out) { - *descriptor_data_offset_out = ~0; + uint64_t bound_desc_sets[PVR_MAX_DESCRIPTOR_SETS]; + const struct pvr_descriptor_state *desc_state; + struct pvr_bo *bo; + VkResult result; - pvr_finishme("Implement new desc set path."); + switch (stage) { + case PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY: + case PVR_STAGE_ALLOCATION_FRAGMENT: + desc_state = &cmd_buffer->state.gfx_desc_state; + break; - return VK_ERROR_UNKNOWN; + case PVR_STAGE_ALLOCATION_COMPUTE: + desc_state = &cmd_buffer->state.compute_desc_state; + break; + + default: + unreachable("Unsupported stage."); + break; + } + + for (uint32_t set = 0; set < ARRAY_SIZE(bound_desc_sets); set++) { + if (!(desc_state->valid_mask & BITFIELD_BIT(set))) { + bound_desc_sets[set] = PVR_DEV_ADDR_INVALID.addr; + } else { + bound_desc_sets[set] = + desc_state->descriptor_sets[set]->pvr_bo->vma->dev_addr.addr; + } + } + + result = pvr_cmd_buffer_upload_general(cmd_buffer, + bound_desc_sets, + sizeof(bound_desc_sets), + &bo); + if (result != VK_SUCCESS) + return result; + + *addr_out = bo->vma->dev_addr; + return VK_SUCCESS; +} + +static VkResult +pvr_process_addr_literal(struct pvr_cmd_buffer *cmd_buffer, + enum pvr_pds_addr_literal_type addr_literal_type, + enum pvr_stage_allocation stage, + pvr_dev_addr_t *addr_out) +{ + VkResult result; + + switch (addr_literal_type) { + case PVR_PDS_ADDR_LITERAL_DESC_SET_ADDRS_TABLE: { + /* TODO: Maybe we want to free pvr_bo? And only when the data + * section is written successfully we link all bos to the command + * buffer. + */ + result = + pvr_cmd_buffer_upload_desc_set_table(cmd_buffer, stage, addr_out); + if (result != VK_SUCCESS) + return result; + + break; + } + + default: + unreachable("Invalid add literal type."); + } + + return VK_SUCCESS; +} + +static VkResult pvr_setup_descriptor_mappings_new( + struct pvr_cmd_buffer *const cmd_buffer, + enum pvr_stage_allocation stage, + const struct pvr_stage_allocation_descriptor_state *descriptor_state, + uint32_t *const descriptor_data_offset_out) +{ + const struct pvr_pds_info *const pds_info = &descriptor_state->pds_info; + const uint8_t *entries; + uint32_t *dword_buffer; + uint64_t *qword_buffer; + struct pvr_bo *pvr_bo; + VkResult result; + + if (!pds_info->data_size_in_dwords) + return VK_SUCCESS; + + result = pvr_cmd_buffer_alloc_mem(cmd_buffer, + cmd_buffer->device->heaps.pds_heap, + pds_info->data_size_in_dwords << 2, + PVR_BO_ALLOC_FLAG_CPU_MAPPED, + &pvr_bo); + if (result != VK_SUCCESS) + return result; + + dword_buffer = (uint32_t *)pvr_bo->bo->map; + qword_buffer = (uint64_t *)pvr_bo->bo->map; + + entries = (uint8_t *)pds_info->entries; + + switch (stage) { + case PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY: + case PVR_STAGE_ALLOCATION_FRAGMENT: + case PVR_STAGE_ALLOCATION_COMPUTE: + break; + + default: + unreachable("Unsupported stage."); + break; + } + + for (uint32_t i = 0; i < pds_info->entry_count; i++) { + const struct pvr_const_map_entry *const entry_header = + (struct pvr_const_map_entry *)entries; + + switch (entry_header->type) { + case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32: { + const struct pvr_const_map_entry_literal32 *const literal = + (struct pvr_const_map_entry_literal32 *)entries; + + PVR_WRITE(dword_buffer, + literal->literal_value, + literal->const_offset, + pds_info->data_size_in_dwords); + + entries += sizeof(*literal); + break; + } + + case PVR_PDS_CONST_MAP_ENTRY_TYPE_ADDR_LITERAL_BUFFER: { + const struct pvr_pds_const_map_entry_addr_literal_buffer + *const addr_literal_buffer_entry = + (struct pvr_pds_const_map_entry_addr_literal_buffer *)entries; + struct pvr_device *device = cmd_buffer->device; + struct pvr_bo *addr_literal_buffer_bo; + uint32_t addr_literal_count = 0; + uint64_t *addr_literal_buffer; + + result = pvr_cmd_buffer_alloc_mem(cmd_buffer, + device->heaps.general_heap, + addr_literal_buffer_entry->size, + PVR_BO_ALLOC_FLAG_CPU_MAPPED, + &addr_literal_buffer_bo); + if (result != VK_SUCCESS) + return result; + + addr_literal_buffer = (uint64_t *)addr_literal_buffer_bo->bo->map; + + entries += sizeof(*addr_literal_buffer_entry); + + PVR_WRITE(qword_buffer, + addr_literal_buffer_bo->vma->dev_addr.addr, + addr_literal_buffer_entry->const_offset, + pds_info->data_size_in_dwords); + + for (uint32_t j = i + 1; j < pds_info->entry_count; j++) { + const struct pvr_const_map_entry *const entry_header = + (struct pvr_const_map_entry *)entries; + const struct pvr_pds_const_map_entry_addr_literal *addr_literal; + pvr_dev_addr_t dev_addr; + + if (entry_header->type != PVR_PDS_CONST_MAP_ENTRY_TYPE_ADDR_LITERAL) + break; + + addr_literal = + (struct pvr_pds_const_map_entry_addr_literal *)entries; + + result = pvr_process_addr_literal(cmd_buffer, + addr_literal->addr_type, + stage, + &dev_addr); + if (result != VK_SUCCESS) + return result; + + addr_literal_buffer[addr_literal_count++] = dev_addr.addr; + + entries += sizeof(*addr_literal); + } + + assert(addr_literal_count * sizeof(uint64_t) == + addr_literal_buffer_entry->size); + + i += addr_literal_count; + + pvr_bo_cpu_unmap(device, addr_literal_buffer_bo); + break; + } + + default: + unreachable("Unsupported map entry type."); + } + } + + pvr_bo_cpu_unmap(cmd_buffer->device, pvr_bo); + + *descriptor_data_offset_out = + pvr_bo->vma->dev_addr.addr - + cmd_buffer->device->heaps.pds_heap->base_addr.addr; + + return VK_SUCCESS; } static VkResult pvr_setup_descriptor_mappings( @@ -3234,7 +3429,10 @@ static VkResult pvr_setup_descriptor_mappings( descriptor_data_offset_out); } - return pvr_setup_descriptor_mappings_new(descriptor_data_offset_out); + return pvr_setup_descriptor_mappings_new(cmd_buffer, + stage, + descriptor_state, + descriptor_data_offset_out); } static void pvr_compute_update_shared(struct pvr_cmd_buffer *cmd_buffer, diff --git a/src/imagination/vulkan/pvr_pipeline.c b/src/imagination/vulkan/pvr_pipeline.c index 9975a4a682e..0c88aa5bc9c 100644 --- a/src/imagination/vulkan/pvr_pipeline.c +++ b/src/imagination/vulkan/pvr_pipeline.c @@ -538,6 +538,15 @@ size_t pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes(void) * (pvr_const_map_entry_literal32) * * 3. Size of DOUTU entry (pvr_const_map_entry_doutu_address) + * + * 4. Max. number of PDS address literals (8) * ( + * size of entry + * (pvr_const_map_entry_descriptor_set_addrs_table) + * + * 5. Max. number of address literals with single buffer entry to DOUTD + size of entry + (pvr_pds_const_map_entry_addr_literal_buffer) + + 8 * size of entry (pvr_pds_const_map_entry_addr_literal) */ /* FIXME: PVR_MAX_DESCRIPTOR_SETS is 4 and not 8. The comment above seems to @@ -549,7 +558,9 @@ size_t pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes(void) PVR_PDS_MAX_BUFFERS * (sizeof(struct pvr_const_map_entry_constant_buffer) + sizeof(struct pvr_const_map_entry_literal32)) + - sizeof(struct pvr_const_map_entry_doutu_address)); + sizeof(struct pvr_const_map_entry_doutu_address) + + sizeof(struct pvr_pds_const_map_entry_addr_literal_buffer) + + 8 * sizeof(struct pvr_pds_const_map_entry_addr_literal)); } /* This is a const pointer to an array of PVR_PDS_MAX_BUFFERS pvr_pds_buffer @@ -636,6 +647,23 @@ static VkResult pvr_pds_descriptor_program_setup_buffers( return VK_SUCCESS; } +/** + * \brief Indicates the layout of shared registers allocated by the driver. + * + * 'present' fields indicate if a certain resource was allocated for, and + * whether it will be present in the shareds. + * 'offset' fields indicate at which shared reg the resource starts at. + */ +struct pvr_sh_reg_layout { + /* If this is present, it will always take up 2 sh regs in size and contain + * the device address of the descriptor set addrs table. + */ + struct { + bool present; + uint32_t offset; + } descriptor_set_addrs_table; +}; + static VkResult pvr_pds_descriptor_program_create_and_upload( struct pvr_device *const device, const VkAllocationCallbacks *const allocator, @@ -644,6 +672,7 @@ static VkResult pvr_pds_descriptor_program_create_and_upload( const struct pvr_explicit_constant_usage *const explicit_const_usage, const struct pvr_pipeline_layout *const layout, enum pvr_stage_allocation stage, + const struct pvr_sh_reg_layout *sh_reg_layout, struct pvr_stage_allocation_descriptor_state *const descriptor_state) { const size_t const_entries_size_in_bytes = @@ -708,8 +737,21 @@ static VkResult pvr_pds_descriptor_program_create_and_upload( }; } } else { - pvr_finishme("Implement new desc set path."); - return VK_ERROR_UNKNOWN; + uint32_t addr_literals = 0; + + if (sh_reg_layout->descriptor_set_addrs_table.present) { + program.addr_literals[addr_literals] = (struct pvr_pds_addr_literal){ + .type = PVR_PDS_ADDR_LITERAL_DESC_SET_ADDRS_TABLE, + .destination = sh_reg_layout->descriptor_set_addrs_table.offset, + }; + addr_literals++; + } + + /* TODO: Add support for other allocation types. E.g. blend constants + * and push constants. + */ + + program.addr_literal_count = addr_literals; } entries_buffer = vk_alloc2(&device->vk.alloc, @@ -1047,6 +1089,55 @@ static void pvr_pipeline_finish(struct pvr_pipeline *pipeline) vk_object_base_finish(&pipeline->base); } +/* How many shared regs it takes to store a pvr_dev_addr_t. + * Each shared reg is 32 bits. + */ +#define PVR_DEV_ADDR_SIZE_IN_SH_REGS \ + DIV_ROUND_UP(sizeof(pvr_dev_addr_t), sizeof(uint32_t)) + +/** + * \brief Allocates shared registers. + * + * \return How many sh regs are required. + */ +static uint32_t +pvr_pipeline_alloc_shareds(const struct pvr_device *device, + const struct pvr_pipeline_layout *layout, + enum pvr_stage_allocation stage, + struct pvr_sh_reg_layout *const sh_reg_layout_out) +{ + ASSERTED const uint64_t reserved_shared_size = + device->pdevice->dev_runtime_info.reserved_shared_size; + ASSERTED const uint64_t max_coeff = + device->pdevice->dev_runtime_info.max_coeffs; + + struct pvr_sh_reg_layout reg_layout = { 0 }; + uint32_t next_free_sh_reg = 0; + + reg_layout.descriptor_set_addrs_table.present = + !!(layout->shader_stage_mask & BITFIELD_BIT(stage)); + + if (reg_layout.descriptor_set_addrs_table.present) { + reg_layout.descriptor_set_addrs_table.offset = next_free_sh_reg; + next_free_sh_reg += PVR_DEV_ADDR_SIZE_IN_SH_REGS; + } + + /* TODO: Add allocation for blend constants, push constants, and other buffer + * types. + */ + + *sh_reg_layout_out = reg_layout; + + /* FIXME: We might need to take more things into consideration. + * See pvr_calc_fscommon_size_and_tiles_in_flight(). + */ + assert(next_free_sh_reg <= reserved_shared_size - max_coeff); + + return next_free_sh_reg; +} + +#undef PVR_DEV_ADDR_SIZE_IN_SH_REGS + /****************************************************************************** Compute pipeline functions ******************************************************************************/ @@ -1063,6 +1154,7 @@ static VkResult pvr_compute_pipeline_compile( uint32_t work_group_input_regs[PVR_WORKGROUP_DIMENSIONS]; struct pvr_explicit_constant_usage explicit_const_usage; uint32_t local_input_regs[PVR_WORKGROUP_DIMENSIONS]; + struct pvr_sh_reg_layout sh_reg_layout; struct rogue_ubo_data ubo_data; uint32_t barrier_coefficient; uint32_t usc_temps; @@ -1104,6 +1196,15 @@ static VkResult pvr_compute_pipeline_compile( explicit_const_usage = build_info.explicit_conts_usage; } else { + uint32_t sh_count; + + sh_count = pvr_pipeline_alloc_shareds(device, + compute_pipeline->base.layout, + PVR_STAGE_ALLOCATION_COMPUTE, + &sh_reg_layout); + + compute_pipeline->shader_state.const_shared_reg_count = sh_count; + /* FIXME: Compile and upload the shader. */ /* FIXME: Initialize the shader state and setup build info. */ abort(); @@ -1117,6 +1218,7 @@ static VkResult pvr_compute_pipeline_compile( &explicit_const_usage, compute_pipeline->base.layout, PVR_STAGE_ALLOCATION_COMPUTE, + &sh_reg_layout, &compute_pipeline->descriptor_state); if (result != VK_SUCCESS) goto err_free_shader; @@ -1429,6 +1531,31 @@ pvr_graphics_pipeline_compile(struct pvr_device *const device, struct rogue_build_ctx *ctx; VkResult result; + const bool old_path = + pvr_hard_code_shader_required(&device->pdevice->dev_info); + + /* Vars needed for the new path. */ + /* TODO: These need to be passed into the compiler so that it knows which + * shared regs to use to access specific resources. + */ + struct pvr_sh_reg_layout vert_sh_reg_layout; + struct pvr_sh_reg_layout frag_sh_reg_layout; + uint32_t vert_sh_count = 0; + uint32_t frag_sh_count = 0; + + if (!old_path) { + vert_sh_count = + pvr_pipeline_alloc_shareds(device, + gfx_pipeline->base.layout, + PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY, + &vert_sh_reg_layout); + + frag_sh_count = pvr_pipeline_alloc_shareds(device, + gfx_pipeline->base.layout, + PVR_STAGE_ALLOCATION_FRAGMENT, + &frag_sh_reg_layout); + } + /* Setup shared build context. */ ctx = rogue_build_context_create(compiler); if (!ctx) @@ -1531,6 +1658,17 @@ pvr_graphics_pipeline_compile(struct pvr_device *const device, pvr_vertex_state_init(gfx_pipeline, &ctx->common_data[MESA_SHADER_VERTEX], &ctx->stage_data.vs); + + if (!old_path) { + struct pvr_vertex_shader_state *vertex_state = + &gfx_pipeline->shader_state.vertex; + + /* FIXME: For now we just overwrite it but the compiler shouldn't be + * returning the sh count since the driver is in charge of allocating + * them. + */ + vertex_state->stage_state.const_shared_reg_count = vert_sh_count; + } } result = pvr_gpu_upload_usc(device, @@ -1551,6 +1689,17 @@ pvr_graphics_pipeline_compile(struct pvr_device *const device, } else { pvr_fragment_state_init(gfx_pipeline, &ctx->common_data[MESA_SHADER_FRAGMENT]); + + if (!old_path) { + struct pvr_fragment_shader_state *fragment_state = + &gfx_pipeline->shader_state.fragment; + + /* FIXME: For now we just overwrite it but the compiler shouldn't be + * returning the sh count since the driver is in charge of allocating + * them. + */ + fragment_state->stage_state.const_shared_reg_count = frag_sh_count; + } } result = pvr_gpu_upload_usc(device, @@ -1605,6 +1754,7 @@ pvr_graphics_pipeline_compile(struct pvr_device *const device, &vert_explicit_const_usage, gfx_pipeline->base.layout, PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY, + &vert_sh_reg_layout, &gfx_pipeline->shader_state.vertex.descriptor_state); if (result != VK_SUCCESS) goto err_free_vertex_attrib_program; @@ -1627,6 +1777,7 @@ pvr_graphics_pipeline_compile(struct pvr_device *const device, &frag_explicit_const_usage, gfx_pipeline->base.layout, PVR_STAGE_ALLOCATION_FRAGMENT, + &frag_sh_reg_layout, &gfx_pipeline->shader_state.fragment.descriptor_state); if (result != VK_SUCCESS) goto err_free_vertex_descriptor_program;