mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-25 19:08:12 +02:00
The new bitfield has a separat flag for each of the color attachments. Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27024>
2468 lines
89 KiB
C
2468 lines
89 KiB
C
/*
|
|
* Copyright © 2022 Imagination Technologies Ltd.
|
|
*
|
|
* based in part on v3dv driver which is:
|
|
* Copyright © 2019 Raspberry Pi
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
* of this software and associated documentation files (the "Software"), to deal
|
|
* in the Software without restriction, including without limitation the rights
|
|
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
* copies of the Software, and to permit persons to whom the Software is
|
|
* furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice (including the next
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
* Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
* SOFTWARE.
|
|
*/
|
|
|
|
#include <assert.h>
|
|
#include <stdbool.h>
|
|
#include <stdint.h>
|
|
#include <string.h>
|
|
#include <vulkan/vulkan.h>
|
|
|
|
#include "compiler/shader_enums.h"
|
|
#include "hwdef/rogue_hw_utils.h"
|
|
#include "nir/nir.h"
|
|
#include "pvr_bo.h"
|
|
#include "pvr_csb.h"
|
|
#include "pvr_csb_enum_helpers.h"
|
|
#include "pvr_hardcode.h"
|
|
#include "pvr_pds.h"
|
|
#include "pvr_private.h"
|
|
#include "pvr_robustness.h"
|
|
#include "pvr_shader.h"
|
|
#include "pvr_types.h"
|
|
#include "rogue/rogue.h"
|
|
#include "util/log.h"
|
|
#include "util/macros.h"
|
|
#include "util/ralloc.h"
|
|
#include "util/u_dynarray.h"
|
|
#include "util/u_math.h"
|
|
#include "vk_alloc.h"
|
|
#include "vk_format.h"
|
|
#include "vk_graphics_state.h"
|
|
#include "vk_log.h"
|
|
#include "vk_object.h"
|
|
#include "vk_pipeline_cache.h"
|
|
#include "vk_render_pass.h"
|
|
#include "vk_util.h"
|
|
|
|
/*****************************************************************************
|
|
PDS functions
|
|
*****************************************************************************/
|
|
|
|
/* If allocator == NULL, the internal one will be used. */
|
|
static VkResult pvr_pds_coeff_program_create_and_upload(
|
|
struct pvr_device *device,
|
|
const VkAllocationCallbacks *allocator,
|
|
const uint32_t *fpu_iterators,
|
|
uint32_t fpu_iterators_count,
|
|
const uint32_t *destinations,
|
|
struct pvr_pds_upload *const pds_upload_out,
|
|
uint32_t *const pds_temps_count_out)
|
|
{
|
|
struct pvr_pds_coeff_loading_program program = {
|
|
.num_fpu_iterators = fpu_iterators_count,
|
|
};
|
|
uint32_t staging_buffer_size;
|
|
uint32_t *staging_buffer;
|
|
VkResult result;
|
|
|
|
assert(fpu_iterators_count < PVR_MAXIMUM_ITERATIONS);
|
|
|
|
/* Get the size of the program and then allocate that much memory. */
|
|
pvr_pds_coefficient_loading(&program, NULL, PDS_GENERATE_SIZES);
|
|
|
|
if (!program.code_size) {
|
|
pds_upload_out->pvr_bo = NULL;
|
|
pds_upload_out->code_size = 0;
|
|
pds_upload_out->data_size = 0;
|
|
*pds_temps_count_out = 0;
|
|
|
|
return VK_SUCCESS;
|
|
}
|
|
|
|
staging_buffer_size = PVR_DW_TO_BYTES(program.code_size + program.data_size);
|
|
|
|
staging_buffer = vk_alloc2(&device->vk.alloc,
|
|
allocator,
|
|
staging_buffer_size,
|
|
8,
|
|
VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
|
|
if (!staging_buffer)
|
|
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
|
|
|
|
/* FIXME: Should we save pointers when we redesign the pds gen api ? */
|
|
typed_memcpy(program.FPU_iterators,
|
|
fpu_iterators,
|
|
program.num_fpu_iterators);
|
|
|
|
typed_memcpy(program.destination, destinations, program.num_fpu_iterators);
|
|
|
|
/* Generate the program into is the staging_buffer. */
|
|
pvr_pds_coefficient_loading(&program,
|
|
staging_buffer,
|
|
PDS_GENERATE_CODEDATA_SEGMENTS);
|
|
|
|
/* FIXME: Figure out the define for alignment of 16. */
|
|
result = pvr_gpu_upload_pds(device,
|
|
&staging_buffer[0],
|
|
program.data_size,
|
|
16,
|
|
&staging_buffer[program.data_size],
|
|
program.code_size,
|
|
16,
|
|
16,
|
|
pds_upload_out);
|
|
if (result != VK_SUCCESS) {
|
|
vk_free2(&device->vk.alloc, allocator, staging_buffer);
|
|
return result;
|
|
}
|
|
|
|
vk_free2(&device->vk.alloc, allocator, staging_buffer);
|
|
|
|
*pds_temps_count_out = program.temps_used;
|
|
|
|
return VK_SUCCESS;
|
|
}
|
|
|
|
/* FIXME: move this elsewhere since it's also called in pvr_pass.c? */
|
|
/* If allocator == NULL, the internal one will be used. */
|
|
VkResult pvr_pds_fragment_program_create_and_upload(
|
|
struct pvr_device *device,
|
|
const VkAllocationCallbacks *allocator,
|
|
const struct pvr_suballoc_bo *fragment_shader_bo,
|
|
uint32_t fragment_temp_count,
|
|
enum rogue_msaa_mode msaa_mode,
|
|
bool has_phase_rate_change,
|
|
struct pvr_pds_upload *const pds_upload_out)
|
|
{
|
|
const enum PVRX(PDSINST_DOUTU_SAMPLE_RATE)
|
|
sample_rate = pvr_pdsinst_doutu_sample_rate_from_rogue(msaa_mode);
|
|
struct pvr_pds_kickusc_program program = { 0 };
|
|
uint32_t staging_buffer_size;
|
|
uint32_t *staging_buffer;
|
|
VkResult result;
|
|
|
|
/* FIXME: Should it be passing in the USC offset rather than address here?
|
|
*/
|
|
/* Note this is not strictly required to be done before calculating the
|
|
* staging_buffer_size in this particular case. It can also be done after
|
|
* allocating the buffer. The size from pvr_pds_kick_usc() is constant.
|
|
*/
|
|
pvr_pds_setup_doutu(&program.usc_task_control,
|
|
fragment_shader_bo->dev_addr.addr,
|
|
fragment_temp_count,
|
|
sample_rate,
|
|
has_phase_rate_change);
|
|
|
|
pvr_pds_kick_usc(&program, NULL, 0, false, PDS_GENERATE_SIZES);
|
|
|
|
staging_buffer_size = PVR_DW_TO_BYTES(program.code_size + program.data_size);
|
|
|
|
staging_buffer = vk_alloc2(&device->vk.alloc,
|
|
allocator,
|
|
staging_buffer_size,
|
|
8,
|
|
VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
|
|
if (!staging_buffer)
|
|
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
|
|
|
|
pvr_pds_kick_usc(&program,
|
|
staging_buffer,
|
|
0,
|
|
false,
|
|
PDS_GENERATE_CODEDATA_SEGMENTS);
|
|
|
|
/* FIXME: Figure out the define for alignment of 16. */
|
|
result = pvr_gpu_upload_pds(device,
|
|
&staging_buffer[0],
|
|
program.data_size,
|
|
16,
|
|
&staging_buffer[program.data_size],
|
|
program.code_size,
|
|
16,
|
|
16,
|
|
pds_upload_out);
|
|
if (result != VK_SUCCESS) {
|
|
vk_free2(&device->vk.alloc, allocator, staging_buffer);
|
|
return result;
|
|
}
|
|
|
|
vk_free2(&device->vk.alloc, allocator, staging_buffer);
|
|
|
|
return VK_SUCCESS;
|
|
}
|
|
|
|
static inline size_t pvr_pds_get_max_vertex_program_const_map_size_in_bytes(
|
|
const struct pvr_device_info *dev_info,
|
|
bool robust_buffer_access)
|
|
{
|
|
/* FIXME: Use more local variable to improve formatting. */
|
|
|
|
/* Maximum memory allocation needed for const map entries in
|
|
* pvr_pds_generate_vertex_primary_program().
|
|
* When robustBufferAccess is disabled, it must be >= 410.
|
|
* When robustBufferAccess is enabled, it must be >= 570.
|
|
*
|
|
* 1. Size of entry for base instance
|
|
* (pvr_const_map_entry_base_instance)
|
|
*
|
|
* 2. Max. number of vertex inputs (PVR_MAX_VERTEX_INPUT_BINDINGS) * (
|
|
* if (!robustBufferAccess)
|
|
* size of vertex attribute entry
|
|
* (pvr_const_map_entry_vertex_attribute_address) +
|
|
* else
|
|
* size of robust vertex attribute entry
|
|
* (pvr_const_map_entry_robust_vertex_attribute_address) +
|
|
* size of entry for max attribute index
|
|
* (pvr_const_map_entry_vertex_attribute_max_index) +
|
|
* fi
|
|
* size of Unified Store burst entry
|
|
* (pvr_const_map_entry_literal32) +
|
|
* size of entry for vertex stride
|
|
* (pvr_const_map_entry_literal32) +
|
|
* size of entries for DDMAD control word
|
|
* (num_ddmad_literals * pvr_const_map_entry_literal32))
|
|
*
|
|
* 3. Size of entry for DOUTW vertex/instance control word
|
|
* (pvr_const_map_entry_literal32)
|
|
*
|
|
* 4. Size of DOUTU entry (pvr_const_map_entry_doutu_address)
|
|
*/
|
|
|
|
const size_t attribute_size =
|
|
(!robust_buffer_access)
|
|
? sizeof(struct pvr_const_map_entry_vertex_attribute_address)
|
|
: sizeof(struct pvr_const_map_entry_robust_vertex_attribute_address) +
|
|
sizeof(struct pvr_const_map_entry_vertex_attribute_max_index);
|
|
|
|
/* If has_pds_ddmadt the DDMAD control word is now a DDMADT control word
|
|
* and is increased by one DWORD to contain the data for the DDMADT's
|
|
* out-of-bounds check.
|
|
*/
|
|
const size_t pvr_pds_const_map_vertex_entry_num_ddmad_literals =
|
|
1U + (size_t)PVR_HAS_FEATURE(dev_info, pds_ddmadt);
|
|
|
|
return (sizeof(struct pvr_const_map_entry_base_instance) +
|
|
PVR_MAX_VERTEX_INPUT_BINDINGS *
|
|
(attribute_size +
|
|
(2 + pvr_pds_const_map_vertex_entry_num_ddmad_literals) *
|
|
sizeof(struct pvr_const_map_entry_literal32)) +
|
|
sizeof(struct pvr_const_map_entry_literal32) +
|
|
sizeof(struct pvr_const_map_entry_doutu_address));
|
|
}
|
|
|
|
/* This is a const pointer to an array of pvr_pds_vertex_dma structs.
|
|
* The array being pointed to is of PVR_MAX_VERTEX_ATTRIB_DMAS size.
|
|
*/
|
|
typedef struct pvr_pds_vertex_dma (
|
|
*const
|
|
pvr_pds_attrib_dma_descriptions_array_ptr)[PVR_MAX_VERTEX_ATTRIB_DMAS];
|
|
|
|
/* dma_descriptions_out_ptr is a pointer to the array used as output.
|
|
* The whole array might not be filled so dma_count_out indicates how many
|
|
* elements were used.
|
|
*/
|
|
static void pvr_pds_vertex_attrib_init_dma_descriptions(
|
|
const VkPipelineVertexInputStateCreateInfo *const vertex_input_state,
|
|
const struct rogue_vs_build_data *vs_data,
|
|
pvr_pds_attrib_dma_descriptions_array_ptr dma_descriptions_out_ptr,
|
|
uint32_t *const dma_count_out)
|
|
{
|
|
struct pvr_pds_vertex_dma *const dma_descriptions =
|
|
*dma_descriptions_out_ptr;
|
|
uint32_t dma_count = 0;
|
|
|
|
if (!vertex_input_state) {
|
|
*dma_count_out = 0;
|
|
return;
|
|
}
|
|
|
|
for (uint32_t i = 0; i < vertex_input_state->vertexAttributeDescriptionCount;
|
|
i++) {
|
|
const VkVertexInputAttributeDescription *const attrib_desc =
|
|
&vertex_input_state->pVertexAttributeDescriptions[i];
|
|
const VkVertexInputBindingDescription *binding_desc = NULL;
|
|
struct pvr_pds_vertex_dma *const dma_desc = &dma_descriptions[dma_count];
|
|
size_t location = attrib_desc->location;
|
|
|
|
assert(location < vs_data->inputs.num_input_vars);
|
|
|
|
/* Finding the matching binding description. */
|
|
for (uint32_t j = 0;
|
|
j < vertex_input_state->vertexBindingDescriptionCount;
|
|
j++) {
|
|
const VkVertexInputBindingDescription *const current_binding_desc =
|
|
&vertex_input_state->pVertexBindingDescriptions[j];
|
|
|
|
if (current_binding_desc->binding == attrib_desc->binding) {
|
|
binding_desc = current_binding_desc;
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* From the Vulkan 1.2.195 spec for
|
|
* VkPipelineVertexInputStateCreateInfo:
|
|
*
|
|
* "For every binding specified by each element of
|
|
* pVertexAttributeDescriptions, a
|
|
* VkVertexInputBindingDescription must exist in
|
|
* pVertexBindingDescriptions with the same value of binding"
|
|
*/
|
|
assert(binding_desc);
|
|
|
|
dma_desc->offset = attrib_desc->offset;
|
|
dma_desc->stride = binding_desc->stride;
|
|
|
|
dma_desc->flags = 0;
|
|
|
|
if (binding_desc->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE)
|
|
dma_desc->flags |= PVR_PDS_VERTEX_DMA_FLAGS_INSTANCE_RATE;
|
|
|
|
dma_desc->size_in_dwords = vs_data->inputs.components[location];
|
|
/* TODO: This will be different when other types are supported.
|
|
* Store in vs_data with base and components?
|
|
*/
|
|
/* TODO: Use attrib_desc->format. */
|
|
dma_desc->component_size_in_bytes = ROGUE_REG_SIZE_BYTES;
|
|
dma_desc->destination = vs_data->inputs.base[location];
|
|
dma_desc->binding_index = attrib_desc->binding;
|
|
dma_desc->divisor = 1;
|
|
|
|
dma_desc->robustness_buffer_offset =
|
|
pvr_get_robustness_buffer_format_offset(attrib_desc->format);
|
|
|
|
++dma_count;
|
|
}
|
|
|
|
*dma_count_out = dma_count;
|
|
}
|
|
|
|
static VkResult pvr_pds_vertex_attrib_program_create_and_upload(
|
|
struct pvr_device *const device,
|
|
const VkAllocationCallbacks *const allocator,
|
|
struct pvr_pds_vertex_primary_program_input *const input,
|
|
struct pvr_pds_attrib_program *const program_out)
|
|
{
|
|
const size_t const_entries_size_in_bytes =
|
|
pvr_pds_get_max_vertex_program_const_map_size_in_bytes(
|
|
&device->pdevice->dev_info,
|
|
device->vk.enabled_features.robustBufferAccess);
|
|
struct pvr_pds_upload *const program = &program_out->program;
|
|
struct pvr_pds_info *const info = &program_out->info;
|
|
struct pvr_const_map_entry *new_entries;
|
|
ASSERTED uint32_t code_size_in_dwords;
|
|
size_t staging_buffer_size;
|
|
uint32_t *staging_buffer;
|
|
VkResult result;
|
|
|
|
memset(info, 0, sizeof(*info));
|
|
|
|
info->entries = vk_alloc2(&device->vk.alloc,
|
|
allocator,
|
|
const_entries_size_in_bytes,
|
|
8,
|
|
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
|
|
if (!info->entries) {
|
|
result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
|
|
goto err_out;
|
|
}
|
|
|
|
info->entries_size_in_bytes = const_entries_size_in_bytes;
|
|
|
|
pvr_pds_generate_vertex_primary_program(
|
|
input,
|
|
NULL,
|
|
info,
|
|
device->vk.enabled_features.robustBufferAccess,
|
|
&device->pdevice->dev_info);
|
|
|
|
code_size_in_dwords = info->code_size_in_dwords;
|
|
staging_buffer_size = PVR_DW_TO_BYTES(info->code_size_in_dwords);
|
|
|
|
staging_buffer = vk_alloc2(&device->vk.alloc,
|
|
allocator,
|
|
staging_buffer_size,
|
|
8,
|
|
VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
|
|
if (!staging_buffer) {
|
|
result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
|
|
goto err_free_entries;
|
|
}
|
|
|
|
/* This also fills in info->entries. */
|
|
pvr_pds_generate_vertex_primary_program(
|
|
input,
|
|
staging_buffer,
|
|
info,
|
|
device->vk.enabled_features.robustBufferAccess,
|
|
&device->pdevice->dev_info);
|
|
|
|
assert(info->code_size_in_dwords <= code_size_in_dwords);
|
|
|
|
/* FIXME: Add a vk_realloc2() ? */
|
|
new_entries = vk_realloc((!allocator) ? &device->vk.alloc : allocator,
|
|
info->entries,
|
|
info->entries_written_size_in_bytes,
|
|
8,
|
|
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
|
|
if (!new_entries) {
|
|
result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
|
|
goto err_free_staging_buffer;
|
|
}
|
|
|
|
info->entries = new_entries;
|
|
info->entries_size_in_bytes = info->entries_written_size_in_bytes;
|
|
|
|
/* FIXME: Figure out the define for alignment of 16. */
|
|
result = pvr_gpu_upload_pds(device,
|
|
NULL,
|
|
0,
|
|
0,
|
|
staging_buffer,
|
|
info->code_size_in_dwords,
|
|
16,
|
|
16,
|
|
program);
|
|
if (result != VK_SUCCESS)
|
|
goto err_free_staging_buffer;
|
|
|
|
vk_free2(&device->vk.alloc, allocator, staging_buffer);
|
|
|
|
return VK_SUCCESS;
|
|
|
|
err_free_staging_buffer:
|
|
vk_free2(&device->vk.alloc, allocator, staging_buffer);
|
|
|
|
err_free_entries:
|
|
vk_free2(&device->vk.alloc, allocator, info->entries);
|
|
|
|
err_out:
|
|
return result;
|
|
}
|
|
|
|
static inline void pvr_pds_vertex_attrib_program_destroy(
|
|
struct pvr_device *const device,
|
|
const struct VkAllocationCallbacks *const allocator,
|
|
struct pvr_pds_attrib_program *const program)
|
|
{
|
|
pvr_bo_suballoc_free(program->program.pvr_bo);
|
|
vk_free2(&device->vk.alloc, allocator, program->info.entries);
|
|
}
|
|
|
|
/* This is a const pointer to an array of pvr_pds_attrib_program structs.
|
|
* The array being pointed to is of PVR_PDS_VERTEX_ATTRIB_PROGRAM_COUNT size.
|
|
*/
|
|
typedef struct pvr_pds_attrib_program (*const pvr_pds_attrib_programs_array_ptr)
|
|
[PVR_PDS_VERTEX_ATTRIB_PROGRAM_COUNT];
|
|
|
|
/* Indicates that the special variable is unused and has not been allocated a
|
|
* register.
|
|
*/
|
|
#define PVR_VERTEX_SPECIAL_VAR_UNUSED (-1)
|
|
|
|
/* Each special variable gets allocated its own vtxin reg if used. */
|
|
struct pvr_vertex_special_vars {
|
|
/* VertexIndex built-in. */
|
|
int16_t vertex_id_offset;
|
|
/* InstanceIndex built-in. */
|
|
int16_t instance_id_offset;
|
|
};
|
|
|
|
/* Generate and uploads a PDS program for DMAing vertex attribs into USC vertex
|
|
* inputs. This will bake the code segment and create a template of the data
|
|
* segment for the command buffer to fill in.
|
|
*/
|
|
/* If allocator == NULL, the internal one will be used.
|
|
*
|
|
* programs_out_ptr is a pointer to the array where the outputs will be placed.
|
|
*/
|
|
static VkResult pvr_pds_vertex_attrib_programs_create_and_upload(
|
|
struct pvr_device *device,
|
|
const VkAllocationCallbacks *const allocator,
|
|
const VkPipelineVertexInputStateCreateInfo *const vertex_input_state,
|
|
uint32_t usc_temp_count,
|
|
const struct rogue_vs_build_data *vs_data,
|
|
|
|
/* Needed for the new path. */
|
|
/* TODO: Remove some of the above once the compiler is hooked up. */
|
|
const struct pvr_pds_vertex_dma
|
|
dma_descriptions[static const PVR_MAX_VERTEX_ATTRIB_DMAS],
|
|
uint32_t dma_count,
|
|
const struct pvr_vertex_special_vars *special_vars_layout,
|
|
|
|
pvr_pds_attrib_programs_array_ptr programs_out_ptr)
|
|
{
|
|
struct pvr_pds_vertex_dma dma_descriptions_old[PVR_MAX_VERTEX_ATTRIB_DMAS];
|
|
|
|
struct pvr_pds_attrib_program *const programs_out = *programs_out_ptr;
|
|
struct pvr_pds_vertex_primary_program_input input = { 0 };
|
|
VkResult result;
|
|
|
|
const bool old_path = pvr_has_hard_coded_shaders(&device->pdevice->dev_info);
|
|
|
|
if (old_path) {
|
|
pvr_pds_vertex_attrib_init_dma_descriptions(vertex_input_state,
|
|
vs_data,
|
|
&dma_descriptions_old,
|
|
&input.dma_count);
|
|
|
|
input.dma_list = dma_descriptions_old;
|
|
} else {
|
|
input.dma_list = dma_descriptions;
|
|
input.dma_count = dma_count;
|
|
|
|
if (special_vars_layout->vertex_id_offset !=
|
|
PVR_VERTEX_SPECIAL_VAR_UNUSED) {
|
|
/* Gets filled by the HW and copied into the appropriate reg. */
|
|
input.flags |= PVR_PDS_VERTEX_FLAGS_VERTEX_ID_REQUIRED;
|
|
input.vertex_id_register = special_vars_layout->vertex_id_offset;
|
|
}
|
|
|
|
if (special_vars_layout->instance_id_offset !=
|
|
PVR_VERTEX_SPECIAL_VAR_UNUSED) {
|
|
/* Gets filled by the HW and copied into the appropriate reg. */
|
|
input.flags |= PVR_PDS_VERTEX_FLAGS_INSTANCE_ID_REQUIRED;
|
|
input.instance_id_register = special_vars_layout->instance_id_offset;
|
|
}
|
|
}
|
|
|
|
pvr_pds_setup_doutu(&input.usc_task_control,
|
|
0,
|
|
usc_temp_count,
|
|
PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE),
|
|
false);
|
|
|
|
/* Note: programs_out_ptr is a pointer to an array so this is fine. See the
|
|
* typedef.
|
|
*/
|
|
for (uint32_t i = 0; i < ARRAY_SIZE(*programs_out_ptr); i++) {
|
|
uint32_t extra_flags;
|
|
|
|
switch (i) {
|
|
case PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASIC:
|
|
extra_flags = 0;
|
|
break;
|
|
|
|
case PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASE_INSTANCE:
|
|
extra_flags = PVR_PDS_VERTEX_FLAGS_BASE_INSTANCE_VARIANT;
|
|
break;
|
|
|
|
case PVR_PDS_VERTEX_ATTRIB_PROGRAM_DRAW_INDIRECT:
|
|
extra_flags = PVR_PDS_VERTEX_FLAGS_DRAW_INDIRECT_VARIANT;
|
|
break;
|
|
|
|
default:
|
|
unreachable("Invalid vertex attrib program type.");
|
|
}
|
|
|
|
input.flags |= extra_flags;
|
|
|
|
result =
|
|
pvr_pds_vertex_attrib_program_create_and_upload(device,
|
|
allocator,
|
|
&input,
|
|
&programs_out[i]);
|
|
if (result != VK_SUCCESS) {
|
|
for (uint32_t j = 0; j < i; j++) {
|
|
pvr_pds_vertex_attrib_program_destroy(device,
|
|
allocator,
|
|
&programs_out[j]);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
input.flags &= ~extra_flags;
|
|
}
|
|
|
|
return VK_SUCCESS;
|
|
}
|
|
|
|
size_t pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes(void)
|
|
{
|
|
/* Maximum memory allocation needed for const map entries in
|
|
* pvr_pds_generate_descriptor_upload_program().
|
|
* It must be >= 688 bytes. This size is calculated as the sum of:
|
|
*
|
|
* 1. Max. number of descriptor sets (8) * (
|
|
* size of descriptor entry
|
|
* (pvr_const_map_entry_descriptor_set) +
|
|
* size of Common Store burst entry
|
|
* (pvr_const_map_entry_literal32))
|
|
*
|
|
* 2. Max. number of PDS program buffers (24) * (
|
|
* size of the largest buffer structure
|
|
* (pvr_const_map_entry_constant_buffer) +
|
|
* size of Common Store burst entry
|
|
* (pvr_const_map_entry_literal32)
|
|
*
|
|
* 3. Size of DOUTU entry (pvr_const_map_entry_doutu_address)
|
|
*
|
|
* 4. Max. number of PDS address literals (8) * (
|
|
* size of entry
|
|
* (pvr_const_map_entry_descriptor_set_addrs_table)
|
|
*
|
|
* 5. Max. number of address literals with single buffer entry to DOUTD
|
|
size of entry
|
|
(pvr_pds_const_map_entry_addr_literal_buffer) +
|
|
8 * size of entry (pvr_pds_const_map_entry_addr_literal)
|
|
*/
|
|
|
|
/* FIXME: PVR_MAX_DESCRIPTOR_SETS is 4 and not 8. The comment above seems to
|
|
* say that it should be 8.
|
|
* Figure our a define for this or is the comment wrong?
|
|
*/
|
|
return (8 * (sizeof(struct pvr_const_map_entry_descriptor_set) +
|
|
sizeof(struct pvr_const_map_entry_literal32)) +
|
|
PVR_PDS_MAX_BUFFERS *
|
|
(sizeof(struct pvr_const_map_entry_constant_buffer) +
|
|
sizeof(struct pvr_const_map_entry_literal32)) +
|
|
sizeof(struct pvr_const_map_entry_doutu_address) +
|
|
sizeof(struct pvr_pds_const_map_entry_addr_literal_buffer) +
|
|
8 * sizeof(struct pvr_pds_const_map_entry_addr_literal));
|
|
}
|
|
|
|
/* This is a const pointer to an array of PVR_PDS_MAX_BUFFERS pvr_pds_buffer
|
|
* structs.
|
|
*/
|
|
typedef struct pvr_pds_buffer (
|
|
*const pvr_pds_descriptor_program_buffer_array_ptr)[PVR_PDS_MAX_BUFFERS];
|
|
|
|
/**
|
|
* \brief Setup buffers for the PDS descriptor program.
|
|
*
|
|
* Sets up buffers required by the PDS gen api based on compiler info.
|
|
*
|
|
* For compile time static constants that need DMAing it uploads them and
|
|
* returns the upload in \r static_consts_pvr_bo_out .
|
|
*/
|
|
static VkResult pvr_pds_descriptor_program_setup_buffers(
|
|
struct pvr_device *device,
|
|
bool robust_buffer_access,
|
|
const struct rogue_compile_time_consts_data *compile_time_consts_data,
|
|
const struct rogue_ubo_data *ubo_data,
|
|
pvr_pds_descriptor_program_buffer_array_ptr buffers_out_ptr,
|
|
uint32_t *const buffer_count_out,
|
|
struct pvr_suballoc_bo **const static_consts_pvr_bo_out)
|
|
{
|
|
struct pvr_pds_buffer *const buffers = *buffers_out_ptr;
|
|
uint32_t buffer_count = 0;
|
|
|
|
for (size_t i = 0; i < ubo_data->num_ubo_entries; i++) {
|
|
struct pvr_pds_buffer *current_buffer = &buffers[buffer_count];
|
|
|
|
/* This is fine since buffers_out_ptr is a pointer to an array. */
|
|
assert(buffer_count < ARRAY_SIZE(*buffers_out_ptr));
|
|
|
|
current_buffer->type = PVR_BUFFER_TYPE_UBO;
|
|
current_buffer->size_in_dwords = ubo_data->size[i];
|
|
current_buffer->destination = ubo_data->dest[i];
|
|
|
|
current_buffer->buffer_id = buffer_count;
|
|
current_buffer->desc_set = ubo_data->desc_set[i];
|
|
current_buffer->binding = ubo_data->binding[i];
|
|
/* TODO: Is this always the case?
|
|
* E.g. can multiple UBOs have the same base buffer?
|
|
*/
|
|
current_buffer->source_offset = 0;
|
|
|
|
buffer_count++;
|
|
}
|
|
|
|
if (compile_time_consts_data->static_consts.num > 0) {
|
|
VkResult result;
|
|
|
|
assert(compile_time_consts_data->static_consts.num <=
|
|
ARRAY_SIZE(compile_time_consts_data->static_consts.value));
|
|
|
|
/* This is fine since buffers_out_ptr is a pointer to an array. */
|
|
assert(buffer_count < ARRAY_SIZE(*buffers_out_ptr));
|
|
|
|
/* TODO: Is it possible to have multiple static consts buffer where the
|
|
* destination is not adjoining? If so we need to handle that.
|
|
* Currently we're only setting up a single buffer.
|
|
*/
|
|
buffers[buffer_count++] = (struct pvr_pds_buffer){
|
|
.type = PVR_BUFFER_TYPE_COMPILE_TIME,
|
|
.size_in_dwords = compile_time_consts_data->static_consts.num,
|
|
.destination = compile_time_consts_data->static_consts.dest,
|
|
};
|
|
|
|
result = pvr_gpu_upload(device,
|
|
device->heaps.general_heap,
|
|
compile_time_consts_data->static_consts.value,
|
|
compile_time_consts_data->static_consts.num *
|
|
ROGUE_REG_SIZE_BYTES,
|
|
ROGUE_REG_SIZE_BYTES,
|
|
static_consts_pvr_bo_out);
|
|
if (result != VK_SUCCESS)
|
|
return result;
|
|
} else {
|
|
*static_consts_pvr_bo_out = NULL;
|
|
}
|
|
|
|
*buffer_count_out = buffer_count;
|
|
|
|
return VK_SUCCESS;
|
|
}
|
|
|
|
static VkResult pvr_pds_descriptor_program_create_and_upload(
|
|
struct pvr_device *const device,
|
|
const VkAllocationCallbacks *const allocator,
|
|
const struct rogue_compile_time_consts_data *const compile_time_consts_data,
|
|
const struct rogue_ubo_data *const ubo_data,
|
|
const struct pvr_explicit_constant_usage *const explicit_const_usage,
|
|
const struct pvr_pipeline_layout *const layout,
|
|
enum pvr_stage_allocation stage,
|
|
const struct pvr_sh_reg_layout *sh_reg_layout,
|
|
struct pvr_stage_allocation_descriptor_state *const descriptor_state)
|
|
{
|
|
const size_t const_entries_size_in_bytes =
|
|
pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes();
|
|
struct pvr_pds_info *const pds_info = &descriptor_state->pds_info;
|
|
struct pvr_pds_descriptor_program_input program = { 0 };
|
|
struct pvr_const_map_entry *new_entries;
|
|
ASSERTED uint32_t code_size_in_dwords;
|
|
uint32_t staging_buffer_size;
|
|
uint32_t *staging_buffer;
|
|
VkResult result;
|
|
|
|
const bool old_path = pvr_has_hard_coded_shaders(&device->pdevice->dev_info);
|
|
|
|
assert(stage != PVR_STAGE_ALLOCATION_COUNT);
|
|
|
|
*pds_info = (struct pvr_pds_info){ 0 };
|
|
|
|
if (old_path) {
|
|
result = pvr_pds_descriptor_program_setup_buffers(
|
|
device,
|
|
device->vk.enabled_features.robustBufferAccess,
|
|
compile_time_consts_data,
|
|
ubo_data,
|
|
&program.buffers,
|
|
&program.buffer_count,
|
|
&descriptor_state->static_consts);
|
|
if (result != VK_SUCCESS)
|
|
return result;
|
|
|
|
if (layout->per_stage_reg_info[stage].primary_dynamic_size_in_dwords)
|
|
assert(!"Unimplemented");
|
|
|
|
for (uint32_t set_num = 0; set_num < layout->set_count; set_num++) {
|
|
const struct pvr_descriptor_set_layout_mem_layout *const reg_layout =
|
|
&layout->register_layout_in_dwords_per_stage[stage][set_num];
|
|
const uint32_t start_offset = explicit_const_usage->start_offset;
|
|
|
|
/* TODO: Use compiler usage info to optimize this? */
|
|
|
|
/* Only dma primaries if they are actually required. */
|
|
if (reg_layout->primary_size) {
|
|
program.descriptor_sets[program.descriptor_set_count++] =
|
|
(struct pvr_pds_descriptor_set){
|
|
.descriptor_set = set_num,
|
|
.size_in_dwords = reg_layout->primary_size,
|
|
.destination = reg_layout->primary_offset + start_offset,
|
|
.primary = true,
|
|
};
|
|
}
|
|
|
|
/* Only dma secondaries if they are actually required. */
|
|
if (!reg_layout->secondary_size)
|
|
continue;
|
|
|
|
program.descriptor_sets[program.descriptor_set_count++] =
|
|
(struct pvr_pds_descriptor_set){
|
|
.descriptor_set = set_num,
|
|
.size_in_dwords = reg_layout->secondary_size,
|
|
.destination = reg_layout->secondary_offset + start_offset,
|
|
};
|
|
}
|
|
} else {
|
|
uint32_t addr_literals = 0;
|
|
|
|
if (sh_reg_layout->descriptor_set_addrs_table.present) {
|
|
program.addr_literals[addr_literals] = (struct pvr_pds_addr_literal){
|
|
.type = PVR_PDS_ADDR_LITERAL_DESC_SET_ADDRS_TABLE,
|
|
.destination = sh_reg_layout->descriptor_set_addrs_table.offset,
|
|
};
|
|
addr_literals++;
|
|
}
|
|
|
|
if (sh_reg_layout->push_consts.present) {
|
|
program.addr_literals[addr_literals] = (struct pvr_pds_addr_literal){
|
|
.type = PVR_PDS_ADDR_LITERAL_PUSH_CONSTS,
|
|
.destination = sh_reg_layout->push_consts.offset,
|
|
};
|
|
addr_literals++;
|
|
}
|
|
|
|
if (sh_reg_layout->blend_consts.present) {
|
|
program.addr_literals[addr_literals] = (struct pvr_pds_addr_literal){
|
|
.type = PVR_PDS_ADDR_LITERAL_BLEND_CONSTANTS,
|
|
.destination = sh_reg_layout->blend_consts.offset,
|
|
};
|
|
addr_literals++;
|
|
}
|
|
|
|
program.addr_literal_count = addr_literals;
|
|
}
|
|
|
|
pds_info->entries = vk_alloc2(&device->vk.alloc,
|
|
allocator,
|
|
const_entries_size_in_bytes,
|
|
8,
|
|
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
|
|
if (!pds_info->entries) {
|
|
result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
|
|
goto err_free_static_consts;
|
|
}
|
|
|
|
pds_info->entries_size_in_bytes = const_entries_size_in_bytes;
|
|
|
|
pvr_pds_generate_descriptor_upload_program(&program, NULL, pds_info);
|
|
|
|
code_size_in_dwords = pds_info->code_size_in_dwords;
|
|
staging_buffer_size = PVR_DW_TO_BYTES(pds_info->code_size_in_dwords);
|
|
|
|
if (!staging_buffer_size) {
|
|
vk_free2(&device->vk.alloc, allocator, pds_info->entries);
|
|
|
|
*descriptor_state = (struct pvr_stage_allocation_descriptor_state){ 0 };
|
|
|
|
return VK_SUCCESS;
|
|
}
|
|
|
|
staging_buffer = vk_alloc2(&device->vk.alloc,
|
|
allocator,
|
|
staging_buffer_size,
|
|
8,
|
|
VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
|
|
if (!staging_buffer) {
|
|
result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
|
|
goto err_free_entries;
|
|
}
|
|
|
|
pvr_pds_generate_descriptor_upload_program(&program,
|
|
staging_buffer,
|
|
pds_info);
|
|
|
|
assert(pds_info->code_size_in_dwords <= code_size_in_dwords);
|
|
|
|
/* FIXME: use vk_realloc2() ? */
|
|
new_entries = vk_realloc((!allocator) ? &device->vk.alloc : allocator,
|
|
pds_info->entries,
|
|
pds_info->entries_written_size_in_bytes,
|
|
8,
|
|
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
|
|
if (!new_entries) {
|
|
result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
|
|
goto err_free_staging_buffer;
|
|
}
|
|
|
|
pds_info->entries = new_entries;
|
|
pds_info->entries_size_in_bytes = pds_info->entries_written_size_in_bytes;
|
|
|
|
/* FIXME: Figure out the define for alignment of 16. */
|
|
result = pvr_gpu_upload_pds(device,
|
|
NULL,
|
|
0,
|
|
0,
|
|
staging_buffer,
|
|
pds_info->code_size_in_dwords,
|
|
16,
|
|
16,
|
|
&descriptor_state->pds_code);
|
|
if (result != VK_SUCCESS)
|
|
goto err_free_staging_buffer;
|
|
|
|
vk_free2(&device->vk.alloc, allocator, staging_buffer);
|
|
|
|
return VK_SUCCESS;
|
|
|
|
err_free_staging_buffer:
|
|
vk_free2(&device->vk.alloc, allocator, staging_buffer);
|
|
|
|
err_free_entries:
|
|
vk_free2(&device->vk.alloc, allocator, pds_info->entries);
|
|
|
|
err_free_static_consts:
|
|
pvr_bo_suballoc_free(descriptor_state->static_consts);
|
|
|
|
return result;
|
|
}
|
|
|
|
static void pvr_pds_descriptor_program_destroy(
|
|
struct pvr_device *const device,
|
|
const struct VkAllocationCallbacks *const allocator,
|
|
struct pvr_stage_allocation_descriptor_state *const descriptor_state)
|
|
{
|
|
if (!descriptor_state)
|
|
return;
|
|
|
|
pvr_bo_suballoc_free(descriptor_state->pds_code.pvr_bo);
|
|
vk_free2(&device->vk.alloc, allocator, descriptor_state->pds_info.entries);
|
|
pvr_bo_suballoc_free(descriptor_state->static_consts);
|
|
}
|
|
|
|
static void pvr_pds_compute_program_setup(
|
|
const struct pvr_device_info *dev_info,
|
|
const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
|
|
const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
|
|
uint32_t barrier_coefficient,
|
|
bool add_base_workgroup,
|
|
uint32_t usc_temps,
|
|
pvr_dev_addr_t usc_shader_dev_addr,
|
|
struct pvr_pds_compute_shader_program *const program)
|
|
{
|
|
pvr_pds_compute_shader_program_init(program);
|
|
program->local_input_regs[0] = local_input_regs[0];
|
|
program->local_input_regs[1] = local_input_regs[1];
|
|
program->local_input_regs[2] = local_input_regs[2];
|
|
program->work_group_input_regs[0] = work_group_input_regs[0];
|
|
program->work_group_input_regs[1] = work_group_input_regs[1];
|
|
program->work_group_input_regs[2] = work_group_input_regs[2];
|
|
program->barrier_coefficient = barrier_coefficient;
|
|
program->add_base_workgroup = add_base_workgroup;
|
|
program->flattened_work_groups = true;
|
|
program->kick_usc = true;
|
|
|
|
STATIC_ASSERT(ARRAY_SIZE(program->local_input_regs) ==
|
|
PVR_WORKGROUP_DIMENSIONS);
|
|
STATIC_ASSERT(ARRAY_SIZE(program->work_group_input_regs) ==
|
|
PVR_WORKGROUP_DIMENSIONS);
|
|
STATIC_ASSERT(ARRAY_SIZE(program->global_input_regs) ==
|
|
PVR_WORKGROUP_DIMENSIONS);
|
|
|
|
pvr_pds_setup_doutu(&program->usc_task_control,
|
|
usc_shader_dev_addr.addr,
|
|
usc_temps,
|
|
PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE),
|
|
false);
|
|
|
|
pvr_pds_compute_shader(program, NULL, PDS_GENERATE_SIZES, dev_info);
|
|
}
|
|
|
|
/* FIXME: See if pvr_device_init_compute_pds_program() and this could be merged.
|
|
*/
|
|
static VkResult pvr_pds_compute_program_create_and_upload(
|
|
struct pvr_device *const device,
|
|
const VkAllocationCallbacks *const allocator,
|
|
const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
|
|
const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
|
|
uint32_t barrier_coefficient,
|
|
uint32_t usc_temps,
|
|
pvr_dev_addr_t usc_shader_dev_addr,
|
|
struct pvr_pds_upload *const pds_upload_out,
|
|
struct pvr_pds_info *const pds_info_out)
|
|
{
|
|
struct pvr_device_info *dev_info = &device->pdevice->dev_info;
|
|
struct pvr_pds_compute_shader_program program;
|
|
uint32_t staging_buffer_size;
|
|
uint32_t *staging_buffer;
|
|
VkResult result;
|
|
|
|
pvr_pds_compute_program_setup(dev_info,
|
|
local_input_regs,
|
|
work_group_input_regs,
|
|
barrier_coefficient,
|
|
false,
|
|
usc_temps,
|
|
usc_shader_dev_addr,
|
|
&program);
|
|
|
|
/* FIXME: According to pvr_device_init_compute_pds_program() the code size
|
|
* is in bytes. Investigate this.
|
|
*/
|
|
staging_buffer_size = PVR_DW_TO_BYTES(program.code_size + program.data_size);
|
|
|
|
staging_buffer = vk_alloc2(&device->vk.alloc,
|
|
allocator,
|
|
staging_buffer_size,
|
|
8,
|
|
VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
|
|
if (!staging_buffer)
|
|
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
|
|
|
|
/* FIXME: pvr_pds_compute_shader doesn't implement
|
|
* PDS_GENERATE_CODEDATA_SEGMENTS.
|
|
*/
|
|
pvr_pds_compute_shader(&program,
|
|
&staging_buffer[0],
|
|
PDS_GENERATE_CODE_SEGMENT,
|
|
dev_info);
|
|
|
|
pvr_pds_compute_shader(&program,
|
|
&staging_buffer[program.code_size],
|
|
PDS_GENERATE_DATA_SEGMENT,
|
|
dev_info);
|
|
|
|
/* FIXME: Figure out the define for alignment of 16. */
|
|
result = pvr_gpu_upload_pds(device,
|
|
&staging_buffer[program.code_size],
|
|
program.data_size,
|
|
16,
|
|
&staging_buffer[0],
|
|
program.code_size,
|
|
16,
|
|
16,
|
|
pds_upload_out);
|
|
if (result != VK_SUCCESS) {
|
|
vk_free2(&device->vk.alloc, allocator, staging_buffer);
|
|
return result;
|
|
}
|
|
|
|
*pds_info_out = (struct pvr_pds_info){
|
|
.temps_required = program.highest_temp,
|
|
.code_size_in_dwords = program.code_size,
|
|
.data_size_in_dwords = program.data_size,
|
|
};
|
|
|
|
vk_free2(&device->vk.alloc, allocator, staging_buffer);
|
|
|
|
return VK_SUCCESS;
|
|
};
|
|
|
|
static void pvr_pds_compute_program_destroy(
|
|
struct pvr_device *const device,
|
|
const struct VkAllocationCallbacks *const allocator,
|
|
struct pvr_pds_upload *const pds_program,
|
|
struct pvr_pds_info *const pds_info)
|
|
{
|
|
/* We don't allocate an entries buffer so we don't need to free it */
|
|
pvr_bo_suballoc_free(pds_program->pvr_bo);
|
|
}
|
|
|
|
/* This only uploads the code segment. The data segment will need to be patched
|
|
* with the base workgroup before uploading.
|
|
*/
|
|
static VkResult pvr_pds_compute_base_workgroup_variant_program_init(
|
|
struct pvr_device *const device,
|
|
const VkAllocationCallbacks *const allocator,
|
|
const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
|
|
const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
|
|
uint32_t barrier_coefficient,
|
|
uint32_t usc_temps,
|
|
pvr_dev_addr_t usc_shader_dev_addr,
|
|
struct pvr_pds_base_workgroup_program *program_out)
|
|
{
|
|
struct pvr_device_info *dev_info = &device->pdevice->dev_info;
|
|
struct pvr_pds_compute_shader_program program;
|
|
uint32_t buffer_size;
|
|
uint32_t *buffer;
|
|
VkResult result;
|
|
|
|
pvr_pds_compute_program_setup(dev_info,
|
|
local_input_regs,
|
|
work_group_input_regs,
|
|
barrier_coefficient,
|
|
true,
|
|
usc_temps,
|
|
usc_shader_dev_addr,
|
|
&program);
|
|
|
|
/* FIXME: According to pvr_device_init_compute_pds_program() the code size
|
|
* is in bytes. Investigate this.
|
|
*/
|
|
buffer_size = PVR_DW_TO_BYTES(MAX2(program.code_size, program.data_size));
|
|
|
|
buffer = vk_alloc2(&device->vk.alloc,
|
|
allocator,
|
|
buffer_size,
|
|
8,
|
|
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
|
|
if (!buffer)
|
|
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
|
|
|
|
pvr_pds_compute_shader(&program,
|
|
&buffer[0],
|
|
PDS_GENERATE_CODE_SEGMENT,
|
|
dev_info);
|
|
|
|
/* FIXME: Figure out the define for alignment of 16. */
|
|
result = pvr_gpu_upload_pds(device,
|
|
NULL,
|
|
0,
|
|
0,
|
|
buffer,
|
|
program.code_size,
|
|
16,
|
|
16,
|
|
&program_out->code_upload);
|
|
if (result != VK_SUCCESS) {
|
|
vk_free2(&device->vk.alloc, allocator, buffer);
|
|
return result;
|
|
}
|
|
|
|
pvr_pds_compute_shader(&program, buffer, PDS_GENERATE_DATA_SEGMENT, dev_info);
|
|
|
|
program_out->data_section = buffer;
|
|
|
|
/* We'll need to patch the base workgroup in the PDS data section before
|
|
* dispatch so we save the offsets at which to patch. We only need to save
|
|
* the offset for the first workgroup id since the workgroup ids are stored
|
|
* contiguously in the data segment.
|
|
*/
|
|
program_out->base_workgroup_data_patching_offset =
|
|
program.base_workgroup_constant_offset_in_dwords[0];
|
|
|
|
program_out->info = (struct pvr_pds_info){
|
|
.temps_required = program.highest_temp,
|
|
.code_size_in_dwords = program.code_size,
|
|
.data_size_in_dwords = program.data_size,
|
|
};
|
|
|
|
return VK_SUCCESS;
|
|
}
|
|
|
|
static void pvr_pds_compute_base_workgroup_variant_program_finish(
|
|
struct pvr_device *device,
|
|
const VkAllocationCallbacks *const allocator,
|
|
struct pvr_pds_base_workgroup_program *const state)
|
|
{
|
|
pvr_bo_suballoc_free(state->code_upload.pvr_bo);
|
|
vk_free2(&device->vk.alloc, allocator, state->data_section);
|
|
}
|
|
|
|
/******************************************************************************
|
|
Generic pipeline functions
|
|
******************************************************************************/
|
|
|
|
static void pvr_pipeline_init(struct pvr_device *device,
|
|
enum pvr_pipeline_type type,
|
|
struct pvr_pipeline *const pipeline)
|
|
{
|
|
assert(!pipeline->layout);
|
|
|
|
vk_object_base_init(&device->vk, &pipeline->base, VK_OBJECT_TYPE_PIPELINE);
|
|
|
|
pipeline->type = type;
|
|
}
|
|
|
|
static void pvr_pipeline_finish(struct pvr_pipeline *pipeline)
|
|
{
|
|
vk_object_base_finish(&pipeline->base);
|
|
}
|
|
|
|
/* How many shared regs it takes to store a pvr_dev_addr_t.
|
|
* Each shared reg is 32 bits.
|
|
*/
|
|
#define PVR_DEV_ADDR_SIZE_IN_SH_REGS \
|
|
DIV_ROUND_UP(sizeof(pvr_dev_addr_t), sizeof(uint32_t))
|
|
|
|
/**
|
|
* \brief Allocates shared registers.
|
|
*
|
|
* \return How many sh regs are required.
|
|
*/
|
|
static uint32_t
|
|
pvr_pipeline_alloc_shareds(const struct pvr_device *device,
|
|
const struct pvr_pipeline_layout *layout,
|
|
enum pvr_stage_allocation stage,
|
|
struct pvr_sh_reg_layout *const sh_reg_layout_out)
|
|
{
|
|
ASSERTED const uint64_t reserved_shared_size =
|
|
device->pdevice->dev_runtime_info.reserved_shared_size;
|
|
ASSERTED const uint64_t max_coeff =
|
|
device->pdevice->dev_runtime_info.max_coeffs;
|
|
|
|
struct pvr_sh_reg_layout reg_layout = { 0 };
|
|
uint32_t next_free_sh_reg = 0;
|
|
|
|
reg_layout.descriptor_set_addrs_table.present =
|
|
!!(layout->shader_stage_mask & BITFIELD_BIT(stage));
|
|
|
|
if (reg_layout.descriptor_set_addrs_table.present) {
|
|
reg_layout.descriptor_set_addrs_table.offset = next_free_sh_reg;
|
|
next_free_sh_reg += PVR_DEV_ADDR_SIZE_IN_SH_REGS;
|
|
}
|
|
|
|
reg_layout.push_consts.present =
|
|
!!(layout->push_constants_shader_stages & BITFIELD_BIT(stage));
|
|
|
|
if (reg_layout.push_consts.present) {
|
|
reg_layout.push_consts.offset = next_free_sh_reg;
|
|
next_free_sh_reg += PVR_DEV_ADDR_SIZE_IN_SH_REGS;
|
|
}
|
|
|
|
*sh_reg_layout_out = reg_layout;
|
|
|
|
/* FIXME: We might need to take more things into consideration.
|
|
* See pvr_calc_fscommon_size_and_tiles_in_flight().
|
|
*/
|
|
assert(next_free_sh_reg <= reserved_shared_size - max_coeff);
|
|
|
|
return next_free_sh_reg;
|
|
}
|
|
|
|
/******************************************************************************
|
|
Compute pipeline functions
|
|
******************************************************************************/
|
|
|
|
/* Compiles and uploads shaders and PDS programs. */
|
|
static VkResult pvr_compute_pipeline_compile(
|
|
struct pvr_device *const device,
|
|
struct vk_pipeline_cache *cache,
|
|
const VkComputePipelineCreateInfo *pCreateInfo,
|
|
const VkAllocationCallbacks *const allocator,
|
|
struct pvr_compute_pipeline *const compute_pipeline)
|
|
{
|
|
struct pvr_pipeline_layout *layout = compute_pipeline->base.layout;
|
|
struct pvr_sh_reg_layout *sh_reg_layout =
|
|
&layout->sh_reg_layout_per_stage[PVR_STAGE_ALLOCATION_COMPUTE];
|
|
struct rogue_compile_time_consts_data compile_time_consts_data;
|
|
uint32_t work_group_input_regs[PVR_WORKGROUP_DIMENSIONS];
|
|
struct pvr_explicit_constant_usage explicit_const_usage;
|
|
uint32_t local_input_regs[PVR_WORKGROUP_DIMENSIONS];
|
|
struct rogue_ubo_data ubo_data;
|
|
uint32_t barrier_coefficient;
|
|
uint32_t usc_temps;
|
|
VkResult result;
|
|
|
|
if (pvr_has_hard_coded_shaders(&device->pdevice->dev_info)) {
|
|
struct pvr_hard_code_compute_build_info build_info;
|
|
|
|
result = pvr_hard_code_compute_pipeline(device,
|
|
&compute_pipeline->shader_state,
|
|
&build_info);
|
|
if (result != VK_SUCCESS)
|
|
return result;
|
|
|
|
ubo_data = build_info.ubo_data;
|
|
compile_time_consts_data = build_info.compile_time_consts_data;
|
|
|
|
/* We make sure that the compiler's unused reg value is compatible with
|
|
* the pds api.
|
|
*/
|
|
STATIC_ASSERT(ROGUE_REG_UNUSED == PVR_PDS_COMPUTE_INPUT_REG_UNUSED);
|
|
|
|
barrier_coefficient = build_info.barrier_reg;
|
|
|
|
/* TODO: Maybe change the pds api to use pointers so we avoid the copy. */
|
|
local_input_regs[0] = build_info.local_invocation_regs[0];
|
|
local_input_regs[1] = build_info.local_invocation_regs[1];
|
|
/* This is not a mistake. We want to assign element 1 to 2. */
|
|
local_input_regs[2] = build_info.local_invocation_regs[1];
|
|
|
|
STATIC_ASSERT(
|
|
__same_type(work_group_input_regs, build_info.work_group_regs));
|
|
typed_memcpy(work_group_input_regs,
|
|
build_info.work_group_regs,
|
|
PVR_WORKGROUP_DIMENSIONS);
|
|
|
|
usc_temps = build_info.usc_temps;
|
|
|
|
explicit_const_usage = build_info.explicit_conts_usage;
|
|
|
|
} else {
|
|
uint32_t sh_count;
|
|
sh_count = pvr_pipeline_alloc_shareds(device,
|
|
layout,
|
|
PVR_STAGE_ALLOCATION_COMPUTE,
|
|
sh_reg_layout);
|
|
|
|
compute_pipeline->shader_state.const_shared_reg_count = sh_count;
|
|
|
|
/* FIXME: Compile and upload the shader. */
|
|
/* FIXME: Initialize the shader state and setup build info. */
|
|
abort();
|
|
};
|
|
|
|
result = pvr_pds_descriptor_program_create_and_upload(
|
|
device,
|
|
allocator,
|
|
&compile_time_consts_data,
|
|
&ubo_data,
|
|
&explicit_const_usage,
|
|
layout,
|
|
PVR_STAGE_ALLOCATION_COMPUTE,
|
|
sh_reg_layout,
|
|
&compute_pipeline->descriptor_state);
|
|
if (result != VK_SUCCESS)
|
|
goto err_free_shader;
|
|
|
|
result = pvr_pds_compute_program_create_and_upload(
|
|
device,
|
|
allocator,
|
|
local_input_regs,
|
|
work_group_input_regs,
|
|
barrier_coefficient,
|
|
usc_temps,
|
|
compute_pipeline->shader_state.bo->dev_addr,
|
|
&compute_pipeline->primary_program,
|
|
&compute_pipeline->primary_program_info);
|
|
if (result != VK_SUCCESS)
|
|
goto err_free_descriptor_program;
|
|
|
|
/* If the workgroup ID is required, then we require the base workgroup
|
|
* variant of the PDS compute program as well.
|
|
*/
|
|
compute_pipeline->flags.base_workgroup =
|
|
work_group_input_regs[0] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED ||
|
|
work_group_input_regs[1] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED ||
|
|
work_group_input_regs[2] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED;
|
|
|
|
if (compute_pipeline->flags.base_workgroup) {
|
|
result = pvr_pds_compute_base_workgroup_variant_program_init(
|
|
device,
|
|
allocator,
|
|
local_input_regs,
|
|
work_group_input_regs,
|
|
barrier_coefficient,
|
|
usc_temps,
|
|
compute_pipeline->shader_state.bo->dev_addr,
|
|
&compute_pipeline->primary_base_workgroup_variant_program);
|
|
if (result != VK_SUCCESS)
|
|
goto err_destroy_compute_program;
|
|
}
|
|
|
|
return VK_SUCCESS;
|
|
|
|
err_destroy_compute_program:
|
|
pvr_pds_compute_program_destroy(device,
|
|
allocator,
|
|
&compute_pipeline->primary_program,
|
|
&compute_pipeline->primary_program_info);
|
|
|
|
err_free_descriptor_program:
|
|
pvr_pds_descriptor_program_destroy(device,
|
|
allocator,
|
|
&compute_pipeline->descriptor_state);
|
|
|
|
err_free_shader:
|
|
pvr_bo_suballoc_free(compute_pipeline->shader_state.bo);
|
|
|
|
return result;
|
|
}
|
|
|
|
static VkResult
|
|
pvr_compute_pipeline_init(struct pvr_device *device,
|
|
struct vk_pipeline_cache *cache,
|
|
const VkComputePipelineCreateInfo *pCreateInfo,
|
|
const VkAllocationCallbacks *allocator,
|
|
struct pvr_compute_pipeline *compute_pipeline)
|
|
{
|
|
VkResult result;
|
|
|
|
pvr_pipeline_init(device,
|
|
PVR_PIPELINE_TYPE_COMPUTE,
|
|
&compute_pipeline->base);
|
|
|
|
compute_pipeline->base.layout =
|
|
pvr_pipeline_layout_from_handle(pCreateInfo->layout);
|
|
|
|
result = pvr_compute_pipeline_compile(device,
|
|
cache,
|
|
pCreateInfo,
|
|
allocator,
|
|
compute_pipeline);
|
|
if (result != VK_SUCCESS) {
|
|
pvr_pipeline_finish(&compute_pipeline->base);
|
|
return result;
|
|
}
|
|
|
|
return VK_SUCCESS;
|
|
}
|
|
|
|
static VkResult
|
|
pvr_compute_pipeline_create(struct pvr_device *device,
|
|
struct vk_pipeline_cache *cache,
|
|
const VkComputePipelineCreateInfo *pCreateInfo,
|
|
const VkAllocationCallbacks *allocator,
|
|
VkPipeline *const pipeline_out)
|
|
{
|
|
struct pvr_compute_pipeline *compute_pipeline;
|
|
VkResult result;
|
|
|
|
compute_pipeline = vk_zalloc2(&device->vk.alloc,
|
|
allocator,
|
|
sizeof(*compute_pipeline),
|
|
8,
|
|
VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
|
|
if (!compute_pipeline)
|
|
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
|
|
|
|
/* Compiles and uploads shaders and PDS programs. */
|
|
result = pvr_compute_pipeline_init(device,
|
|
cache,
|
|
pCreateInfo,
|
|
allocator,
|
|
compute_pipeline);
|
|
if (result != VK_SUCCESS) {
|
|
vk_free2(&device->vk.alloc, allocator, compute_pipeline);
|
|
return result;
|
|
}
|
|
|
|
*pipeline_out = pvr_pipeline_to_handle(&compute_pipeline->base);
|
|
|
|
return VK_SUCCESS;
|
|
}
|
|
|
|
static void pvr_compute_pipeline_destroy(
|
|
struct pvr_device *const device,
|
|
const VkAllocationCallbacks *const allocator,
|
|
struct pvr_compute_pipeline *const compute_pipeline)
|
|
{
|
|
if (compute_pipeline->flags.base_workgroup) {
|
|
pvr_pds_compute_base_workgroup_variant_program_finish(
|
|
device,
|
|
allocator,
|
|
&compute_pipeline->primary_base_workgroup_variant_program);
|
|
}
|
|
|
|
pvr_pds_compute_program_destroy(device,
|
|
allocator,
|
|
&compute_pipeline->primary_program,
|
|
&compute_pipeline->primary_program_info);
|
|
pvr_pds_descriptor_program_destroy(device,
|
|
allocator,
|
|
&compute_pipeline->descriptor_state);
|
|
pvr_bo_suballoc_free(compute_pipeline->shader_state.bo);
|
|
|
|
pvr_pipeline_finish(&compute_pipeline->base);
|
|
|
|
vk_free2(&device->vk.alloc, allocator, compute_pipeline);
|
|
}
|
|
|
|
VkResult
|
|
pvr_CreateComputePipelines(VkDevice _device,
|
|
VkPipelineCache pipelineCache,
|
|
uint32_t createInfoCount,
|
|
const VkComputePipelineCreateInfo *pCreateInfos,
|
|
const VkAllocationCallbacks *pAllocator,
|
|
VkPipeline *pPipelines)
|
|
{
|
|
VK_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache);
|
|
PVR_FROM_HANDLE(pvr_device, device, _device);
|
|
VkResult result = VK_SUCCESS;
|
|
|
|
for (uint32_t i = 0; i < createInfoCount; i++) {
|
|
const VkResult local_result =
|
|
pvr_compute_pipeline_create(device,
|
|
cache,
|
|
&pCreateInfos[i],
|
|
pAllocator,
|
|
&pPipelines[i]);
|
|
if (local_result != VK_SUCCESS) {
|
|
result = local_result;
|
|
pPipelines[i] = VK_NULL_HANDLE;
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
/******************************************************************************
|
|
Graphics pipeline functions
|
|
******************************************************************************/
|
|
|
|
static void
|
|
pvr_graphics_pipeline_destroy(struct pvr_device *const device,
|
|
const VkAllocationCallbacks *const allocator,
|
|
struct pvr_graphics_pipeline *const gfx_pipeline)
|
|
{
|
|
const uint32_t num_vertex_attrib_programs =
|
|
ARRAY_SIZE(gfx_pipeline->shader_state.vertex.pds_attrib_programs);
|
|
|
|
pvr_pds_descriptor_program_destroy(
|
|
device,
|
|
allocator,
|
|
&gfx_pipeline->shader_state.fragment.descriptor_state);
|
|
|
|
pvr_pds_descriptor_program_destroy(
|
|
device,
|
|
allocator,
|
|
&gfx_pipeline->shader_state.vertex.descriptor_state);
|
|
|
|
for (uint32_t i = 0; i < num_vertex_attrib_programs; i++) {
|
|
struct pvr_pds_attrib_program *const attrib_program =
|
|
&gfx_pipeline->shader_state.vertex.pds_attrib_programs[i];
|
|
|
|
pvr_pds_vertex_attrib_program_destroy(device, allocator, attrib_program);
|
|
}
|
|
|
|
pvr_bo_suballoc_free(
|
|
gfx_pipeline->shader_state.fragment.pds_fragment_program.pvr_bo);
|
|
pvr_bo_suballoc_free(
|
|
gfx_pipeline->shader_state.fragment.pds_coeff_program.pvr_bo);
|
|
|
|
pvr_bo_suballoc_free(gfx_pipeline->shader_state.fragment.bo);
|
|
pvr_bo_suballoc_free(gfx_pipeline->shader_state.vertex.bo);
|
|
|
|
pvr_pipeline_finish(&gfx_pipeline->base);
|
|
|
|
vk_free2(&device->vk.alloc, allocator, gfx_pipeline);
|
|
}
|
|
|
|
static void
|
|
pvr_vertex_state_init(struct pvr_graphics_pipeline *gfx_pipeline,
|
|
const struct rogue_common_build_data *common_data,
|
|
uint32_t vtxin_regs_used,
|
|
const struct rogue_vs_build_data *vs_data)
|
|
{
|
|
struct pvr_vertex_shader_state *vertex_state =
|
|
&gfx_pipeline->shader_state.vertex;
|
|
|
|
/* TODO: Hard coding these for now. These should be populated based on the
|
|
* information returned by the compiler.
|
|
*/
|
|
vertex_state->stage_state.const_shared_reg_count = common_data->shareds;
|
|
vertex_state->stage_state.const_shared_reg_offset = 0;
|
|
vertex_state->stage_state.coefficient_size = common_data->coeffs;
|
|
vertex_state->stage_state.uses_atomic_ops = false;
|
|
vertex_state->stage_state.uses_texture_rw = false;
|
|
vertex_state->stage_state.uses_barrier = false;
|
|
vertex_state->stage_state.has_side_effects = false;
|
|
vertex_state->stage_state.empty_program = false;
|
|
|
|
/* This ends up unused since we'll use the temp_usage for the PDS program we
|
|
* end up selecting, and the descriptor PDS program doesn't use any temps.
|
|
* Let's set it to ~0 in case it ever gets used.
|
|
*/
|
|
vertex_state->stage_state.pds_temps_count = ~0;
|
|
|
|
vertex_state->vertex_input_size = vtxin_regs_used;
|
|
vertex_state->vertex_output_size =
|
|
vs_data->num_vertex_outputs * ROGUE_REG_SIZE_BYTES;
|
|
vertex_state->user_clip_planes_mask = 0;
|
|
vertex_state->entry_offset = 0;
|
|
|
|
/* TODO: The number of varyings should be checked against the fragment
|
|
* shader inputs and assigned in the place where that happens.
|
|
* There will also be an opportunity to cull unused fs inputs/vs outputs.
|
|
*/
|
|
pvr_csb_pack (&gfx_pipeline->shader_state.vertex.varying[0],
|
|
TA_STATE_VARYING0,
|
|
varying0) {
|
|
varying0.f32_linear = vs_data->num_varyings;
|
|
varying0.f32_flat = 0;
|
|
varying0.f32_npc = 0;
|
|
}
|
|
|
|
pvr_csb_pack (&gfx_pipeline->shader_state.vertex.varying[1],
|
|
TA_STATE_VARYING1,
|
|
varying1) {
|
|
varying1.f16_linear = 0;
|
|
varying1.f16_flat = 0;
|
|
varying1.f16_npc = 0;
|
|
}
|
|
}
|
|
|
|
static void
|
|
pvr_fragment_state_init(struct pvr_graphics_pipeline *gfx_pipeline,
|
|
const struct rogue_common_build_data *common_data)
|
|
{
|
|
struct pvr_fragment_shader_state *fragment_state =
|
|
&gfx_pipeline->shader_state.fragment;
|
|
|
|
/* TODO: Hard coding these for now. These should be populated based on the
|
|
* information returned by the compiler.
|
|
*/
|
|
fragment_state->stage_state.const_shared_reg_count = 0;
|
|
fragment_state->stage_state.const_shared_reg_offset = 0;
|
|
fragment_state->stage_state.coefficient_size = common_data->coeffs;
|
|
fragment_state->stage_state.uses_atomic_ops = false;
|
|
fragment_state->stage_state.uses_texture_rw = false;
|
|
fragment_state->stage_state.uses_barrier = false;
|
|
fragment_state->stage_state.has_side_effects = false;
|
|
fragment_state->stage_state.empty_program = false;
|
|
|
|
fragment_state->pass_type = PVRX(TA_PASSTYPE_OPAQUE);
|
|
fragment_state->entry_offset = 0;
|
|
|
|
/* We can't initialize it yet since we still need to generate the PDS
|
|
* programs so set it to `~0` to make sure that we set this up later on.
|
|
*/
|
|
fragment_state->stage_state.pds_temps_count = ~0;
|
|
}
|
|
|
|
static bool pvr_blend_factor_requires_consts(VkBlendFactor factor)
|
|
{
|
|
switch (factor) {
|
|
case VK_BLEND_FACTOR_CONSTANT_COLOR:
|
|
case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
|
|
case VK_BLEND_FACTOR_CONSTANT_ALPHA:
|
|
case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
|
|
return true;
|
|
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* \brief Indicates whether dynamic blend constants are needed.
|
|
*
|
|
* If the user has specified the blend constants to be dynamic, they might not
|
|
* necessarily be using them. This function makes sure that they are being used
|
|
* in order to determine whether we need to upload them later on for the shader
|
|
* to access them.
|
|
*/
|
|
static bool pvr_graphics_pipeline_requires_dynamic_blend_consts(
|
|
const struct pvr_graphics_pipeline *gfx_pipeline)
|
|
{
|
|
const struct vk_dynamic_graphics_state *const state =
|
|
&gfx_pipeline->dynamic_state;
|
|
|
|
if (BITSET_TEST(state->set, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS))
|
|
return false;
|
|
|
|
for (uint32_t i = 0; i < state->cb.attachment_count; i++) {
|
|
const struct vk_color_blend_attachment_state *attachment =
|
|
&state->cb.attachments[i];
|
|
|
|
const bool has_color_write =
|
|
attachment->write_mask &
|
|
(VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT |
|
|
VK_COLOR_COMPONENT_B_BIT);
|
|
const bool has_alpha_write = attachment->write_mask &
|
|
VK_COLOR_COMPONENT_A_BIT;
|
|
|
|
if (!attachment->blend_enable || attachment->write_mask == 0)
|
|
continue;
|
|
|
|
if (has_color_write) {
|
|
const uint8_t src_color_blend_factor =
|
|
attachment->src_color_blend_factor;
|
|
const uint8_t dst_color_blend_factor =
|
|
attachment->dst_color_blend_factor;
|
|
|
|
if (pvr_blend_factor_requires_consts(src_color_blend_factor) ||
|
|
pvr_blend_factor_requires_consts(dst_color_blend_factor)) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
if (has_alpha_write) {
|
|
const uint8_t src_alpha_blend_factor =
|
|
attachment->src_alpha_blend_factor;
|
|
const uint8_t dst_alpha_blend_factor =
|
|
attachment->dst_alpha_blend_factor;
|
|
|
|
if (pvr_blend_factor_requires_consts(src_alpha_blend_factor) ||
|
|
pvr_blend_factor_requires_consts(dst_alpha_blend_factor)) {
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
static uint32_t pvr_graphics_pipeline_alloc_shareds(
|
|
const struct pvr_device *device,
|
|
const struct pvr_graphics_pipeline *gfx_pipeline,
|
|
enum pvr_stage_allocation stage,
|
|
struct pvr_sh_reg_layout *const sh_reg_layout_out)
|
|
{
|
|
ASSERTED const uint64_t reserved_shared_size =
|
|
device->pdevice->dev_runtime_info.reserved_shared_size;
|
|
ASSERTED const uint64_t max_coeff =
|
|
device->pdevice->dev_runtime_info.max_coeffs;
|
|
|
|
const struct pvr_pipeline_layout *layout = gfx_pipeline->base.layout;
|
|
struct pvr_sh_reg_layout reg_layout = { 0 };
|
|
uint32_t next_free_sh_reg = 0;
|
|
|
|
next_free_sh_reg =
|
|
pvr_pipeline_alloc_shareds(device, layout, stage, ®_layout);
|
|
|
|
reg_layout.blend_consts.present =
|
|
(stage == PVR_STAGE_ALLOCATION_FRAGMENT &&
|
|
pvr_graphics_pipeline_requires_dynamic_blend_consts(gfx_pipeline));
|
|
if (reg_layout.blend_consts.present) {
|
|
reg_layout.blend_consts.offset = next_free_sh_reg;
|
|
next_free_sh_reg += PVR_DEV_ADDR_SIZE_IN_SH_REGS;
|
|
}
|
|
|
|
*sh_reg_layout_out = reg_layout;
|
|
|
|
/* FIXME: We might need to take more things into consideration.
|
|
* See pvr_calc_fscommon_size_and_tiles_in_flight().
|
|
*/
|
|
assert(next_free_sh_reg <= reserved_shared_size - max_coeff);
|
|
|
|
return next_free_sh_reg;
|
|
}
|
|
|
|
#undef PVR_DEV_ADDR_SIZE_IN_SH_REGS
|
|
|
|
static void pvr_graphics_pipeline_alloc_vertex_inputs(
|
|
const VkPipelineVertexInputStateCreateInfo *const vs_data,
|
|
rogue_vertex_inputs *const vertex_input_layout_out,
|
|
unsigned *num_vertex_input_regs_out,
|
|
pvr_pds_attrib_dma_descriptions_array_ptr dma_descriptions_out_ptr,
|
|
uint32_t *const dma_count_out)
|
|
{
|
|
const VkVertexInputBindingDescription
|
|
*sorted_bindings[PVR_MAX_VERTEX_INPUT_BINDINGS] = { 0 };
|
|
const VkVertexInputAttributeDescription
|
|
*sorted_attributes[PVR_MAX_VERTEX_INPUT_BINDINGS] = { 0 };
|
|
|
|
rogue_vertex_inputs build_data = {
|
|
.num_input_vars = vs_data->vertexAttributeDescriptionCount,
|
|
};
|
|
uint32_t next_reg_offset = 0;
|
|
|
|
struct pvr_pds_vertex_dma *const dma_descriptions =
|
|
*dma_descriptions_out_ptr;
|
|
uint32_t dma_count = 0;
|
|
|
|
/* Vertex attributes map to the `layout(location = x)` annotation in the
|
|
* shader where `x` is the attribute's location.
|
|
* Vertex bindings have NO relation to the shader. They have nothing to do
|
|
* with the `layout(set = x, binding = y)` notation. They instead indicate
|
|
* where the data for a collection of vertex attributes comes from. The
|
|
* application binds a VkBuffer with vkCmdBindVertexBuffers() to a specific
|
|
* binding number and based on that we'll know which buffer to DMA the data
|
|
* from, to fill in the collection of vertex attributes.
|
|
*/
|
|
|
|
for (uint32_t i = 0; i < vs_data->vertexBindingDescriptionCount; i++) {
|
|
const VkVertexInputBindingDescription *binding_desc =
|
|
&vs_data->pVertexBindingDescriptions[i];
|
|
|
|
sorted_bindings[binding_desc->binding] = binding_desc;
|
|
}
|
|
|
|
for (uint32_t i = 0; i < vs_data->vertexAttributeDescriptionCount; i++) {
|
|
const VkVertexInputAttributeDescription *attribute_desc =
|
|
&vs_data->pVertexAttributeDescriptions[i];
|
|
|
|
sorted_attributes[attribute_desc->location] = attribute_desc;
|
|
}
|
|
|
|
for (uint32_t i = 0, j = 0; i < ARRAY_SIZE(sorted_attributes); i++) {
|
|
if (sorted_attributes[i])
|
|
sorted_attributes[j++] = sorted_attributes[i];
|
|
}
|
|
|
|
for (uint32_t i = 0; i < vs_data->vertexAttributeDescriptionCount; i++) {
|
|
const VkVertexInputAttributeDescription *attribute = sorted_attributes[i];
|
|
const VkVertexInputBindingDescription *binding =
|
|
sorted_bindings[attribute->binding];
|
|
const struct util_format_description *fmt_description =
|
|
vk_format_description(attribute->format);
|
|
struct pvr_pds_vertex_dma *dma_desc = &dma_descriptions[dma_count];
|
|
unsigned vtxin_reg_offset;
|
|
|
|
/* Reg allocation. */
|
|
|
|
vtxin_reg_offset = next_reg_offset;
|
|
build_data.base[i] = vtxin_reg_offset;
|
|
|
|
if (fmt_description->colorspace != UTIL_FORMAT_COLORSPACE_RGB ||
|
|
fmt_description->layout != UTIL_FORMAT_LAYOUT_PLAIN ||
|
|
fmt_description->block.bits % 32 != 0 || !fmt_description->is_array) {
|
|
/* For now we only support formats with 32 bit components since we
|
|
* don't need to pack/unpack them.
|
|
*/
|
|
/* TODO: Support any other format with VERTEX_BUFFER_BIT set that
|
|
* doesn't have 32 bit components if we're advertising any.
|
|
*/
|
|
assert(false);
|
|
}
|
|
|
|
/* TODO: Check if this is fine with the compiler. Does it want the amount
|
|
* of components or does it want a size in dwords to figure out how many
|
|
* vtxin regs are covered. For formats with 32 bit components the
|
|
* distinction doesn't change anything.
|
|
*/
|
|
build_data.components[i] =
|
|
util_format_get_nr_components(fmt_description->format);
|
|
|
|
next_reg_offset += build_data.components[i];
|
|
|
|
/* DMA setup. */
|
|
|
|
/* The PDS program sets up DDMADs to DMA attributes into vtxin regs.
|
|
*
|
|
* DDMAD -> Multiply, add, and DOUTD (i.e. DMA from that address).
|
|
* DMA source addr = src0 * src1 + src2
|
|
* DMA params = src3
|
|
*
|
|
* In the PDS program we setup src0 with the binding's stride and src1
|
|
* with either the instance id or vertex id (both of which get filled by
|
|
* the hardware). We setup src2 later on once we know which VkBuffer to
|
|
* DMA the data from so it's saved for later when we patch the data
|
|
* section.
|
|
*/
|
|
|
|
/* TODO: Right now we're setting up a DMA per attribute. In a case where
|
|
* there are multiple attributes packed into a single binding with
|
|
* adjacent locations we'd still be DMAing them separately. This is not
|
|
* great so the DMA setup should be smarter and could do with some
|
|
* optimization.
|
|
*/
|
|
|
|
*dma_desc = (struct pvr_pds_vertex_dma){ 0 };
|
|
|
|
/* In relation to the Vulkan spec. 22.4. Vertex Input Address Calculation
|
|
* this corresponds to `attribDesc.offset`.
|
|
* The PDS program doesn't do anything with it but just save it in the
|
|
* PDS program entry.
|
|
*/
|
|
dma_desc->offset = attribute->offset;
|
|
|
|
/* In relation to the Vulkan spec. 22.4. Vertex Input Address Calculation
|
|
* this corresponds to `bindingDesc.stride`.
|
|
* The PDS program will calculate the `effectiveVertexOffset` with this
|
|
* and add it to the address provided in the patched data segment.
|
|
*/
|
|
dma_desc->stride = binding->stride;
|
|
|
|
if (binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE)
|
|
dma_desc->flags = PVR_PDS_VERTEX_DMA_FLAGS_INSTANCE_RATE;
|
|
else
|
|
dma_desc->flags = 0;
|
|
|
|
/* Size to DMA per vertex attribute. Used to setup src3 in the DDMAD. */
|
|
assert(fmt_description->block.bits != 0); /* Likely an unsupported fmt. */
|
|
dma_desc->size_in_dwords = fmt_description->block.bits / 32;
|
|
|
|
/* Vtxin reg offset to start DMAing into. */
|
|
dma_desc->destination = vtxin_reg_offset;
|
|
|
|
/* Will be used by the driver to figure out buffer address to patch in the
|
|
* data section. I.e. which binding we should DMA from.
|
|
*/
|
|
dma_desc->binding_index = attribute->binding;
|
|
|
|
/* We don't currently support VK_EXT_vertex_attribute_divisor so no
|
|
* repeating of instance-rate vertex attributes needed. We should always
|
|
* move on to the next vertex attribute.
|
|
*/
|
|
dma_desc->divisor = 1;
|
|
|
|
/* Will be used to generate PDS code that takes care of robust buffer
|
|
* access, and later on by the driver to write the correct robustness
|
|
* buffer address to DMA the fallback values from.
|
|
*/
|
|
dma_desc->robustness_buffer_offset =
|
|
pvr_get_robustness_buffer_format_offset(attribute->format);
|
|
|
|
/* Used by later on by the driver to figure out if the buffer is being
|
|
* accessed out of bounds, for robust buffer access.
|
|
*/
|
|
dma_desc->component_size_in_bytes =
|
|
fmt_description->block.bits / fmt_description->nr_channels / 8;
|
|
|
|
dma_count++;
|
|
};
|
|
|
|
*vertex_input_layout_out = build_data;
|
|
*num_vertex_input_regs_out = next_reg_offset;
|
|
*dma_count_out = dma_count;
|
|
}
|
|
|
|
static void pvr_graphics_pipeline_alloc_vertex_special_vars(
|
|
unsigned *num_vertex_input_regs,
|
|
struct pvr_vertex_special_vars *special_vars_layout_out)
|
|
{
|
|
unsigned next_free_reg = *num_vertex_input_regs;
|
|
struct pvr_vertex_special_vars layout;
|
|
|
|
/* We don't support VK_KHR_shader_draw_parameters or Vulkan 1.1 so no
|
|
* BaseInstance, BaseVertex, DrawIndex.
|
|
*/
|
|
|
|
/* TODO: The shader might not necessarily be using this so we'd just be
|
|
* wasting regs. Get the info from the compiler about whether or not the
|
|
* shader uses them and allocate them accordingly. For now we'll set them up
|
|
* regardless.
|
|
*/
|
|
|
|
layout.vertex_id_offset = (int16_t)next_free_reg;
|
|
next_free_reg++;
|
|
|
|
layout.instance_id_offset = (int16_t)next_free_reg;
|
|
next_free_reg++;
|
|
|
|
*num_vertex_input_regs = next_free_reg;
|
|
*special_vars_layout_out = layout;
|
|
}
|
|
|
|
/* Compiles and uploads shaders and PDS programs. */
|
|
static VkResult
|
|
pvr_graphics_pipeline_compile(struct pvr_device *const device,
|
|
struct vk_pipeline_cache *cache,
|
|
const VkGraphicsPipelineCreateInfo *pCreateInfo,
|
|
const VkAllocationCallbacks *const allocator,
|
|
struct pvr_graphics_pipeline *const gfx_pipeline)
|
|
{
|
|
/* FIXME: Remove this hard coding. */
|
|
struct pvr_explicit_constant_usage vert_explicit_const_usage = {
|
|
.start_offset = 16,
|
|
};
|
|
struct pvr_explicit_constant_usage frag_explicit_const_usage = {
|
|
.start_offset = 0,
|
|
};
|
|
static uint32_t hard_code_pipeline_n = 0;
|
|
|
|
struct pvr_pipeline_layout *layout = gfx_pipeline->base.layout;
|
|
struct pvr_sh_reg_layout *sh_reg_layout_vert =
|
|
&layout->sh_reg_layout_per_stage[PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY];
|
|
struct pvr_sh_reg_layout *sh_reg_layout_frag =
|
|
&layout->sh_reg_layout_per_stage[PVR_STAGE_ALLOCATION_FRAGMENT];
|
|
const VkPipelineVertexInputStateCreateInfo *const vertex_input_state =
|
|
pCreateInfo->pVertexInputState;
|
|
const uint32_t cache_line_size =
|
|
rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
|
|
struct rogue_compiler *compiler = device->pdevice->compiler;
|
|
struct rogue_build_ctx *ctx;
|
|
VkResult result;
|
|
|
|
const bool old_path = pvr_has_hard_coded_shaders(&device->pdevice->dev_info);
|
|
|
|
/* Vars needed for the new path. */
|
|
struct pvr_pds_vertex_dma vtx_dma_descriptions[PVR_MAX_VERTEX_ATTRIB_DMAS];
|
|
uint32_t vtx_dma_count = 0;
|
|
rogue_vertex_inputs *vertex_input_layout;
|
|
unsigned *vertex_input_reg_count;
|
|
|
|
/* TODO: The compiler should be making use of this to determine where
|
|
* specific special variables are located in the vtxin reg set.
|
|
*/
|
|
struct pvr_vertex_special_vars special_vars_layout = { 0 };
|
|
|
|
uint32_t sh_count[PVR_STAGE_ALLOCATION_COUNT] = { 0 };
|
|
|
|
/* Setup shared build context. */
|
|
ctx = rogue_build_context_create(compiler, layout);
|
|
if (!ctx)
|
|
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
|
|
|
|
vertex_input_layout = &ctx->stage_data.vs.inputs;
|
|
vertex_input_reg_count = &ctx->stage_data.vs.num_vertex_input_regs;
|
|
|
|
if (!old_path) {
|
|
pvr_graphics_pipeline_alloc_vertex_inputs(vertex_input_state,
|
|
vertex_input_layout,
|
|
vertex_input_reg_count,
|
|
&vtx_dma_descriptions,
|
|
&vtx_dma_count);
|
|
|
|
pvr_graphics_pipeline_alloc_vertex_special_vars(vertex_input_reg_count,
|
|
&special_vars_layout);
|
|
|
|
for (enum pvr_stage_allocation pvr_stage =
|
|
PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY;
|
|
pvr_stage < PVR_STAGE_ALLOCATION_COMPUTE;
|
|
++pvr_stage)
|
|
sh_count[pvr_stage] = pvr_pipeline_alloc_shareds(
|
|
device,
|
|
layout,
|
|
pvr_stage,
|
|
&layout->sh_reg_layout_per_stage[pvr_stage]);
|
|
}
|
|
|
|
/* NIR middle-end translation. */
|
|
for (gl_shader_stage stage = MESA_SHADER_FRAGMENT; stage > MESA_SHADER_NONE;
|
|
stage--) {
|
|
const VkPipelineShaderStageCreateInfo *create_info;
|
|
size_t stage_index = gfx_pipeline->stage_indices[stage];
|
|
|
|
if (pvr_has_hard_coded_shaders(&device->pdevice->dev_info)) {
|
|
if (pvr_hard_code_graphics_get_flags(&device->pdevice->dev_info) &
|
|
BITFIELD_BIT(stage)) {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
/* Skip unused/inactive stages. */
|
|
if (stage_index == ~0)
|
|
continue;
|
|
|
|
create_info = &pCreateInfo->pStages[stage_index];
|
|
|
|
/* SPIR-V to NIR. */
|
|
ctx->nir[stage] = pvr_spirv_to_nir(ctx, stage, create_info);
|
|
if (!ctx->nir[stage]) {
|
|
ralloc_free(ctx);
|
|
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
|
|
}
|
|
}
|
|
|
|
/* Pre-back-end analysis and optimization, driver data extraction. */
|
|
/* TODO: Analyze and cull unused I/O between stages. */
|
|
/* TODO: Allocate UBOs between stages;
|
|
* pipeline->layout->set_{count,layout}.
|
|
*/
|
|
|
|
/* Back-end translation. */
|
|
for (gl_shader_stage stage = MESA_SHADER_FRAGMENT; stage > MESA_SHADER_NONE;
|
|
stage--) {
|
|
if (pvr_has_hard_coded_shaders(&device->pdevice->dev_info) &&
|
|
pvr_hard_code_graphics_get_flags(&device->pdevice->dev_info) &
|
|
BITFIELD_BIT(stage)) {
|
|
const struct pvr_device_info *const dev_info =
|
|
&device->pdevice->dev_info;
|
|
struct pvr_explicit_constant_usage *explicit_const_usage;
|
|
|
|
switch (stage) {
|
|
case MESA_SHADER_VERTEX:
|
|
explicit_const_usage = &vert_explicit_const_usage;
|
|
break;
|
|
|
|
case MESA_SHADER_FRAGMENT:
|
|
explicit_const_usage = &frag_explicit_const_usage;
|
|
break;
|
|
|
|
default:
|
|
unreachable("Unsupported stage.");
|
|
}
|
|
|
|
pvr_hard_code_graphics_shader(dev_info,
|
|
hard_code_pipeline_n,
|
|
stage,
|
|
&ctx->binary[stage]);
|
|
|
|
pvr_hard_code_graphics_get_build_info(dev_info,
|
|
hard_code_pipeline_n,
|
|
stage,
|
|
&ctx->common_data[stage],
|
|
&ctx->stage_data,
|
|
explicit_const_usage);
|
|
|
|
continue;
|
|
}
|
|
|
|
if (!ctx->nir[stage])
|
|
continue;
|
|
|
|
ctx->rogue[stage] = pvr_nir_to_rogue(ctx, ctx->nir[stage]);
|
|
if (!ctx->rogue[stage]) {
|
|
ralloc_free(ctx);
|
|
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
|
|
}
|
|
|
|
pvr_rogue_to_binary(ctx, ctx->rogue[stage], &ctx->binary[stage]);
|
|
if (!ctx->binary[stage].size) {
|
|
ralloc_free(ctx);
|
|
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
|
|
}
|
|
}
|
|
|
|
if (pvr_has_hard_coded_shaders(&device->pdevice->dev_info) &&
|
|
pvr_hard_code_graphics_get_flags(&device->pdevice->dev_info) &
|
|
BITFIELD_BIT(MESA_SHADER_VERTEX)) {
|
|
pvr_hard_code_graphics_vertex_state(&device->pdevice->dev_info,
|
|
hard_code_pipeline_n,
|
|
&gfx_pipeline->shader_state.vertex);
|
|
} else {
|
|
pvr_vertex_state_init(gfx_pipeline,
|
|
&ctx->common_data[MESA_SHADER_VERTEX],
|
|
*vertex_input_reg_count,
|
|
&ctx->stage_data.vs);
|
|
|
|
if (!old_path) {
|
|
struct pvr_vertex_shader_state *vertex_state =
|
|
&gfx_pipeline->shader_state.vertex;
|
|
|
|
/* FIXME: For now we just overwrite it but the compiler shouldn't be
|
|
* returning the sh count since the driver is in charge of allocating
|
|
* them.
|
|
*/
|
|
vertex_state->stage_state.const_shared_reg_count =
|
|
sh_count[PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY];
|
|
|
|
gfx_pipeline->shader_state.vertex.vertex_input_size =
|
|
ctx->stage_data.vs.num_vertex_input_regs;
|
|
}
|
|
}
|
|
|
|
result =
|
|
pvr_gpu_upload_usc(device,
|
|
util_dynarray_begin(&ctx->binary[MESA_SHADER_VERTEX]),
|
|
ctx->binary[MESA_SHADER_VERTEX].size,
|
|
cache_line_size,
|
|
&gfx_pipeline->shader_state.vertex.bo);
|
|
if (result != VK_SUCCESS)
|
|
goto err_free_build_context;
|
|
|
|
if (ctx->nir[MESA_SHADER_FRAGMENT]) {
|
|
struct pvr_fragment_shader_state *fragment_state =
|
|
&gfx_pipeline->shader_state.fragment;
|
|
|
|
if (pvr_has_hard_coded_shaders(&device->pdevice->dev_info) &&
|
|
pvr_hard_code_graphics_get_flags(&device->pdevice->dev_info) &
|
|
BITFIELD_BIT(MESA_SHADER_FRAGMENT)) {
|
|
pvr_hard_code_graphics_fragment_state(
|
|
&device->pdevice->dev_info,
|
|
hard_code_pipeline_n,
|
|
&gfx_pipeline->shader_state.fragment);
|
|
} else {
|
|
pvr_fragment_state_init(gfx_pipeline,
|
|
&ctx->common_data[MESA_SHADER_FRAGMENT]);
|
|
|
|
if (!old_path) {
|
|
/* FIXME: For now we just overwrite it but the compiler shouldn't be
|
|
* returning the sh count since the driver is in charge of
|
|
* allocating them.
|
|
*/
|
|
fragment_state->stage_state.const_shared_reg_count =
|
|
sh_count[PVR_STAGE_ALLOCATION_FRAGMENT];
|
|
}
|
|
}
|
|
|
|
result = pvr_gpu_upload_usc(
|
|
device,
|
|
util_dynarray_begin(&ctx->binary[MESA_SHADER_FRAGMENT]),
|
|
ctx->binary[MESA_SHADER_FRAGMENT].size,
|
|
cache_line_size,
|
|
&gfx_pipeline->shader_state.fragment.bo);
|
|
if (result != VK_SUCCESS)
|
|
goto err_free_vertex_bo;
|
|
|
|
/* TODO: powervr has an optimization where it attempts to recompile
|
|
* shaders. See PipelineCompileNoISPFeedbackFragmentStage. Unimplemented
|
|
* since in our case the optimization doesn't happen.
|
|
*/
|
|
|
|
result = pvr_pds_coeff_program_create_and_upload(
|
|
device,
|
|
allocator,
|
|
ctx->stage_data.fs.iterator_args.fpu_iterators,
|
|
ctx->stage_data.fs.iterator_args.num_fpu_iterators,
|
|
ctx->stage_data.fs.iterator_args.destination,
|
|
&fragment_state->pds_coeff_program,
|
|
&fragment_state->stage_state.pds_temps_count);
|
|
if (result != VK_SUCCESS)
|
|
goto err_free_fragment_bo;
|
|
|
|
result = pvr_pds_fragment_program_create_and_upload(
|
|
device,
|
|
allocator,
|
|
gfx_pipeline->shader_state.fragment.bo,
|
|
ctx->common_data[MESA_SHADER_FRAGMENT].temps,
|
|
ctx->stage_data.fs.msaa_mode,
|
|
ctx->stage_data.fs.phas,
|
|
&fragment_state->pds_fragment_program);
|
|
if (result != VK_SUCCESS)
|
|
goto err_free_coeff_program;
|
|
|
|
/* FIXME: For now we pass in the same explicit_const_usage since it
|
|
* contains all invalid entries. Fix this by hooking it up to the
|
|
* compiler.
|
|
*/
|
|
result = pvr_pds_descriptor_program_create_and_upload(
|
|
device,
|
|
allocator,
|
|
&ctx->common_data[MESA_SHADER_FRAGMENT].compile_time_consts_data,
|
|
&ctx->common_data[MESA_SHADER_FRAGMENT].ubo_data,
|
|
&frag_explicit_const_usage,
|
|
layout,
|
|
PVR_STAGE_ALLOCATION_FRAGMENT,
|
|
sh_reg_layout_frag,
|
|
&fragment_state->descriptor_state);
|
|
if (result != VK_SUCCESS)
|
|
goto err_free_frag_program;
|
|
|
|
/* If not, we need to MAX2() and set
|
|
* `fragment_state->stage_state.pds_temps_count` appropriately.
|
|
*/
|
|
assert(fragment_state->descriptor_state.pds_info.temps_required == 0);
|
|
}
|
|
|
|
result = pvr_pds_vertex_attrib_programs_create_and_upload(
|
|
device,
|
|
allocator,
|
|
vertex_input_state,
|
|
ctx->common_data[MESA_SHADER_VERTEX].temps,
|
|
&ctx->stage_data.vs,
|
|
vtx_dma_descriptions,
|
|
vtx_dma_count,
|
|
&special_vars_layout,
|
|
&gfx_pipeline->shader_state.vertex.pds_attrib_programs);
|
|
if (result != VK_SUCCESS)
|
|
goto err_free_frag_descriptor_program;
|
|
|
|
result = pvr_pds_descriptor_program_create_and_upload(
|
|
device,
|
|
allocator,
|
|
&ctx->common_data[MESA_SHADER_VERTEX].compile_time_consts_data,
|
|
&ctx->common_data[MESA_SHADER_VERTEX].ubo_data,
|
|
&vert_explicit_const_usage,
|
|
layout,
|
|
PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY,
|
|
sh_reg_layout_vert,
|
|
&gfx_pipeline->shader_state.vertex.descriptor_state);
|
|
if (result != VK_SUCCESS)
|
|
goto err_free_vertex_attrib_program;
|
|
|
|
/* FIXME: When the temp_buffer_total_size is non-zero we need to allocate a
|
|
* scratch buffer for both vertex and fragment stage.
|
|
* Figure out the best place to do this.
|
|
*/
|
|
/* assert(pvr_pds_descriptor_program_variables.temp_buff_total_size == 0); */
|
|
/* TODO: Implement spilling with the above. */
|
|
|
|
ralloc_free(ctx);
|
|
|
|
hard_code_pipeline_n++;
|
|
|
|
return VK_SUCCESS;
|
|
|
|
err_free_vertex_attrib_program:
|
|
for (uint32_t i = 0;
|
|
i < ARRAY_SIZE(gfx_pipeline->shader_state.vertex.pds_attrib_programs);
|
|
i++) {
|
|
struct pvr_pds_attrib_program *const attrib_program =
|
|
&gfx_pipeline->shader_state.vertex.pds_attrib_programs[i];
|
|
|
|
pvr_pds_vertex_attrib_program_destroy(device, allocator, attrib_program);
|
|
}
|
|
err_free_frag_descriptor_program:
|
|
pvr_pds_descriptor_program_destroy(
|
|
device,
|
|
allocator,
|
|
&gfx_pipeline->shader_state.fragment.descriptor_state);
|
|
err_free_frag_program:
|
|
pvr_bo_suballoc_free(
|
|
gfx_pipeline->shader_state.fragment.pds_fragment_program.pvr_bo);
|
|
err_free_coeff_program:
|
|
pvr_bo_suballoc_free(
|
|
gfx_pipeline->shader_state.fragment.pds_coeff_program.pvr_bo);
|
|
err_free_fragment_bo:
|
|
pvr_bo_suballoc_free(gfx_pipeline->shader_state.fragment.bo);
|
|
err_free_vertex_bo:
|
|
pvr_bo_suballoc_free(gfx_pipeline->shader_state.vertex.bo);
|
|
err_free_build_context:
|
|
ralloc_free(ctx);
|
|
return result;
|
|
}
|
|
|
|
static struct vk_render_pass_state
|
|
pvr_create_renderpass_state(const VkGraphicsPipelineCreateInfo *const info)
|
|
{
|
|
PVR_FROM_HANDLE(pvr_render_pass, pass, info->renderPass);
|
|
const struct pvr_render_subpass *const subpass =
|
|
&pass->subpasses[info->subpass];
|
|
|
|
enum vk_rp_attachment_flags attachments = 0;
|
|
|
|
assert(info->subpass < pass->subpass_count);
|
|
|
|
for (uint32_t i = 0; i < subpass->color_count; i++) {
|
|
if (pass->attachments[subpass->color_attachments[i]].aspects)
|
|
attachments |= MESA_VK_RP_ATTACHMENT_COLOR_0_BIT << i;
|
|
}
|
|
|
|
if (subpass->depth_stencil_attachment != VK_ATTACHMENT_UNUSED) {
|
|
VkImageAspectFlags ds_aspects =
|
|
pass->attachments[subpass->depth_stencil_attachment].aspects;
|
|
if (ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
|
|
attachments |= MESA_VK_RP_ATTACHMENT_DEPTH_BIT;
|
|
if (ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT)
|
|
attachments |= MESA_VK_RP_ATTACHMENT_STENCIL_BIT;
|
|
}
|
|
|
|
return (struct vk_render_pass_state){
|
|
.attachments = attachments,
|
|
|
|
/* TODO: This is only needed for VK_KHR_create_renderpass2 (or core 1.2),
|
|
* which is not currently supported.
|
|
*/
|
|
.view_mask = 0,
|
|
};
|
|
}
|
|
|
|
static VkResult
|
|
pvr_graphics_pipeline_init(struct pvr_device *device,
|
|
struct vk_pipeline_cache *cache,
|
|
const VkGraphicsPipelineCreateInfo *pCreateInfo,
|
|
const VkAllocationCallbacks *allocator,
|
|
struct pvr_graphics_pipeline *gfx_pipeline)
|
|
{
|
|
struct vk_dynamic_graphics_state *const dynamic_state =
|
|
&gfx_pipeline->dynamic_state;
|
|
const struct vk_render_pass_state rp_state =
|
|
pvr_create_renderpass_state(pCreateInfo);
|
|
|
|
struct vk_graphics_pipeline_all_state all_state;
|
|
struct vk_graphics_pipeline_state state = { 0 };
|
|
|
|
VkResult result;
|
|
|
|
pvr_pipeline_init(device, PVR_PIPELINE_TYPE_GRAPHICS, &gfx_pipeline->base);
|
|
|
|
result = vk_graphics_pipeline_state_fill(&device->vk,
|
|
&state,
|
|
pCreateInfo,
|
|
&rp_state,
|
|
0,
|
|
&all_state,
|
|
NULL,
|
|
0,
|
|
NULL);
|
|
if (result != VK_SUCCESS)
|
|
goto err_pipeline_finish;
|
|
|
|
vk_dynamic_graphics_state_init(dynamic_state);
|
|
|
|
/* Load static state into base dynamic state holder. */
|
|
vk_dynamic_graphics_state_fill(dynamic_state, &state);
|
|
|
|
/* The value of ms.rasterization_samples is undefined when
|
|
* rasterizer_discard_enable is set, but we need a specific value.
|
|
* Fill that in here.
|
|
*/
|
|
if (state.rs->rasterizer_discard_enable)
|
|
dynamic_state->ms.rasterization_samples = VK_SAMPLE_COUNT_1_BIT;
|
|
|
|
memset(gfx_pipeline->stage_indices, ~0, sizeof(gfx_pipeline->stage_indices));
|
|
|
|
for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) {
|
|
VkShaderStageFlagBits vk_stage = pCreateInfo->pStages[i].stage;
|
|
gl_shader_stage gl_stage = vk_to_mesa_shader_stage(vk_stage);
|
|
/* From the Vulkan 1.2.192 spec for VkPipelineShaderStageCreateInfo:
|
|
*
|
|
* "stage must not be VK_SHADER_STAGE_ALL_GRAPHICS,
|
|
* or VK_SHADER_STAGE_ALL."
|
|
*
|
|
* So we don't handle that.
|
|
*
|
|
* We also don't handle VK_SHADER_STAGE_TESSELLATION_* and
|
|
* VK_SHADER_STAGE_GEOMETRY_BIT stages as 'tessellationShader' and
|
|
* 'geometryShader' are set to false in the VkPhysicalDeviceFeatures
|
|
* structure returned by the driver.
|
|
*/
|
|
switch (pCreateInfo->pStages[i].stage) {
|
|
case VK_SHADER_STAGE_VERTEX_BIT:
|
|
case VK_SHADER_STAGE_FRAGMENT_BIT:
|
|
gfx_pipeline->stage_indices[gl_stage] = i;
|
|
break;
|
|
default:
|
|
unreachable("Unsupported stage.");
|
|
}
|
|
}
|
|
|
|
gfx_pipeline->base.layout =
|
|
pvr_pipeline_layout_from_handle(pCreateInfo->layout);
|
|
|
|
/* Compiles and uploads shaders and PDS programs. */
|
|
result = pvr_graphics_pipeline_compile(device,
|
|
cache,
|
|
pCreateInfo,
|
|
allocator,
|
|
gfx_pipeline);
|
|
if (result != VK_SUCCESS)
|
|
goto err_pipeline_finish;
|
|
|
|
return VK_SUCCESS;
|
|
|
|
err_pipeline_finish:
|
|
pvr_pipeline_finish(&gfx_pipeline->base);
|
|
|
|
return result;
|
|
}
|
|
|
|
/* If allocator == NULL, the internal one will be used. */
|
|
static VkResult
|
|
pvr_graphics_pipeline_create(struct pvr_device *device,
|
|
struct vk_pipeline_cache *cache,
|
|
const VkGraphicsPipelineCreateInfo *pCreateInfo,
|
|
const VkAllocationCallbacks *allocator,
|
|
VkPipeline *const pipeline_out)
|
|
{
|
|
struct pvr_graphics_pipeline *gfx_pipeline;
|
|
VkResult result;
|
|
|
|
gfx_pipeline = vk_zalloc2(&device->vk.alloc,
|
|
allocator,
|
|
sizeof(*gfx_pipeline),
|
|
8,
|
|
VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
|
|
if (!gfx_pipeline)
|
|
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
|
|
|
|
/* Compiles and uploads shaders and PDS programs too. */
|
|
result = pvr_graphics_pipeline_init(device,
|
|
cache,
|
|
pCreateInfo,
|
|
allocator,
|
|
gfx_pipeline);
|
|
if (result != VK_SUCCESS) {
|
|
vk_free2(&device->vk.alloc, allocator, gfx_pipeline);
|
|
return result;
|
|
}
|
|
|
|
*pipeline_out = pvr_pipeline_to_handle(&gfx_pipeline->base);
|
|
|
|
return VK_SUCCESS;
|
|
}
|
|
|
|
VkResult
|
|
pvr_CreateGraphicsPipelines(VkDevice _device,
|
|
VkPipelineCache pipelineCache,
|
|
uint32_t createInfoCount,
|
|
const VkGraphicsPipelineCreateInfo *pCreateInfos,
|
|
const VkAllocationCallbacks *pAllocator,
|
|
VkPipeline *pPipelines)
|
|
{
|
|
VK_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache);
|
|
PVR_FROM_HANDLE(pvr_device, device, _device);
|
|
VkResult result = VK_SUCCESS;
|
|
|
|
for (uint32_t i = 0; i < createInfoCount; i++) {
|
|
const VkResult local_result =
|
|
pvr_graphics_pipeline_create(device,
|
|
cache,
|
|
&pCreateInfos[i],
|
|
pAllocator,
|
|
&pPipelines[i]);
|
|
if (local_result != VK_SUCCESS) {
|
|
result = local_result;
|
|
pPipelines[i] = VK_NULL_HANDLE;
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
/*****************************************************************************
|
|
Other functions
|
|
*****************************************************************************/
|
|
|
|
void pvr_DestroyPipeline(VkDevice _device,
|
|
VkPipeline _pipeline,
|
|
const VkAllocationCallbacks *pAllocator)
|
|
{
|
|
PVR_FROM_HANDLE(pvr_pipeline, pipeline, _pipeline);
|
|
PVR_FROM_HANDLE(pvr_device, device, _device);
|
|
|
|
if (!pipeline)
|
|
return;
|
|
|
|
switch (pipeline->type) {
|
|
case PVR_PIPELINE_TYPE_GRAPHICS: {
|
|
struct pvr_graphics_pipeline *const gfx_pipeline =
|
|
to_pvr_graphics_pipeline(pipeline);
|
|
|
|
pvr_graphics_pipeline_destroy(device, pAllocator, gfx_pipeline);
|
|
break;
|
|
}
|
|
|
|
case PVR_PIPELINE_TYPE_COMPUTE: {
|
|
struct pvr_compute_pipeline *const compute_pipeline =
|
|
to_pvr_compute_pipeline(pipeline);
|
|
|
|
pvr_compute_pipeline_destroy(device, pAllocator, compute_pipeline);
|
|
break;
|
|
}
|
|
|
|
default:
|
|
unreachable("Unknown pipeline type.");
|
|
}
|
|
}
|