mesa/src/imagination/vulkan/pvr_arch_cmd_buffer.c
Erik Faye-Lund e762592bff pvr: build pvr_arch_*.c as a multi-arch sources
This will allow us to build this multiple times for different
architectures. For now, it only defines a single architecture, because
that's what we currently support. But this makes room for future
architectures, that will follow relatively soon.

Co-authored-by: Ashish Chauhan <ashish.chauhan@imgtec.com>
Acked-by: Frank Binns <frank.binns@imgtec.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38922>
2025-12-19 10:35:05 +01:00

9765 lines
351 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Copyright © 2022 Imagination Technologies Ltd.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "pvr_cmd_buffer.h"
#include <assert.h>
#include <limits.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include <vulkan/vulkan.h>
#include "hwdef/rogue_hw_defs.h"
#include "hwdef/rogue_hw_utils.h"
#include "pvr_blit.h"
#include "pvr_bo.h"
#include "pvr_buffer.h"
#include "pvr_clear.h"
#include "pvr_common.h"
#include "pvr_csb.h"
#include "pvr_csb_enum_helpers.h"
#include "pvr_descriptor_set.h"
#include "pvr_device.h"
#include "pvr_device_info.h"
#include "pvr_entrypoints.h"
#include "pvr_formats.h"
#include "pvr_framebuffer.h"
#include "pvr_hw_pass.h"
#include "pvr_image.h"
#include "pvr_job_common.h"
#include "pvr_job_render.h"
#include "pvr_limits.h"
#include "pvr_macros.h"
#include "pvr_pass.h"
#include "pvr_pds.h"
#include "pvr_physical_device.h"
#include "pvr_pipeline.h"
#include "pvr_query.h"
#include "pvr_rt_dataset.h"
#include "pvr_tex_state.h"
#include "pvr_types.h"
#include "pvr_usc.h"
#include "pvr_winsys.h"
#include "util/bitscan.h"
#include "util/bitset.h"
#include "util/compiler.h"
#include "util/list.h"
#include "util/macros.h"
#include "util/u_dynarray.h"
#include "util/u_math.h"
#include "util/u_pack_color.h"
#include "vk_alloc.h"
#include "vk_command_buffer.h"
#include "vk_command_pool.h"
#include "vk_common_entrypoints.h"
#include "vk_format.h"
#include "vk_graphics_state.h"
#include "vk_log.h"
#include "vk_object.h"
#include "vk_pipeline_layout.h"
#include "vk_synchronization.h"
#include "vk_util.h"
/* Structure used to pass data into pvr_compute_generate_control_stream()
* function.
*/
struct pvr_compute_kernel_info {
pvr_dev_addr_t indirect_buffer_addr;
bool global_offsets_present;
uint32_t usc_common_size;
uint32_t usc_unified_size;
uint32_t pds_temp_size;
uint32_t pds_data_size;
enum ROGUE_CDMCTRL_USC_TARGET usc_target;
bool is_fence;
uint32_t pds_data_offset;
uint32_t pds_code_offset;
enum ROGUE_CDMCTRL_SD_TYPE sd_type;
bool usc_common_shared;
uint32_t global_size[PVR_WORKGROUP_DIMENSIONS];
uint32_t local_size[PVR_WORKGROUP_DIMENSIONS];
uint32_t max_instances;
};
static void
pvr_dynamic_render_info_destroy(const struct pvr_device *device,
struct pvr_cmd_buffer *cmd_buffer,
struct pvr_dynamic_render_info *dr_info);
static void pvr_cmd_buffer_clear_values_free(struct pvr_cmd_buffer *cmd_buffer);
static void pvr_cmd_buffer_attachments_free(struct pvr_cmd_buffer *cmd_buffer);
struct pvr_renderpass_hwsetup_render *PVR_PER_ARCH(pass_info_get_hw_render)(
const struct pvr_render_pass_info *render_pass_info,
uint32_t idx)
{
if (render_pass_info->dr_info)
return &render_pass_info->dr_info->hw_render;
return &render_pass_info->pass->hw_setup->renders[idx];
}
static void pvr_cmd_buffer_free_sub_cmd(struct pvr_cmd_buffer *cmd_buffer,
struct pvr_sub_cmd *sub_cmd)
{
bool secondary_cont;
if (sub_cmd->owned) {
switch (sub_cmd->type) {
case PVR_SUB_CMD_TYPE_GRAPHICS:
secondary_cont = cmd_buffer->vk.level ==
VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
(cmd_buffer->usage_flags &
VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT);
util_dynarray_fini(&sub_cmd->gfx.sec_query_indices);
pvr_csb_finish(&sub_cmd->gfx.control_stream);
pvr_bo_free(cmd_buffer->device, sub_cmd->gfx.terminate_ctrl_stream);
pvr_bo_free(cmd_buffer->device, sub_cmd->gfx.multiview_ctrl_stream);
pvr_bo_suballoc_free(sub_cmd->gfx.depth_bias_bo);
pvr_bo_suballoc_free(sub_cmd->gfx.scissor_bo);
if (sub_cmd->is_dynamic_render && !secondary_cont) {
pvr_rstate_entry_remove(cmd_buffer->device, sub_cmd->gfx.rstate);
pvr_dynamic_render_info_destroy(cmd_buffer->device,
cmd_buffer,
sub_cmd->gfx.dr_info);
}
break;
case PVR_SUB_CMD_TYPE_COMPUTE:
case PVR_SUB_CMD_TYPE_QUERY:
pvr_csb_finish(&sub_cmd->compute.control_stream);
break;
case PVR_SUB_CMD_TYPE_TRANSFER:
list_for_each_entry_safe (struct pvr_transfer_cmd,
transfer_cmd,
sub_cmd->transfer.transfer_cmds,
link) {
list_del(&transfer_cmd->link);
vk_free(&cmd_buffer->vk.pool->alloc, transfer_cmd);
}
break;
case PVR_SUB_CMD_TYPE_EVENT:
if (sub_cmd->event.type == PVR_EVENT_TYPE_WAIT)
vk_free(&cmd_buffer->vk.pool->alloc, sub_cmd->event.wait.events);
break;
default:
UNREACHABLE("Unsupported sub-command type");
}
}
list_del(&sub_cmd->link);
vk_free(&cmd_buffer->vk.pool->alloc, sub_cmd);
}
static void pvr_cmd_buffer_free_sub_cmds(struct pvr_cmd_buffer *cmd_buffer)
{
list_for_each_entry_safe (struct pvr_sub_cmd,
sub_cmd,
&cmd_buffer->sub_cmds,
link) {
pvr_cmd_buffer_free_sub_cmd(cmd_buffer, sub_cmd);
}
}
static void pvr_cmd_buffer_free_resources(struct pvr_cmd_buffer *cmd_buffer)
{
pvr_cmd_buffer_attachments_free(cmd_buffer);
pvr_cmd_buffer_clear_values_free(cmd_buffer);
util_dynarray_fini(&cmd_buffer->state.query_indices);
pvr_cmd_buffer_free_sub_cmds(cmd_buffer);
list_for_each_entry_safe (struct pvr_suballoc_bo,
suballoc_bo,
&cmd_buffer->bo_list,
link) {
list_del(&suballoc_bo->link);
pvr_bo_suballoc_free(suballoc_bo);
}
/* At the end of any graphics job, all of these get moved elsewhere */
assert(list_is_empty(&cmd_buffer->deferred_clears));
util_dynarray_fini(&cmd_buffer->deferred_csb_commands);
util_dynarray_fini(&cmd_buffer->scissor_array);
util_dynarray_fini(&cmd_buffer->depth_bias_array);
}
static void pvr_cmd_buffer_reset(struct vk_command_buffer *vk_cmd_buffer,
VkCommandBufferResetFlags flags)
{
struct pvr_cmd_buffer *cmd_buffer =
container_of(vk_cmd_buffer, struct pvr_cmd_buffer, vk);
/* FIXME: For now we always free all resources as if
* VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT was set.
*/
pvr_cmd_buffer_free_resources(cmd_buffer);
vk_command_buffer_reset(&cmd_buffer->vk);
memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
memset(&cmd_buffer->scissor_words, 0, sizeof(cmd_buffer->scissor_words));
cmd_buffer->usage_flags = 0;
}
static void pvr_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer)
{
struct pvr_cmd_buffer *cmd_buffer =
container_of(vk_cmd_buffer, struct pvr_cmd_buffer, vk);
pvr_cmd_buffer_free_resources(cmd_buffer);
vk_command_buffer_finish(&cmd_buffer->vk);
vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer);
}
static const struct vk_command_buffer_ops cmd_buffer_ops = {
.reset = pvr_cmd_buffer_reset,
.destroy = pvr_cmd_buffer_destroy,
};
static VkResult pvr_cmd_buffer_create(struct pvr_device *device,
struct vk_command_pool *pool,
VkCommandBufferLevel level,
VkCommandBuffer *pCommandBuffer)
{
struct pvr_cmd_buffer *cmd_buffer;
VkResult result;
cmd_buffer = vk_zalloc(&pool->alloc,
sizeof(*cmd_buffer),
8U,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (!cmd_buffer)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
result =
vk_command_buffer_init(pool, &cmd_buffer->vk, &cmd_buffer_ops, level);
if (result != VK_SUCCESS) {
vk_free(&pool->alloc, cmd_buffer);
return result;
}
cmd_buffer->device = device;
cmd_buffer->depth_bias_array = UTIL_DYNARRAY_INIT;
cmd_buffer->scissor_array = UTIL_DYNARRAY_INIT;
cmd_buffer->deferred_csb_commands = UTIL_DYNARRAY_INIT;
list_inithead(&cmd_buffer->deferred_clears);
list_inithead(&cmd_buffer->sub_cmds);
list_inithead(&cmd_buffer->bo_list);
*pCommandBuffer = pvr_cmd_buffer_to_handle(cmd_buffer);
return VK_SUCCESS;
}
VkResult PVR_PER_ARCH(AllocateCommandBuffers)(
VkDevice _device,
const VkCommandBufferAllocateInfo *pAllocateInfo,
VkCommandBuffer *pCommandBuffers)
{
VK_FROM_HANDLE(vk_command_pool, pool, pAllocateInfo->commandPool);
VK_FROM_HANDLE(pvr_device, device, _device);
VkResult result = VK_SUCCESS;
uint32_t i;
for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
result = pvr_cmd_buffer_create(device,
pool,
pAllocateInfo->level,
&pCommandBuffers[i]);
if (result != VK_SUCCESS)
break;
}
if (result != VK_SUCCESS) {
while (i--) {
VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, pCommandBuffers[i]);
pvr_cmd_buffer_destroy(cmd_buffer);
}
for (i = 0; i < pAllocateInfo->commandBufferCount; i++)
pCommandBuffers[i] = VK_NULL_HANDLE;
}
return result;
}
static void pvr_cmd_buffer_update_barriers(struct pvr_cmd_buffer *cmd_buffer,
enum pvr_sub_cmd_type type)
{
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
uint32_t barriers;
switch (type) {
case PVR_SUB_CMD_TYPE_GRAPHICS:
barriers = PVR_PIPELINE_STAGE_GEOM_BIT | PVR_PIPELINE_STAGE_FRAG_BIT;
break;
case PVR_SUB_CMD_TYPE_COMPUTE:
barriers = PVR_PIPELINE_STAGE_COMPUTE_BIT;
break;
case PVR_SUB_CMD_TYPE_QUERY:
case PVR_SUB_CMD_TYPE_TRANSFER:
/* Compute jobs are used for queries but to copy the results we
* have to sync with transfer jobs because vkCmdCopyQueryPoolResults() is
* deemed as a transfer operation by the spec.
*/
barriers = PVR_PIPELINE_STAGE_TRANSFER_BIT;
break;
case PVR_SUB_CMD_TYPE_EVENT:
barriers = 0;
break;
default:
UNREACHABLE("Unsupported sub-command type");
}
for (uint32_t i = 0; i < ARRAY_SIZE(state->barriers_needed); i++)
state->barriers_needed[i] |= barriers;
}
static VkResult
pvr_cmd_buffer_upload_tables(struct pvr_device *device,
struct pvr_cmd_buffer *cmd_buffer,
struct pvr_sub_cmd_gfx *const sub_cmd)
{
const uint32_t cache_line_size =
pvr_get_slc_cache_line_size(&device->pdevice->dev_info);
VkResult result;
assert(!sub_cmd->depth_bias_bo && !sub_cmd->scissor_bo);
if (cmd_buffer->depth_bias_array.size > 0) {
result =
pvr_gpu_upload(device,
device->heaps.general_heap,
util_dynarray_begin(&cmd_buffer->depth_bias_array),
cmd_buffer->depth_bias_array.size,
cache_line_size,
&sub_cmd->depth_bias_bo);
if (result != VK_SUCCESS)
return result;
}
if (cmd_buffer->scissor_array.size > 0) {
result = pvr_gpu_upload(device,
device->heaps.general_heap,
util_dynarray_begin(&cmd_buffer->scissor_array),
cmd_buffer->scissor_array.size,
cache_line_size,
&sub_cmd->scissor_bo);
if (result != VK_SUCCESS)
goto err_free_depth_bias_bo;
}
util_dynarray_clear(&cmd_buffer->depth_bias_array);
util_dynarray_clear(&cmd_buffer->scissor_array);
return VK_SUCCESS;
err_free_depth_bias_bo:
pvr_bo_suballoc_free(sub_cmd->depth_bias_bo);
sub_cmd->depth_bias_bo = NULL;
return result;
}
static VkResult
pvr_cmd_buffer_emit_ppp_state(const struct pvr_cmd_buffer *const cmd_buffer,
struct pvr_csb *const csb)
{
const struct pvr_render_state *const rstate =
cmd_buffer->state.render_pass_info.rstate;
assert(csb->stream_type == PVR_CMD_STREAM_TYPE_GRAPHICS ||
csb->stream_type == PVR_CMD_STREAM_TYPE_GRAPHICS_DEFERRED);
pvr_csb_set_relocation_mark(csb);
pvr_csb_emit (csb, VDMCTRL_PPP_STATE0, state0) {
state0.addrmsb = rstate->ppp_state_bo->dev_addr;
state0.word_count = rstate->ppp_state_size;
}
pvr_csb_emit (csb, VDMCTRL_PPP_STATE1, state1) {
state1.addrlsb = rstate->ppp_state_bo->dev_addr;
}
pvr_csb_clear_relocation_mark(csb);
return csb->status;
}
VkResult PVR_PER_ARCH(cmd_buffer_upload_general)(
struct pvr_cmd_buffer *const cmd_buffer,
const void *const data,
const size_t size,
struct pvr_suballoc_bo **const pvr_bo_out)
{
struct pvr_device *const device = cmd_buffer->device;
const uint32_t cache_line_size =
pvr_get_slc_cache_line_size(&device->pdevice->dev_info);
struct pvr_suballoc_bo *suballoc_bo;
VkResult result;
result = pvr_gpu_upload(device,
device->heaps.general_heap,
data,
size,
cache_line_size,
&suballoc_bo);
if (result != VK_SUCCESS)
return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
list_add(&suballoc_bo->link, &cmd_buffer->bo_list);
*pvr_bo_out = suballoc_bo;
return VK_SUCCESS;
}
static VkResult
pvr_cmd_buffer_upload_usc(struct pvr_cmd_buffer *const cmd_buffer,
const void *const code,
const size_t code_size,
uint64_t code_alignment,
struct pvr_suballoc_bo **const pvr_bo_out)
{
struct pvr_device *const device = cmd_buffer->device;
const uint32_t cache_line_size =
pvr_get_slc_cache_line_size(&device->pdevice->dev_info);
struct pvr_suballoc_bo *suballoc_bo;
VkResult result;
code_alignment = MAX2(code_alignment, cache_line_size);
result =
pvr_gpu_upload_usc(device, code, code_size, code_alignment, &suballoc_bo);
if (result != VK_SUCCESS)
return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
list_add(&suballoc_bo->link, &cmd_buffer->bo_list);
*pvr_bo_out = suballoc_bo;
return VK_SUCCESS;
}
VkResult
PVR_PER_ARCH(cmd_buffer_upload_pds)(struct pvr_cmd_buffer *const cmd_buffer,
const uint32_t *data,
uint32_t data_size_dwords,
uint32_t data_alignment,
const uint32_t *code,
uint32_t code_size_dwords,
uint32_t code_alignment,
uint64_t min_alignment,
struct pvr_pds_upload *const pds_upload_out)
{
struct pvr_device *const device = cmd_buffer->device;
VkResult result;
result = pvr_gpu_upload_pds(device,
data,
data_size_dwords,
data_alignment,
code,
code_size_dwords,
code_alignment,
min_alignment,
pds_upload_out);
if (result != VK_SUCCESS)
return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
list_add(&pds_upload_out->pvr_bo->link, &cmd_buffer->bo_list);
return VK_SUCCESS;
}
static inline VkResult
pvr_cmd_buffer_upload_pds_data(struct pvr_cmd_buffer *const cmd_buffer,
const uint32_t *data,
uint32_t data_size_dwords,
uint32_t data_alignment,
struct pvr_pds_upload *const pds_upload_out)
{
return pvr_cmd_buffer_upload_pds(cmd_buffer,
data,
data_size_dwords,
data_alignment,
NULL,
0,
0,
data_alignment,
pds_upload_out);
}
/* pbe_cs_words must be an array of length emit_count with
* ROGUE_NUM_PBESTATE_STATE_WORDS entries
*/
static VkResult pvr_sub_cmd_gfx_per_job_fragment_programs_create_and_upload(
struct pvr_cmd_buffer *const cmd_buffer,
const uint32_t emit_count,
const uint32_t *pbe_cs_words,
const unsigned *tile_buffer_ids,
unsigned pixel_output_width,
struct pvr_pds_upload *const pds_upload_out)
{
struct pvr_pds_event_program pixel_event_program = {
/* No data to DMA, just a DOUTU needed. */
.num_emit_word_pairs = 0,
};
const uint32_t staging_buffer_size =
PVR_DW_TO_BYTES(cmd_buffer->device->pixel_event_data_size_in_dwords);
const VkAllocationCallbacks *const allocator = &cmd_buffer->vk.pool->alloc;
struct pvr_device *const device = cmd_buffer->device;
const struct pvr_device_tile_buffer_state *tile_buffer_state =
&device->tile_buffer_state;
struct pvr_suballoc_bo *usc_eot_program = NULL;
struct pvr_eot_props props = {
.emit_count = emit_count,
.shared_words = false,
.state_words = pbe_cs_words,
};
uint32_t *staging_buffer;
uint32_t usc_temp_count;
pco_shader *eot;
VkResult result;
bool has_tile_buffers = false;
for (unsigned u = 0; u < emit_count; ++u) {
unsigned tile_buffer_id = tile_buffer_ids[u];
if (tile_buffer_id == ~0)
continue;
assert(tile_buffer_id < tile_buffer_state->buffer_count);
props.tile_buffer_addrs[u] =
tile_buffer_state->buffers[tile_buffer_id]->vma->dev_addr.addr;
has_tile_buffers = true;
}
if (has_tile_buffers) {
props.num_output_regs = pixel_output_width;
props.msaa_samples =
cmd_buffer->vk.dynamic_graphics_state.ms.rasterization_samples;
if (!props.msaa_samples)
props.msaa_samples = 1;
}
eot =
pvr_usc_eot(device->pdevice->pco_ctx, &props, &device->pdevice->dev_info);
usc_temp_count = pco_shader_data(eot)->common.temps;
result = pvr_cmd_buffer_upload_usc(cmd_buffer,
pco_shader_binary_data(eot),
pco_shader_binary_size(eot),
4,
&usc_eot_program);
ralloc_free(eot);
if (result != VK_SUCCESS)
return result;
pvr_pds_setup_doutu(&pixel_event_program.task_control,
usc_eot_program->dev_addr.addr,
usc_temp_count,
ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE,
false);
/* TODO: We could skip allocating this and generate directly into the device
* buffer thus removing one allocation and memcpy() per job. Would this
* speed up things in a noticeable way?
*/
staging_buffer = vk_alloc(allocator,
staging_buffer_size,
8,
VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
if (!staging_buffer) {
result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
goto err_free_usc_pixel_program;
}
/* Generate the data segment. The code segment was uploaded earlier when
* setting up the PDS static heap data.
*/
pvr_pds_generate_pixel_event_data_segment(&pixel_event_program,
staging_buffer,
&device->pdevice->dev_info);
result = pvr_cmd_buffer_upload_pds_data(
cmd_buffer,
staging_buffer,
cmd_buffer->device->pixel_event_data_size_in_dwords,
4,
pds_upload_out);
vk_free(allocator, staging_buffer);
return result;
err_free_usc_pixel_program:
list_del(&usc_eot_program->link);
pvr_bo_suballoc_free(usc_eot_program);
return result;
}
static VkResult pvr_sub_cmd_gfx_build_terminate_ctrl_stream(
struct pvr_device *const device,
const struct pvr_cmd_buffer *const cmd_buffer,
struct pvr_sub_cmd_gfx *const gfx_sub_cmd)
{
struct list_head bo_list;
struct pvr_csb csb;
VkResult result;
pvr_csb_init(device, PVR_CMD_STREAM_TYPE_GRAPHICS, &csb);
result = pvr_cmd_buffer_emit_ppp_state(cmd_buffer, &csb);
if (result != VK_SUCCESS)
goto err_csb_finish;
result = pvr_csb_emit_terminate(&csb);
if (result != VK_SUCCESS)
goto err_csb_finish;
result = pvr_csb_bake(&csb, &bo_list);
if (result != VK_SUCCESS)
goto err_csb_finish;
/* This is a trivial control stream, there's no reason it should ever require
* more memory than a single bo can provide.
*/
assert(list_is_singular(&bo_list));
gfx_sub_cmd->terminate_ctrl_stream =
list_first_entry(&bo_list, struct pvr_bo, link);
return VK_SUCCESS;
err_csb_finish:
pvr_csb_finish(&csb);
return result;
}
static VkResult pvr_setup_texture_state_words(
struct pvr_device *device,
struct pvr_combined_image_sampler_descriptor *descriptor,
const struct pvr_image_view *image_view,
uint32_t view_index)
{
const struct pvr_image *image = vk_to_pvr_image(image_view->vk.image);
struct pvr_texture_state_info info = {
.format = image_view->vk.format,
.mem_layout = image->memlayout,
.type = image_view->vk.view_type,
.is_cube = image_view->vk.view_type == VK_IMAGE_VIEW_TYPE_CUBE ||
image_view->vk.view_type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY,
.tex_state_type = PVR_TEXTURE_STATE_SAMPLE,
.extent = image_view->vk.extent,
.mip_levels = 1,
.sample_count = image_view->vk.image->samples,
.stride = image->physical_extent.width,
.offset = image->layer_size * view_index,
.addr = image->dev_addr,
};
const uint8_t *const swizzle = pvr_get_format_swizzle(info.format);
VkResult result;
memcpy(&info.swizzle, swizzle, sizeof(info.swizzle));
/* TODO: Can we use image_view->texture_state instead of generating here? */
result = pvr_pack_tex_state(device, &info, &descriptor->image);
if (result != VK_SUCCESS)
return result;
pvr_csb_pack (&descriptor->sampler.words[0],
TEXSTATE_SAMPLER_WORD0,
sampler) {
sampler.non_normalized_coords = true;
sampler.addrmode_v = ROGUE_TEXSTATE_ADDRMODE_CLAMP_TO_EDGE;
sampler.addrmode_u = ROGUE_TEXSTATE_ADDRMODE_CLAMP_TO_EDGE;
sampler.minfilter = ROGUE_TEXSTATE_FILTER_POINT;
sampler.magfilter = ROGUE_TEXSTATE_FILTER_POINT;
sampler.dadjust = ROGUE_TEXSTATE_DADJUST_ZERO_UINT;
}
pvr_csb_pack (&descriptor->sampler.words[1],
TEXSTATE_SAMPLER_WORD1,
sampler) {}
return VK_SUCCESS;
}
static VkResult
pvr_load_op_constants_create_and_upload(struct pvr_cmd_buffer *cmd_buffer,
const struct pvr_load_op *load_op,
uint32_t view_index,
pvr_dev_addr_t *const addr_out)
{
const struct pvr_render_pass_info *render_pass_info =
&cmd_buffer->state.render_pass_info;
const struct pvr_render_pass *pass = render_pass_info->pass;
struct pvr_suballoc_bo *clear_bo;
uint32_t attachment_count;
bool has_depth_clear;
bool has_depth_load;
VkResult result;
struct pvr_combined_image_sampler_descriptor
texture_states[PVR_LOAD_OP_CLEARS_LOADS_MAX_RTS];
uint32_t texture_count = 0;
uint32_t hw_clear_value[PVR_LOAD_OP_CLEARS_LOADS_MAX_RTS *
PVR_CLEAR_COLOR_ARRAY_SIZE];
uint32_t next_clear_consts = 0;
unsigned buffer_size;
uint8_t *buffer;
if (load_op->is_hw_object)
attachment_count = load_op->hw_render->color_init_count;
else
attachment_count = load_op->subpass->color_count;
for (uint32_t i = 0; i < attachment_count; i++) {
struct pvr_image_view *image_view;
uint32_t attachment_idx;
const VkClearValue *clear_value;
if (load_op->is_hw_object)
attachment_idx = load_op->hw_render->color_init[i].index;
else
attachment_idx = load_op->subpass->color_attachments[i];
image_view = render_pass_info->attachments[attachment_idx];
clear_value = &render_pass_info->clear_values[attachment_idx];
assert((load_op->clears_loads_state.rt_load_mask &
load_op->clears_loads_state.rt_clear_mask) == 0);
if (load_op->clears_loads_state.rt_load_mask & BITFIELD_BIT(i)) {
result = pvr_setup_texture_state_words(cmd_buffer->device,
&texture_states[texture_count],
image_view,
view_index);
if (result != VK_SUCCESS)
return result;
texture_count++;
} else if (load_op->clears_loads_state.rt_clear_mask & BITFIELD_BIT(i)) {
const uint32_t accum_fmt_size =
pvr_get_pbe_accum_format_size_in_bytes(image_view->vk.format);
assert(next_clear_consts +
vk_format_get_blocksize(image_view->vk.format) <=
ARRAY_SIZE(hw_clear_value));
/* FIXME: do this at the point we store the clear values? */
pvr_get_hw_clear_color(image_view->vk.format,
clear_value->color,
&hw_clear_value[next_clear_consts]);
next_clear_consts += DIV_ROUND_UP(accum_fmt_size, sizeof(uint32_t));
}
}
has_depth_load = false;
for (uint32_t i = 0;
i < ARRAY_SIZE(load_op->clears_loads_state.dest_vk_format);
i++) {
switch (load_op->clears_loads_state.dest_vk_format[i]) {
case VK_FORMAT_D16_UNORM:
case VK_FORMAT_X8_D24_UNORM_PACK32:
case VK_FORMAT_D32_SFLOAT:
case VK_FORMAT_D24_UNORM_S8_UINT:
case VK_FORMAT_D32_SFLOAT_S8_UINT:
has_depth_load = true;
break;
default:
break;
}
if (has_depth_load)
break;
}
has_depth_clear = load_op->clears_loads_state.depth_clear_to_reg != -1;
assert(!(has_depth_clear && has_depth_load));
if (has_depth_load) {
const struct pvr_render_pass_attachment *attachment;
const struct pvr_image_view *image_view;
assert(load_op->subpass->depth_stencil_attachment !=
VK_ATTACHMENT_UNUSED);
assert(!load_op->is_hw_object);
attachment =
&pass->attachments[load_op->subpass->depth_stencil_attachment];
image_view = render_pass_info->attachments[attachment->index];
result = pvr_setup_texture_state_words(cmd_buffer->device,
&texture_states[texture_count],
image_view,
view_index);
if (result != VK_SUCCESS)
return result;
texture_count++;
} else if (has_depth_clear) {
const struct pvr_render_pass_attachment *attachment;
VkClearValue clear_value;
assert(load_op->subpass->depth_stencil_attachment !=
VK_ATTACHMENT_UNUSED);
attachment =
&pass->attachments[load_op->subpass->depth_stencil_attachment];
clear_value = render_pass_info->clear_values[attachment->index];
assert(next_clear_consts < ARRAY_SIZE(hw_clear_value));
hw_clear_value[next_clear_consts++] = fui(clear_value.depthStencil.depth);
}
buffer_size = next_clear_consts * sizeof(hw_clear_value[0]);
if (texture_count > 0)
buffer_size = ALIGN_POT(buffer_size, 4 * sizeof(uint32_t));
unsigned words = buffer_size;
buffer_size +=
texture_count * sizeof(struct pvr_combined_image_sampler_descriptor);
unsigned tile_buffer_offset = buffer_size;
buffer_size += load_op->num_tile_buffers * sizeof(uint64_t);
assert(!(buffer_size % sizeof(uint32_t)));
assert(buffer_size / sizeof(uint32_t) == load_op->shareds_count);
result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
cmd_buffer->device->heaps.general_heap,
buffer_size,
&clear_bo);
if (result != VK_SUCCESS)
return result;
buffer = (uint8_t *)pvr_bo_suballoc_get_map_addr(clear_bo);
memcpy(&buffer[0],
hw_clear_value,
next_clear_consts * sizeof(hw_clear_value[0]));
memcpy(&buffer[words],
texture_states,
texture_count * sizeof(struct pvr_combined_image_sampler_descriptor));
struct pvr_device *const device = cmd_buffer->device;
const struct pvr_device_tile_buffer_state *tile_buffer_state =
&device->tile_buffer_state;
uint32_t *tile_buffers = (uint32_t *)&buffer[tile_buffer_offset];
for (unsigned u = 0; u < load_op->num_tile_buffers; ++u) {
assert(u < tile_buffer_state->buffer_count);
uint64_t tile_buffer_addr =
tile_buffer_state->buffers[u]->vma->dev_addr.addr;
tile_buffers[2 * u] = tile_buffer_addr & 0xffffffff;
tile_buffers[2 * u + 1] = tile_buffer_addr >> 32;
}
*addr_out = clear_bo->dev_addr;
return VK_SUCCESS;
}
static VkResult pvr_load_op_pds_data_create_and_upload(
struct pvr_cmd_buffer *cmd_buffer,
const struct pvr_load_op *load_op,
pvr_dev_addr_t constants_addr,
struct pvr_pds_upload *const pds_upload_out)
{
struct pvr_device *device = cmd_buffer->device;
const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
struct pvr_pds_pixel_shader_sa_program program = { 0 };
uint32_t staging_buffer_size;
uint32_t *staging_buffer;
VkResult result;
program.num_texture_dma_kicks = 1;
pvr_csb_pack (&program.texture_dma_address[0],
PDSINST_DOUT_FIELDS_DOUTD_SRC0,
value) {
value.sbase = constants_addr;
}
pvr_csb_pack (&program.texture_dma_control[0],
PDSINST_DOUT_FIELDS_DOUTD_SRC1,
value) {
value.dest = ROGUE_PDSINST_DOUTD_DEST_COMMON_STORE;
value.bsize = load_op->shareds_count;
}
pvr_pds_set_sizes_pixel_shader_sa_texture_data(&program, dev_info);
staging_buffer_size = PVR_DW_TO_BYTES(program.data_size);
staging_buffer = vk_alloc(&cmd_buffer->vk.pool->alloc,
staging_buffer_size,
8,
VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
if (!staging_buffer)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
pvr_pds_generate_pixel_shader_sa_texture_state_data(&program,
staging_buffer,
dev_info);
result = pvr_cmd_buffer_upload_pds_data(cmd_buffer,
staging_buffer,
program.data_size,
1,
pds_upload_out);
if (result != VK_SUCCESS) {
vk_free(&cmd_buffer->vk.pool->alloc, staging_buffer);
return result;
}
vk_free(&cmd_buffer->vk.pool->alloc, staging_buffer);
return VK_SUCCESS;
}
/* FIXME: Should this function be specific to the HW background object, in
* which case its name should be changed, or should it have the load op
* structure passed in?
*/
static VkResult
pvr_load_op_data_create_and_upload(struct pvr_cmd_buffer *cmd_buffer,
const struct pvr_load_op *load_op,
uint32_t view_index,
struct pvr_pds_upload *const pds_upload_out)
{
pvr_dev_addr_t constants_addr;
VkResult result;
result = pvr_load_op_constants_create_and_upload(cmd_buffer,
load_op,
view_index,
&constants_addr);
if (result != VK_SUCCESS)
return result;
return pvr_load_op_pds_data_create_and_upload(cmd_buffer,
load_op,
constants_addr,
pds_upload_out);
}
static void pvr_pds_bgnd_pack_state(
const struct pvr_load_op *load_op,
const struct pvr_pds_upload *load_op_program,
uint64_t pds_reg_values[static const ROGUE_NUM_CR_PDS_BGRND_WORDS])
{
pvr_csb_pack (&pds_reg_values[0], CR_PDS_BGRND0_BASE, value) {
value.shader_addr = PVR_DEV_ADDR(load_op->pds_frag_prog.data_offset);
value.texunicode_addr =
PVR_DEV_ADDR(load_op->pds_tex_state_prog.code_offset);
}
pvr_csb_pack (&pds_reg_values[1], CR_PDS_BGRND1_BASE, value) {
value.texturedata_addr = PVR_DEV_ADDR(load_op_program->data_offset);
}
pvr_csb_pack (&pds_reg_values[2], CR_PDS_BGRND3_SIZEINFO, value) {
value.usc_sharedsize =
DIV_ROUND_UP(load_op->const_shareds_count,
ROGUE_CR_PDS_BGRND3_SIZEINFO_USC_SHAREDSIZE_UNIT_SIZE);
value.pds_texturestatesize = DIV_ROUND_UP(
load_op_program->data_size,
ROGUE_CR_PDS_BGRND3_SIZEINFO_PDS_TEXTURESTATESIZE_UNIT_SIZE);
value.pds_tempsize =
DIV_ROUND_UP(load_op->temps_count,
ROGUE_CR_PDS_BGRND3_SIZEINFO_PDS_TEMPSIZE_UNIT_SIZE);
}
}
static inline VkResult pvr_load_op_state_data_create_and_upload_for_view(
struct pvr_cmd_buffer *cmd_buffer,
const struct pvr_load_op *load_op,
uint32_t view_index,
uint64_t pds_reg_values[static const ROGUE_NUM_CR_PDS_BGRND_WORDS])
{
struct pvr_pds_upload load_op_program;
VkResult result;
/* FIXME: Should we free the PDS pixel event data or let it be freed
* when the pool gets emptied?
*/
result = pvr_load_op_data_create_and_upload(cmd_buffer,
load_op,
view_index,
&load_op_program);
if (result != VK_SUCCESS)
return result;
pvr_pds_bgnd_pack_state(load_op, &load_op_program, pds_reg_values);
return VK_SUCCESS;
}
static VkResult pvr_load_op_state_data_create_and_upload(
struct pvr_cmd_buffer *cmd_buffer,
const struct pvr_load_op_state *load_op_state,
struct pvr_view_state *view_state)
{
for (uint32_t i = 0; i < load_op_state->load_op_count; i++) {
const struct pvr_load_op *load_op = &load_op_state->load_ops[i];
uint32_t view_index = load_op->view_indices[0];
uint64_t *pds_reg_values;
VkResult result;
pds_reg_values = view_state->view[view_index].pds_bgnd_reg_values;
result =
pvr_load_op_state_data_create_and_upload_for_view(cmd_buffer,
load_op,
view_index,
pds_reg_values);
if (result != VK_SUCCESS)
return result;
pds_reg_values = view_state->view[view_index].pr_pds_bgnd_reg_values;
result =
pvr_load_op_state_data_create_and_upload_for_view(cmd_buffer,
load_op,
view_index,
pds_reg_values);
if (result != VK_SUCCESS)
return result;
}
return VK_SUCCESS;
}
/**
* \brief Calculates the stride in pixels based on the pitch in bytes and pixel
* format.
*
* \param[in] pitch Width pitch in bytes.
* \param[in] vk_format Vulkan image format.
* \return Stride in pixels.
*/
static inline uint32_t pvr_stride_from_pitch(uint32_t pitch, VkFormat vk_format)
{
const unsigned int cpp = vk_format_get_blocksize(vk_format);
assert(pitch % cpp == 0);
return pitch / cpp;
}
static void pvr_setup_pbe_state(
const struct pvr_device_info *dev_info,
const struct pvr_render_state *rstate,
uint32_t mrt_index,
const struct usc_mrt_resource *mrt_resource,
const struct pvr_image_view *const iview,
const VkRect2D *render_area,
const bool down_scale,
const uint32_t samples,
uint32_t pbe_cs_words[static const ROGUE_NUM_PBESTATE_STATE_WORDS],
uint64_t pbe_reg_words[static const ROGUE_NUM_PBESTATE_REG_WORDS],
uint32_t view_index)
{
const struct pvr_image *image = pvr_image_view_get_image(iview);
uint32_t level_pitch = image->mip_levels[iview->vk.base_mip_level].pitch;
struct pvr_pbe_surf_params surface_params;
struct pvr_pbe_render_params render_params;
bool with_packed_usc_channel;
const uint8_t *swizzle;
uint32_t position;
/* down_scale should be true when performing a resolve, in which case there
* should be more than one sample.
*/
assert((down_scale && samples > 1U) || (!down_scale && samples == 1U));
/* Setup surface parameters. */
if (PVR_HAS_FEATURE(dev_info, usc_f16sop_u8)) {
with_packed_usc_channel = vk_format_is_unorm(iview->vk.format) ||
vk_format_is_snorm(iview->vk.format);
} else {
with_packed_usc_channel = false;
}
swizzle = pvr_get_format_swizzle(iview->vk.format);
memcpy(surface_params.swizzle, swizzle, sizeof(surface_params.swizzle));
pvr_pbe_get_src_format_and_gamma(iview->vk.format,
PVR_PBE_GAMMA_NONE,
with_packed_usc_channel,
&surface_params.source_format,
&surface_params.gamma);
surface_params.is_normalized =
pvr_vk_format_is_fully_normalized(iview->vk.format);
surface_params.pbe_packmode = pvr_get_pbe_packmode(iview->vk.format);
surface_params.nr_components = vk_format_get_nr_components(iview->vk.format);
/* FIXME: Should we have an inline function to return the address of a mip
* level?
*/
surface_params.addr = PVR_DEV_ADDR_OFFSET(
image->vma->dev_addr,
image->layer_size * view_index +
image->mip_levels[iview->vk.base_mip_level].offset);
if (!iview->vk.storage.z_slice_offset) {
surface_params.addr =
PVR_DEV_ADDR_OFFSET(surface_params.addr,
iview->vk.base_array_layer * image->layer_size);
}
surface_params.mem_layout = image->memlayout;
surface_params.stride = pvr_stride_from_pitch(level_pitch, iview->vk.format);
surface_params.depth = iview->vk.extent.depth;
surface_params.width = iview->vk.extent.width;
surface_params.height = iview->vk.extent.height;
surface_params.z_only_render = false;
surface_params.down_scale = down_scale;
/* Setup render parameters. */
if (mrt_resource->type == USC_MRT_RESOURCE_TYPE_MEMORY) {
position = mrt_resource->mem.offset_dw;
} else {
assert(mrt_resource->type == USC_MRT_RESOURCE_TYPE_OUTPUT_REG);
assert(mrt_resource->reg.offset == 0);
position = mrt_resource->reg.output_reg;
}
assert(position <= 3 || PVR_HAS_FEATURE(dev_info, eight_output_registers));
switch (position) {
case 0:
case 4:
render_params.source_start = PVR_PBE_STARTPOS_BIT0;
break;
case 1:
case 5:
render_params.source_start = PVR_PBE_STARTPOS_BIT32;
break;
case 2:
case 6:
render_params.source_start = PVR_PBE_STARTPOS_BIT64;
break;
case 3:
case 7:
render_params.source_start = PVR_PBE_STARTPOS_BIT96;
break;
default:
assert(!"Invalid output register");
break;
}
#define PVR_DEC_IF_NOT_ZERO(_v) (((_v) > 0) ? (_v) - 1 : 0)
render_params.min_x_clip = MAX2(0, render_area->offset.x);
render_params.min_y_clip = MAX2(0, render_area->offset.y);
render_params.max_x_clip = MIN2(
rstate->width - 1,
PVR_DEC_IF_NOT_ZERO(render_area->offset.x + render_area->extent.width));
render_params.max_y_clip = MIN2(
rstate->height - 1,
PVR_DEC_IF_NOT_ZERO(render_area->offset.y + render_area->extent.height));
#undef PVR_DEC_IF_NOT_ZERO
render_params.slice = iview->vk.storage.z_slice_offset;
render_params.mrt_index = mrt_index;
pvr_pbe_pack_state(dev_info,
&surface_params,
&render_params,
pbe_cs_words,
pbe_reg_words);
}
static struct pvr_render_target *
pvr_get_render_target(const struct pvr_renderpass_hwsetup_render *hw_render,
const struct pvr_render_state *rstate)
{
uint32_t rt_idx = 0;
switch (hw_render->sample_count) {
case 1:
case 2:
case 4:
case 8:
rt_idx = util_logbase2(hw_render->sample_count);
break;
default:
UNREACHABLE("Unsupported sample count");
break;
}
return &rstate->render_targets[rt_idx];
}
static uint32_t pvr_get_pixel_output_width(
const struct pvr_renderpass_hwsetup_render *hw_render,
const struct pvr_device_info *dev_info)
{
/* Default value based on the maximum value found in all existing cores. The
* maximum is used as this is being treated as a lower bound, making it a
* "safer" choice than the minimum value found in all existing cores.
*/
const uint32_t min_output_regs =
PVR_GET_FEATURE_VALUE(dev_info, usc_min_output_registers_per_pix, 2U);
const uint32_t width = MAX2(hw_render->output_regs_count, min_output_regs);
return util_next_power_of_two(width);
}
static inline bool
pvr_ds_attachment_requires_zls(const struct pvr_ds_attachment *attachment)
{
bool zls_used;
zls_used = attachment->load.d || attachment->load.s;
zls_used |= attachment->store.d || attachment->store.s;
return zls_used;
}
/**
* \brief If depth and/or stencil attachment dimensions are not tile-aligned,
* then we may need to insert some additional transfer subcommands.
*
* It's worth noting that we check whether the dimensions are smaller than a
* tile here, rather than checking whether they're tile-aligned - this relies
* on the assumption that we can safely use any attachment with dimensions
* larger than a tile. If the attachment is twiddled, it will be over-allocated
* to the nearest power-of-two (which will be tile-aligned). If the attachment
* is not twiddled, we don't need to worry about tile-alignment at all.
*/
static bool pvr_sub_cmd_gfx_requires_ds_subtile_alignment(
const struct pvr_device_info *dev_info,
const struct pvr_render_job *job)
{
const struct pvr_image *const ds_image =
pvr_image_view_get_image(job->ds.iview);
uint32_t zls_tile_size_x;
uint32_t zls_tile_size_y;
rogue_get_zls_tile_size_xy(dev_info, &zls_tile_size_x, &zls_tile_size_y);
if (ds_image->physical_extent.width >= zls_tile_size_x &&
ds_image->physical_extent.height >= zls_tile_size_y) {
return false;
}
/* If we have the zls_subtile feature, we can skip the alignment iff:
* - The attachment is not multisampled, and
* - The depth and stencil attachments are the same.
*/
if (PVR_HAS_FEATURE(dev_info, zls_subtile) &&
ds_image->vk.samples == VK_SAMPLE_COUNT_1_BIT &&
job->has_stencil_attachment == job->has_depth_attachment) {
return false;
}
/* No ZLS functions enabled; nothing to do. */
if ((!job->has_depth_attachment && !job->has_stencil_attachment) ||
!pvr_ds_attachment_requires_zls(&job->ds)) {
return false;
}
return true;
}
static VkResult
pvr_sub_cmd_gfx_align_ds_subtiles(struct pvr_cmd_buffer *const cmd_buffer,
struct pvr_sub_cmd_gfx *const gfx_sub_cmd)
{
struct pvr_sub_cmd *const prev_sub_cmd =
container_of(gfx_sub_cmd, struct pvr_sub_cmd, gfx);
struct pvr_ds_attachment *const ds = &gfx_sub_cmd->job.ds;
const struct pvr_image *const ds_image = pvr_image_view_get_image(ds->iview);
const VkFormat copy_format = pvr_get_raw_copy_format(ds_image->vk.format);
struct pvr_suballoc_bo *buffer;
uint32_t buffer_layer_size;
VkBufferImageCopy2 region;
VkExtent2D zls_tile_size;
VkExtent2D rounded_size;
uint32_t buffer_size;
VkExtent2D scale;
VkResult result;
/* The operations below assume the last command in the buffer was the target
* gfx subcommand. Assert that this is the case.
*/
assert(list_last_entry(&cmd_buffer->sub_cmds, struct pvr_sub_cmd, link) ==
prev_sub_cmd);
assert(prev_sub_cmd == cmd_buffer->state.current_sub_cmd);
if (!pvr_ds_attachment_requires_zls(ds))
return VK_SUCCESS;
rogue_get_zls_tile_size_xy(&cmd_buffer->device->pdevice->dev_info,
&zls_tile_size.width,
&zls_tile_size.height);
rogue_get_isp_scale_xy_from_samples(ds_image->vk.samples,
&scale.width,
&scale.height);
rounded_size = (VkExtent2D){
.width = ALIGN_POT(ds_image->physical_extent.width, zls_tile_size.width),
.height =
ALIGN_POT(ds_image->physical_extent.height, zls_tile_size.height),
};
buffer_layer_size = vk_format_get_blocksize(ds_image->vk.format) *
rounded_size.width * rounded_size.height * scale.width *
scale.height;
if (ds->iview->vk.layer_count > 1)
buffer_layer_size = ALIGN_POT(buffer_layer_size, ds_image->alignment);
buffer_size = buffer_layer_size * ds->iview->vk.layer_count;
result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
cmd_buffer->device->heaps.general_heap,
buffer_size,
&buffer);
if (result != VK_SUCCESS)
return result;
region = (VkBufferImageCopy2){
.sType = VK_STRUCTURE_TYPE_BUFFER_IMAGE_COPY_2,
.pNext = NULL,
.bufferOffset = 0,
.bufferRowLength = rounded_size.width,
.bufferImageHeight = 0,
.imageSubresource = {
.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT,
.mipLevel = ds->iview->vk.base_mip_level,
.baseArrayLayer = ds->iview->vk.base_array_layer,
.layerCount = ds->iview->vk.layer_count,
},
.imageOffset = { 0 },
.imageExtent = {
.width = ds->iview->vk.extent.width,
.height = ds->iview->vk.extent.height,
.depth = 1,
},
};
if (ds->load.d || ds->load.s) {
struct pvr_sub_cmd *new_sub_cmd;
cmd_buffer->state.current_sub_cmd = NULL;
result =
pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_TRANSFER);
if (result != VK_SUCCESS)
return result;
new_sub_cmd = cmd_buffer->state.current_sub_cmd;
result = pvr_copy_image_to_buffer_region_format(cmd_buffer,
ds_image,
buffer->dev_addr,
&region,
copy_format,
copy_format);
if (result != VK_SUCCESS)
return result;
new_sub_cmd->transfer.serialize_with_frag = true;
result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
if (result != VK_SUCCESS)
return result;
/* Now we have to fiddle with cmd_buffer to place this transfer command
* *before* the target gfx subcommand.
*
* Note the doc for list_move_to() is subtly wrong - item is placed
* directly *after* loc in the list, not "in front of".
*/
list_move_to(&new_sub_cmd->link, prev_sub_cmd->link.prev);
cmd_buffer->state.current_sub_cmd = prev_sub_cmd;
}
if (ds->store.d || ds->store.s) {
cmd_buffer->state.current_sub_cmd = NULL;
result =
pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_TRANSFER);
if (result != VK_SUCCESS)
return result;
result = pvr_copy_buffer_to_image_region_format(cmd_buffer,
buffer->dev_addr,
ds_image,
&region,
copy_format,
copy_format,
0);
if (result != VK_SUCCESS)
return result;
cmd_buffer->state.current_sub_cmd->transfer.serialize_with_frag = true;
result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
if (result != VK_SUCCESS)
return result;
cmd_buffer->state.current_sub_cmd = prev_sub_cmd;
}
/* Finally, patch up the target graphics sub_cmd to use the correctly-strided
* buffer.
*/
ds->has_alignment_transfers = true;
ds->addr = buffer->dev_addr;
ds->physical_extent = rounded_size;
gfx_sub_cmd->wait_on_previous_transfer = true;
return VK_SUCCESS;
}
struct pvr_emit_state {
uint32_t pbe_cs_words[PVR_MAX_COLOR_ATTACHMENTS]
[ROGUE_NUM_PBESTATE_STATE_WORDS];
uint64_t pbe_reg_words[PVR_MAX_COLOR_ATTACHMENTS]
[ROGUE_NUM_PBESTATE_REG_WORDS];
unsigned tile_buffer_ids[PVR_MAX_COLOR_ATTACHMENTS];
uint32_t emit_count;
};
static void
pvr_setup_emit_state(const struct pvr_device_info *dev_info,
const struct pvr_renderpass_hwsetup_render *hw_render,
struct pvr_render_pass_info *render_pass_info,
uint32_t view_index,
struct pvr_emit_state *emit_state)
{
assert(hw_render->pbe_emits <= PVR_NUM_PBE_EMIT_REGS);
if (hw_render->eot_surface_count == 0) {
emit_state->emit_count = 1;
pvr_csb_pack (&emit_state->pbe_cs_words[0][1],
PBESTATE_STATE_WORD1,
state) {
state.emptytile = true;
}
return;
}
static_assert(USC_MRT_RESOURCE_TYPE_OUTPUT_REG + 1 ==
USC_MRT_RESOURCE_TYPE_MEMORY,
"The loop below needs adjusting.");
emit_state->emit_count = 0;
for (uint32_t resource_type = USC_MRT_RESOURCE_TYPE_OUTPUT_REG;
resource_type <= USC_MRT_RESOURCE_TYPE_MEMORY;
resource_type++) {
for (uint32_t i = 0; i < hw_render->eot_surface_count; i++) {
const struct pvr_render_state *rstate = render_pass_info->rstate;
const struct pvr_renderpass_hwsetup_eot_surface *surface =
&hw_render->eot_surfaces[i];
const struct pvr_image_view *iview =
render_pass_info->attachments[surface->attachment_idx];
const struct usc_mrt_resource *mrt_resource =
&hw_render->eot_setup.mrt_resources[surface->mrt_idx];
uint32_t samples = 1;
if (mrt_resource->type != resource_type)
continue;
if (surface->need_resolve) {
const struct pvr_image_view *resolve_src =
render_pass_info->attachments[surface->src_attachment_idx];
/* Attachments that are the destination of resolve operations must
* be loaded before their next use.
*/
render_pass_info->enable_bg_tag = true;
render_pass_info->process_empty_tiles = true;
if (surface->resolve_type != PVR_RESOLVE_TYPE_PBE)
continue;
samples = (uint32_t)resolve_src->vk.image->samples;
}
assert(emit_state->emit_count < ARRAY_SIZE(emit_state->pbe_cs_words));
assert(emit_state->emit_count < ARRAY_SIZE(emit_state->pbe_reg_words));
emit_state->tile_buffer_ids[emit_state->emit_count] =
mrt_resource->type == USC_MRT_RESOURCE_TYPE_MEMORY
? mrt_resource->mem.tile_buffer
: ~0;
pvr_setup_pbe_state(dev_info,
rstate,
emit_state->emit_count,
mrt_resource,
iview,
&render_pass_info->render_area,
surface->need_resolve,
samples,
emit_state->pbe_cs_words[emit_state->emit_count],
emit_state->pbe_reg_words[emit_state->emit_count],
view_index);
emit_state->emit_count += 1;
}
}
assert(emit_state->emit_count == hw_render->pbe_emits);
}
static inline bool
pvr_is_render_area_tile_aligned(const struct pvr_cmd_buffer *cmd_buffer,
const struct pvr_image_view *iview)
{
const VkRect2D *render_area =
&cmd_buffer->state.render_pass_info.render_area;
return render_area->offset.x == 0 && render_area->offset.y == 0 &&
render_area->extent.height == iview->vk.extent.height &&
render_area->extent.width == iview->vk.extent.width;
}
static VkResult pvr_render_targets_dataset_init(
struct pvr_device *device,
struct pvr_render_state *rstate,
const struct pvr_renderpass_hwsetup_render *hw_render);
static VkResult pvr_sub_cmd_gfx_job_init(const struct pvr_device_info *dev_info,
struct pvr_cmd_buffer *cmd_buffer,
struct pvr_sub_cmd_gfx *sub_cmd)
{
static const VkClearDepthStencilValue default_ds_clear_value = {
.depth = 1.0f,
.stencil = 0xFFFFFFFF,
};
const struct vk_dynamic_graphics_state *dynamic_state =
&cmd_buffer->vk.dynamic_graphics_state;
struct pvr_render_pass_info *render_pass_info =
&cmd_buffer->state.render_pass_info;
const struct pvr_renderpass_hwsetup_render *hw_render =
pvr_pass_info_get_hw_render(render_pass_info, sub_cmd->hw_render_idx);
struct pvr_render_job *job = &sub_cmd->job;
struct pvr_render_state *rstate = render_pass_info->rstate;
struct pvr_spm_bgobj_state *spm_bgobj_state =
&rstate->spm_bgobj_state_per_render[sub_cmd->hw_render_idx];
struct pvr_spm_eot_state *spm_eot_state =
&rstate->spm_eot_state_per_render[sub_cmd->hw_render_idx];
struct pvr_render_target *render_target;
VkResult result;
/* Unless for barrier_{store,load}, where the index defaults to zero, the
* view index associated with a gfx job is known and set only at submission
* time.
*/
job->view_state.view_index = 0;
if (sub_cmd->barrier_store) {
/* Store to the SPM scratch buffer. */
/* The scratch buffer is always needed and allocated to avoid data loss in
* case SPM is hit so set the flag unconditionally.
*/
job->requires_spm_scratch_buffer = true;
/* There can only ever be one frag job running on the hardware at any one
* time, and a context switch is not allowed mid-tile, so instead of
* allocating a new scratch buffer we can reuse the SPM scratch buffer to
* perform the store.
* So use the SPM EOT program with the SPM PBE reg words in order to store
* the render to the SPM scratch buffer.
*/
memcpy(job->pbe_reg_words,
&spm_eot_state->pbe_reg_words,
sizeof(job->pbe_reg_words));
/* Configure the job view state for a barrier store */
assert(!job->view_state.view_index);
job->view_state.view[0].pds_pixel_event_data_offset =
spm_eot_state->pixel_event_program_data_offset;
job->view_state.force_pds_pixel_event_data_offset_zero = true;
} else {
struct pvr_pds_upload pds_pixel_event_program;
struct pvr_emit_state emit_state = { 0 };
memset(emit_state.tile_buffer_ids,
~0,
sizeof(emit_state.tile_buffer_ids));
u_foreach_bit (view_idx, hw_render->view_mask) {
pvr_setup_emit_state(dev_info,
hw_render,
render_pass_info,
view_idx,
&emit_state);
unsigned pixel_output_width =
pvr_get_pixel_output_width(hw_render, dev_info);
result = pvr_sub_cmd_gfx_per_job_fragment_programs_create_and_upload(
cmd_buffer,
emit_state.emit_count,
emit_state.pbe_cs_words[0],
emit_state.tile_buffer_ids,
pixel_output_width,
&pds_pixel_event_program);
if (result != VK_SUCCESS)
return result;
/* Configure the job view state */
job->view_state.view[view_idx].pds_pixel_event_data_offset =
pds_pixel_event_program.data_offset;
}
job->z_only_render = !hw_render->eot_surface_count &&
!sub_cmd->frag_has_side_effects &&
!sub_cmd->has_depth_feedback;
memcpy(job->pbe_reg_words,
emit_state.pbe_reg_words,
sizeof(job->pbe_reg_words));
}
if (sub_cmd->barrier_load) {
job->enable_bg_tag = true;
job->process_empty_tiles = true;
/* Load the previously stored render from the SPM scratch buffer. */
STATIC_ASSERT(ARRAY_SIZE(job->view_state.view[0].pds_bgnd_reg_values) ==
ARRAY_SIZE(spm_bgobj_state->pds_reg_values));
typed_memcpy(job->view_state.view[0].pds_bgnd_reg_values,
spm_bgobj_state->pds_reg_values,
ARRAY_SIZE(spm_bgobj_state->pds_reg_values));
STATIC_ASSERT(
ARRAY_SIZE(job->view_state.view[0].pr_pds_bgnd_reg_values) ==
ARRAY_SIZE(spm_bgobj_state->pds_reg_values));
typed_memcpy(job->view_state.view[0].pr_pds_bgnd_reg_values,
spm_bgobj_state->pds_reg_values,
ARRAY_SIZE(spm_bgobj_state->pds_reg_values));
/* Configure the job view state for a barrier load */
assert(!job->view_state.view_index);
job->view_state.force_pds_bgnd_reg_values_zero = true;
} else if (hw_render->load_op_state) {
const struct pvr_load_op_state *load_op_state = hw_render->load_op_state;
/* We always have at least 1 bit set in the view_mask */
assert(load_op_state->load_op_count);
/* Recalculate Background Object(s). */
result = pvr_load_op_state_data_create_and_upload(cmd_buffer,
load_op_state,
&job->view_state);
if (result != VK_SUCCESS)
return result;
job->enable_bg_tag = render_pass_info->enable_bg_tag;
job->process_empty_tiles = render_pass_info->process_empty_tiles;
}
if (!hw_render->requires_frag_pr) {
memcpy(job->pr_pbe_reg_words,
job->pbe_reg_words,
sizeof(job->pbe_reg_words));
job->view_state.use_pds_pixel_event_data_offset = true;
} else {
memcpy(job->pr_pbe_reg_words,
&spm_eot_state->pbe_reg_words,
sizeof(job->pbe_reg_words));
job->pr_pds_pixel_event_data_offset =
spm_eot_state->pixel_event_program_data_offset;
}
render_target = pvr_get_render_target(hw_render, rstate);
job->view_state.rt_datasets = &render_target->rt_dataset[0];
if (cmd_buffer->state.current_sub_cmd->is_dynamic_render) {
result =
pvr_render_targets_dataset_init(cmd_buffer->device, rstate, hw_render);
if (result != VK_SUCCESS) {
vk_command_buffer_set_error(&cmd_buffer->vk, result);
return result;
}
}
job->ctrl_stream_addr = pvr_csb_get_start_address(&sub_cmd->control_stream);
if (sub_cmd->depth_bias_bo)
job->depth_bias_table_addr = sub_cmd->depth_bias_bo->dev_addr;
else
job->depth_bias_table_addr = PVR_DEV_ADDR_INVALID;
if (sub_cmd->scissor_bo)
job->scissor_table_addr = sub_cmd->scissor_bo->dev_addr;
else
job->scissor_table_addr = PVR_DEV_ADDR_INVALID;
job->pixel_output_width = pvr_get_pixel_output_width(hw_render, dev_info);
/* Setup depth/stencil job information. */
if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) {
struct pvr_image_view *ds_iview =
render_pass_info->attachments[hw_render->ds_attach_idx];
const struct pvr_image *ds_image = pvr_image_view_get_image(ds_iview);
job->has_depth_attachment = vk_format_has_depth(ds_image->vk.format);
job->has_stencil_attachment = vk_format_has_stencil(ds_image->vk.format);
if (job->has_depth_attachment || job->has_stencil_attachment) {
uint32_t level_pitch =
ds_image->mip_levels[ds_iview->vk.base_mip_level].pitch;
const bool render_area_is_tile_aligned =
pvr_is_render_area_tile_aligned(cmd_buffer, ds_iview);
bool store_was_optimised_out = false;
bool d_store = false, s_store = false;
bool d_load = false, s_load = false;
job->ds.iview = ds_iview;
job->ds.addr = ds_image->dev_addr;
job->ds.stride =
pvr_stride_from_pitch(level_pitch, ds_iview->vk.format);
job->ds.height = ds_iview->vk.extent.height;
job->ds.physical_extent = (VkExtent2D){
.width = u_minify(ds_image->physical_extent.width,
ds_iview->vk.base_mip_level),
.height = u_minify(ds_image->physical_extent.height,
ds_iview->vk.base_mip_level),
};
job->ds.layer_size = ds_image->layer_size;
job->ds_clear_value = default_ds_clear_value;
if (hw_render->ds_attach_idx < render_pass_info->clear_value_count) {
const VkClearDepthStencilValue *const clear_values =
&render_pass_info->clear_values[hw_render->ds_attach_idx]
.depthStencil;
if (job->has_depth_attachment)
job->ds_clear_value.depth = clear_values->depth;
if (job->has_stencil_attachment)
job->ds_clear_value.stencil = clear_values->stencil;
}
switch (ds_iview->vk.format) {
case VK_FORMAT_D16_UNORM:
job->ds.zls_format = ROGUE_CR_ZLS_FORMAT_TYPE_16BITINT;
break;
case VK_FORMAT_S8_UINT:
case VK_FORMAT_D32_SFLOAT:
job->ds.zls_format = ROGUE_CR_ZLS_FORMAT_TYPE_F32Z;
break;
case VK_FORMAT_D24_UNORM_S8_UINT:
case VK_FORMAT_X8_D24_UNORM_PACK32:
job->ds.zls_format = ROGUE_CR_ZLS_FORMAT_TYPE_24BITINT;
break;
case VK_FORMAT_D32_SFLOAT_S8_UINT:
job->ds.zls_format = ROGUE_CR_ZLS_FORMAT_TYPE_F64Z;
break;
default:
UNREACHABLE("Unsupported depth stencil format");
}
job->ds.memlayout = ds_image->memlayout;
if (job->has_depth_attachment) {
if (hw_render->depth_store || sub_cmd->barrier_store) {
const bool depth_init_is_clear = hw_render->depth_init ==
VK_ATTACHMENT_LOAD_OP_CLEAR;
d_store = true;
if (hw_render->depth_store && render_area_is_tile_aligned &&
!(sub_cmd->modifies_depth || depth_init_is_clear)) {
d_store = false;
store_was_optimised_out = true;
}
}
if (d_store && !render_area_is_tile_aligned) {
d_load = true;
} else if (hw_render->depth_init == VK_ATTACHMENT_LOAD_OP_LOAD) {
enum pvr_depth_stencil_usage depth_usage = sub_cmd->depth_usage;
d_load = (depth_usage != PVR_DEPTH_STENCIL_USAGE_NEVER);
} else {
d_load = sub_cmd->barrier_load;
}
}
if (job->has_stencil_attachment) {
if (hw_render->stencil_store || sub_cmd->barrier_store) {
const bool stencil_init_is_clear = hw_render->stencil_init ==
VK_ATTACHMENT_LOAD_OP_CLEAR;
s_store = true;
if (hw_render->stencil_store && render_area_is_tile_aligned &&
!(sub_cmd->modifies_stencil || stencil_init_is_clear)) {
s_store = false;
store_was_optimised_out = true;
}
}
if (s_store && !render_area_is_tile_aligned) {
s_load = true;
} else if (hw_render->stencil_init == VK_ATTACHMENT_LOAD_OP_LOAD) {
enum pvr_depth_stencil_usage stencil_usage =
sub_cmd->stencil_usage;
s_load = (stencil_usage != PVR_DEPTH_STENCIL_USAGE_NEVER);
} else {
s_load = sub_cmd->barrier_load;
}
}
job->ds.load.d = d_load;
job->ds.load.s = s_load;
job->ds.store.d = d_store;
job->ds.store.s = s_store;
/* ZLS can't do masked writes for packed depth stencil formats so if
* we store anything, we have to store everything.
*/
if ((job->ds.store.d || job->ds.store.s) &&
pvr_zls_format_type_is_packed(job->ds.zls_format)) {
job->ds.store.d = true;
job->ds.store.s = true;
/* In case we are only operating on one aspect of the attachment we
* need to load the unused one in order to preserve its contents due
* to the forced store which might otherwise corrupt it.
*/
if (hw_render->depth_init != VK_ATTACHMENT_LOAD_OP_CLEAR)
job->ds.load.d = true;
if (hw_render->stencil_init != VK_ATTACHMENT_LOAD_OP_CLEAR)
job->ds.load.s = true;
}
if (pvr_ds_attachment_requires_zls(&job->ds) ||
store_was_optimised_out) {
job->process_empty_tiles = true;
}
if (pvr_sub_cmd_gfx_requires_ds_subtile_alignment(dev_info, job)) {
result = pvr_sub_cmd_gfx_align_ds_subtiles(cmd_buffer, sub_cmd);
if (result != VK_SUCCESS)
return result;
}
}
} else {
job->has_depth_attachment = false;
job->has_stencil_attachment = false;
job->ds_clear_value = default_ds_clear_value;
}
if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) {
struct pvr_image_view *iview =
render_pass_info->attachments[hw_render->ds_attach_idx];
const struct pvr_image *image = pvr_image_view_get_image(iview);
/* If the HW render pass has a valid depth/stencil surface, determine the
* sample count from the attachment's image.
*/
job->samples = image->vk.samples;
} else if (hw_render->output_regs_count) {
/* If the HW render pass has output registers, we have color attachments
* to write to, so determine the sample count from the count specified for
* every color attachment in this render.
*/
job->samples = hw_render->sample_count;
} else if (cmd_buffer->state.gfx_pipeline) {
/* If the HW render pass has no color or depth/stencil attachments, we
* determine the sample count from the count given during pipeline
* creation.
*/
job->samples = dynamic_state->ms.rasterization_samples;
} else if (render_pass_info->pass->attachment_count > 0) {
/* If we get here, we have a render pass with subpasses containing no
* attachments. The next best thing is largest of the sample counts
* specified by the render pass attachment descriptions.
*/
job->samples = render_pass_info->pass->max_sample_count;
} else {
/* No appropriate framebuffer attachment is available. */
mesa_logw("Defaulting render job sample count to 1.");
job->samples = VK_SAMPLE_COUNT_1_BIT;
}
if (sub_cmd->max_tiles_in_flight ==
PVR_GET_FEATURE_VALUE(dev_info, isp_max_tiles_in_flight, 1U)) {
/* Use the default limit based on the partition store. */
job->max_tiles_in_flight = 0U;
} else {
job->max_tiles_in_flight = sub_cmd->max_tiles_in_flight;
}
job->frag_uses_atomic_ops = sub_cmd->frag_uses_atomic_ops;
job->disable_compute_overlap = false;
job->max_shared_registers = cmd_buffer->state.max_shared_regs;
if (!cmd_buffer->state.current_sub_cmd->is_suspend) {
assert(cmd_buffer->state.current_sub_cmd->type ==
PVR_SUB_CMD_TYPE_GRAPHICS);
job->geometry_terminate = true;
job->run_frag = true;
}
/* TODO: Enable pixel merging when it's safe to do. */
job->disable_pixel_merging = true;
return VK_SUCCESS;
}
static void
pvr_sub_cmd_compute_job_init(const struct pvr_physical_device *pdevice,
struct pvr_cmd_buffer *cmd_buffer,
struct pvr_sub_cmd_compute *sub_cmd)
{
sub_cmd->num_shared_regs = MAX2(cmd_buffer->device->idfwdf_state.usc_shareds,
cmd_buffer->state.max_shared_regs);
cmd_buffer->state.max_shared_regs = 0U;
}
#define PIXEL_ALLOCATION_SIZE_MAX_IN_BLOCKS \
(1024 / ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE)
static uint32_t
pvr_compute_flat_slot_size(const struct pvr_physical_device *pdevice,
uint32_t coeff_regs_count,
bool use_barrier,
uint32_t total_workitems)
{
const struct pvr_device_runtime_info *dev_runtime_info =
&pdevice->dev_runtime_info;
const struct pvr_device_info *dev_info = &pdevice->dev_info;
uint32_t max_workgroups_per_task = ROGUE_CDM_MAX_PACKED_WORKGROUPS_PER_TASK;
uint32_t max_avail_coeff_regs =
dev_runtime_info->cdm_max_local_mem_size_regs;
uint32_t localstore_chunks_count =
DIV_ROUND_UP(PVR_DW_TO_BYTES(coeff_regs_count),
ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE);
/* Ensure that we cannot have more workgroups in a slot than the available
* number of coefficients allow us to have.
*/
if (coeff_regs_count > 0U) {
/* If the geometry or fragment jobs can overlap with the compute job, or
* if there is a vertex shader already running then we need to consider
* this in calculating max allowed work-groups.
*/
if (PVR_HAS_QUIRK(dev_info, 52354) &&
(PVR_HAS_FEATURE(dev_info, compute_overlap) ||
PVR_HAS_FEATURE(dev_info, gs_rta_support))) {
/* Solve for n (number of work-groups per task). All values are in
* size of common store alloc blocks:
*
* n + (2n + 7) * (local_memory_size_max - 1) =
* (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
* ==>
* n + 2n * (local_memory_size_max - 1) =
* (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
* - (7 * (local_memory_size_max - 1))
* ==>
* n * (1 + 2 * (local_memory_size_max - 1)) =
* (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
* - (7 * (local_memory_size_max - 1))
* ==>
* n = ((coefficient_memory_pool_size) -
* (7 * pixel_allocation_size_max) -
* (7 * (local_memory_size_max - 1)) / (1 +
* 2 * (local_memory_size_max - 1)))
*/
uint32_t max_common_store_blocks =
DIV_ROUND_UP(max_avail_coeff_regs * 4U,
ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE);
/* (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
*/
max_common_store_blocks -= ROGUE_MAX_OVERLAPPED_PIXEL_TASK_INSTANCES *
PIXEL_ALLOCATION_SIZE_MAX_IN_BLOCKS;
/* - (7 * (local_memory_size_max - 1)) */
max_common_store_blocks -= (ROGUE_MAX_OVERLAPPED_PIXEL_TASK_INSTANCES *
(localstore_chunks_count - 1U));
/* Divide by (1 + 2 * (local_memory_size_max - 1)) */
max_workgroups_per_task = max_common_store_blocks /
(1U + 2U * (localstore_chunks_count - 1U));
max_workgroups_per_task =
MIN2(max_workgroups_per_task,
ROGUE_CDM_MAX_PACKED_WORKGROUPS_PER_TASK);
} else {
max_workgroups_per_task =
MIN2((max_avail_coeff_regs / coeff_regs_count),
max_workgroups_per_task);
}
}
/* max_workgroups_per_task should at least be one. */
assert(max_workgroups_per_task >= 1U);
if (total_workitems >= ROGUE_MAX_INSTANCES_PER_TASK) {
/* In this case, the work group size will have been padded up to the
* next ROGUE_MAX_INSTANCES_PER_TASK so we just set max instances to be
* ROGUE_MAX_INSTANCES_PER_TASK.
*/
return ROGUE_MAX_INSTANCES_PER_TASK;
}
/* In this case, the number of instances in the slot must be clamped to
* accommodate whole work-groups only.
*/
if (PVR_HAS_QUIRK(dev_info, 49032) || use_barrier) {
max_workgroups_per_task =
MIN2(max_workgroups_per_task,
ROGUE_MAX_INSTANCES_PER_TASK / total_workitems);
return total_workitems * max_workgroups_per_task;
}
return MIN2(total_workitems * max_workgroups_per_task,
ROGUE_MAX_INSTANCES_PER_TASK);
}
static void
pvr_compute_generate_control_stream(struct pvr_csb *csb,
struct pvr_sub_cmd_compute *sub_cmd,
const struct pvr_compute_kernel_info *info)
{
pvr_csb_set_relocation_mark(csb);
/* Compute kernel 0. */
pvr_csb_emit (csb, CDMCTRL_KERNEL0, kernel0) {
kernel0.indirect_present = !!info->indirect_buffer_addr.addr;
kernel0.global_offsets_present = info->global_offsets_present;
kernel0.usc_common_size = info->usc_common_size;
kernel0.usc_unified_size = info->usc_unified_size;
kernel0.pds_temp_size = info->pds_temp_size;
kernel0.pds_data_size = info->pds_data_size;
kernel0.usc_target = info->usc_target;
kernel0.fence = info->is_fence;
}
/* Compute kernel 1. */
pvr_csb_emit (csb, CDMCTRL_KERNEL1, kernel1) {
kernel1.data_addr = PVR_DEV_ADDR(info->pds_data_offset);
kernel1.sd_type = info->sd_type;
kernel1.usc_common_shared = info->usc_common_shared;
}
/* Compute kernel 2. */
pvr_csb_emit (csb, CDMCTRL_KERNEL2, kernel2) {
kernel2.code_addr = PVR_DEV_ADDR(info->pds_code_offset);
}
if (info->indirect_buffer_addr.addr) {
/* Compute kernel 6. */
pvr_csb_emit (csb, CDMCTRL_KERNEL6, kernel6) {
kernel6.indirect_addrmsb = info->indirect_buffer_addr;
}
/* Compute kernel 7. */
pvr_csb_emit (csb, CDMCTRL_KERNEL7, kernel7) {
kernel7.indirect_addrlsb = info->indirect_buffer_addr;
}
} else {
/* Compute kernel 3. */
pvr_csb_emit (csb, CDMCTRL_KERNEL3, kernel3) {
assert(info->global_size[0U] > 0U);
kernel3.workgroup_x = info->global_size[0U] - 1U;
}
/* Compute kernel 4. */
pvr_csb_emit (csb, CDMCTRL_KERNEL4, kernel4) {
assert(info->global_size[1U] > 0U);
kernel4.workgroup_y = info->global_size[1U] - 1U;
}
/* Compute kernel 5. */
pvr_csb_emit (csb, CDMCTRL_KERNEL5, kernel5) {
assert(info->global_size[2U] > 0U);
kernel5.workgroup_z = info->global_size[2U] - 1U;
}
}
/* Compute kernel 8. */
pvr_csb_emit (csb, CDMCTRL_KERNEL8, kernel8) {
if (info->max_instances == ROGUE_MAX_INSTANCES_PER_TASK)
kernel8.max_instances = 0U;
else
kernel8.max_instances = info->max_instances;
assert(info->local_size[0U] > 0U);
kernel8.workgroup_size_x = info->local_size[0U] - 1U;
assert(info->local_size[1U] > 0U);
kernel8.workgroup_size_y = info->local_size[1U] - 1U;
assert(info->local_size[2U] > 0U);
kernel8.workgroup_size_z = info->local_size[2U] - 1U;
}
pvr_csb_clear_relocation_mark(csb);
/* Track the highest amount of shared registers usage in this dispatch.
* This is used by the FW for context switching, so must be large enough
* to contain all the shared registers that might be in use for this compute
* job. Coefficients don't need to be included as the context switch will not
* happen within the execution of a single workgroup, thus nothing needs to
* be preserved.
*/
if (info->usc_common_shared) {
sub_cmd->num_shared_regs =
MAX2(sub_cmd->num_shared_regs, info->usc_common_size);
}
}
/* TODO: This can be pre-packed and uploaded directly. Would that provide any
* speed up?
*/
static void
pvr_compute_generate_idfwdf(struct pvr_cmd_buffer *cmd_buffer,
struct pvr_sub_cmd_compute *const sub_cmd)
{
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
bool *const is_sw_barrier_required =
&state->current_sub_cmd->compute.pds_sw_barrier_requires_clearing;
const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
struct pvr_csb *csb = &sub_cmd->control_stream;
const struct pvr_pds_upload *program;
if (PVR_NEED_SW_COMPUTE_PDS_BARRIER(&pdevice->dev_info) &&
*is_sw_barrier_required) {
*is_sw_barrier_required = false;
program = &cmd_buffer->device->idfwdf_state.sw_compute_barrier_pds;
} else {
program = &cmd_buffer->device->idfwdf_state.pds;
}
struct pvr_compute_kernel_info info = {
.indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
.global_offsets_present = false,
.usc_common_size = DIV_ROUND_UP(
PVR_DW_TO_BYTES(cmd_buffer->device->idfwdf_state.usc_shareds),
ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE),
.usc_unified_size = 0U,
.pds_temp_size = 0U,
.pds_data_size =
DIV_ROUND_UP(PVR_DW_TO_BYTES(program->data_size),
ROGUE_CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE),
.usc_target = ROGUE_CDMCTRL_USC_TARGET_ALL,
.is_fence = false,
.pds_data_offset = program->data_offset,
.sd_type = ROGUE_CDMCTRL_SD_TYPE_USC,
.usc_common_shared = true,
.pds_code_offset = program->code_offset,
.global_size = { 1U, 1U, 1U },
.local_size = { 1U, 1U, 1U },
};
/* We don't need to pad work-group size for this case. */
info.max_instances =
pvr_compute_flat_slot_size(pdevice,
cmd_buffer->device->idfwdf_state.usc_shareds,
false,
1U);
pvr_compute_generate_control_stream(csb, sub_cmd, &info);
}
/* TODO: This can be pre-packed and uploaded directly. Would that provide any
* speed up?
*/
void PVR_PER_ARCH(compute_generate_fence)(
struct pvr_cmd_buffer *cmd_buffer,
struct pvr_sub_cmd_compute *const sub_cmd,
bool deallocate_shareds)
{
const struct pvr_pds_upload *program =
&cmd_buffer->device->pds_compute_fence_program;
const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
struct pvr_csb *csb = &sub_cmd->control_stream;
struct pvr_compute_kernel_info info = {
.indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
.global_offsets_present = false,
.usc_common_size = 0U,
.usc_unified_size = 0U,
.pds_temp_size = 0U,
.pds_data_size =
DIV_ROUND_UP(PVR_DW_TO_BYTES(program->data_size),
ROGUE_CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE),
.usc_target = ROGUE_CDMCTRL_USC_TARGET_ANY,
.is_fence = true,
.pds_data_offset = program->data_offset,
.sd_type = ROGUE_CDMCTRL_SD_TYPE_PDS,
.usc_common_shared = deallocate_shareds,
.pds_code_offset = program->code_offset,
.global_size = { 1U, 1U, 1U },
.local_size = { 1U, 1U, 1U },
};
/* We don't need to pad work-group size for this case. */
/* Here we calculate the slot size. This can depend on the use of barriers,
* local memory, BRN's or other factors.
*/
info.max_instances = pvr_compute_flat_slot_size(pdevice, 0U, false, 1U);
pvr_compute_generate_control_stream(csb, sub_cmd, &info);
}
static VkResult
pvr_cmd_buffer_process_deferred_clears(struct pvr_cmd_buffer *cmd_buffer)
{
list_for_each_entry_safe (struct pvr_transfer_cmd,
transfer_cmd,
&cmd_buffer->deferred_clears,
link) {
VkResult result;
list_del(&transfer_cmd->link);
result = pvr_cmd_buffer_add_transfer_cmd(cmd_buffer, transfer_cmd);
if (result != VK_SUCCESS)
return result;
cmd_buffer->state.current_sub_cmd->transfer.serialize_with_frag = true;
}
return VK_SUCCESS;
}
static VkResult
pvr_csb_gfx_build_view_index_ctrl_stream(struct pvr_device *const device,
pvr_dev_addr_t addr,
struct pvr_bo **bo,
uint32_t *stride)
{
struct list_head bo_list;
struct pvr_csb csb;
VkResult result;
pvr_csb_init(device, PVR_CMD_STREAM_TYPE_GRAPHICS, &csb);
for (uint32_t i = 0; i < PVR_MAX_MULTIVIEW; ++i) {
struct pvr_pds_view_index_init_program *program =
&device->view_index_init_info[i];
pvr_csb_set_relocation_mark(&csb);
pvr_csb_emit (&csb, VDMCTRL_PDS_STATE0, state_update0) {
state_update0.block_type = ROGUE_VDMCTRL_BLOCK_TYPE_PDS_STATE_UPDATE;
state_update0.dm_target = ROGUE_VDMCTRL_DM_TARGET_VDM;
state_update0.usc_target = ROGUE_VDMCTRL_USC_TARGET_ALL;
state_update0.usc_common_size = 0;
state_update0.usc_unified_size = 0;
state_update0.pds_temp_size = program->temps_used;
state_update0.pds_data_size = DIV_ROUND_UP(
PVR_DW_TO_BYTES(device->view_index_init_programs[i].data_size),
ROGUE_VDMCTRL_PDS_STATE0_PDS_DATA_SIZE_UNIT_SIZE);
}
pvr_csb_emit (&csb, VDMCTRL_PDS_STATE1, state_update1) {
state_update1.pds_data_addr =
device->view_index_init_programs[i].pvr_bo->dev_addr;
state_update1.sd_type = ROGUE_VDMCTRL_SD_TYPE_PDS;
state_update1.sd_next_type = ROGUE_VDMCTRL_SD_TYPE_USC;
}
pvr_csb_emit (&csb, VDMCTRL_PDS_STATE2, state_update2) {
state_update2.pds_code_addr.addr =
device->view_index_init_programs[i].pvr_bo->dev_addr.addr +
PVR_DW_TO_BYTES(device->view_index_init_programs[i].data_size);
}
pvr_csb_clear_relocation_mark(&csb);
pvr_csb_emit_link(&csb, addr, false);
}
result = pvr_csb_bake(&csb, &bo_list);
if (result != VK_SUCCESS)
goto err_csb_finish;
assert(list_is_singular(&bo_list));
*bo = list_first_entry(&bo_list, struct pvr_bo, link);
/* This needs to be kept in sync with the instructions emitted above. */
*stride = pvr_cmd_length(VDMCTRL_PDS_STATE0) +
pvr_cmd_length(VDMCTRL_PDS_STATE1) +
pvr_cmd_length(VDMCTRL_PDS_STATE2) +
pvr_cmd_length(VDMCTRL_STREAM_LINK0) +
pvr_cmd_length(VDMCTRL_STREAM_LINK1);
return VK_SUCCESS;
err_csb_finish:
pvr_csb_finish(&csb);
return result;
}
VkResult PVR_PER_ARCH(cmd_buffer_end_sub_cmd)(struct pvr_cmd_buffer *cmd_buffer)
{
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
struct pvr_sub_cmd *sub_cmd = state->current_sub_cmd;
struct pvr_device *device = cmd_buffer->device;
const struct pvr_query_pool *query_pool = NULL;
struct pvr_suballoc_bo *query_bo = NULL;
size_t query_indices_size = 0;
bool suspend, dynamic_render;
VkResult result;
/* FIXME: Is this NULL check required because this function is called from
* pvr_resolve_unemitted_resolve_attachments()? See comment about this
* function being called twice in a row in pvr_CmdEndRenderPass().
*/
if (!sub_cmd)
return VK_SUCCESS;
if (!sub_cmd->owned) {
state->current_sub_cmd = NULL;
return VK_SUCCESS;
}
switch (sub_cmd->type) {
case PVR_SUB_CMD_TYPE_GRAPHICS: {
struct pvr_sub_cmd_gfx *const gfx_sub_cmd = &sub_cmd->gfx;
const bool secondary_cont =
cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
cmd_buffer->usage_flags &
VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
query_indices_size =
util_dynarray_num_elements(&state->query_indices, char);
if (query_indices_size > 0) {
assert(gfx_sub_cmd->query_pool);
if (secondary_cont) {
util_dynarray_append_dynarray(&state->query_indices,
&gfx_sub_cmd->sec_query_indices);
} else {
const void *data = util_dynarray_begin(&state->query_indices);
result = pvr_cmd_buffer_upload_general(cmd_buffer,
data,
query_indices_size,
&query_bo);
if (result != VK_SUCCESS)
return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
query_pool = gfx_sub_cmd->query_pool;
}
dynamic_render = sub_cmd->is_dynamic_render;
suspend = sub_cmd->is_suspend;
gfx_sub_cmd->has_query = true;
util_dynarray_clear(&state->query_indices);
}
if (secondary_cont) {
result = pvr_csb_emit_return(&gfx_sub_cmd->control_stream);
if (result != VK_SUCCESS)
return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
break;
}
/* TODO: Check if the sub_cmd can be skipped based on
* sub_cmd->gfx.empty_cmd flag and if dynamic rendering isn't enabled.
*/
/* TODO: Set the state in the functions called with the command buffer
* instead of here.
*/
result = pvr_cmd_buffer_upload_tables(device, cmd_buffer, gfx_sub_cmd);
if (result != VK_SUCCESS)
return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
result = pvr_cmd_buffer_emit_ppp_state(cmd_buffer,
&gfx_sub_cmd->control_stream);
if (result != VK_SUCCESS)
return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
result = pvr_csb_emit_terminate(&gfx_sub_cmd->control_stream);
if (result != VK_SUCCESS)
return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
if (gfx_sub_cmd->multiview_enabled) {
result = pvr_csb_gfx_build_view_index_ctrl_stream(
device,
pvr_csb_get_start_address(&gfx_sub_cmd->control_stream),
&gfx_sub_cmd->multiview_ctrl_stream,
&gfx_sub_cmd->multiview_ctrl_stream_stride);
if (result != VK_SUCCESS)
return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
}
result = pvr_sub_cmd_gfx_job_init(&device->pdevice->dev_info,
cmd_buffer,
gfx_sub_cmd);
if (result != VK_SUCCESS)
return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
if (pvr_sub_cmd_gfx_requires_split_submit(gfx_sub_cmd)) {
result = pvr_sub_cmd_gfx_build_terminate_ctrl_stream(device,
cmd_buffer,
gfx_sub_cmd);
if (result != VK_SUCCESS)
return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
}
break;
}
case PVR_SUB_CMD_TYPE_QUERY:
case PVR_SUB_CMD_TYPE_COMPUTE: {
struct pvr_sub_cmd_compute *const compute_sub_cmd = &sub_cmd->compute;
pvr_compute_generate_fence(cmd_buffer, compute_sub_cmd, true);
result = pvr_csb_emit_terminate(&compute_sub_cmd->control_stream);
if (result != VK_SUCCESS)
return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
pvr_sub_cmd_compute_job_init(device->pdevice,
cmd_buffer,
compute_sub_cmd);
break;
}
case PVR_SUB_CMD_TYPE_TRANSFER:
break;
case PVR_SUB_CMD_TYPE_EVENT:
break;
default:
UNREACHABLE("Unsupported sub-command type");
}
state->current_sub_cmd = NULL;
/* pvr_cmd_buffer_process_deferred_clears() must be called with a NULL
* current_sub_cmd.
*
* We can start a sub_cmd of a different type from the current sub_cmd only
* after having ended the current sub_cmd. However, we can't end the current
* sub_cmd if this depends on starting sub_cmd(s) of a different type. Hence,
* don't try to start transfer sub_cmd(s) with
* pvr_cmd_buffer_process_deferred_clears() until the current hasn't ended.
* Failing to do so we will cause a circular dependency between
* pvr_cmd_buffer_{end,start}_cmd and blow the stack.
*/
if (sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS) {
result = pvr_cmd_buffer_process_deferred_clears(cmd_buffer);
if (result != VK_SUCCESS)
return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
}
if (query_pool) {
struct pvr_query_info query_info;
assert(query_bo);
assert(query_indices_size);
query_info.type = PVR_QUERY_TYPE_AVAILABILITY_WRITE;
/* sizeof(uint32_t) is for the size of single query. */
query_info.availability_write.num_query_indices =
query_indices_size / sizeof(uint32_t);
query_info.availability_write.index_bo = query_bo;
query_info.availability_write.num_queries = query_pool->query_count;
query_info.availability_write.availability_bo =
query_pool->availability_buffer;
query_info.is_dynamic_render = dynamic_render;
query_info.is_suspend = suspend;
/* Insert a barrier after the graphics sub command and before the
* query sub command so that the availability write program waits for the
* fragment shader to complete.
*/
result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
if (result != VK_SUCCESS)
return result;
cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
.type = PVR_EVENT_TYPE_BARRIER,
.barrier = {
.wait_for_stage_mask = PVR_PIPELINE_STAGE_FRAG_BIT,
.wait_at_stage_mask = PVR_PIPELINE_STAGE_QUERY_BIT,
},
};
return pvr_add_query_program(cmd_buffer, &query_info);
}
return VK_SUCCESS;
}
void PVR_PER_ARCH(reset_graphics_dirty_state)(
struct pvr_cmd_buffer *const cmd_buffer,
bool start_geom)
{
struct vk_dynamic_graphics_state *const dynamic_state =
&cmd_buffer->vk.dynamic_graphics_state;
if (start_geom) {
/*
* Initial geometry phase state.
* It's the driver's responsibility to ensure that the state of the
* hardware is correctly initialized at the start of every geometry
* phase. This is required to prevent stale state from a previous
* geometry phase erroneously affecting the next geometry phase.
*
* If a geometry phase does not contain any geometry, this restriction
* can be ignored. If the first draw call in a geometry phase will only
* update the depth or stencil buffers i.e. ISP_TAGWRITEDISABLE is set
* in the ISP State Control Word, the PDS State Pointers
* (TA_PRES_PDSSTATEPTR*) in the first PPP State Update do not need to
* be supplied, since they will never reach the PDS in the fragment
* phase.
*/
cmd_buffer->state.emit_header = (struct ROGUE_TA_STATE_HEADER){
.pres_stream_out_size = true,
.pres_ppp_ctrl = true,
.pres_varying_word2 = true,
.pres_varying_word1 = true,
.pres_varying_word0 = true,
.pres_outselects = true,
.pres_wclamp = true,
.pres_viewport = true,
.pres_region_clip = true,
.pres_pds_state_ptr0 = true,
.pres_ispctl_fb = true,
.pres_ispctl = true,
};
} else {
struct ROGUE_TA_STATE_HEADER *const emit_header =
&cmd_buffer->state.emit_header;
emit_header->pres_ppp_ctrl = true;
emit_header->pres_varying_word1 = true;
emit_header->pres_varying_word0 = true;
emit_header->pres_outselects = true;
emit_header->pres_viewport = true;
emit_header->pres_region_clip = true;
emit_header->pres_pds_state_ptr0 = true;
emit_header->pres_ispctl_fb = true;
emit_header->pres_ispctl = true;
}
memset(&cmd_buffer->state.ppp_state,
0U,
sizeof(cmd_buffer->state.ppp_state));
cmd_buffer->state.dirty.vertex_bindings = true;
cmd_buffer->state.dirty.gfx_pipeline_binding = true;
BITSET_SET(dynamic_state->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS);
BITSET_SET(dynamic_state->dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT);
}
static inline bool
pvr_cmd_uses_deferred_cs_cmds(const struct pvr_cmd_buffer *const cmd_buffer)
{
const VkCommandBufferUsageFlags deferred_control_stream_flags =
VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT |
VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
return cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
(cmd_buffer->usage_flags & deferred_control_stream_flags) ==
deferred_control_stream_flags;
}
static inline uint32_t
pvr_render_pass_info_get_view_mask(const struct pvr_render_pass_info *rp_info)
{
const uint32_t hw_render_idx = rp_info->current_hw_subpass;
const struct pvr_renderpass_hwsetup_render *hw_render =
pvr_pass_info_get_hw_render(rp_info, hw_render_idx);
return hw_render->view_mask;
}
VkResult
PVR_PER_ARCH(cmd_buffer_start_sub_cmd)(struct pvr_cmd_buffer *cmd_buffer,
enum pvr_sub_cmd_type type)
{
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
struct pvr_device *device = cmd_buffer->device;
struct pvr_sub_cmd *sub_cmd;
VkResult result;
/* Check the current status of the buffer. */
if (vk_command_buffer_has_error(&cmd_buffer->vk))
return vk_command_buffer_get_record_result(&cmd_buffer->vk);
pvr_cmd_buffer_update_barriers(cmd_buffer, type);
/* TODO: Add proper support for joining consecutive event sub_cmd? */
if (state->current_sub_cmd) {
if (state->current_sub_cmd->type == type) {
/* Continue adding to the current sub command. */
return VK_SUCCESS;
}
/* End the current sub command. */
result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
if (result != VK_SUCCESS)
return result;
}
sub_cmd = vk_zalloc(&cmd_buffer->vk.pool->alloc,
sizeof(*sub_cmd),
8,
VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
if (!sub_cmd) {
return vk_command_buffer_set_error(&cmd_buffer->vk,
VK_ERROR_OUT_OF_HOST_MEMORY);
}
sub_cmd->type = type;
sub_cmd->owned = true;
switch (type) {
case PVR_SUB_CMD_TYPE_GRAPHICS:
sub_cmd->gfx.depth_usage = PVR_DEPTH_STENCIL_USAGE_UNDEFINED;
sub_cmd->gfx.stencil_usage = PVR_DEPTH_STENCIL_USAGE_UNDEFINED;
sub_cmd->gfx.modifies_depth = false;
sub_cmd->gfx.modifies_stencil = false;
sub_cmd->gfx.max_tiles_in_flight =
PVR_GET_FEATURE_VALUE(&device->pdevice->dev_info,
isp_max_tiles_in_flight,
1);
sub_cmd->gfx.rstate = state->render_pass_info.rstate;
sub_cmd->gfx.empty_cmd = true;
sub_cmd->gfx.view_mask =
pvr_render_pass_info_get_view_mask(&state->render_pass_info);
if (state->render_pass_info.dynamic_render) {
sub_cmd->is_suspend = state->render_pass_info.suspend;
sub_cmd->is_resume = state->render_pass_info.resume;
sub_cmd->is_dynamic_render = true;
sub_cmd->gfx.hw_render = state->render_pass_info.hw_render;
sub_cmd->gfx.multiview_enabled =
state->render_pass_info.hw_render->multiview_enabled;
sub_cmd->gfx.hw_render_idx = 0;
sub_cmd->gfx.dr_info = state->render_pass_info.dr_info;
} else {
sub_cmd->gfx.hw_render_idx =
state->render_pass_info.current_hw_subpass;
sub_cmd->gfx.multiview_enabled =
state->render_pass_info.pass
? state->render_pass_info.pass->multiview_enabled
: false;
}
if (state->vis_test_enabled)
sub_cmd->gfx.query_pool = state->query_pool;
pvr_reset_graphics_dirty_state(cmd_buffer, true);
if (pvr_cmd_uses_deferred_cs_cmds(cmd_buffer)) {
pvr_csb_init(device,
PVR_CMD_STREAM_TYPE_GRAPHICS_DEFERRED,
&sub_cmd->gfx.control_stream);
} else {
pvr_csb_init(device,
PVR_CMD_STREAM_TYPE_GRAPHICS,
&sub_cmd->gfx.control_stream);
}
sub_cmd->gfx.sec_query_indices = UTIL_DYNARRAY_INIT;
break;
case PVR_SUB_CMD_TYPE_QUERY:
case PVR_SUB_CMD_TYPE_COMPUTE:
pvr_csb_init(device,
PVR_CMD_STREAM_TYPE_COMPUTE,
&sub_cmd->compute.control_stream);
break;
case PVR_SUB_CMD_TYPE_TRANSFER:
sub_cmd->transfer.transfer_cmds = &sub_cmd->transfer.transfer_cmds_priv;
list_inithead(sub_cmd->transfer.transfer_cmds);
break;
case PVR_SUB_CMD_TYPE_EVENT:
break;
default:
UNREACHABLE("Unsupported sub-command type");
}
list_addtail(&sub_cmd->link, &cmd_buffer->sub_cmds);
state->current_sub_cmd = sub_cmd;
return VK_SUCCESS;
}
VkResult
PVR_PER_ARCH(cmd_buffer_alloc_mem)(struct pvr_cmd_buffer *cmd_buffer,
struct pvr_winsys_heap *heap,
uint64_t size,
struct pvr_suballoc_bo **const pvr_bo_out)
{
const uint32_t cache_line_size =
pvr_get_slc_cache_line_size(&cmd_buffer->device->pdevice->dev_info);
struct pvr_suballoc_bo *suballoc_bo;
struct pvr_suballocator *allocator;
VkResult result;
if (heap == cmd_buffer->device->heaps.general_heap)
allocator = &cmd_buffer->device->suballoc_general;
else if (heap == cmd_buffer->device->heaps.pds_heap)
allocator = &cmd_buffer->device->suballoc_pds;
else if (heap == cmd_buffer->device->heaps.transfer_frag_heap)
allocator = &cmd_buffer->device->suballoc_transfer;
else if (heap == cmd_buffer->device->heaps.usc_heap)
allocator = &cmd_buffer->device->suballoc_usc;
else
UNREACHABLE("Unknown heap type");
result =
pvr_bo_suballoc(allocator, size, cache_line_size, false, &suballoc_bo);
if (result != VK_SUCCESS)
return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
list_add(&suballoc_bo->link, &cmd_buffer->bo_list);
*pvr_bo_out = suballoc_bo;
return VK_SUCCESS;
}
static void pvr_cmd_bind_compute_pipeline(
const struct pvr_compute_pipeline *const compute_pipeline,
struct pvr_cmd_buffer *const cmd_buffer)
{
cmd_buffer->state.compute_pipeline = compute_pipeline;
cmd_buffer->state.dirty.compute_pipeline_binding = true;
}
static void pvr_cmd_bind_graphics_pipeline(
const struct pvr_graphics_pipeline *const gfx_pipeline,
struct pvr_cmd_buffer *const cmd_buffer)
{
cmd_buffer->state.gfx_pipeline = gfx_pipeline;
cmd_buffer->state.dirty.gfx_pipeline_binding = true;
vk_cmd_set_dynamic_graphics_state(&cmd_buffer->vk,
&gfx_pipeline->dynamic_state);
}
void PVR_PER_ARCH(CmdBindPipeline)(VkCommandBuffer commandBuffer,
VkPipelineBindPoint pipelineBindPoint,
VkPipeline _pipeline)
{
VK_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
VK_FROM_HANDLE(pvr_pipeline, pipeline, _pipeline);
switch (pipelineBindPoint) {
case VK_PIPELINE_BIND_POINT_COMPUTE:
pvr_cmd_bind_compute_pipeline(to_pvr_compute_pipeline(pipeline),
cmd_buffer);
break;
case VK_PIPELINE_BIND_POINT_GRAPHICS:
pvr_cmd_bind_graphics_pipeline(to_pvr_graphics_pipeline(pipeline),
cmd_buffer);
break;
default:
UNREACHABLE("Invalid bind point.");
break;
}
}
#if MESA_DEBUG
static void check_viewport_quirk_70165(const struct pvr_device *device,
const VkViewport *pViewport)
{
const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
float min_vertex_x, max_vertex_x, min_vertex_y, max_vertex_y;
float min_screen_space_value, max_screen_space_value;
float sign_to_unsigned_offset, fixed_point_max;
float guardband_width, guardband_height;
if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
/* Max representable value in 13.4 fixed point format.
* Round-down to avoid precision issues.
* Calculated as (2 ** 13) - 2*(2 ** -4)
*/
fixed_point_max = 8192.0f - 2.0f / 16.0f;
if (PVR_HAS_FEATURE(dev_info, screen_size8K)) {
if (pViewport->width <= 4096 && pViewport->height <= 4096) {
guardband_width = pViewport->width / 4.0f;
guardband_height = pViewport->height / 4.0f;
/* 2k of the range is negative */
sign_to_unsigned_offset = 2048.0f;
} else {
guardband_width = 0.0f;
guardband_height = 0.0f;
/* For > 4k renders, the entire range is positive */
sign_to_unsigned_offset = 0.0f;
}
} else {
guardband_width = pViewport->width / 4.0f;
guardband_height = pViewport->height / 4.0f;
/* 2k of the range is negative */
sign_to_unsigned_offset = 2048.0f;
}
} else {
/* Max representable value in 16.8 fixed point format
* Calculated as (2 ** 16) - (2 ** -8)
*/
fixed_point_max = 65535.99609375f;
guardband_width = pViewport->width / 4.0f;
guardband_height = pViewport->height / 4.0f;
/* 4k/20k of the range is negative */
sign_to_unsigned_offset = (float)PVR_MAX_NEG_OFFSCREEN_OFFSET;
}
min_screen_space_value = -sign_to_unsigned_offset;
max_screen_space_value = fixed_point_max - sign_to_unsigned_offset;
min_vertex_x = pViewport->x - guardband_width;
max_vertex_x = pViewport->x + pViewport->width + guardband_width;
min_vertex_y = pViewport->y - guardband_height;
max_vertex_y = pViewport->y + pViewport->height + guardband_height;
if (min_vertex_x < min_screen_space_value ||
max_vertex_x > max_screen_space_value ||
min_vertex_y < min_screen_space_value ||
max_vertex_y > max_screen_space_value) {
mesa_logw("Viewport is affected by BRN70165, geometry outside "
"the viewport could be corrupted");
}
}
#endif
void PVR_PER_ARCH(CmdSetViewport)(VkCommandBuffer commandBuffer,
uint32_t firstViewport,
uint32_t viewportCount,
const VkViewport *pViewports)
{
VK_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
const uint32_t total_count = firstViewport + viewportCount;
assert(firstViewport < PVR_MAX_VIEWPORTS && viewportCount > 0);
assert(total_count >= 1 && total_count <= PVR_MAX_VIEWPORTS);
PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
#if MESA_DEBUG
if (PVR_HAS_QUIRK(&cmd_buffer->device->pdevice->dev_info, 70165)) {
for (uint32_t viewport = 0; viewport < viewportCount; viewport++) {
check_viewport_quirk_70165(cmd_buffer->device, &pViewports[viewport]);
}
}
#endif
vk_common_CmdSetViewport(commandBuffer,
firstViewport,
viewportCount,
pViewports);
}
void PVR_PER_ARCH(CmdSetDepthBounds)(VkCommandBuffer commandBuffer,
float minDepthBounds,
float maxDepthBounds)
{
mesa_logd("No support for depth bounds testing.");
}
void PVR_PER_ARCH(CmdBindDescriptorSets2KHR)(
VkCommandBuffer commandBuffer,
const VkBindDescriptorSetsInfoKHR *pBindDescriptorSetsInfo)
{
VK_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
VK_FROM_HANDLE(vk_pipeline_layout,
pipeline_layout,
pBindDescriptorSetsInfo->layout);
unsigned dyn_off = 0;
const bool has_dyn_offs = pBindDescriptorSetsInfo->dynamicOffsetCount > 0;
PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
struct pvr_descriptor_state *graphics_desc_state =
&cmd_buffer->state.gfx_desc_state;
struct pvr_descriptor_state *compute_desc_state =
&cmd_buffer->state.compute_desc_state;
for (unsigned u = 0; u < pBindDescriptorSetsInfo->descriptorSetCount; ++u) {
VK_FROM_HANDLE(pvr_descriptor_set,
set,
pBindDescriptorSetsInfo->pDescriptorSets[u]);
unsigned desc_set = u + pBindDescriptorSetsInfo->firstSet;
const struct pvr_descriptor_set_layout *set_layout =
vk_to_pvr_descriptor_set_layout(
pipeline_layout->set_layouts[desc_set]);
for (unsigned u = 0; u < set_layout->dynamic_buffer_count; ++u) {
set->dynamic_buffers[u].offset =
has_dyn_offs ? pBindDescriptorSetsInfo->pDynamicOffsets[dyn_off++]
: 0;
}
if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS) {
if (graphics_desc_state->sets[desc_set] != set) {
graphics_desc_state->sets[desc_set] = set;
graphics_desc_state->dirty_sets |= BITFIELD_BIT(desc_set);
}
}
if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
if (compute_desc_state->sets[desc_set] != set) {
compute_desc_state->sets[desc_set] = set;
compute_desc_state->dirty_sets |= BITFIELD_BIT(desc_set);
}
}
}
assert(dyn_off == pBindDescriptorSetsInfo->dynamicOffsetCount);
if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS)
cmd_buffer->state.dirty.gfx_desc_dirty = true;
if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT)
cmd_buffer->state.dirty.compute_desc_dirty = true;
}
void PVR_PER_ARCH(CmdBindVertexBuffers2)(VkCommandBuffer commandBuffer,
uint32_t firstBinding,
uint32_t bindingCount,
const VkBuffer *pBuffers,
const VkDeviceSize *pOffsets,
const VkDeviceSize *pSizes,
const VkDeviceSize *pStrides)
{
VK_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
struct pvr_vertex_binding *const vb = cmd_buffer->state.vertex_bindings;
assert(firstBinding < PVR_MAX_VERTEX_INPUT_BINDINGS &&
bindingCount <= PVR_MAX_VERTEX_INPUT_BINDINGS);
PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
for (uint32_t i = 0; i < bindingCount; i++) {
VK_FROM_HANDLE(pvr_buffer, buffer, pBuffers[i]);
const uint64_t size = pSizes ? pSizes[i] : VK_WHOLE_SIZE;
vb[firstBinding + i] = (struct pvr_vertex_binding){
.buffer = buffer,
.offset = pOffsets[i],
.size = vk_buffer_range(&buffer->vk, pOffsets[i], size),
};
}
if (pStrides != NULL) {
vk_cmd_set_vertex_binding_strides(&cmd_buffer->vk,
firstBinding,
bindingCount,
pStrides);
}
cmd_buffer->state.dirty.vertex_bindings = true;
}
void PVR_PER_ARCH(CmdBindIndexBuffer)(VkCommandBuffer commandBuffer,
VkBuffer buffer,
VkDeviceSize offset,
VkIndexType indexType)
{
VK_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
VK_FROM_HANDLE(pvr_buffer, index_buffer, buffer);
struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
assert(offset < index_buffer->vk.size);
assert(indexType == VK_INDEX_TYPE_UINT32 ||
indexType == VK_INDEX_TYPE_UINT16 ||
indexType == VK_INDEX_TYPE_UINT8_KHR);
PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
state->index_buffer_binding.buffer = index_buffer;
state->index_buffer_binding.offset = offset;
state->index_buffer_binding.type = indexType;
state->dirty.index_buffer_binding = true;
}
static void update_push_constants(struct pvr_push_constants *push_consts,
uint32_t offset,
uint32_t size,
const void *data)
{
memcpy(&push_consts->data[offset], data, size);
push_consts->bytes_updated = MAX2(push_consts->bytes_updated, offset + size);
push_consts->dirty = true;
}
void PVR_PER_ARCH(CmdPushConstants2KHR)(
VkCommandBuffer commandBuffer,
const VkPushConstantsInfoKHR *pPushConstantsInfo)
{
VK_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
if (pPushConstantsInfo->stageFlags & VK_SHADER_STAGE_VERTEX_BIT) {
update_push_constants(
&state->push_consts[PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY],
pPushConstantsInfo->offset,
pPushConstantsInfo->size,
pPushConstantsInfo->pValues);
}
if (pPushConstantsInfo->stageFlags & VK_SHADER_STAGE_FRAGMENT_BIT) {
update_push_constants(&state->push_consts[PVR_STAGE_ALLOCATION_FRAGMENT],
pPushConstantsInfo->offset,
pPushConstantsInfo->size,
pPushConstantsInfo->pValues);
}
if (pPushConstantsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
update_push_constants(&state->push_consts[PVR_STAGE_ALLOCATION_COMPUTE],
pPushConstantsInfo->offset,
pPushConstantsInfo->size,
pPushConstantsInfo->pValues);
}
}
static void pvr_cmd_buffer_attachments_free(struct pvr_cmd_buffer *cmd_buffer)
{
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
if (!state->render_pass_info.attachments) {
assert(!state->render_pass_info.attachment_count);
return;
}
vk_free(&cmd_buffer->vk.pool->alloc, state->render_pass_info.attachments);
state->render_pass_info.attachment_count = 0;
state->render_pass_info.attachments = NULL;
}
static VkResult
pvr_cmd_buffer_attachments_alloc(struct pvr_cmd_buffer *cmd_buffer,
uint32_t attachment_count)
{
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
struct pvr_render_pass_info *info = &state->render_pass_info;
/* Free any previously allocated attachments. */
pvr_cmd_buffer_attachments_free(cmd_buffer);
state->render_pass_info.attachment_count = attachment_count;
if (attachment_count == 0) {
info->attachments = NULL;
return VK_SUCCESS;
}
const size_t size = attachment_count * sizeof(*info->attachments);
info->attachments = vk_zalloc(&cmd_buffer->vk.pool->alloc,
size,
8,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (!info->attachments) {
return vk_command_buffer_set_error(&cmd_buffer->vk,
VK_ERROR_OUT_OF_HOST_MEMORY);
}
return VK_SUCCESS;
}
static VkResult pvr_cmd_buffer_attachments_setup(
struct pvr_cmd_buffer *cmd_buffer,
const struct pvr_render_pass *pass,
const struct pvr_framebuffer *framebuffer,
const VkRenderPassBeginInfo *pRenderPassBeginInfo)
{
const VkRenderPassAttachmentBeginInfo *pImageless =
vk_find_struct_const(pRenderPassBeginInfo->pNext,
RENDER_PASS_ATTACHMENT_BEGIN_INFO);
struct pvr_render_pass_info *info = &cmd_buffer->state.render_pass_info;
VkResult result;
if (!pImageless)
assert(pass->attachment_count == framebuffer->attachment_count);
result =
pvr_cmd_buffer_attachments_alloc(cmd_buffer, pass->attachment_count);
if (result != VK_SUCCESS)
return result;
for (uint32_t i = 0; i < pass->attachment_count; i++) {
if (pImageless && pImageless->attachmentCount) {
assert(pImageless->attachmentCount == pass->attachment_count);
VK_FROM_HANDLE(pvr_image_view, img_view, pImageless->pAttachments[i]);
info->attachments[i] = img_view;
} else {
info->attachments[i] = framebuffer->attachments[i];
}
}
return result;
}
static inline VkResult pvr_render_targets_datasets_create(
struct pvr_device *device,
struct pvr_render_state *rstate,
const struct pvr_renderpass_hwsetup_render *hw_render,
struct pvr_render_target *render_target)
{
const struct pvr_device_info *const dev_info = &device->pdevice->dev_info;
const uint32_t layers =
PVR_HAS_FEATURE(dev_info, gs_rta_support) ? rstate->layers : 1;
pthread_mutex_lock(&render_target->mutex);
u_foreach_bit (view_idx, hw_render->view_mask) {
struct pvr_rt_dataset *rt_dataset;
VkResult result;
if (render_target->valid_mask & BITFIELD_BIT(view_idx))
continue;
result = pvr_render_target_dataset_create(device,
rstate->width,
rstate->height,
hw_render->sample_count,
layers,
&rt_dataset);
if (result != VK_SUCCESS) {
pvr_render_targets_datasets_destroy(render_target);
pthread_mutex_unlock(&render_target->mutex);
return result;
}
render_target->valid_mask |= BITFIELD_BIT(view_idx);
render_target->rt_dataset[view_idx] = rt_dataset;
}
pthread_mutex_unlock(&render_target->mutex);
return VK_SUCCESS;
}
static VkResult pvr_render_targets_dataset_init(
struct pvr_device *device,
struct pvr_render_state *rstate,
const struct pvr_renderpass_hwsetup_render *hw_render)
{
struct pvr_render_target *render_target =
pvr_get_render_target(hw_render, rstate);
return pvr_render_targets_datasets_create(device,
rstate,
hw_render,
render_target);
}
static VkResult
pvr_render_targets_init_for_render(struct pvr_device *device,
struct pvr_render_pass *pass,
struct pvr_framebuffer *framebuffer)
{
for (uint32_t i = 0; i < pass->hw_setup->render_count; i++) {
const struct pvr_renderpass_hwsetup_render *hw_render =
&pass->hw_setup->renders[i];
VkResult result;
result = pvr_render_targets_dataset_init(device,
framebuffer->rstate,
hw_render);
if (result != VK_SUCCESS)
return result;
}
return VK_SUCCESS;
}
const struct pvr_renderpass_hwsetup_subpass *
PVR_PER_ARCH(get_hw_subpass)(const struct pvr_render_pass *pass,
const uint32_t subpass)
{
const struct pvr_renderpass_hw_map *map =
&pass->hw_setup->subpass_map[subpass];
return &pass->hw_setup->renders[map->render].subpasses[map->subpass];
}
static void
pvr_perform_start_of_render_attachment_clear(struct pvr_cmd_buffer *cmd_buffer,
uint32_t layers,
uint32_t index,
bool is_depth_stencil,
uint32_t *index_list_clear_mask,
bool resume_render)
{
ASSERTED static const VkImageAspectFlags dsc_aspect_flags =
VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT |
VK_IMAGE_ASPECT_COLOR_BIT;
struct pvr_render_pass_info *info = &cmd_buffer->state.render_pass_info;
const struct pvr_render_pass *pass = info->pass;
const struct pvr_renderpass_hwsetup *hw_setup = pass ? pass->hw_setup : NULL;
const uint32_t hw_render_idx =
hw_setup ? hw_setup->subpass_map[info->subpass_idx].render : 0;
const struct pvr_renderpass_hwsetup_render *hw_render =
pvr_pass_info_get_hw_render(info, hw_render_idx);
VkImageAspectFlags image_aspect;
const struct pvr_image *image;
struct pvr_image_view *iview;
VkFormat vk_format;
uint32_t iview_idx;
if (is_depth_stencil) {
assert(hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED);
iview_idx = hw_render->ds_attach_idx;
} else {
assert(index != VK_ATTACHMENT_UNUSED);
iview_idx = hw_render->color_init[index].index;
}
assert(iview_idx != VK_ATTACHMENT_UNUSED);
iview = info->attachments[iview_idx];
image = pvr_image_view_get_image(iview);
vk_format = image->vk.format;
if (is_depth_stencil) {
bool stencil_clear;
bool depth_clear;
bool is_stencil;
bool is_depth;
assert(index == 0);
depth_clear = hw_render->depth_init == VK_ATTACHMENT_LOAD_OP_CLEAR;
stencil_clear = hw_render->stencil_init == VK_ATTACHMENT_LOAD_OP_CLEAR;
is_stencil = vk_format_has_stencil(vk_format);
is_depth = vk_format_has_depth(vk_format);
/* Attempt to clear the ds attachment. Do not erroneously discard an
* attachment that has no depth clear but has a stencil attachment.
*/
/* if not (a ∧ c) (b ∧ d) */
if (!((is_depth && depth_clear) || (is_stencil && stencil_clear)))
return;
} else if (hw_render->color_init[index].op != VK_ATTACHMENT_LOAD_OP_CLEAR) {
return;
}
/* FIXME: It would be nice if this function and pvr_sub_cmd_gfx_job_init()
* were doing the same check (even if it's just an assert) to determine if a
* clear is needed.
*/
/* If this is single-layer fullscreen, we already do the clears in
* pvr_sub_cmd_gfx_job_init().
*/
if (pvr_is_render_area_tile_aligned(cmd_buffer, iview) && layers == 1) {
return;
}
image_aspect = vk_format_aspects(vk_format);
assert((image_aspect & ~dsc_aspect_flags) == 0);
if (image_aspect & VK_IMAGE_ASPECT_DEPTH_BIT &&
hw_render->depth_init != VK_ATTACHMENT_LOAD_OP_CLEAR) {
image_aspect &= ~VK_IMAGE_ASPECT_DEPTH_BIT;
}
if (image_aspect & VK_IMAGE_ASPECT_STENCIL_BIT &&
hw_render->stencil_init != VK_ATTACHMENT_LOAD_OP_CLEAR) {
image_aspect &= ~VK_IMAGE_ASPECT_STENCIL_BIT;
}
if (image_aspect != VK_IMAGE_ASPECT_NONE && !resume_render) {
VkClearAttachment clear_attachment = {
.aspectMask = image_aspect,
.colorAttachment = index,
.clearValue = info->clear_values[iview_idx],
};
VkClearRect rect = {
.rect = info->render_area,
.baseArrayLayer = 0,
/* TODO:
*
* Verify where layers should come from, info->framebuffer
* vs layers (function argument). Aren't they the same?
* If they are, drop the argument altogether.
*/
.layerCount = info->rstate->layers,
};
assert(iview_idx < info->clear_value_count);
pvr_clear_attachments_render_init(cmd_buffer, &clear_attachment, &rect);
*index_list_clear_mask |= (1 << index);
}
}
static void
pvr_perform_start_of_render_clears(struct pvr_cmd_buffer *cmd_buffer,
bool resume_render)
{
struct pvr_render_pass_info *info = &cmd_buffer->state.render_pass_info;
const struct pvr_renderpass_hwsetup_render *hw_render;
const struct pvr_render_pass *pass = info->pass;
if (pass) {
const struct pvr_renderpass_hwsetup *hw_setup = pass->hw_setup;
hw_render =
&hw_setup->renders[hw_setup->subpass_map[info->subpass_idx].render];
} else {
hw_render = pvr_pass_info_get_hw_render(info, 0);
}
/* Mask of attachment clears using index lists instead of background object
* to clear.
*/
uint32_t index_list_clear_mask = 0;
for (uint32_t i = 0; i < hw_render->color_init_count; i++) {
pvr_perform_start_of_render_attachment_clear(cmd_buffer,
info->rstate->layers,
i,
false,
&index_list_clear_mask,
resume_render);
}
info->enable_bg_tag = !!hw_render->color_init_count;
/* If we're not using index list for all clears/loads then we need to run
* the background object on empty tiles.
*/
if (hw_render->color_init_count &&
index_list_clear_mask != ((1u << hw_render->color_init_count) - 1u)) {
info->process_empty_tiles = true;
} else {
info->process_empty_tiles = false;
}
if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) {
uint32_t ds_index_list = 0;
pvr_perform_start_of_render_attachment_clear(cmd_buffer,
info->rstate->layers,
0,
true,
&ds_index_list,
resume_render);
}
}
static void pvr_stash_depth_format(struct pvr_cmd_buffer_state *state,
struct pvr_sub_cmd_gfx *const sub_cmd)
{
const struct pvr_renderpass_hwsetup_render *hw_render =
pvr_pass_info_get_hw_render(&state->render_pass_info,
sub_cmd->hw_render_idx);
if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) {
struct pvr_image_view **iviews = state->render_pass_info.attachments;
state->depth_format = iviews[hw_render->ds_attach_idx]->vk.format;
}
}
static bool pvr_loadops_contain_clear(struct pvr_renderpass_hwsetup *hw_setup)
{
for (uint32_t i = 0; i < hw_setup->render_count; i++) {
struct pvr_renderpass_hwsetup_render *hw_render = &hw_setup->renders[i];
uint32_t render_targets_count = hw_render->init_setup.num_render_targets;
for (uint32_t j = 0;
j < (hw_render->color_init_count * render_targets_count);
j += render_targets_count) {
for (uint32_t k = 0; k < hw_render->init_setup.num_render_targets;
k++) {
if (hw_render->color_init[j + k].op ==
VK_ATTACHMENT_LOAD_OP_CLEAR) {
return true;
}
}
}
if (hw_render->depth_init == VK_ATTACHMENT_LOAD_OP_CLEAR ||
hw_render->stencil_init == VK_ATTACHMENT_LOAD_OP_CLEAR) {
return true;
}
}
return false;
}
static void pvr_cmd_buffer_clear_values_free(struct pvr_cmd_buffer *cmd_buffer)
{
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
if (!state->render_pass_info.clear_values) {
assert(!state->render_pass_info.clear_value_count);
return;
}
vk_free(&cmd_buffer->vk.pool->alloc, state->render_pass_info.clear_values);
state->render_pass_info.clear_value_count = 0;
state->render_pass_info.clear_values = NULL;
}
static VkResult
pvr_cmd_buffer_clear_values_alloc(struct pvr_cmd_buffer *cmd_buffer,
uint32_t clear_value_count)
{
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
/* Free any previously allocated clear values. */
pvr_cmd_buffer_clear_values_free(cmd_buffer);
state->render_pass_info.clear_value_count = clear_value_count;
if (!clear_value_count) {
state->render_pass_info.clear_values = NULL;
return VK_SUCCESS;
}
const size_t size =
clear_value_count * sizeof(*state->render_pass_info.clear_values);
state->render_pass_info.clear_values =
vk_zalloc(&cmd_buffer->vk.pool->alloc,
size,
8,
VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
if (!state->render_pass_info.clear_values) {
return vk_command_buffer_set_error(&cmd_buffer->vk,
VK_ERROR_OUT_OF_HOST_MEMORY);
}
return VK_SUCCESS;
}
static VkResult
pvr_cmd_buffer_set_clear_values(struct pvr_cmd_buffer *cmd_buffer,
const VkRenderPassBeginInfo *pRenderPassBegin)
{
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
const size_t size = pRenderPassBegin->clearValueCount *
sizeof(*state->render_pass_info.clear_values);
VkResult result;
result =
pvr_cmd_buffer_clear_values_alloc(cmd_buffer,
pRenderPassBegin->clearValueCount);
if (result != VK_SUCCESS)
return result;
memcpy(state->render_pass_info.clear_values,
pRenderPassBegin->pClearValues,
size);
return result;
}
/**
* \brief Indicates whether to use the large or normal clear state words.
*
* If the current render area can fit within a quarter of the max framebuffer
* that the device is capable of, we can use the normal clear state words,
* otherwise the large clear state words are needed.
*
* The requirement of a quarter of the max framebuffer comes from the index
* count used in the normal clear state words and the vertices uploaded at
* device creation.
*
* \param[in] cmd_buffer The command buffer for the clear.
* \return true if large clear state words are required.
*/
static bool
pvr_is_large_clear_required(const struct pvr_cmd_buffer *const cmd_buffer)
{
const struct pvr_device_info *const dev_info =
&cmd_buffer->device->pdevice->dev_info;
const VkRect2D render_area = cmd_buffer->state.render_pass_info.render_area;
const uint32_t vf_max_x = rogue_get_param_vf_max_x(dev_info);
const uint32_t vf_max_y = rogue_get_param_vf_max_x(dev_info);
return (render_area.extent.width > (vf_max_x / 2) - 1) ||
(render_area.extent.height > (vf_max_y / 2) - 1);
}
static void pvr_emit_clear_words(struct pvr_cmd_buffer *const cmd_buffer,
struct pvr_sub_cmd_gfx *const sub_cmd)
{
struct pvr_device *device = cmd_buffer->device;
struct pvr_csb *csb = &sub_cmd->control_stream;
uint32_t vdm_state_size_in_dw;
const uint32_t *vdm_state;
uint32_t *stream;
vdm_state_size_in_dw =
pvr_clear_vdm_state_get_size_in_dw(&device->pdevice->dev_info, 1);
pvr_csb_set_relocation_mark(csb);
stream = pvr_csb_alloc_dwords(csb, vdm_state_size_in_dw);
if (!stream) {
pvr_cmd_buffer_set_error_unwarned(cmd_buffer, csb->status);
return;
}
if (pvr_is_large_clear_required(cmd_buffer))
vdm_state = device->static_clear_state->large_clear_vdm_words;
else
vdm_state = device->static_clear_state->vdm_words;
memcpy(stream, vdm_state, PVR_DW_TO_BYTES(vdm_state_size_in_dw));
pvr_csb_clear_relocation_mark(csb);
}
static VkResult pvr_cs_write_load_op_for_view(struct pvr_cmd_buffer *cmd_buffer,
struct pvr_sub_cmd_gfx *sub_cmd,
struct pvr_load_op *load_op,
uint32_t view_index,
uint32_t isp_userpass)
{
const struct pvr_device *device = cmd_buffer->device;
struct pvr_static_clear_ppp_template template =
device->static_clear_state->ppp_templates[VK_IMAGE_ASPECT_COLOR_BIT];
uint32_t pds_state[PVR_PDS_STATE_LENGTH];
struct pvr_pds_upload shareds_update_program;
struct pvr_suballoc_bo *pvr_bo;
VkResult result;
result = pvr_load_op_data_create_and_upload(cmd_buffer,
load_op,
view_index,
&shareds_update_program);
if (result != VK_SUCCESS)
return result;
template.config.ispctl.upass = isp_userpass;
/* It might look odd that we aren't specifying the code segment's
* address anywhere. This is because the hardware always assumes that the
* data size is 2 128bit words and the code segments starts after that.
*/
pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SHADERBASE],
TA_STATE_PDS_SHADERBASE,
shaderbase) {
shaderbase.addr = PVR_DEV_ADDR(load_op->pds_frag_prog.data_offset);
}
pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_TEXUNICODEBASE],
TA_STATE_PDS_TEXUNICODEBASE,
texunicodebase) {
texunicodebase.addr =
PVR_DEV_ADDR(load_op->pds_tex_state_prog.code_offset);
}
pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SIZEINFO1],
TA_STATE_PDS_SIZEINFO1,
sizeinfo1) {
/* Dummy coefficient loading program. */
sizeinfo1.pds_varyingsize = 0;
sizeinfo1.pds_texturestatesize = DIV_ROUND_UP(
shareds_update_program.data_size,
ROGUE_TA_STATE_PDS_SIZEINFO1_PDS_TEXTURESTATESIZE_UNIT_SIZE);
sizeinfo1.pds_tempsize =
DIV_ROUND_UP(load_op->temps_count,
ROGUE_TA_STATE_PDS_SIZEINFO1_PDS_TEMPSIZE_UNIT_SIZE);
}
pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SIZEINFO2],
TA_STATE_PDS_SIZEINFO2,
sizeinfo2) {
sizeinfo2.usc_sharedsize =
DIV_ROUND_UP(load_op->const_shareds_count,
ROGUE_TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE);
}
/* Dummy coefficient loading program. */
pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_VARYINGBASE] = 0;
pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_TEXTUREDATABASE],
TA_STATE_PDS_TEXTUREDATABASE,
texturedatabase) {
texturedatabase.addr = PVR_DEV_ADDR(shareds_update_program.data_offset);
}
template.config.pds_state = &pds_state;
pvr_emit_ppp_from_template(&sub_cmd->control_stream, &template, &pvr_bo);
list_add(&pvr_bo->link, &cmd_buffer->bo_list);
pvr_emit_clear_words(cmd_buffer, sub_cmd);
pvr_reset_graphics_dirty_state(cmd_buffer, false);
return VK_SUCCESS;
}
static VkResult pvr_cs_write_load_op(struct pvr_cmd_buffer *cmd_buffer,
struct pvr_sub_cmd_gfx *sub_cmd,
struct pvr_load_op *load_op,
uint32_t isp_userpass)
{
assert(load_op->view_count);
for (uint32_t i = 0; i < load_op->view_count; i++) {
const uint32_t view_index = load_op->view_indices[i];
VkResult result;
result = pvr_cs_write_load_op_for_view(cmd_buffer,
sub_cmd,
load_op,
view_index,
isp_userpass);
if (result != VK_SUCCESS)
return result;
}
return VK_SUCCESS;
}
static inline enum pvr_resolve_op
pvr_resolve_mode_to_op(enum VkResolveModeFlagBits mode)
{
switch (mode) {
default:
UNREACHABLE("invalid resolve mode");
FALLTHROUGH;
case VK_RESOLVE_MODE_AVERAGE_BIT:
return PVR_RESOLVE_BLEND;
case VK_RESOLVE_MODE_SAMPLE_ZERO_BIT:
return PVR_RESOLVE_SAMPLE0;
case VK_RESOLVE_MODE_MIN_BIT:
return PVR_RESOLVE_MIN;
case VK_RESOLVE_MODE_MAX_BIT:
return PVR_RESOLVE_MAX;
}
}
static inline void
pvr_resolve_subresource_layer_init(const struct pvr_image_view *const iview,
VkImageSubresourceLayers *subresource)
{
*subresource = (VkImageSubresourceLayers){
.mipLevel = iview->vk.base_mip_level,
.baseArrayLayer = iview->vk.base_array_layer,
.layerCount = iview->vk.layer_count,
};
}
static inline void
pvr_resolve_offset_init(const struct pvr_render_pass_info *const info,
VkOffset3D *offset)
{
*offset = (VkOffset3D){
.x = info->render_area.offset.x,
.y = info->render_area.offset.y,
};
}
/* clang-format off */
static inline void
pvr_resolve_image_copy_region_init(const struct pvr_image_view *const src_view,
const struct pvr_image_view *const dst_view,
const struct pvr_render_pass_info *const info,
VkImageCopy2 *region)
{
pvr_resolve_subresource_layer_init(src_view, &region->srcSubresource);
pvr_resolve_subresource_layer_init(dst_view, &region->dstSubresource);
pvr_resolve_offset_init(info, &region->srcOffset);
pvr_resolve_offset_init(info, &region->dstOffset);
region->extent = (VkExtent3D){
.width = info->render_area.extent.width,
.height = info->render_area.extent.height,
.depth = 1
};
}
/* clang-format on */
static VkResult
pvr_resolve_unemitted_resolve_attachments(struct pvr_cmd_buffer *cmd_buffer,
struct pvr_render_pass_info *info)
{
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
const struct pvr_renderpass_hwsetup_render *hw_render =
pvr_pass_info_get_hw_render(&state->render_pass_info,
info->current_hw_subpass);
for (uint32_t i = 0U; i < hw_render->eot_surface_count; i++) {
const struct pvr_renderpass_hwsetup_eot_surface *surface =
&hw_render->eot_surfaces[i];
const uint32_t color_attach_idx = surface->src_attachment_idx;
const uint32_t resolve_attach_idx = surface->attachment_idx;
struct pvr_image_view *dst_view;
struct pvr_image_view *src_view;
VkFormat src_format;
VkFormat dst_format;
VkImageCopy2 region;
VkResult result;
if (!surface->need_resolve ||
surface->resolve_type != PVR_RESOLVE_TYPE_TRANSFER) {
continue;
}
dst_view = info->attachments[resolve_attach_idx];
src_view = info->attachments[color_attach_idx];
pvr_resolve_image_copy_region_init(src_view, dst_view, info, &region);
region.srcSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
region.dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
/* TODO: if ERN_46863 is supported, Depth and stencil are sampled
* separately from images with combined depth+stencil. Add logic here to
* handle it using appropriate format from image view.
*/
src_format = src_view->vk.image->format;
dst_format = dst_view->vk.image->format;
src_view->vk.image->format = src_view->vk.format;
dst_view->vk.image->format = dst_view->vk.format;
result = pvr_copy_or_resolve_color_image_region(
cmd_buffer,
vk_to_pvr_image(src_view->vk.image),
vk_to_pvr_image(dst_view->vk.image),
&region);
src_view->vk.image->format = src_format;
dst_view->vk.image->format = dst_format;
state->current_sub_cmd->transfer.serialize_with_frag = true;
if (result != VK_SUCCESS)
return result;
}
if (hw_render->stencil_resolve_mode || hw_render->depth_resolve_mode) {
const struct pvr_image_view *dst_view =
info->attachments[hw_render->ds_attach_resolve_idx];
const struct pvr_image_view *src_view =
info->attachments[hw_render->ds_attach_idx];
const VkClearValue *resolve_clear_values =
&state->render_pass_info.clear_values[hw_render->ds_attach_resolve_idx];
const bool both_stencil = vk_format_has_stencil(dst_view->vk.format) &&
vk_format_has_stencil(src_view->vk.format);
const bool both_depth = vk_format_has_depth(dst_view->vk.format) &&
vk_format_has_depth(src_view->vk.format);
struct pvr_render_pass_attachment resolve_attachment;
bool clear_depth = false, clear_stencil = false;
VkClearDepthStencilValue clear_values;
VkImageCopy2 region;
pvr_resolve_image_copy_region_init(src_view, dst_view, info, &region);
if (!info->dr_info) {
resolve_attachment =
info->pass->attachments[hw_render->ds_attach_resolve_idx];
if (both_depth && !hw_render->depth_resolve_mode &&
resolve_attachment.store_op == VK_ATTACHMENT_STORE_OP_STORE &&
resolve_attachment.load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
clear_values.depth = resolve_clear_values->depthStencil.depth;
clear_depth = true;
}
if (both_stencil && !hw_render->stencil_resolve_mode &&
resolve_attachment.stencil_store_op ==
VK_ATTACHMENT_STORE_OP_STORE &&
resolve_attachment.stencil_load_op ==
VK_ATTACHMENT_LOAD_OP_CLEAR) {
clear_values.stencil = resolve_clear_values->depthStencil.stencil;
clear_stencil = true;
}
}
/* Resolve depth and if it can clear stencil */
if (both_depth && hw_render->depth_resolve_mode) {
region.srcSubresource.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
region.dstSubresource.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
pvr_copy_or_resolve_depth_stencil_region(
cmd_buffer,
vk_to_pvr_image(src_view->vk.image),
vk_to_pvr_image(dst_view->vk.image),
pvr_resolve_mode_to_op(hw_render->depth_resolve_mode),
clear_stencil,
&clear_values,
&region);
state->current_sub_cmd->transfer.serialize_with_frag = true;
clear_stencil = false;
}
/* Resolve stencil and if it can clear depth */
if (both_stencil && hw_render->stencil_resolve_mode) {
region.srcSubresource.aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT;
region.dstSubresource.aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT;
pvr_copy_or_resolve_depth_stencil_region(
cmd_buffer,
vk_to_pvr_image(src_view->vk.image),
vk_to_pvr_image(dst_view->vk.image),
pvr_resolve_mode_to_op(hw_render->stencil_resolve_mode),
clear_depth,
&clear_values,
&region);
state->current_sub_cmd->transfer.serialize_with_frag = true;
clear_depth = false;
}
/* Clear what it couldn't be cleared yet */
if (clear_stencil || clear_depth) {
const VkImageAspectFlagBits aspect =
(clear_depth ? VK_IMAGE_ASPECT_DEPTH_BIT : 0) |
(clear_stencil ? VK_IMAGE_ASPECT_STENCIL_BIT : 0);
VkImageSubresourceRange range = (VkImageSubresourceRange){
.aspectMask = aspect,
.baseMipLevel = dst_view->vk.base_mip_level,
.levelCount = 1,
.baseArrayLayer = dst_view->vk.base_array_layer,
.layerCount = dst_view->vk.layer_count,
};
pvr_clear_depth_stencil_image(cmd_buffer,
vk_to_pvr_image(dst_view->vk.image),
&clear_values,
1,
&range);
state->current_sub_cmd->transfer.serialize_with_frag = true;
}
}
return pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
}
void PVR_PER_ARCH(CmdBeginRenderPass2)(
VkCommandBuffer commandBuffer,
const VkRenderPassBeginInfo *pRenderPassBeginInfo,
const VkSubpassBeginInfo *pSubpassBeginInfo)
{
VK_FROM_HANDLE(pvr_framebuffer,
framebuffer,
pRenderPassBeginInfo->framebuffer);
VK_FROM_HANDLE(pvr_render_pass, pass, pRenderPassBeginInfo->renderPass);
VK_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
const struct pvr_renderpass_hwsetup_subpass *hw_subpass;
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
VkResult result;
PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
assert(!state->render_pass_info.pass);
assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
/* FIXME: Create a separate function for everything using pass->subpasses,
* look at cmd_buffer_begin_subpass() for example. */
state->render_pass_info.pass = pass;
state->render_pass_info.framebuffer = framebuffer;
state->render_pass_info.rstate = framebuffer->rstate;
state->render_pass_info.subpass_idx = 0;
state->render_pass_info.render_area = pRenderPassBeginInfo->renderArea;
state->render_pass_info.current_hw_subpass = 0;
state->render_pass_info.pipeline_bind_point =
pass->subpasses[0].pipeline_bind_point;
state->render_pass_info.isp_userpass = pass->subpasses[0].isp_userpass;
state->dirty.isp_userpass = true;
result = pvr_cmd_buffer_attachments_setup(cmd_buffer,
pass,
framebuffer,
pRenderPassBeginInfo);
if (result != VK_SUCCESS)
return;
result =
pvr_render_targets_init_for_render(cmd_buffer->device, pass, framebuffer);
if (result != VK_SUCCESS) {
pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
return;
}
result = pvr_cmd_buffer_set_clear_values(cmd_buffer, pRenderPassBeginInfo);
if (result != VK_SUCCESS)
return;
assert(pass->subpasses[0].pipeline_bind_point ==
VK_PIPELINE_BIND_POINT_GRAPHICS);
result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
if (result != VK_SUCCESS)
return;
/* Run subpass 0 "soft" background object after the actual background
* object.
*/
hw_subpass = pvr_get_hw_subpass(pass, 0);
if (hw_subpass->load_op) {
result = pvr_cs_write_load_op(cmd_buffer,
&cmd_buffer->state.current_sub_cmd->gfx,
hw_subpass->load_op,
0);
if (result != VK_SUCCESS)
return;
}
pvr_perform_start_of_render_clears(cmd_buffer, false);
pvr_stash_depth_format(&cmd_buffer->state,
&cmd_buffer->state.current_sub_cmd->gfx);
}
static inline uint32_t
pvr_get_ds_component_bits(const VkFormat vk_format,
const VkImageAspectFlags ds_aspect)
{
uint32_t component;
if (ds_aspect == VK_IMAGE_ASPECT_STENCIL_BIT) {
assert(vk_format_has_stencil(vk_format));
component = 0;
} else {
assert(ds_aspect == VK_IMAGE_ASPECT_DEPTH_BIT);
assert(vk_format_has_depth(vk_format));
component = 1;
}
return vk_format_get_component_bits(vk_format,
UTIL_FORMAT_COLORSPACE_ZS,
component);
}
static inline bool
pvr_can_pbe_resolve_ds_attachment(const struct pvr_device_info *dev_info,
VkFormat vk_format,
uint32_t sample_count,
VkResolveModeFlagBits resolve_mode)
{
if (!PVR_HAS_FEATURE(dev_info, gs_rta_support))
return false;
if (resolve_mode != VK_RESOLVE_MODE_AVERAGE_BIT)
return false;
if (pvr_get_ds_component_bits(vk_format, VK_IMAGE_ASPECT_STENCIL_BIT))
return false;
return pvr_format_is_pbe_downscalable(dev_info, vk_format);
}
static inline VkResult
pvr_mrt_setup_partial_init(struct pvr_device *const device,
struct usc_mrt_setup *mrt_setup,
uint32_t num_renger_targets,
uint32_t num_output_regs,
uint32_t num_tile_buffers)
{
struct usc_mrt_resource *mrt_resources = NULL;
if (num_renger_targets) {
const uint32_t size =
num_renger_targets * sizeof(*mrt_setup->mrt_resources);
mrt_resources = vk_zalloc(&device->vk.alloc,
size,
8,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (!mrt_resources)
return VK_ERROR_OUT_OF_HOST_MEMORY;
}
*mrt_setup = (struct usc_mrt_setup){
.num_render_targets = num_renger_targets,
.num_output_regs = num_output_regs,
.num_tile_buffers = num_tile_buffers,
.mrt_resources = mrt_resources,
};
return VK_SUCCESS;
}
static void pvr_dynamic_rendering_output_attachments_cleanup(
const struct pvr_device *device,
const VkAllocationCallbacks *const allocator,
struct pvr_dynamic_render_info *dr_info)
{
if (!dr_info)
return;
pvr_mrt_load_op_state_cleanup(device,
allocator,
dr_info->hw_render.load_op_state);
pvr_destroy_mrt_setup(device, &dr_info->hw_render.eot_setup);
pvr_destroy_mrt_setup(device, &dr_info->hw_render.init_setup);
pvr_destroy_mrt_setup(device, dr_info->mrt_setup);
vk_free2(&device->vk.alloc, allocator, dr_info->hw_render.eot_surfaces);
vk_free2(&device->vk.alloc, allocator, dr_info->hw_render.color_init);
vk_free2(&device->vk.alloc, allocator, dr_info->color_attachments);
}
static VkResult pvr_dynamic_rendering_output_attachments_setup(
struct pvr_cmd_buffer *cmd_buffer,
const VkRenderingInfo *pRenderingInfo,
struct pvr_dynamic_render_info *dr_info,
uint32_t *attach_idx)
{
VkFormat attachment_formats[pRenderingInfo->colorAttachmentCount];
uint32_t mrt_attachment_map[pRenderingInfo->colorAttachmentCount];
uint32_t mrt_count = 0, eot_mrt_count = 0, eot_mrt_idx = 0,
eot_surface_idx = 0, color_idx = 0, pbe_emits = 0,
init_setup_idx = 0;
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
struct pvr_device *device = cmd_buffer->device;
struct usc_mrt_setup mrt_setup;
VkResult result;
dr_info->color_attachment_count = pRenderingInfo->colorAttachmentCount;
if (dr_info->color_attachment_count) {
const uint32_t size =
sizeof(*dr_info->color_attachments) * dr_info->color_attachment_count;
dr_info->color_attachments = vk_zalloc(&device->vk.alloc,
size,
8,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (!dr_info->color_attachments) {
return vk_command_buffer_set_error(&cmd_buffer->vk,
VK_ERROR_OUT_OF_HOST_MEMORY);
}
}
for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
uint32_t cur_attach_idx = *attach_idx;
struct pvr_render_pass_attachment *attachment =
&dr_info->attachments[cur_attach_idx];
const VkRenderingAttachmentInfo *attachment_info =
&pRenderingInfo->pColorAttachments[i];
const struct pvr_image *image;
struct pvr_image_view *iview;
/* Start off clean */
dr_info->color_attachments[i].index_color = VK_ATTACHMENT_UNUSED;
dr_info->color_attachments[i].index_resolve = VK_ATTACHMENT_UNUSED;
attachment_formats[mrt_count] = VK_FORMAT_UNDEFINED;
mrt_attachment_map[i] = VK_ATTACHMENT_UNUSED;
if (attachment_info->imageView)
iview = pvr_image_view_from_handle(attachment_info->imageView);
else if (attachment_info->resolveImageView)
iview = pvr_image_view_from_handle(attachment_info->resolveImageView);
else
continue;
/* We have an output color attachment */
image = pvr_image_view_get_image(iview);
state->render_pass_info.attachments[cur_attach_idx] = iview;
state->render_pass_info.rstate->height =
MIN2(state->render_pass_info.rstate->height, image->vk.extent.height);
state->render_pass_info.rstate->width =
MIN2(state->render_pass_info.rstate->width, image->vk.extent.width);
mrt_attachment_map[i] = mrt_count;
attachment_formats[mrt_count++] = iview->vk.view_format;
attachment->index = cur_attach_idx;
attachment->vk_format = iview->vk.view_format;
attachment->sample_count = image->vk.samples;
attachment->resolve_target = VK_ATTACHMENT_UNUSED;
attachment->load_op = attachment_info->loadOp;
attachment->store_op = attachment_info->storeOp;
attachment->resolve_mode = attachment_info->resolveMode;
attachment->stencil_load_op = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
attachment->stencil_store_op = VK_ATTACHMENT_STORE_OP_DONT_CARE;
attachment->stencil_resolve_mode = VK_RESOLVE_MODE_NONE;
attachment->need_eot = true;
/* TODO: verify logic for is_pbe_downscalable */
attachment->is_pbe_downscalable = pvr_can_pbe_resolve_ds_attachment(
&cmd_buffer->device->pdevice->dev_info,
iview->vk.view_format,
image->vk.samples,
attachment_info->resolveMode);
dr_info->color_attachments[i].index_color = cur_attach_idx;
dr_info->hw_render.sample_count =
MAX2(dr_info->hw_render.sample_count, attachment->sample_count);
*attach_idx += 1;
if (attachment->resolve_mode &&
cur_attach_idx < dr_info->attachment_count) {
const uint32_t resolve_attach_idx = *attach_idx;
struct pvr_render_pass_attachment *resolve_attach =
&dr_info->attachments[resolve_attach_idx];
struct pvr_image_view *resolve_iview =
pvr_image_view_from_handle(attachment_info->resolveImageView);
assert(resolve_iview &&
resolve_attach_idx <= dr_info->attachment_count);
state->render_pass_info.attachments[resolve_attach_idx] =
resolve_iview;
attachment->resolve_target = resolve_attach_idx;
dr_info->color_attachments[i].index_resolve = resolve_attach_idx;
dr_info->hw_render.has_side_effects = true;
dr_info->hw_render.requires_frag_pr = true;
dr_info->hw_render.eot_surface_count++;
resolve_attach->sample_count = image->vk.samples;
resolve_attach->index = resolve_attach_idx;
resolve_attach->vk_format = resolve_iview->vk.view_format;
resolve_attach->load_op = resolve_attach->stencil_load_op =
VK_ATTACHMENT_LOAD_OP_DONT_CARE;
resolve_attach->store_op = resolve_attach->stencil_store_op =
VK_ATTACHMENT_STORE_OP_DONT_CARE;
resolve_attach->resolve_mode = resolve_attach->stencil_resolve_mode =
VK_RESOLVE_MODE_NONE;
resolve_attach->resolve_target = VK_ATTACHMENT_UNUSED;
/* If resolving with a tranfer, then always store the source */
if (!attachment->is_pbe_downscalable)
attachment->store_op = VK_ATTACHMENT_STORE_OP_STORE;
*attach_idx += 1;
}
if (attachment->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
VkClearColorValue *clear_value =
&state->render_pass_info.clear_values[cur_attach_idx].color;
*clear_value = attachment_info->clearValue.color;
if (attachment->store_op == VK_ATTACHMENT_STORE_OP_STORE)
dr_info->hw_render.has_side_effects = true;
}
if (attachment->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR ||
attachment->load_op == VK_ATTACHMENT_LOAD_OP_LOAD) {
dr_info->hw_render.color_init_count++;
}
if (attachment->store_op == VK_ATTACHMENT_STORE_OP_STORE)
dr_info->hw_render.eot_surface_count++;
else
dr_info->hw_render.requires_frag_pr = true;
if (attachment->store_op == VK_ATTACHMENT_STORE_OP_STORE ||
attachment->resolve_mode) {
eot_mrt_count++;
}
}
if (dr_info->hw_render.color_init_count) {
const uint32_t size = dr_info->hw_render.color_init_count *
sizeof(*dr_info->hw_render.color_init);
dr_info->hw_render.color_init =
vk_zalloc(&device->vk.alloc,
size,
8,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (!dr_info->hw_render.color_init) {
result = vk_command_buffer_set_error(&cmd_buffer->vk,
VK_ERROR_OUT_OF_HOST_MEMORY);
goto err_free_color_attachments;
}
}
if (dr_info->hw_render.eot_surface_count) {
const uint32_t size = dr_info->hw_render.eot_surface_count *
sizeof(*dr_info->hw_render.eot_surfaces);
dr_info->hw_render.eot_surfaces =
vk_zalloc(&device->vk.alloc,
size,
8,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (!dr_info->hw_render.eot_surfaces) {
result = vk_command_buffer_set_error(&cmd_buffer->vk,
VK_ERROR_OUT_OF_HOST_MEMORY);
goto err_free_hw_render_color_init;
}
}
result =
pvr_init_usc_mrt_setup(device, mrt_count, attachment_formats, &mrt_setup);
if (result != VK_SUCCESS) {
vk_command_buffer_set_error(&cmd_buffer->vk, result);
goto err_free_hw_render_eot_surface;
}
dr_info->hw_render.tile_buffers_count = mrt_setup.num_tile_buffers;
dr_info->hw_render.output_regs_count = mrt_setup.num_output_regs;
result = pvr_mrt_setup_partial_init(cmd_buffer->device,
dr_info->mrt_setup,
pRenderingInfo->colorAttachmentCount,
dr_info->hw_render.output_regs_count,
dr_info->hw_render.tile_buffers_count);
if (result != VK_SUCCESS) {
pvr_destroy_mrt_setup(device, &mrt_setup);
vk_command_buffer_set_error(&cmd_buffer->vk, result);
goto err_free_hw_render_eot_surface;
}
/* We fully initialised only a mrt_count number of render targets,
* now scatter gather them over the whole number of attachments.
* This is necessary because render targets are attachment-indexed
* throughout the driver.
*/
for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
if (mrt_attachment_map[i] != VK_ATTACHMENT_UNUSED) {
memcpy(&dr_info->mrt_setup->mrt_resources[i],
&mrt_setup.mrt_resources[mrt_attachment_map[i]],
sizeof(mrt_setup.mrt_resources[0]));
}
}
result = pvr_mrt_setup_partial_init(cmd_buffer->device,
&dr_info->hw_render.init_setup,
dr_info->hw_render.color_init_count,
dr_info->hw_render.output_regs_count,
dr_info->hw_render.tile_buffers_count);
if (result != VK_SUCCESS) {
pvr_destroy_mrt_setup(device, &mrt_setup);
vk_command_buffer_set_error(&cmd_buffer->vk, result);
goto err_finish_mrt_setup;
}
result = pvr_mrt_setup_partial_init(cmd_buffer->device,
&dr_info->hw_render.eot_setup,
eot_mrt_count,
dr_info->hw_render.output_regs_count,
dr_info->hw_render.tile_buffers_count);
if (result != VK_SUCCESS) {
pvr_destroy_mrt_setup(device, &mrt_setup);
vk_command_buffer_set_error(&cmd_buffer->vk, result);
goto err_finish_mrt_init_setup;
}
for (uint32_t i = 0; i < dr_info->attachment_count; i++) {
struct pvr_render_pass_attachment *attachment = &dr_info->attachments[i];
bool need_eot_setup = false;
if (!attachment->need_eot)
continue;
if (attachment->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR ||
attachment->load_op == VK_ATTACHMENT_LOAD_OP_LOAD) {
dr_info->hw_render.color_init[init_setup_idx].index = i;
dr_info->hw_render.color_init[init_setup_idx].op = attachment->load_op;
dr_info->hw_render.init_setup.mrt_resources[init_setup_idx] =
mrt_setup.mrt_resources[color_idx];
init_setup_idx++;
}
if (attachment->store_op == VK_ATTACHMENT_STORE_OP_STORE ||
(attachment->resolve_mode && !attachment->is_pbe_downscalable)) {
struct pvr_renderpass_hwsetup_eot_surface *surface =
&dr_info->hw_render.eot_surfaces[eot_surface_idx];
surface->src_attachment_idx = VK_ATTACHMENT_UNUSED;
surface->resolve_type = PVR_RESOLVE_TYPE_INVALID;
surface->mrt_idx = eot_mrt_idx;
surface->attachment_idx = i;
need_eot_setup = true;
eot_surface_idx++;
pbe_emits++;
}
if (attachment->resolve_mode) {
struct pvr_renderpass_hwsetup_eot_surface *surface =
&dr_info->hw_render.eot_surfaces[eot_surface_idx]; /* next surface
*/
surface->src_attachment_idx = i;
surface->mrt_idx = eot_mrt_idx;
surface->need_resolve = true;
surface->attachment_idx = attachment->resolve_target;
if (attachment->store_op == VK_ATTACHMENT_STORE_OP_DONT_CARE &&
attachment->is_pbe_downscalable) {
surface->resolve_type = PVR_RESOLVE_TYPE_PBE;
pbe_emits++;
} else if (!attachment->is_pbe_downscalable) {
surface->resolve_type = PVR_RESOLVE_TYPE_TRANSFER;
} else {
surface->resolve_type = PVR_RESOLVE_TYPE_INVALID;
}
need_eot_setup = true;
eot_surface_idx++;
}
if (need_eot_setup) {
dr_info->hw_render.eot_setup.mrt_resources[eot_mrt_idx++] =
mrt_setup.mrt_resources[color_idx];
}
color_idx++;
}
assert(dr_info->hw_render.eot_surface_count == eot_surface_idx);
assert(pbe_emits <= PVR_MAX_COLOR_ATTACHMENTS);
for (uint32_t i = 0; i < eot_surface_idx; i++) {
struct pvr_renderpass_hwsetup_eot_surface *surface =
&dr_info->hw_render.eot_surfaces[i];
struct pvr_render_pass_attachment *attachment;
if (!surface->need_resolve ||
surface->resolve_type != PVR_RESOLVE_TYPE_INVALID) {
continue;
}
assert(surface->src_attachment_idx != VK_ATTACHMENT_UNUSED);
attachment = &dr_info->attachments[surface->src_attachment_idx];
if (attachment->resolve_mode) {
if (pbe_emits < PVR_MAX_COLOR_ATTACHMENTS) {
surface->resolve_type = PVR_RESOLVE_TYPE_PBE;
pbe_emits++;
} else {
surface->resolve_type = PVR_RESOLVE_TYPE_TRANSFER;
}
}
}
dr_info->hw_render.pbe_emits = pbe_emits;
pvr_destroy_mrt_setup(device, &mrt_setup);
return VK_SUCCESS;
err_finish_mrt_init_setup:
pvr_destroy_mrt_setup(device, &dr_info->hw_render.init_setup);
err_finish_mrt_setup:
pvr_destroy_mrt_setup(device, dr_info->mrt_setup);
err_free_hw_render_eot_surface:
vk_free(&device->vk.alloc, dr_info->hw_render.eot_surfaces);
err_free_hw_render_color_init:
vk_free(&device->vk.alloc, dr_info->hw_render.color_init);
err_free_color_attachments:
vk_free(&device->vk.alloc, dr_info->color_attachments);
return result;
}
static void
pvr_dynamic_render_info_destroy(const struct pvr_device *device,
struct pvr_cmd_buffer *cmd_buffer,
struct pvr_dynamic_render_info *dr_info)
{
pvr_dynamic_rendering_output_attachments_cleanup(device,
&device->vk.alloc,
dr_info);
pvr_cmd_buffer_clear_values_free(cmd_buffer);
pvr_cmd_buffer_attachments_free(cmd_buffer);
if (dr_info)
vk_free(&device->vk.alloc, dr_info->attachments);
vk_free(&device->vk.alloc, dr_info);
}
static VkResult
pvr_dynamic_render_info_create(struct pvr_cmd_buffer *cmd_buffer,
const VkRenderingInfo *pRenderingInfo,
struct pvr_dynamic_render_info **dr_info_out)
{
const bool has_stencil_resolve_iview =
pRenderingInfo->pStencilAttachment &&
pRenderingInfo->pStencilAttachment->resolveMode &&
pRenderingInfo->pStencilAttachment->resolveImageView;
const bool has_depth_resolve_iview =
pRenderingInfo->pDepthAttachment &&
pRenderingInfo->pDepthAttachment->resolveMode &&
pRenderingInfo->pDepthAttachment->resolveImageView;
const bool has_stencil_iview = pRenderingInfo->pStencilAttachment &&
pRenderingInfo->pStencilAttachment->imageView;
const bool has_depth_iview = pRenderingInfo->pDepthAttachment &&
pRenderingInfo->pDepthAttachment->imageView;
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
struct pvr_device *device = cmd_buffer->device;
struct pvr_dynamic_render_info *dr_info;
struct pvr_render_state *rstate;
struct usc_mrt_setup *mrt_setup;
uint32_t attach_idx = 0;
VkResult result;
VK_MULTIALLOC(ma);
vk_multialloc_add(&ma, &dr_info, __typeof__(*dr_info), 1);
vk_multialloc_add(&ma, &mrt_setup, __typeof__(*mrt_setup), 1);
if (!vk_multialloc_zalloc(&ma,
&device->vk.alloc,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT)) {
return vk_command_buffer_set_error(&cmd_buffer->vk,
VK_ERROR_OUT_OF_HOST_MEMORY);
}
for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
const VkRenderingAttachmentInfo *attachment_info =
&pRenderingInfo->pColorAttachments[i];
if (attachment_info->imageView)
dr_info->attachment_count++;
if (attachment_info->resolveMode && attachment_info->resolveImageView)
dr_info->attachment_count++;
}
dr_info->attachment_count += has_depth_iview || has_stencil_iview;
dr_info->attachment_count += has_depth_resolve_iview ||
has_stencil_resolve_iview;
dr_info->mrt_setup = mrt_setup;
rstate = vk_zalloc(&device->vk.alloc,
sizeof(*rstate),
8,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (!rstate) {
result = vk_command_buffer_set_error(&cmd_buffer->vk,
VK_ERROR_OUT_OF_HOST_MEMORY);
goto err_free_dr_info;
}
*rstate = (struct pvr_render_state){
.width = UINT32_MAX,
.height = UINT32_MAX,
.width_alignment = 1,
.height_alignment = 1,
};
state->render_pass_info.rstate = rstate;
if (dr_info->attachment_count) {
dr_info->attachments =
vk_zalloc(&device->vk.alloc,
sizeof(*dr_info->attachments) * dr_info->attachment_count,
8,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (!dr_info->attachments) {
result = vk_command_buffer_set_error(&cmd_buffer->vk,
VK_ERROR_OUT_OF_HOST_MEMORY);
goto err_free_rstate;
}
result = pvr_cmd_buffer_attachments_alloc(cmd_buffer,
dr_info->attachment_count);
if (result != VK_SUCCESS) {
result = vk_command_buffer_set_error(&cmd_buffer->vk, result);
goto err_free_dr_info_attachments;
}
result = pvr_cmd_buffer_clear_values_alloc(cmd_buffer,
dr_info->attachment_count);
if (result != VK_SUCCESS) {
result = vk_command_buffer_set_error(&cmd_buffer->vk, result);
goto err_free_cmd_buffer_attachments;
}
} else {
pvr_cmd_buffer_clear_values_free(cmd_buffer);
pvr_cmd_buffer_attachments_free(cmd_buffer);
}
dr_info->hw_render = (struct pvr_renderpass_hwsetup_render){
.ds_attach_resolve_idx = VK_ATTACHMENT_UNUSED,
.ds_attach_idx = VK_ATTACHMENT_UNUSED,
.depth_init = VK_ATTACHMENT_LOAD_OP_DONT_CARE,
.stencil_init = VK_ATTACHMENT_LOAD_OP_DONT_CARE,
.multiview_enabled = !!pRenderingInfo->viewMask,
.view_mask = MAX2(1, pRenderingInfo->viewMask),
.sample_count = 1,
};
if (has_depth_iview || has_stencil_iview) {
struct pvr_render_pass_attachment *ds_attach =
&dr_info->attachments[attach_idx];
VkClearDepthStencilValue *clear_value;
const struct pvr_image *image;
struct pvr_image_view *iview;
ds_attach->index = attach_idx;
ds_attach->load_op = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
ds_attach->store_op = VK_ATTACHMENT_STORE_OP_DONT_CARE;
if (has_depth_iview) {
iview = pvr_image_view_from_handle(
pRenderingInfo->pDepthAttachment->imageView);
ds_attach->is_depth = true;
ds_attach->load_op = pRenderingInfo->pDepthAttachment->loadOp;
ds_attach->store_op = pRenderingInfo->pDepthAttachment->storeOp;
ds_attach->resolve_mode =
pRenderingInfo->pDepthAttachment->resolveMode;
}
ds_attach->stencil_load_op = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
ds_attach->stencil_store_op = VK_ATTACHMENT_STORE_OP_DONT_CARE;
if (has_stencil_iview) {
iview = pvr_image_view_from_handle(
pRenderingInfo->pStencilAttachment->imageView);
ds_attach->is_stencil = true;
ds_attach->stencil_load_op =
pRenderingInfo->pStencilAttachment->loadOp;
ds_attach->stencil_store_op =
pRenderingInfo->pStencilAttachment->storeOp;
ds_attach->stencil_resolve_mode =
pRenderingInfo->pStencilAttachment->resolveMode;
}
image = pvr_image_view_get_image(iview);
ds_attach->vk_format = iview->vk.view_format;
ds_attach->sample_count = image->vk.samples;
ds_attach->resolve_target = VK_ATTACHMENT_UNUSED;
state->render_pass_info.attachments[attach_idx] = iview;
state->render_pass_info.rstate->height =
MIN2(state->render_pass_info.rstate->height, image->vk.extent.height);
state->render_pass_info.rstate->width =
MIN2(state->render_pass_info.rstate->width, image->vk.extent.width);
dr_info->hw_render.ds_attach_idx = attach_idx;
dr_info->hw_render.depth_init = ds_attach->load_op;
dr_info->hw_render.depth_store = ds_attach->store_op ==
VK_ATTACHMENT_STORE_OP_STORE;
dr_info->hw_render.depth_resolve_mode = ds_attach->resolve_mode;
dr_info->hw_render.stencil_init = ds_attach->stencil_load_op;
dr_info->hw_render.stencil_store = ds_attach->stencil_store_op ==
VK_ATTACHMENT_STORE_OP_STORE;
dr_info->hw_render.sample_count =
MAX2(dr_info->hw_render.sample_count, ds_attach->sample_count);
dr_info->hw_render.stencil_resolve_mode = ds_attach->stencil_resolve_mode;
clear_value =
&state->render_pass_info.clear_values[attach_idx].depthStencil;
if (ds_attach->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
clear_value->depth =
pRenderingInfo->pDepthAttachment->clearValue.depthStencil.depth;
if (ds_attach->store_op == VK_ATTACHMENT_STORE_OP_STORE)
dr_info->hw_render.has_side_effects = true;
}
if (ds_attach->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
clear_value->stencil =
pRenderingInfo->pStencilAttachment->clearValue.depthStencil.stencil;
if (ds_attach->stencil_store_op == VK_ATTACHMENT_STORE_OP_STORE)
dr_info->hw_render.has_side_effects = true;
}
attach_idx++;
if (ds_attach->resolve_mode || ds_attach->stencil_resolve_mode) {
struct pvr_render_pass_attachment *resolve_ds_attach =
&dr_info->attachments[attach_idx];
struct pvr_image_view *resolve_iview;
ds_attach->resolve_target = attach_idx;
if (has_depth_resolve_iview) {
resolve_iview = pvr_image_view_from_handle(
pRenderingInfo->pDepthAttachment->resolveImageView);
} else if (has_stencil_resolve_iview) {
resolve_iview = pvr_image_view_from_handle(
pRenderingInfo->pStencilAttachment->resolveImageView);
} else {
UNREACHABLE("invalid image view for DS resolve attachment");
}
state->render_pass_info.attachments[attach_idx] = resolve_iview;
dr_info->hw_render.ds_attach_resolve_idx = attach_idx;
dr_info->hw_render.has_side_effects = true;
assert(iview->vk.view_format == resolve_iview->vk.view_format);
resolve_ds_attach->vk_format = ds_attach->vk_format;
resolve_ds_attach->sample_count = 1;
resolve_ds_attach->index = attach_idx;
resolve_ds_attach->resolve_target = VK_ATTACHMENT_UNUSED;
resolve_ds_attach->is_stencil = ds_attach->is_stencil;
resolve_ds_attach->store_op = ds_attach->store_op;
resolve_ds_attach->stencil_store_op = ds_attach->stencil_store_op;
resolve_ds_attach->load_op = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
resolve_ds_attach->stencil_load_op = VK_ATTACHMENT_STORE_OP_DONT_CARE;
attach_idx++;
}
}
result = pvr_dynamic_rendering_output_attachments_setup(cmd_buffer,
pRenderingInfo,
dr_info,
&attach_idx);
if (result != VK_SUCCESS) {
vk_command_buffer_set_error(&cmd_buffer->vk, result);
goto err_free_cmd_buffer_clear_values;
}
if (state->render_pass_info.rstate->height == UINT32_MAX) {
state->render_pass_info.rstate->height =
pRenderingInfo->renderArea.offset.y +
pRenderingInfo->renderArea.extent.height;
}
if (state->render_pass_info.rstate->width == UINT32_MAX) {
state->render_pass_info.rstate->width =
pRenderingInfo->renderArea.offset.x +
pRenderingInfo->renderArea.extent.width;
}
*dr_info_out = dr_info;
/* FIXME: When adding support for caching rt datasets and render states,
* remove this and handle it with a proper hash table.
*/
pvr_rstate_entry_add(device, rstate);
return VK_SUCCESS;
err_free_cmd_buffer_clear_values:
pvr_cmd_buffer_clear_values_free(cmd_buffer);
err_free_cmd_buffer_attachments:
pvr_cmd_buffer_attachments_free(cmd_buffer);
err_free_dr_info_attachments:
vk_free(&device->vk.alloc, dr_info->attachments);
err_free_rstate:
vk_free(&device->vk.alloc, rstate);
err_free_dr_info:
vk_free(&device->vk.alloc, dr_info);
return result;
}
static inline uint64_t pvr_render_pass_info_get_scratch_buffer_size(
struct pvr_device *device,
const struct pvr_render_pass_info *info)
{
return pvr_spm_scratch_buffer_calc_required_size(
&info->dr_info->hw_render,
1,
info->dr_info->hw_render.sample_count,
info->rstate->width,
info->rstate->height);
}
void PVR_PER_ARCH(CmdBeginRendering)(VkCommandBuffer commandBuffer,
const VkRenderingInfo *pRenderingInfo)
{
VK_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
struct pvr_device *const device = cmd_buffer->device;
struct pvr_dynamic_render_info *dr_info = NULL;
struct pvr_sub_cmd_gfx *sub_cmd;
bool resume, suspend;
VkResult result;
/* TODO: Check not in renderpess? */
suspend = pRenderingInfo->flags & VK_RENDERING_SUSPENDING_BIT_KHR;
resume = pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT_KHR;
if (state->current_sub_cmd && resume) {
assert(state->current_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS);
assert(state->current_sub_cmd->is_dynamic_render);
assert(state->current_sub_cmd->is_suspend);
if (!suspend)
state->current_sub_cmd->is_suspend = false;
return;
}
result =
pvr_dynamic_render_info_create(cmd_buffer, pRenderingInfo, &dr_info);
if (result != VK_SUCCESS) {
vk_command_buffer_set_error(&cmd_buffer->vk, result);
return;
}
state->render_pass_info.dr_info = dr_info;
state->render_pass_info.hw_render = &dr_info->hw_render;
state->render_pass_info.render_area = (VkRect2D){
.extent = pRenderingInfo->renderArea.extent,
.offset =
(VkOffset2D){
.x = MAX2(0, pRenderingInfo->renderArea.offset.x),
.y = MAX2(0, pRenderingInfo->renderArea.offset.y),
},
};
state->render_pass_info.pipeline_bind_point =
VK_PIPELINE_BIND_POINT_GRAPHICS;
state->render_pass_info.dynamic_render = true;
state->render_pass_info.suspend = suspend;
state->render_pass_info.resume = resume;
state->render_pass_info.current_hw_subpass = -1;
if (dr_info->hw_render.tile_buffers_count) {
result = pvr_device_tile_buffer_ensure_cap(
device,
dr_info->hw_render.tile_buffers_count);
if (result != VK_SUCCESS) {
vk_command_buffer_set_error(&cmd_buffer->vk, result);
goto err_destroy_dynamic_render_info;
}
}
if (pRenderingInfo->pDepthAttachment &&
pRenderingInfo->pDepthAttachment->imageView) {
struct pvr_image_view *iview = pvr_image_view_from_handle(
pRenderingInfo->pDepthAttachment->imageView);
state->depth_format = iview->vk.format;
} else if (pRenderingInfo->pStencilAttachment &&
pRenderingInfo->pStencilAttachment->imageView) {
struct pvr_image_view *iview = pvr_image_view_from_handle(
pRenderingInfo->pStencilAttachment->imageView);
state->depth_format = iview->vk.format;
} else {
state->depth_format = VK_FORMAT_UNDEFINED;
}
if (dr_info->color_attachment_count ||
dr_info->hw_render.ds_attach_idx != VK_ATTACHMENT_UNUSED) {
state->render_pass_info.sample_count = dr_info->hw_render.sample_count;
}
state->render_pass_info.rstate->layers =
!pRenderingInfo->viewMask ? pRenderingInfo->layerCount : 1;
state->render_pass_info.rstate->scratch_buffer_size =
pvr_render_pass_info_get_scratch_buffer_size(device,
&state->render_pass_info);
result = pvr_render_state_setup(device,
NULL,
state->render_pass_info.rstate,
1,
&dr_info->hw_render);
if (result != VK_SUCCESS) {
vk_command_buffer_set_error(&cmd_buffer->vk, result);
goto err_cleanup_tile_buffers;
}
result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
if (result != VK_SUCCESS) {
vk_command_buffer_set_error(&cmd_buffer->vk, result);
goto err_cleanup_render_state;
}
sub_cmd = &state->current_sub_cmd->gfx;
sub_cmd->rstate = state->render_pass_info.rstate;
sub_cmd->dr_info = dr_info;
assert(sub_cmd->dr_info);
result = pvr_mrt_load_ops_setup(cmd_buffer,
&cmd_buffer->vk.pool->alloc,
&dr_info->hw_render.load_op_state);
if (result != VK_SUCCESS) {
vk_command_buffer_set_error(&cmd_buffer->vk, result);
goto err_cleanup_render_state;
}
pvr_perform_start_of_render_clears(cmd_buffer, resume);
return;
err_cleanup_render_state:
pvr_render_state_cleanup(device, state->render_pass_info.rstate);
err_cleanup_tile_buffers:
pvr_device_free_tile_buffer_state(device);
err_destroy_dynamic_render_info:
pvr_dynamic_render_info_destroy(device, cmd_buffer, dr_info);
}
static VkResult
pvr_resolve_unemitted_resolve_attachments(struct pvr_cmd_buffer *cmd_buffer,
struct pvr_render_pass_info *info);
void PVR_PER_ARCH(CmdEndRendering)(VkCommandBuffer commandBuffer)
{
VK_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
VkResult result;
if (state->current_sub_cmd && state->current_sub_cmd->is_suspend) {
return;
}
result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
if (result != VK_SUCCESS) {
vk_command_buffer_set_error(&cmd_buffer->vk, result);
goto exit_teardown_render;
}
result = pvr_resolve_unemitted_resolve_attachments(cmd_buffer,
&state->render_pass_info);
if (result != VK_SUCCESS) {
vk_command_buffer_set_error(&cmd_buffer->vk, result);
goto exit_teardown_render;
}
exit_teardown_render:
state->render_pass_info.dynamic_render = false;
pvr_cmd_buffer_clear_values_free(cmd_buffer);
pvr_cmd_buffer_attachments_free(cmd_buffer);
}
static void pvr_cmd_buffer_state_from_dynamic_inheritance(
struct pvr_cmd_buffer *cmd_buffer,
const VkCommandBufferInheritanceRenderingInfo *inheritance_info)
{
const bool has_stencil = inheritance_info->stencilAttachmentFormat !=
VK_FORMAT_UNDEFINED;
const bool has_depth = inheritance_info->depthAttachmentFormat !=
VK_FORMAT_UNDEFINED;
VkFormat attachment_formats[inheritance_info->colorAttachmentCount];
uint32_t mrt_attachment_map[inheritance_info->colorAttachmentCount];
struct pvr_device *const device = cmd_buffer->device;
struct pvr_dynamic_render_info *dr_info;
uint32_t attach_idx = 0, mrt_count = 0;
struct usc_mrt_setup mrt_setup, *setup;
VkResult result;
VK_MULTIALLOC(ma);
vk_multialloc_add(&ma, &dr_info, __typeof__(*dr_info), 1);
vk_multialloc_add(&ma, &setup, __typeof__(*setup), 1);
if (!vk_multialloc_zalloc(&ma,
&device->vk.alloc,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT)) {
vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
}
dr_info->hw_render.sample_count = inheritance_info->rasterizationSamples;
dr_info->hw_render.multiview_enabled = !!inheritance_info->viewMask;
dr_info->hw_render.view_mask = MAX2(1, inheritance_info->viewMask);
dr_info->attachment_count = has_depth || has_stencil;
dr_info->mrt_setup = setup;
for (uint32_t i = 0; i < inheritance_info->colorAttachmentCount; i++)
if (inheritance_info->pColorAttachmentFormats[i] != VK_FORMAT_UNDEFINED)
dr_info->attachment_count++;
if (dr_info->attachment_count) {
dr_info->attachments =
vk_zalloc(&cmd_buffer->vk.pool->alloc,
sizeof(*dr_info->attachments) * dr_info->attachment_count,
8,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (!dr_info->attachments) {
vk_command_buffer_set_error(&cmd_buffer->vk,
VK_ERROR_OUT_OF_HOST_MEMORY);
goto err_free_dr_info;
}
}
dr_info->color_attachment_count = inheritance_info->colorAttachmentCount;
if (dr_info->color_attachment_count) {
dr_info->color_attachments = vk_zalloc(
&device->vk.alloc,
sizeof(*dr_info->color_attachments) * dr_info->color_attachment_count,
8,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (!dr_info->color_attachments) {
vk_command_buffer_set_error(&cmd_buffer->vk,
VK_ERROR_OUT_OF_HOST_MEMORY);
goto err_free_attachments;
}
}
assert(inheritance_info->stencilAttachmentFormat == VK_FORMAT_UNDEFINED ||
inheritance_info->depthAttachmentFormat == VK_FORMAT_UNDEFINED ||
inheritance_info->depthAttachmentFormat ==
inheritance_info->stencilAttachmentFormat);
dr_info->hw_render.ds_attach_idx = VK_ATTACHMENT_UNUSED;
if (has_depth || has_stencil) {
struct pvr_render_pass_attachment *ds_attachment =
&dr_info->attachments[attach_idx];
const VkFormat vk_format = has_depth
? inheritance_info->depthAttachmentFormat
: inheritance_info->stencilAttachmentFormat;
dr_info->hw_render.ds_attach_idx = attach_idx;
dr_info->hw_render.stencil_store = has_stencil;
dr_info->hw_render.depth_store = has_depth;
ds_attachment->index = attach_idx;
ds_attachment->vk_format = vk_format;
ds_attachment->is_depth = dr_info->hw_render.depth_store;
ds_attachment->is_stencil = dr_info->hw_render.stencil_store;
ds_attachment->sample_count = inheritance_info->rasterizationSamples;
ds_attachment->resolve_target = VK_ATTACHMENT_UNUSED;
ds_attachment->load_op = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
ds_attachment->store_op = VK_ATTACHMENT_STORE_OP_DONT_CARE;
ds_attachment->stencil_load_op = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
ds_attachment->stencil_store_op = VK_ATTACHMENT_STORE_OP_DONT_CARE;
attach_idx++;
}
for (uint32_t i = 0; i < inheritance_info->colorAttachmentCount; i++) {
struct pvr_render_pass_attachment *attachment;
dr_info->color_attachments[i].index_color = VK_ATTACHMENT_UNUSED;
dr_info->color_attachments[i].index_resolve = VK_ATTACHMENT_UNUSED;
attachment_formats[mrt_count] = VK_FORMAT_UNDEFINED;
mrt_attachment_map[i] = VK_ATTACHMENT_UNUSED;
if (inheritance_info->pColorAttachmentFormats[i] == VK_FORMAT_UNDEFINED)
continue;
dr_info->color_attachments[i].index_color = attach_idx;
mrt_attachment_map[i] = mrt_count;
attachment_formats[mrt_count++] =
inheritance_info->pColorAttachmentFormats[i];
attachment = &dr_info->attachments[attach_idx];
attachment->index = attach_idx;
attachment->vk_format = inheritance_info->pColorAttachmentFormats[i];
attachment->sample_count = inheritance_info->rasterizationSamples;
attachment->resolve_target = VK_ATTACHMENT_UNUSED;
attachment->load_op = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
attachment->store_op = VK_ATTACHMENT_STORE_OP_DONT_CARE;
attachment->stencil_load_op = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
attachment->stencil_store_op = VK_ATTACHMENT_STORE_OP_DONT_CARE;
attachment->stencil_resolve_mode = VK_RESOLVE_MODE_NONE;
attach_idx++;
}
result =
pvr_init_usc_mrt_setup(device, mrt_count, attachment_formats, &mrt_setup);
if (result != VK_SUCCESS) {
vk_command_buffer_set_error(&cmd_buffer->vk, result);
goto err_free_attachments;
}
result = pvr_mrt_setup_partial_init(cmd_buffer->device,
dr_info->mrt_setup,
inheritance_info->colorAttachmentCount,
mrt_setup.num_output_regs,
mrt_setup.num_tile_buffers);
if (result != VK_SUCCESS) {
pvr_destroy_mrt_setup(device, &mrt_setup);
vk_command_buffer_set_error(&cmd_buffer->vk, result);
goto err_free_attachments;
}
/* We fully initialised only a mrt_count number of render targets,
* now scatter gather them over the whole number of attachments.
* This is necessary because render targets are attachment-indexed
* throughout the driver.
*/
for (uint32_t i = 0; i < inheritance_info->colorAttachmentCount; i++) {
if (mrt_attachment_map[i] != VK_ATTACHMENT_UNUSED) {
memcpy(&dr_info->mrt_setup->mrt_resources[i],
&mrt_setup.mrt_resources[mrt_attachment_map[i]],
sizeof(mrt_setup.mrt_resources[0]));
}
}
pvr_destroy_mrt_setup(device, &mrt_setup);
if (dr_info->mrt_setup->num_tile_buffers) {
result = pvr_device_tile_buffer_ensure_cap(
device,
dr_info->mrt_setup->num_tile_buffers);
if (result != VK_SUCCESS) {
vk_command_buffer_set_error(&cmd_buffer->vk, result);
goto err_destroy_mrt_setup;
}
}
cmd_buffer->state.render_pass_info.hw_render = &dr_info->hw_render;
cmd_buffer->state.render_pass_info.dr_info = dr_info;
/* For setting the depth_format, first check stencil and then override */
if (has_stencil)
cmd_buffer->state.depth_format =
inheritance_info->stencilAttachmentFormat;
if (has_depth)
cmd_buffer->state.depth_format = inheritance_info->depthAttachmentFormat;
return;
err_destroy_mrt_setup:
pvr_destroy_mrt_setup(device, dr_info->mrt_setup);
err_free_attachments:
vk_free(&device->vk.alloc, dr_info->attachments);
err_free_dr_info:
vk_free(&device->vk.alloc, dr_info);
}
static inline void pvr_cmd_buffer_state_from_render_pass_inheritance(
struct pvr_cmd_buffer_state *state,
const VkCommandBufferInheritanceInfo *inheritance_info)
{
struct pvr_render_pass *pass =
pvr_render_pass_from_handle(inheritance_info->renderPass);
state->render_pass_info.pass = pass;
state->render_pass_info.framebuffer =
pvr_framebuffer_from_handle(inheritance_info->framebuffer);
state->render_pass_info.subpass_idx = inheritance_info->subpass;
state->render_pass_info.isp_userpass =
pass->subpasses[inheritance_info->subpass].isp_userpass;
}
VkResult
PVR_PER_ARCH(BeginCommandBuffer)(VkCommandBuffer commandBuffer,
const VkCommandBufferBeginInfo *pBeginInfo)
{
VK_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
struct pvr_cmd_buffer_state *state;
VkResult result;
vk_command_buffer_begin(&cmd_buffer->vk, pBeginInfo);
cmd_buffer->usage_flags = pBeginInfo->flags;
state = &cmd_buffer->state;
/* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for
* primary level command buffers.
*
* From the Vulkan 1.0 spec:
*
* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a
* secondary command buffer is considered to be entirely inside a render
* pass. If this is a primary command buffer, then this bit is ignored.
*/
if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
cmd_buffer->usage_flags &=
~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
}
if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
if (cmd_buffer->usage_flags &
VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
const VkCommandBufferInheritanceRenderingInfo *dyn_inheritance_info =
vk_find_struct_const(pBeginInfo->pInheritanceInfo->pNext,
COMMAND_BUFFER_INHERITANCE_RENDERING_INFO);
if (!pBeginInfo->pInheritanceInfo->renderPass) {
assert(dyn_inheritance_info);
pvr_cmd_buffer_state_from_dynamic_inheritance(cmd_buffer,
dyn_inheritance_info);
} else {
pvr_cmd_buffer_state_from_render_pass_inheritance(
state,
pBeginInfo->pInheritanceInfo);
}
result =
pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
if (result != VK_SUCCESS)
return result;
state->vis_test_enabled =
pBeginInfo->pInheritanceInfo->occlusionQueryEnable;
if (!pBeginInfo->pInheritanceInfo->renderPass)
state->current_sub_cmd->is_dynamic_render = true;
}
state->dirty.isp_userpass = true;
}
state->query_indices = UTIL_DYNARRAY_INIT;
memset(state->barriers_needed,
0xFF,
sizeof(*state->barriers_needed) * ARRAY_SIZE(state->barriers_needed));
return VK_SUCCESS;
}
VkResult
PVR_PER_ARCH(cmd_buffer_add_transfer_cmd)(struct pvr_cmd_buffer *cmd_buffer,
struct pvr_transfer_cmd *transfer_cmd)
{
struct pvr_sub_cmd_transfer *sub_cmd;
VkResult result;
result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_TRANSFER);
if (result != VK_SUCCESS)
return result;
sub_cmd = &cmd_buffer->state.current_sub_cmd->transfer;
list_addtail(&transfer_cmd->link, sub_cmd->transfer_cmds);
return VK_SUCCESS;
}
static VkResult
pvr_setup_vertex_buffers(struct pvr_cmd_buffer *cmd_buffer,
const struct pvr_graphics_pipeline *const gfx_pipeline)
{
const struct pvr_vertex_shader_state *const vertex_state =
&gfx_pipeline->shader_state.vertex;
struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
const struct pvr_pds_info *const pds_info = state->pds_shader.info;
struct pvr_suballoc_bo *pvr_bo;
const uint8_t *entries;
uint32_t *dword_buffer;
uint64_t *qword_buffer;
VkResult result;
result =
pvr_cmd_buffer_alloc_mem(cmd_buffer,
cmd_buffer->device->heaps.pds_heap,
PVR_DW_TO_BYTES(pds_info->data_size_in_dwords),
&pvr_bo);
if (result != VK_SUCCESS)
return result;
dword_buffer = (uint32_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
qword_buffer = (uint64_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
entries = (uint8_t *)pds_info->entries;
for (uint32_t i = 0; i < pds_info->entry_count; i++) {
const struct pvr_const_map_entry *const entry_header =
(struct pvr_const_map_entry *)entries;
switch (entry_header->type) {
case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32: {
const struct pvr_const_map_entry_literal32 *const literal =
(struct pvr_const_map_entry_literal32 *)entries;
PVR_WRITE(dword_buffer,
literal->literal_value,
literal->const_offset,
pds_info->data_size_in_dwords);
entries += sizeof(*literal);
break;
}
case PVR_PDS_CONST_MAP_ENTRY_TYPE_DOUTU_ADDRESS: {
const struct pvr_const_map_entry_doutu_address *const doutu_addr =
(struct pvr_const_map_entry_doutu_address *)entries;
const pco_data *const vs_data = &state->gfx_pipeline->vs_data;
const pvr_dev_addr_t exec_addr =
PVR_DEV_ADDR_OFFSET(vertex_state->shader_bo->dev_addr,
vs_data->common.entry_offset);
uint64_t addr = 0ULL;
pvr_set_usc_execution_address64(&addr, exec_addr.addr);
PVR_WRITE(qword_buffer,
addr | doutu_addr->doutu_control,
doutu_addr->const_offset,
pds_info->data_size_in_dwords);
entries += sizeof(*doutu_addr);
break;
}
case PVR_PDS_CONST_MAP_ENTRY_TYPE_BASE_INSTANCE: {
const struct pvr_const_map_entry_base_instance *const base_instance =
(struct pvr_const_map_entry_base_instance *)entries;
PVR_WRITE(dword_buffer,
state->draw_state.base_instance,
base_instance->const_offset,
pds_info->data_size_in_dwords);
entries += sizeof(*base_instance);
break;
}
case PVR_PDS_CONST_MAP_ENTRY_TYPE_BASE_VERTEX: {
const struct pvr_const_map_entry_base_instance *const base_instance =
(struct pvr_const_map_entry_base_instance *)entries;
PVR_WRITE(dword_buffer,
state->draw_state.base_vertex,
base_instance->const_offset,
pds_info->data_size_in_dwords);
entries += sizeof(*base_instance);
break;
}
case PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTRIBUTE_ADDRESS: {
const struct pvr_const_map_entry_vertex_attribute_address
*const attribute =
(struct pvr_const_map_entry_vertex_attribute_address *)entries;
const struct pvr_vertex_binding *const binding =
&state->vertex_bindings[attribute->binding_index];
/* In relation to the Vulkan spec. 22.4. Vertex Input Address
* Calculation:
* Adding binding->offset corresponds to calculating the
* `bufferBindingAddress`. Adding attribute->offset corresponds to
* adding the `attribDesc.offset`. The `effectiveVertexOffset` is
* taken care by the PDS program itself with a DDMAD which will
* multiply the vertex/instance idx with the binding's stride and
* add that to the address provided here.
*/
const pvr_dev_addr_t addr =
PVR_DEV_ADDR_OFFSET(binding->buffer->dev_addr,
binding->offset + attribute->offset);
PVR_WRITE(qword_buffer,
addr.addr,
attribute->const_offset,
pds_info->data_size_in_dwords);
entries += sizeof(*attribute);
break;
}
case PVR_PDS_CONST_MAP_ENTRY_TYPE_ROBUST_VERTEX_ATTRIBUTE_ADDRESS: {
const struct pvr_const_map_entry_robust_vertex_attribute_address
*const attribute =
(struct pvr_const_map_entry_robust_vertex_attribute_address *)
entries;
const struct pvr_vertex_binding *const binding =
&state->vertex_bindings[attribute->binding_index];
pvr_dev_addr_t addr;
if (binding->size <
(attribute->offset + attribute->component_size_in_bytes)) {
/* Replace with load from robustness buffer when no attribute is in
* range
*/
addr = PVR_DEV_ADDR_OFFSET(
cmd_buffer->device->robustness_buffer->vma->dev_addr,
attribute->robustness_buffer_offset);
} else {
addr = PVR_DEV_ADDR_OFFSET(binding->buffer->dev_addr,
binding->offset + attribute->offset);
}
PVR_WRITE(qword_buffer,
addr.addr,
attribute->const_offset,
pds_info->data_size_in_dwords);
entries += sizeof(*attribute);
break;
}
case PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTR_DDMADT_OOB_BUFFER_SIZE: {
const struct pvr_pds_const_map_entry_vertex_attr_ddmadt_oob_buffer_size
*ddmadt_src3 =
(struct pvr_pds_const_map_entry_vertex_attr_ddmadt_oob_buffer_size
*)entries;
const struct pvr_vertex_binding *const binding =
&state->vertex_bindings[ddmadt_src3->binding_index];
const struct vk_dynamic_graphics_state *dynamic_state =
&cmd_buffer->vk.dynamic_graphics_state;
uint32_t stride =
dynamic_state->vi_binding_strides[ddmadt_src3->binding_index];
uint32_t bound_size = binding->buffer->vk.size - binding->offset;
uint64_t control_qword;
uint32_t control_dword;
assert(PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
pds_ddmadt));
if (stride) {
bound_size -= bound_size % stride;
if (bound_size == 0) {
/* If size is zero, DMA OOB won't execute. Read will come from
* robustness buffer.
*/
bound_size = stride;
}
}
pvr_csb_pack (&control_qword, PDSINST_DDMAD_FIELDS_SRC3, src3) {
src3.test = true;
src3.msize = bound_size;
}
control_dword = (uint32_t)(control_qword >> 32);
PVR_WRITE(dword_buffer,
control_dword,
ddmadt_src3->const_offset,
pds_info->data_size_in_dwords);
entries += sizeof(*ddmadt_src3);
break;
}
case PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTRIBUTE_MAX_INDEX: {
const struct pvr_const_map_entry_vertex_attribute_max_index *attribute =
(struct pvr_const_map_entry_vertex_attribute_max_index *)entries;
const struct pvr_vertex_binding *const binding =
&state->vertex_bindings[attribute->binding_index];
const uint64_t bound_size = binding->buffer->vk.size - binding->offset;
const uint32_t attribute_end =
attribute->offset + attribute->component_size_in_bytes;
const struct vk_dynamic_graphics_state *dynamic_state =
&cmd_buffer->vk.dynamic_graphics_state;
const uint32_t stride =
dynamic_state->vi_binding_strides[attribute->binding_index];
uint32_t max_index;
/* If the stride is 0 then all attributes use the same single
* element from the binding so the index can only be up to 0.
*/
if (bound_size < attribute_end || stride == 0) {
max_index = 0;
} else {
max_index = (uint32_t)(bound_size / stride) - 1;
/* There's one last attribute that can fit in. */
if (bound_size % stride >= attribute_end)
max_index++;
}
PVR_WRITE(dword_buffer,
max_index,
attribute->const_offset,
pds_info->data_size_in_dwords);
entries += sizeof(*attribute);
break;
}
case PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTRIBUTE_STRIDE: {
const struct pvr_const_map_entry_vertex_attribute_stride *attribute =
(const struct pvr_const_map_entry_vertex_attribute_stride *)entries;
const struct vk_dynamic_graphics_state *dynamic_state =
&cmd_buffer->vk.dynamic_graphics_state;
const uint32_t stride =
dynamic_state->vi_binding_strides[attribute->binding_index];
PVR_WRITE(dword_buffer,
stride,
attribute->const_offset,
pds_info->data_size_in_dwords);
entries += sizeof(*attribute);
break;
}
default:
UNREACHABLE("Unsupported data section map");
break;
}
}
state->pds_vertex_attrib_offset =
pvr_bo->dev_addr.addr -
cmd_buffer->device->heaps.pds_heap->base_addr.addr;
return VK_SUCCESS;
}
static VkResult pvr_cmd_upload_push_consts(struct pvr_cmd_buffer *cmd_buffer,
enum pvr_stage_allocation stage);
static VkResult pvr_setup_descriptor_mappings(
struct pvr_cmd_buffer *const cmd_buffer,
enum pvr_stage_allocation stage,
const struct pvr_stage_allocation_descriptor_state *descriptor_state,
const pvr_dev_addr_t *const num_worgroups_buff_addr,
uint32_t *const descriptor_data_offset_out)
{
const struct pvr_pds_info *const pds_info = &descriptor_state->pds_info;
const struct pvr_descriptor_state *desc_state;
const pco_data *data;
struct pvr_suballoc_bo *pvr_bo;
const uint8_t *entries;
uint32_t *dword_buffer;
uint64_t *qword_buffer;
VkResult result;
if (!pds_info->data_size_in_dwords)
return VK_SUCCESS;
result =
pvr_cmd_buffer_alloc_mem(cmd_buffer,
cmd_buffer->device->heaps.pds_heap,
PVR_DW_TO_BYTES(pds_info->data_size_in_dwords),
&pvr_bo);
if (result != VK_SUCCESS)
return result;
dword_buffer = (uint32_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
qword_buffer = (uint64_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
entries = (uint8_t *)pds_info->entries;
switch (stage) {
case PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY:
desc_state = &cmd_buffer->state.gfx_desc_state;
data = &cmd_buffer->state.gfx_pipeline->vs_data;
break;
case PVR_STAGE_ALLOCATION_FRAGMENT:
desc_state = &cmd_buffer->state.gfx_desc_state;
data = &cmd_buffer->state.gfx_pipeline->fs_data;
break;
case PVR_STAGE_ALLOCATION_COMPUTE:
desc_state = &cmd_buffer->state.compute_desc_state;
data = &cmd_buffer->state.compute_pipeline->cs_data;
break;
default:
UNREACHABLE("Unsupported stage.");
break;
}
for (uint32_t i = 0; i < pds_info->entry_count; i++) {
const struct pvr_const_map_entry *const entry_header =
(struct pvr_const_map_entry *)entries;
switch (entry_header->type) {
case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32: {
const struct pvr_const_map_entry_literal32 *const literal =
(struct pvr_const_map_entry_literal32 *)entries;
PVR_WRITE(dword_buffer,
literal->literal_value,
literal->const_offset,
pds_info->data_size_in_dwords);
entries += sizeof(*literal);
break;
}
case PVR_PDS_CONST_MAP_ENTRY_TYPE_DESCRIPTOR_SET: {
const struct pvr_const_map_entry_descriptor_set *desc_set_entry =
(struct pvr_const_map_entry_descriptor_set *)entries;
const uint32_t desc_set = desc_set_entry->descriptor_set;
const struct pvr_descriptor_set *descriptor_set;
pvr_dev_addr_t desc_set_addr;
assert(desc_set < PVR_MAX_DESCRIPTOR_SETS);
descriptor_set = desc_state->sets[desc_set];
assert(descriptor_set);
desc_set_addr = descriptor_set->dev_addr;
PVR_WRITE(qword_buffer,
desc_set_addr.addr,
desc_set_entry->const_offset,
pds_info->data_size_in_dwords);
entries += sizeof(*desc_set_entry);
break;
}
case PVR_PDS_CONST_MAP_ENTRY_TYPE_SPECIAL_BUFFER: {
const struct pvr_const_map_entry_special_buffer *special_buff_entry =
(struct pvr_const_map_entry_special_buffer *)entries;
switch (special_buff_entry->buffer_type) {
case PVR_BUFFER_TYPE_DYNAMIC: {
unsigned desc_set = special_buff_entry->data;
const struct pvr_descriptor_set *descriptor_set;
struct pvr_suballoc_bo *dynamic_desc_bo;
assert(desc_set < PVR_MAX_DESCRIPTOR_SETS);
descriptor_set = desc_state->sets[desc_set];
assert(descriptor_set);
result = pvr_cmd_buffer_upload_general(
cmd_buffer,
descriptor_set->dynamic_buffers,
special_buff_entry->size_in_dwords * sizeof(uint32_t),
&dynamic_desc_bo);
if (result != VK_SUCCESS)
return result;
PVR_WRITE(qword_buffer,
dynamic_desc_bo->dev_addr.addr,
special_buff_entry->const_offset,
pds_info->data_size_in_dwords);
break;
}
case PVR_BUFFER_TYPE_PUSH_CONSTS: {
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
/* Handle running with undefined push constants. */
if (!state->push_consts[stage].dev_addr.addr) {
state->push_consts[stage].dirty = true;
assert(!state->push_consts[stage].bytes_updated);
state->push_consts[stage].bytes_updated =
sizeof(state->push_consts[stage].data);
result = pvr_cmd_upload_push_consts(cmd_buffer, stage);
/* Reset. */
state->push_consts[stage].bytes_updated = 0;
if (result != VK_SUCCESS)
return result;
}
PVR_WRITE(qword_buffer,
state->push_consts[stage].dev_addr.addr,
special_buff_entry->const_offset,
pds_info->data_size_in_dwords);
break;
}
case PVR_BUFFER_TYPE_BLEND_CONSTS: {
const struct vk_color_blend_state *cb =
&cmd_buffer->vk.dynamic_graphics_state.cb;
struct pvr_suballoc_bo *blend_consts_bo;
result = pvr_cmd_buffer_upload_general(cmd_buffer,
cb->blend_constants,
sizeof(cb->blend_constants),
&blend_consts_bo);
if (result != VK_SUCCESS)
return result;
PVR_WRITE(qword_buffer,
blend_consts_bo->dev_addr.addr,
special_buff_entry->const_offset,
pds_info->data_size_in_dwords);
break;
}
case PVR_BUFFER_TYPE_POINT_SAMPLER: {
uint64_t point_sampler_words[ROGUE_NUM_TEXSTATE_SAMPLER_WORDS];
pvr_csb_pack (&point_sampler_words[0],
TEXSTATE_SAMPLER_WORD0,
sampler) {
sampler.addrmode_u = ROGUE_TEXSTATE_ADDRMODE_CLAMP_TO_BORDER;
sampler.addrmode_v = ROGUE_TEXSTATE_ADDRMODE_CLAMP_TO_BORDER;
sampler.addrmode_w = ROGUE_TEXSTATE_ADDRMODE_CLAMP_TO_BORDER;
sampler.dadjust = ROGUE_TEXSTATE_DADJUST_ZERO_UINT;
sampler.minfilter = ROGUE_TEXSTATE_FILTER_POINT;
sampler.magfilter = ROGUE_TEXSTATE_FILTER_POINT;
sampler.maxlod = ROGUE_TEXSTATE_CLAMP_MAX;
sampler.anisoctl = ROGUE_TEXSTATE_ANISOCTL_DISABLED;
}
pvr_csb_pack (&point_sampler_words[1],
TEXSTATE_SAMPLER_WORD1,
sampler) {}
struct pvr_suballoc_bo *point_sampler_bo;
result = pvr_cmd_buffer_upload_general(cmd_buffer,
point_sampler_words,
sizeof(point_sampler_words),
&point_sampler_bo);
if (result != VK_SUCCESS)
return result;
PVR_WRITE(qword_buffer,
point_sampler_bo->dev_addr.addr,
special_buff_entry->const_offset,
pds_info->data_size_in_dwords);
break;
}
case PVR_BUFFER_TYPE_IA_SAMPLER: {
uint64_t ia_sampler_words[ROGUE_NUM_TEXSTATE_SAMPLER_WORDS];
pvr_csb_pack (&ia_sampler_words[0],
TEXSTATE_SAMPLER_WORD0,
sampler) {
sampler.addrmode_u = ROGUE_TEXSTATE_ADDRMODE_CLAMP_TO_EDGE;
sampler.addrmode_v = ROGUE_TEXSTATE_ADDRMODE_CLAMP_TO_EDGE;
sampler.addrmode_w = ROGUE_TEXSTATE_ADDRMODE_CLAMP_TO_EDGE;
sampler.dadjust = ROGUE_TEXSTATE_DADJUST_ZERO_UINT;
sampler.magfilter = ROGUE_TEXSTATE_FILTER_POINT;
sampler.minfilter = ROGUE_TEXSTATE_FILTER_POINT;
sampler.anisoctl = ROGUE_TEXSTATE_ANISOCTL_DISABLED;
sampler.non_normalized_coords = true;
}
pvr_csb_pack (&ia_sampler_words[1],
TEXSTATE_SAMPLER_WORD1,
sampler) {}
struct pvr_suballoc_bo *ia_sampler_bo;
result = pvr_cmd_buffer_upload_general(cmd_buffer,
ia_sampler_words,
sizeof(ia_sampler_words),
&ia_sampler_bo);
if (result != VK_SUCCESS)
return result;
PVR_WRITE(qword_buffer,
ia_sampler_bo->dev_addr.addr,
special_buff_entry->const_offset,
pds_info->data_size_in_dwords);
break;
}
case PVR_BUFFER_TYPE_FRONT_FACE_OP: {
VkFrontFace front_face =
cmd_buffer->vk.dynamic_graphics_state.rs.front_face;
VkPrimitiveTopology primitive_topology =
cmd_buffer->vk.dynamic_graphics_state.ia.primitive_topology;
uint32_t ff_op;
switch (primitive_topology) {
case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
ff_op = PCO_FRONT_FACE_OP_TRUE;
break;
default:
ff_op = front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE
? PCO_FRONT_FACE_OP_SWAP
: PCO_FRONT_FACE_OP_NOP;
break;
}
struct pvr_suballoc_bo *ff_op_bo;
result = pvr_cmd_buffer_upload_general(cmd_buffer,
&ff_op,
sizeof(ff_op),
&ff_op_bo);
if (result != VK_SUCCESS)
return result;
PVR_WRITE(qword_buffer,
ff_op_bo->dev_addr.addr,
special_buff_entry->const_offset,
pds_info->data_size_in_dwords);
break;
}
case PVR_BUFFER_TYPE_FS_META: {
uint32_t fs_meta = 0;
if (cmd_buffer->vk.dynamic_graphics_state.ms.alpha_to_one_enable)
fs_meta |= (1 << 0);
fs_meta |= cmd_buffer->vk.dynamic_graphics_state.ms.sample_mask
<< 9;
fs_meta |=
cmd_buffer->vk.dynamic_graphics_state.cb.color_write_enables
<< 1;
if (cmd_buffer->vk.dynamic_graphics_state.ms
.alpha_to_coverage_enable)
fs_meta |= (1 << 25);
struct pvr_suballoc_bo *fs_meta_bo;
result = pvr_cmd_buffer_upload_general(cmd_buffer,
&fs_meta,
sizeof(fs_meta),
&fs_meta_bo);
if (result != VK_SUCCESS)
return result;
PVR_WRITE(qword_buffer,
fs_meta_bo->dev_addr.addr,
special_buff_entry->const_offset,
pds_info->data_size_in_dwords);
break;
}
case PVR_BUFFER_TYPE_TILE_BUFFERS: {
const struct pvr_device_tile_buffer_state *tile_buffer_state =
&cmd_buffer->device->tile_buffer_state;
const struct pvr_graphics_pipeline *const gfx_pipeline =
cmd_buffer->state.gfx_pipeline;
const pco_data *const fs_data = &gfx_pipeline->fs_data;
unsigned num_tile_buffers =
fs_data->fs.tile_buffers.count / fs_data->fs.tile_buffers.stride;
uint32_t tile_buffer_addrs[PVR_MAX_TILE_BUFFER_COUNT * 2];
for (unsigned u = 0; u < num_tile_buffers; ++u) {
assert(u < tile_buffer_state->buffer_count);
uint64_t tile_buffer_addr =
tile_buffer_state->buffers[u]->vma->dev_addr.addr;
tile_buffer_addrs[2 * u] = tile_buffer_addr & 0xffffffff;
tile_buffer_addrs[2 * u + 1] = tile_buffer_addr >> 32;
}
struct pvr_suballoc_bo *tile_buffer_bo;
result = pvr_cmd_buffer_upload_general(cmd_buffer,
&tile_buffer_addrs,
num_tile_buffers *
sizeof(uint64_t),
&tile_buffer_bo);
if (result != VK_SUCCESS)
return result;
PVR_WRITE(qword_buffer,
tile_buffer_bo->dev_addr.addr,
special_buff_entry->const_offset,
pds_info->data_size_in_dwords);
break;
}
case PVR_BUFFER_TYPE_SPILL_INFO: {
unsigned spill_block_size =
data->common.spilled_temps * sizeof(uint32_t);
spill_block_size = spill_block_size ? spill_block_size
: sizeof(uint32_t);
struct pvr_suballoc_bo *spill_buffer_bo;
result = pvr_cmd_buffer_upload_general(cmd_buffer,
NULL,
spill_block_size * 2048,
&spill_buffer_bo);
if (result != VK_SUCCESS)
return result;
uint32_t spill_info[3] = {
[0] = spill_buffer_bo->dev_addr.addr & 0xffffffff,
[1] = spill_buffer_bo->dev_addr.addr >> 32,
[2] = spill_block_size,
};
struct pvr_suballoc_bo *spill_info_bo;
result = pvr_cmd_buffer_upload_general(cmd_buffer,
spill_info,
sizeof(spill_info),
&spill_info_bo);
if (result != VK_SUCCESS)
return result;
PVR_WRITE(qword_buffer,
spill_info_bo->dev_addr.addr,
special_buff_entry->const_offset,
pds_info->data_size_in_dwords);
break;
}
case PVR_BUFFER_TYPE_SCRATCH_INFO: {
assert(data->common.scratch);
unsigned scratch_block_size = data->common.scratch;
/* TODO: 2048 is to account for each instance... do this
* programmatically!
*/
struct pvr_suballoc_bo *scratch_buffer_bo;
result = pvr_cmd_buffer_upload_general(cmd_buffer,
NULL,
scratch_block_size * 2048,
&scratch_buffer_bo);
if (result != VK_SUCCESS)
return result;
uint32_t scratch_info[3] = {
[0] = scratch_buffer_bo->dev_addr.addr & 0xffffffff,
[1] = scratch_buffer_bo->dev_addr.addr >> 32,
[2] = scratch_block_size,
};
struct pvr_suballoc_bo *scratch_info_bo;
result = pvr_cmd_buffer_upload_general(cmd_buffer,
scratch_info,
sizeof(scratch_info),
&scratch_info_bo);
if (result != VK_SUCCESS)
return result;
PVR_WRITE(qword_buffer,
scratch_info_bo->dev_addr.addr,
special_buff_entry->const_offset,
pds_info->data_size_in_dwords);
break;
}
case PVR_BUFFER_TYPE_SAMPLE_LOCATIONS: {
/* Standard sample locations. */
uint32_t packed_sample_locations[] = {
0x000044cc,
0xeaa26e26,
0x359db759,
0x1ffb71d3,
};
struct pvr_suballoc_bo *sample_locations_bo;
result =
pvr_cmd_buffer_upload_general(cmd_buffer,
&packed_sample_locations,
sizeof(packed_sample_locations),
&sample_locations_bo);
if (result != VK_SUCCESS)
return result;
PVR_WRITE(qword_buffer,
sample_locations_bo->dev_addr.addr,
special_buff_entry->const_offset,
pds_info->data_size_in_dwords);
break;
}
default:
UNREACHABLE("Unsupported special buffer type.");
}
entries += sizeof(*special_buff_entry);
break;
}
default:
UNREACHABLE("Unsupported map entry type.");
}
}
*descriptor_data_offset_out =
pvr_bo->dev_addr.addr -
cmd_buffer->device->heaps.pds_heap->base_addr.addr;
return VK_SUCCESS;
}
static void pvr_compute_update_shared(struct pvr_cmd_buffer *cmd_buffer,
struct pvr_sub_cmd_compute *const sub_cmd)
{
const struct pvr_device *device = cmd_buffer->device;
const struct pvr_physical_device *pdevice = device->pdevice;
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
struct pvr_csb *csb = &sub_cmd->control_stream;
const struct pvr_compute_pipeline *pipeline = state->compute_pipeline;
const uint32_t const_shared_regs = pipeline->cs_data.common.shareds;
struct pvr_compute_kernel_info info;
/* No shared regs, no need to use an allocation kernel. */
if (!const_shared_regs)
return;
/* Accumulate the MAX number of shared registers across the kernels in this
* dispatch. This is used by the FW for context switching, so must be large
* enough to contain all the shared registers that might be in use for this
* compute job. Coefficients don't need to be included as the context switch
* will not happen within the execution of a single workgroup, thus nothing
* needs to be preserved.
*/
state->max_shared_regs = MAX2(state->max_shared_regs, const_shared_regs);
info = (struct pvr_compute_kernel_info){
.indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
.sd_type = ROGUE_CDMCTRL_SD_TYPE_NONE,
.usc_target = ROGUE_CDMCTRL_USC_TARGET_ALL,
.usc_common_shared = true,
.usc_common_size =
DIV_ROUND_UP(PVR_DW_TO_BYTES(const_shared_regs),
ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE),
.global_size = { 1, 1, 1 },
.local_size = { 1, 1, 1 },
};
/* Sometimes we don't have a secondary program if there were no constants to
* write, but we still need to run a PDS program to accomplish the
* allocation of the local/common store shared registers. Use the
* pre-uploaded empty PDS program in this instance.
*/
if (pipeline->descriptor_state.pds_info.code_size_in_dwords) {
uint32_t pds_data_size_in_dwords =
pipeline->descriptor_state.pds_info.data_size_in_dwords;
info.pds_data_offset = state->pds_compute_descriptor_data_offset;
info.pds_data_size =
DIV_ROUND_UP(PVR_DW_TO_BYTES(pds_data_size_in_dwords),
ROGUE_CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE);
/* Check that we have upload the code section. */
assert(pipeline->descriptor_state.pds_code.code_size);
info.pds_code_offset = pipeline->descriptor_state.pds_code.code_offset;
} else {
const struct pvr_pds_upload *program = &device->pds_compute_empty_program;
info.pds_data_offset = program->data_offset;
info.pds_data_size =
DIV_ROUND_UP(PVR_DW_TO_BYTES(program->data_size),
ROGUE_CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE);
info.pds_code_offset = program->code_offset;
}
/* We don't need to pad the workgroup size. */
info.max_instances =
pvr_compute_flat_slot_size(pdevice, const_shared_regs, false, 1U);
pvr_compute_generate_control_stream(csb, sub_cmd, &info);
}
void PVR_PER_ARCH(compute_update_shared_private)(
struct pvr_cmd_buffer *cmd_buffer,
struct pvr_sub_cmd_compute *const sub_cmd,
struct pvr_private_compute_pipeline *pipeline)
{
const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
const uint32_t const_shared_regs = pipeline->const_shared_regs_count;
struct pvr_csb *csb = &sub_cmd->control_stream;
struct pvr_compute_kernel_info info;
/* No shared regs, no need to use an allocation kernel. */
if (!const_shared_regs)
return;
/* See comment in pvr_compute_update_shared() for details on this. */
state->max_shared_regs = MAX2(state->max_shared_regs, const_shared_regs);
info = (struct pvr_compute_kernel_info){
.indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
.usc_common_size =
DIV_ROUND_UP(PVR_DW_TO_BYTES(const_shared_regs),
ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE),
.pds_data_size =
DIV_ROUND_UP(PVR_DW_TO_BYTES(pipeline->pds_shared_update_data_size_dw),
ROGUE_CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE),
.usc_target = ROGUE_CDMCTRL_USC_TARGET_ALL,
.pds_data_offset = pipeline->pds_shared_update_data_offset,
.pds_code_offset = pipeline->pds_shared_update_code_offset,
.sd_type = ROGUE_CDMCTRL_SD_TYPE_NONE,
.usc_common_shared = true,
.global_size = { 1, 1, 1 },
.local_size = { 1, 1, 1 },
};
/* We don't need to pad the workgroup size. */
info.max_instances =
pvr_compute_flat_slot_size(pdevice, const_shared_regs, false, 1U);
pvr_compute_generate_control_stream(csb, sub_cmd, &info);
}
static uint32_t
pvr_compute_flat_pad_workgroup_size(const struct pvr_physical_device *pdevice,
uint32_t workgroup_size,
uint32_t coeff_regs_count)
{
const struct pvr_device_runtime_info *dev_runtime_info =
&pdevice->dev_runtime_info;
const struct pvr_device_info *dev_info = &pdevice->dev_info;
uint32_t max_avail_coeff_regs =
dev_runtime_info->cdm_max_local_mem_size_regs;
uint32_t coeff_regs_count_aligned =
ALIGN_POT(coeff_regs_count,
ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE >> 2U);
/* If the work group size is > ROGUE_MAX_INSTANCES_PER_TASK. We now *always*
* pad the work group size to the next multiple of
* ROGUE_MAX_INSTANCES_PER_TASK.
*
* If we use more than 1/8th of the max coefficient registers then we round
* work group size up to the next multiple of ROGUE_MAX_INSTANCES_PER_TASK
*/
/* TODO: See if this can be optimized. */
if (workgroup_size > ROGUE_MAX_INSTANCES_PER_TASK ||
coeff_regs_count_aligned > (max_avail_coeff_regs / 8)) {
assert(workgroup_size < rogue_get_compute_max_work_group_size(dev_info));
return ALIGN_POT(workgroup_size, ROGUE_MAX_INSTANCES_PER_TASK);
}
return workgroup_size;
}
void PVR_PER_ARCH(compute_update_kernel_private)(
struct pvr_cmd_buffer *cmd_buffer,
struct pvr_sub_cmd_compute *const sub_cmd,
struct pvr_private_compute_pipeline *pipeline,
const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])
{
const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
struct pvr_csb *csb = &sub_cmd->control_stream;
struct pvr_compute_kernel_info info = {
.indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
.usc_target = ROGUE_CDMCTRL_USC_TARGET_ANY,
.pds_temp_size =
DIV_ROUND_UP(pipeline->pds_temps_used << 2U,
ROGUE_CDMCTRL_KERNEL0_PDS_TEMP_SIZE_UNIT_SIZE),
.pds_data_size =
DIV_ROUND_UP(PVR_DW_TO_BYTES(pipeline->pds_data_size_dw),
ROGUE_CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE),
.pds_data_offset = pipeline->pds_data_offset,
.pds_code_offset = pipeline->pds_code_offset,
.sd_type = ROGUE_CDMCTRL_SD_TYPE_NONE,
.usc_unified_size =
DIV_ROUND_UP(pipeline->unified_store_regs_count << 2U,
ROGUE_CDMCTRL_KERNEL0_USC_UNIFIED_SIZE_UNIT_SIZE),
/* clang-format off */
.global_size = {
global_workgroup_size[0],
global_workgroup_size[1],
global_workgroup_size[2]
},
/* clang-format on */
};
uint32_t work_size = pipeline->workgroup_size.width *
pipeline->workgroup_size.height *
pipeline->workgroup_size.depth;
uint32_t coeff_regs =
pipeline->coeff_regs_count + pipeline->const_shared_regs_count;
info.usc_common_size =
DIV_ROUND_UP(PVR_DW_TO_BYTES(coeff_regs),
ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE);
/* Use a whole slot per workgroup. */
work_size = MAX2(work_size, ROGUE_MAX_INSTANCES_PER_TASK);
if (pipeline->const_shared_regs_count > 0)
info.sd_type = ROGUE_CDMCTRL_SD_TYPE_USC;
work_size =
pvr_compute_flat_pad_workgroup_size(pdevice, work_size, coeff_regs);
info.local_size[0] = work_size;
info.local_size[1] = 1U;
info.local_size[2] = 1U;
info.max_instances =
pvr_compute_flat_slot_size(pdevice, coeff_regs, false, work_size);
pvr_compute_generate_control_stream(csb, sub_cmd, &info);
}
static void pvr_compute_update_kernel(
struct pvr_cmd_buffer *cmd_buffer,
struct pvr_sub_cmd_compute *const sub_cmd,
pvr_dev_addr_t indirect_addr,
const uint32_t global_base_group[static const PVR_WORKGROUP_DIMENSIONS],
const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])
{
const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
struct pvr_csb *csb = &sub_cmd->control_stream;
const struct pvr_compute_pipeline *pipeline = state->compute_pipeline;
const pco_data *const cs_data = &pipeline->cs_data;
const struct pvr_pds_info *program_info = &pipeline->pds_cs_program_info;
bool uses_wg_id = pipeline->base_workgroup_data_patching_offset != ~0u;
bool uses_num_wgs = pipeline->num_workgroups_data_patching_offset != ~0u;
bool base_group_set = !!global_base_group[0] || !!global_base_group[1] ||
!!global_base_group[2];
uint32_t pds_data_offset = pipeline->pds_cs_program.data_offset;
/* Does the PDS data segment need patching, or can the default be used? */
if ((uses_wg_id && base_group_set) || uses_num_wgs) {
struct pvr_pds_upload pds_data_upload;
uint32_t *pds_data;
/* Upload and patch PDS data segment. */
pvr_cmd_buffer_upload_pds_data(cmd_buffer,
pipeline->pds_cs_data_section,
program_info->data_size_in_dwords,
16,
&pds_data_upload);
pds_data_offset = pds_data_upload.data_offset;
pds_data = pvr_bo_suballoc_get_map_addr(pds_data_upload.pvr_bo);
if (uses_wg_id && base_group_set) {
unsigned offset = pipeline->base_workgroup_data_patching_offset;
for (unsigned u = 0; u < PVR_WORKGROUP_DIMENSIONS; ++u) {
pds_data[offset + u] = global_base_group[u];
}
}
if (uses_num_wgs) {
if (indirect_addr.addr) {
unsigned offset =
pipeline->num_workgroups_indirect_src_patching_offset;
uint64_t *pds_data64 =
pvr_bo_suballoc_get_map_addr(pds_data_upload.pvr_bo);
pds_data64[offset / 2] = indirect_addr.addr;
offset = pipeline->num_workgroups_indirect_src_dma_patching_offset;
pds_data[offset] |=
3 << PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTD_SRC1_BSIZE_SHIFT;
}
unsigned offset = pipeline->num_workgroups_data_patching_offset;
for (unsigned u = 0; u < PVR_WORKGROUP_DIMENSIONS; ++u) {
pds_data[offset + u] = global_workgroup_size[u];
}
}
}
struct pvr_compute_kernel_info info = {
.indirect_buffer_addr = indirect_addr,
.usc_target = ROGUE_CDMCTRL_USC_TARGET_ANY,
.pds_temp_size =
DIV_ROUND_UP(program_info->temps_required << 2U,
ROGUE_CDMCTRL_KERNEL0_PDS_TEMP_SIZE_UNIT_SIZE),
.pds_data_size =
DIV_ROUND_UP(PVR_DW_TO_BYTES(program_info->data_size_in_dwords),
ROGUE_CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE),
.pds_data_offset = pds_data_offset,
.pds_code_offset = pipeline->pds_cs_program.code_offset,
.sd_type = ROGUE_CDMCTRL_SD_TYPE_NONE,
.usc_unified_size =
DIV_ROUND_UP(cs_data->common.vtxins << 2U,
ROGUE_CDMCTRL_KERNEL0_USC_UNIFIED_SIZE_UNIT_SIZE),
/* clang-format off */
.global_size = {
global_workgroup_size[0],
global_workgroup_size[1],
global_workgroup_size[2]
},
/* clang-format on */
};
uint32_t work_size = cs_data->cs.workgroup_size[0] *
cs_data->cs.workgroup_size[1] *
cs_data->cs.workgroup_size[2];
uint32_t coeff_regs = cs_data->common.coeffs + cs_data->common.shareds;
info.usc_common_size =
DIV_ROUND_UP(PVR_DW_TO_BYTES(coeff_regs),
ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE);
/* Use a whole slot per workgroup. */
work_size = MAX2(work_size, ROGUE_MAX_INSTANCES_PER_TASK);
if (cs_data->common.shareds > 0)
info.sd_type = ROGUE_CDMCTRL_SD_TYPE_USC;
work_size =
pvr_compute_flat_pad_workgroup_size(pdevice, work_size, coeff_regs);
info.local_size[0] = work_size;
info.local_size[1] = 1U;
info.local_size[2] = 1U;
info.max_instances =
pvr_compute_flat_slot_size(pdevice, coeff_regs, false, work_size);
pvr_compute_generate_control_stream(csb, sub_cmd, &info);
}
static VkResult pvr_cmd_upload_push_consts(struct pvr_cmd_buffer *cmd_buffer,
enum pvr_stage_allocation stage)
{
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
struct pvr_push_constants *push_consts = &state->push_consts[stage];
struct pvr_suballoc_bo *suballoc_bo;
VkResult result;
if (!push_consts->dirty)
return VK_SUCCESS;
result = pvr_cmd_buffer_upload_general(cmd_buffer,
push_consts->data,
push_consts->bytes_updated,
&suballoc_bo);
if (result != VK_SUCCESS)
return result;
push_consts->dev_addr = suballoc_bo->dev_addr;
push_consts->dirty = false;
return VK_SUCCESS;
}
static void pvr_cmd_dispatch(
struct pvr_cmd_buffer *const cmd_buffer,
const pvr_dev_addr_t indirect_addr,
const uint32_t base_group[static const PVR_WORKGROUP_DIMENSIONS],
const uint32_t workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])
{
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
const struct pvr_compute_pipeline *compute_pipeline =
state->compute_pipeline;
const pco_data *const cs_data = &compute_pipeline->cs_data;
struct pvr_sub_cmd_compute *sub_cmd;
VkResult result;
pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_COMPUTE);
sub_cmd = &state->current_sub_cmd->compute;
sub_cmd->uses_atomic_ops |= cs_data->common.uses.atomics;
sub_cmd->uses_barrier |= cs_data->common.uses.barriers;
if (state->push_consts[PVR_STAGE_ALLOCATION_COMPUTE].dirty) {
result =
pvr_cmd_upload_push_consts(cmd_buffer, PVR_STAGE_ALLOCATION_COMPUTE);
if (result != VK_SUCCESS)
return;
/* Regenerate the PDS program to use the new push consts buffer. */
state->dirty.compute_desc_dirty = true;
}
if (state->dirty.compute_desc_dirty ||
state->dirty.compute_pipeline_binding) {
result = pvr_setup_descriptor_mappings(
cmd_buffer,
PVR_STAGE_ALLOCATION_COMPUTE,
&compute_pipeline->descriptor_state,
NULL,
&state->pds_compute_descriptor_data_offset);
if (result != VK_SUCCESS)
return;
}
pvr_compute_update_shared(cmd_buffer, sub_cmd);
pvr_compute_update_kernel(cmd_buffer,
sub_cmd,
indirect_addr,
base_group,
workgroup_size);
}
void PVR_PER_ARCH(CmdDispatchBase)(VkCommandBuffer commandBuffer,
uint32_t baseGroupX,
uint32_t baseGroupY,
uint32_t baseGroupZ,
uint32_t groupCountX,
uint32_t groupCountY,
uint32_t groupCountZ)
{
VK_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
if (!groupCountX || !groupCountY || !groupCountZ)
return;
pvr_cmd_dispatch(cmd_buffer,
PVR_DEV_ADDR_INVALID,
(uint32_t[]){ baseGroupX, baseGroupY, baseGroupZ },
(uint32_t[]){ groupCountX, groupCountY, groupCountZ });
}
void PVR_PER_ARCH(CmdDispatchIndirect)(VkCommandBuffer commandBuffer,
VkBuffer _buffer,
VkDeviceSize offset)
{
VK_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
VK_FROM_HANDLE(pvr_buffer, buffer, _buffer);
PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
pvr_cmd_dispatch(cmd_buffer,
PVR_DEV_ADDR_OFFSET(buffer->dev_addr, offset),
(uint32_t[]){ 0, 0, 0 },
(uint32_t[]){ 1, 1, 1 });
}
static void
pvr_update_draw_state(struct pvr_cmd_buffer_state *const state,
const struct pvr_cmd_buffer_draw_state *const draw_state)
{
/* We don't have a state to tell us that base_instance is being used so it
* gets used as a boolean - 0 means we'll use a pds program that skips the
* base instance addition. If the base_instance gets used (and the last
* draw's base_instance was 0) then we switch to the BASE_INSTANCE attrib
* program.
*
* If base_instance changes then we only need to update the data section.
*
* The only draw call state that doesn't really matter is the start vertex
* as that is handled properly in the VDM state in all cases.
*/
if ((state->draw_state.draw_indexed != draw_state->draw_indexed) ||
(state->draw_state.draw_indirect != draw_state->draw_indirect) ||
(state->draw_state.base_instance == 0 &&
draw_state->base_instance != 0)) {
state->dirty.draw_variant = true;
} else if (state->draw_state.base_instance != draw_state->base_instance) {
state->dirty.draw_base_instance = true;
}
state->draw_state = *draw_state;
}
static uint32_t pvr_calc_shared_regs_count(
const struct pvr_graphics_pipeline *const gfx_pipeline)
{
uint32_t shared_regs = gfx_pipeline->vs_data.common.shareds;
if (gfx_pipeline->shader_state.fragment.shader_bo) {
uint32_t fragment_regs = gfx_pipeline->fs_data.common.shareds;
shared_regs = MAX2(shared_regs, fragment_regs);
}
return shared_regs;
}
static void
pvr_emit_dirty_pds_state(const struct pvr_cmd_buffer *const cmd_buffer,
struct pvr_sub_cmd_gfx *const sub_cmd,
const uint32_t pds_vertex_descriptor_data_offset)
{
const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
const struct pvr_stage_allocation_descriptor_state
*const vertex_descriptor_state =
&state->gfx_pipeline->shader_state.vertex.descriptor_state;
const pco_data *const vs_data = &state->gfx_pipeline->vs_data;
struct pvr_csb *const csb = &sub_cmd->control_stream;
if (!vertex_descriptor_state->pds_info.code_size_in_dwords)
return;
pvr_csb_set_relocation_mark(csb);
pvr_csb_emit (csb, VDMCTRL_PDS_STATE0, state0) {
state0.usc_target = ROGUE_VDMCTRL_USC_TARGET_ALL;
state0.usc_common_size =
DIV_ROUND_UP(PVR_DW_TO_BYTES(vs_data->common.shareds),
ROGUE_VDMCTRL_PDS_STATE0_USC_COMMON_SIZE_UNIT_SIZE);
state0.pds_data_size = DIV_ROUND_UP(
PVR_DW_TO_BYTES(vertex_descriptor_state->pds_info.data_size_in_dwords),
ROGUE_VDMCTRL_PDS_STATE0_PDS_DATA_SIZE_UNIT_SIZE);
}
pvr_csb_emit (csb, VDMCTRL_PDS_STATE1, state1) {
state1.pds_data_addr = PVR_DEV_ADDR(pds_vertex_descriptor_data_offset);
state1.sd_type = ROGUE_VDMCTRL_SD_TYPE_NONE;
}
pvr_csb_emit (csb, VDMCTRL_PDS_STATE2, state2) {
state2.pds_code_addr =
PVR_DEV_ADDR(vertex_descriptor_state->pds_code.code_offset);
}
pvr_csb_clear_relocation_mark(csb);
}
static void pvr_setup_output_select(struct pvr_cmd_buffer *const cmd_buffer)
{
const struct pvr_graphics_pipeline *const gfx_pipeline =
cmd_buffer->state.gfx_pipeline;
struct ROGUE_TA_STATE_HEADER *const header = &cmd_buffer->state.emit_header;
struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
const pco_data *const vs_data = &gfx_pipeline->vs_data;
const pco_data *const fs_data = &gfx_pipeline->fs_data;
uint32_t output_selects;
uint32_t varying[2];
const pco_range *varyings = vs_data->vs.varyings;
const bool has_point_size = varyings[VARYING_SLOT_PSIZ].count > 0;
const bool has_viewport = varyings[VARYING_SLOT_VIEWPORT].count > 0;
const bool has_layer = varyings[VARYING_SLOT_LAYER].count > 0;
const unsigned clip_count = vs_data->vs.clip_count;
const unsigned cull_count = vs_data->vs.cull_count;
const unsigned clip_cull = clip_count + cull_count;
pvr_csb_pack (&output_selects, TA_OUTPUT_SEL, state) {
state.rhw_pres = fs_data->fs.uses.w;
state.tsp_unclamped_z_pres = fs_data->fs.uses.z;
state.vtxsize = vs_data->vs.vtxouts;
state.psprite_size_pres = has_point_size;
state.vpt_tgt_pres = has_viewport;
state.render_tgt_pres = has_layer;
state.plane0 = clip_cull > 0;
state.plane1 = clip_cull > 1;
state.plane2 = clip_cull > 2;
state.plane3 = clip_cull > 3;
state.plane4 = clip_cull > 4;
state.plane5 = clip_cull > 5;
state.plane6 = clip_cull > 6;
state.plane7 = clip_cull > 7;
state.cullplane0 = (clip_cull > 0) && (clip_count < 1);
state.cullplane1 = (clip_cull > 1) && (clip_count < 2);
state.cullplane2 = (clip_cull > 2) && (clip_count < 3);
state.cullplane3 = (clip_cull > 3) && (clip_count < 4);
state.cullplane4 = (clip_cull > 4) && (clip_count < 5);
state.cullplane5 = (clip_cull > 5) && (clip_count < 6);
state.cullplane6 = (clip_cull > 6) && (clip_count < 7);
state.cullplane7 = (clip_cull > 7) && (clip_count < 8);
}
if (ppp_state->output_selects != output_selects) {
ppp_state->output_selects = output_selects;
header->pres_outselects = true;
}
pvr_csb_pack (&varying[0], TA_STATE_VARYING0, varying0) {
varying0.f32_linear = vs_data->vs.f32_smooth;
varying0.f32_flat = vs_data->vs.f32_flat;
varying0.f32_npc = vs_data->vs.f32_npc;
}
if (ppp_state->varying_word[0] != varying[0]) {
ppp_state->varying_word[0] = varying[0];
header->pres_varying_word0 = true;
}
pvr_csb_pack (&varying[1], TA_STATE_VARYING1, varying1) {
varying1.f16_linear = vs_data->vs.f16_smooth;
varying1.f16_flat = vs_data->vs.f16_flat;
varying1.f16_npc = vs_data->vs.f16_npc;
}
if (ppp_state->varying_word[1] != varying[1]) {
ppp_state->varying_word[1] = varying[1];
header->pres_varying_word1 = true;
}
}
static void
pvr_setup_isp_faces_and_control(struct pvr_cmd_buffer *const cmd_buffer,
struct ROGUE_TA_STATE_ISPA *const ispa_out)
{
struct ROGUE_TA_STATE_HEADER *const header = &cmd_buffer->state.emit_header;
const struct pvr_fragment_shader_state *const fragment_shader_state =
&cmd_buffer->state.gfx_pipeline->shader_state.fragment;
const struct pvr_render_pass_info *const pass_info =
&cmd_buffer->state.render_pass_info;
struct vk_dynamic_graphics_state *dynamic_state =
&cmd_buffer->vk.dynamic_graphics_state;
struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
const bool rasterizer_discard = dynamic_state->rs.rasterizer_discard_enable;
const uint32_t subpass_idx = pass_info->subpass_idx;
const uint32_t depth_stencil_attachment_idx =
pass_info->pass
? pass_info->pass->subpasses[subpass_idx].depth_stencil_attachment
: pass_info->dr_info->hw_render.ds_attach_idx;
struct pvr_render_pass_attachment *attachment;
if (pass_info->pass) {
attachment =
depth_stencil_attachment_idx != VK_ATTACHMENT_UNUSED
? &pass_info->pass->attachments[depth_stencil_attachment_idx]
: NULL;
} else {
attachment =
depth_stencil_attachment_idx != VK_ATTACHMENT_UNUSED
? &pass_info->dr_info->attachments[depth_stencil_attachment_idx]
: NULL;
}
const enum ROGUE_TA_OBJTYPE obj_type =
pvr_ta_objtype(dynamic_state->ia.primitive_topology);
const VkImageAspectFlags ds_aspects =
(!rasterizer_discard && attachment)
? vk_format_aspects(attachment->vk_format) &
(VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)
: VK_IMAGE_ASPECT_NONE;
/* This is deliberately a full copy rather than a pointer because
* vk_optimize_depth_stencil_state() can only be run once against any given
* instance of vk_depth_stencil_state.
*/
struct vk_depth_stencil_state ds_state = dynamic_state->ds;
uint32_t ispb_stencil_off;
bool is_two_sided = false;
uint32_t isp_control;
uint32_t line_width;
uint32_t common_a;
uint32_t front_a;
uint32_t front_b;
uint32_t back_a;
uint32_t back_b;
vk_optimize_depth_stencil_state(&ds_state, ds_aspects, true);
/* Convert to 4.4 fixed point format. */
line_width = util_unsigned_fixed(dynamic_state->rs.line.width, 4);
/* Subtract 1 to shift values from range [0=0,256=16] to [0=1/16,255=16].
* If 0 it stays at 0, otherwise we subtract 1.
*/
line_width = (!!line_width) * (line_width - 1);
line_width = MIN2(line_width, ROGUE_TA_STATE_ISPA_POINTLINEWIDTH_SIZE_MAX);
/* TODO: Part of the logic in this function is duplicated in another part
* of the code. E.g. the dcmpmode, and sop1/2/3. Could we do this earlier?
*/
pvr_csb_pack (&common_a, TA_STATE_ISPA, ispa) {
ispa.pointlinewidth = line_width;
ispa.dcmpmode = pvr_ta_cmpmode(ds_state.depth.compare_op);
ispa.dwritedisable = !ds_state.depth.write_enable;
ispa.passtype = fragment_shader_state->pass_type;
if (dynamic_state->cb.logic_op_enable &&
fragment_shader_state->pass_type == ROGUE_TA_PASSTYPE_OPAQUE)
ispa.passtype = ROGUE_TA_PASSTYPE_TRANSLUCENT;
ispa.objtype = obj_type;
/* Return unpacked ispa structure. dcmpmode, dwritedisable, passtype and
* objtype are needed by pvr_setup_triangle_merging_flag.
*/
if (ispa_out)
*ispa_out = ispa;
}
/* TODO: Does this actually represent the ispb control word on stencil off?
* If not, rename the variable.
*/
pvr_csb_pack (&ispb_stencil_off, TA_STATE_ISPB, ispb) {
ispb.sop3 = ROGUE_TA_ISPB_STENCILOP_KEEP;
ispb.sop2 = ROGUE_TA_ISPB_STENCILOP_KEEP;
ispb.sop1 = ROGUE_TA_ISPB_STENCILOP_KEEP;
ispb.scmpmode = ROGUE_TA_CMPMODE_ALWAYS;
}
/* FIXME: This logic should be redone and improved. Can we also get rid of
* the front and back variants?
*/
front_a = common_a;
back_a = common_a;
if (ds_state.stencil.test_enable) {
uint32_t front_a_sref;
uint32_t back_a_sref;
pvr_csb_pack (&front_a_sref, TA_STATE_ISPA, ispa) {
ispa.sref = ds_state.stencil.front.reference;
}
front_a |= front_a_sref;
pvr_csb_pack (&back_a_sref, TA_STATE_ISPA, ispa) {
ispa.sref = ds_state.stencil.back.reference;
}
back_a |= back_a_sref;
pvr_csb_pack (&front_b, TA_STATE_ISPB, ispb) {
const struct vk_stencil_test_face_state *const front =
&ds_state.stencil.front;
if (ds_state.stencil.write_enable)
ispb.swmask = front->write_mask;
ispb.scmpmask = front->compare_mask;
ispb.sop3 = pvr_ta_stencilop(front->op.pass);
ispb.sop2 = pvr_ta_stencilop(front->op.depth_fail);
ispb.sop1 = pvr_ta_stencilop(front->op.fail);
ispb.scmpmode = pvr_ta_cmpmode(front->op.compare);
}
pvr_csb_pack (&back_b, TA_STATE_ISPB, ispb) {
const struct vk_stencil_test_face_state *const back =
&ds_state.stencil.back;
if (ds_state.stencil.write_enable)
ispb.swmask = back->write_mask;
ispb.scmpmask = back->compare_mask;
ispb.sop3 = pvr_ta_stencilop(back->op.pass);
ispb.sop2 = pvr_ta_stencilop(back->op.depth_fail);
ispb.sop1 = pvr_ta_stencilop(back->op.fail);
ispb.scmpmode = pvr_ta_cmpmode(back->op.compare);
}
} else {
front_b = ispb_stencil_off;
back_b = ispb_stencil_off;
}
if (front_a != back_a || front_b != back_b) {
if (dynamic_state->rs.cull_mode & VK_CULL_MODE_BACK_BIT) {
/* Single face, using front state. */
} else if (dynamic_state->rs.cull_mode & VK_CULL_MODE_FRONT_BIT) {
/* Single face, using back state. */
front_a = back_a;
front_b = back_b;
} else {
/* Both faces. */
header->pres_ispctl_ba = is_two_sided = true;
if (dynamic_state->rs.front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE) {
uint32_t tmp = front_a;
front_a = back_a;
back_a = tmp;
tmp = front_b;
front_b = back_b;
back_b = tmp;
}
/* HW defaults to stencil off. */
if (back_b != ispb_stencil_off) {
header->pres_ispctl_fb = true;
header->pres_ispctl_bb = true;
}
}
}
if (ds_state.stencil.test_enable && front_b != ispb_stencil_off)
header->pres_ispctl_fb = true;
pvr_csb_pack (&isp_control, TA_STATE_ISPCTL, ispctl) {
ispctl.upass = pass_info->isp_userpass;
/* TODO: is shader_bo ever NULL? Figure out what to do. */
ispctl.tagwritedisable = rasterizer_discard ||
!fragment_shader_state->shader_bo;
ispctl.two_sided = is_two_sided;
ispctl.bpres = header->pres_ispctl_fb || header->pres_ispctl_bb;
ispctl.dbenable = !rasterizer_discard &&
dynamic_state->rs.depth_bias.enable &&
obj_type == ROGUE_TA_OBJTYPE_TRIANGLE;
if (!rasterizer_discard && cmd_buffer->state.vis_test_enabled) {
ispctl.vistest = true;
ispctl.visreg = cmd_buffer->state.vis_reg;
}
ispctl.scenable = !rasterizer_discard;
ppp_state->isp.control_struct = ispctl;
}
header->pres_ispctl = true;
ppp_state->isp.control = isp_control;
ppp_state->isp.front_a = front_a;
ppp_state->isp.front_b = front_b;
ppp_state->isp.back_a = back_a;
ppp_state->isp.back_b = back_b;
}
static float
pvr_calculate_final_depth_bias_contant_factor(struct pvr_device_info *dev_info,
VkFormat format,
float depth_bias)
{
/* Information for future modifiers of these depth bias calculations.
* ==================================================================
* Specified depth bias equations scale the specified constant factor by a
* value 'r' that is guaranteed to cause a resolvable difference in depth
* across the entire range of depth values.
* For floating point depth formats 'r' is calculated by taking the maximum
* exponent across the triangle.
* For UNORM formats 'r' is constant.
* Here 'n' is the number of mantissa bits stored in the floating point
* representation (23 for F32).
*
* UNORM Format -> z += dbcf * r + slope
* FLOAT Format -> z += dbcf * 2^(e-n) + slope
*
* HW Variations.
* ==============
* The HW either always performs the F32 depth bias equation (exponent based
* r), or in the case of HW that correctly supports the integer depth bias
* equation for UNORM depth formats, we can select between both equations
* using the ROGUE_CR_ISP_CTL.dbias_is_int flag - this is required to
* correctly perform Vulkan UNORM depth bias (constant r).
*
* if ern42307:
* if DBIAS_IS_INT_EN:
* z += dbcf + slope
* else:
* z += dbcf * 2^(e-n) + slope
* else:
* z += dbcf * 2^(e-n) + slope
*
*/
float nudge_factor;
if (PVR_HAS_ERN(dev_info, 42307)) {
switch (format) {
case VK_FORMAT_D16_UNORM:
return depth_bias / (1 << 15);
case VK_FORMAT_D24_UNORM_S8_UINT:
case VK_FORMAT_X8_D24_UNORM_PACK32:
return depth_bias / (1 << 23);
default:
return depth_bias;
}
}
/* The reasoning behind clamping/nudging the value here is because UNORM
* depth formats can have higher precision over our underlying D32F
* representation for some depth ranges.
*
* When the HW scales the depth bias value by 2^(e-n) [The 'r' term'] a depth
* bias of 1 can result in a value smaller than one F32 ULP, which will get
* quantized to 0 - resulting in no bias.
*
* Biasing small values away from zero will ensure that small depth biases of
* 1 still yield a result and overcome Z-fighting.
*/
switch (format) {
case VK_FORMAT_D16_UNORM:
depth_bias *= 512.0f;
nudge_factor = 1.0f;
break;
case VK_FORMAT_D24_UNORM_S8_UINT:
case VK_FORMAT_X8_D24_UNORM_PACK32:
depth_bias *= 2.0f;
nudge_factor = 2.0f;
break;
default:
nudge_factor = 0.0f;
break;
}
if (nudge_factor != 0.0f) {
if (depth_bias < 0.0f && depth_bias > -nudge_factor)
depth_bias -= nudge_factor;
else if (depth_bias > 0.0f && depth_bias < nudge_factor)
depth_bias += nudge_factor;
}
return depth_bias;
}
static void pvr_get_viewport_scissor_overlap(const VkViewport *const viewport,
const VkRect2D *const scissor,
VkRect2D *const rect_out)
{
/* TODO: See if we can remove this struct. */
struct pvr_rect {
int32_t x0, y0;
int32_t x1, y1;
};
/* TODO: Worry about overflow? */
const struct pvr_rect scissor_rect = {
.x0 = scissor->offset.x,
.y0 = scissor->offset.y,
.x1 = scissor->offset.x + scissor->extent.width,
.y1 = scissor->offset.y + scissor->extent.height
};
struct pvr_rect viewport_rect = { 0 };
assert(viewport->width >= 0.0f);
assert(scissor_rect.x0 >= 0);
assert(scissor_rect.y0 >= 0);
if (scissor->extent.width == 0 || scissor->extent.height == 0) {
*rect_out = (VkRect2D){ 0 };
return;
}
viewport_rect.x0 = (int32_t)viewport->x;
viewport_rect.x1 = (int32_t)viewport->x + (int32_t)viewport->width;
/* TODO: Is there a mathematical way of doing all this and then clamp at
* the end?
*/
/* We flip the y0 and y1 when height is negative. */
viewport_rect.y0 = (int32_t)viewport->y + MIN2(0, (int32_t)viewport->height);
viewport_rect.y1 = (int32_t)viewport->y + MAX2(0, (int32_t)viewport->height);
if (scissor_rect.x1 <= viewport_rect.x0 ||
scissor_rect.y1 <= viewport_rect.y0 ||
scissor_rect.x0 >= viewport_rect.x1 ||
scissor_rect.y0 >= viewport_rect.y1) {
*rect_out = (VkRect2D){ 0 };
return;
}
/* Determine the overlapping rectangle. */
viewport_rect.x0 = MAX2(viewport_rect.x0, scissor_rect.x0);
viewport_rect.y0 = MAX2(viewport_rect.y0, scissor_rect.y0);
viewport_rect.x1 = MIN2(viewport_rect.x1, scissor_rect.x1);
viewport_rect.y1 = MIN2(viewport_rect.y1, scissor_rect.y1);
/* TODO: Is this conversion safe? Is this logic right? */
rect_out->offset.x = (uint32_t)viewport_rect.x0;
rect_out->offset.y = (uint32_t)viewport_rect.y0;
rect_out->extent.height = (uint32_t)(viewport_rect.y1 - viewport_rect.y0);
rect_out->extent.width = (uint32_t)(viewport_rect.x1 - viewport_rect.x0);
}
static inline uint32_t
pvr_get_geom_region_clip_align_size(struct pvr_device_info *const dev_info)
{
/* TODO: This should come from rogue_ppp.xml. */
return 16U + 16U * (!PVR_HAS_FEATURE(dev_info, tile_size_16x16));
}
static void
pvr_setup_isp_depth_bias_scissor_state(struct pvr_cmd_buffer *const cmd_buffer)
{
struct ROGUE_TA_STATE_HEADER *const header = &cmd_buffer->state.emit_header;
struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
struct vk_dynamic_graphics_state *const dynamic_state =
&cmd_buffer->vk.dynamic_graphics_state;
const struct ROGUE_TA_STATE_ISPCTL *const ispctl =
&ppp_state->isp.control_struct;
struct pvr_device_info *const dev_info =
&cmd_buffer->device->pdevice->dev_info;
if (ispctl->dbenable &&
(BITSET_TEST(dynamic_state->dirty,
MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS) ||
cmd_buffer->depth_bias_array.size == 0)) {
struct pvr_depth_bias_state depth_bias = {
.constant_factor = pvr_calculate_final_depth_bias_contant_factor(
dev_info,
cmd_buffer->state.depth_format,
dynamic_state->rs.depth_bias.constant_factor),
.slope_factor = dynamic_state->rs.depth_bias.slope_factor,
.clamp = dynamic_state->rs.depth_bias.clamp,
};
ppp_state->depthbias_scissor_indices.depthbias_index =
util_dynarray_num_elements(&cmd_buffer->depth_bias_array,
__typeof__(depth_bias));
util_dynarray_append(&cmd_buffer->depth_bias_array, depth_bias);
header->pres_ispctl_dbsc = true;
}
if (ispctl->scenable) {
const uint32_t region_clip_align_size =
pvr_get_geom_region_clip_align_size(dev_info);
const VkViewport *const viewport = &dynamic_state->vp.viewports[0];
const VkRect2D *const scissor = &dynamic_state->vp.scissors[0];
struct pvr_scissor_words scissor_words;
VkRect2D overlap_rect;
uint32_t height;
uint32_t width;
uint32_t x;
uint32_t y;
/* For region clip. */
uint32_t bottom;
uint32_t right;
uint32_t left;
uint32_t top;
/* We don't support multiple viewport calculations. */
assert(dynamic_state->vp.viewport_count == 1);
/* We don't support multiple scissor calculations. */
assert(dynamic_state->vp.scissor_count == 1);
pvr_get_viewport_scissor_overlap(viewport, scissor, &overlap_rect);
x = overlap_rect.offset.x;
y = overlap_rect.offset.y;
width = overlap_rect.extent.width;
height = overlap_rect.extent.height;
pvr_csb_pack (&scissor_words.w0, IPF_SCISSOR_WORD_0, word0) {
word0.scw0_xmax = x + width;
word0.scw0_xmin = x;
}
pvr_csb_pack (&scissor_words.w1, IPF_SCISSOR_WORD_1, word1) {
word1.scw1_ymax = y + height;
word1.scw1_ymin = y;
}
if (cmd_buffer->scissor_array.size &&
cmd_buffer->scissor_words.w0 == scissor_words.w0 &&
cmd_buffer->scissor_words.w1 == scissor_words.w1) {
return;
}
cmd_buffer->scissor_words = scissor_words;
/* Calculate region clip. */
left = x / region_clip_align_size;
top = y / region_clip_align_size;
/* We prevent right=-1 with the multiplication. */
/* TODO: Is there a better way of doing this? */
if ((x + width) != 0U)
right = DIV_ROUND_UP(x + width, region_clip_align_size) - 1;
else
right = 0;
if ((y + height) != 0U)
bottom = DIV_ROUND_UP(y + height, region_clip_align_size) - 1;
else
bottom = 0U;
/* Setup region clip to clip everything outside what was calculated. */
/* FIXME: Should we mask to prevent writing over other words? */
pvr_csb_pack (&ppp_state->region_clipping.word0, TA_REGION_CLIP0, word0) {
word0.right = right;
word0.left = left;
word0.mode = ROGUE_TA_REGION_CLIP_MODE_OUTSIDE;
}
pvr_csb_pack (&ppp_state->region_clipping.word1, TA_REGION_CLIP1, word1) {
word1.bottom = bottom;
word1.top = top;
}
ppp_state->depthbias_scissor_indices.scissor_index =
util_dynarray_num_elements(&cmd_buffer->scissor_array,
struct pvr_scissor_words);
util_dynarray_append(&cmd_buffer->scissor_array,
cmd_buffer->scissor_words);
header->pres_ispctl_dbsc = true;
header->pres_region_clip = true;
}
}
static void
pvr_setup_triangle_merging_flag(struct pvr_cmd_buffer *const cmd_buffer,
struct ROGUE_TA_STATE_ISPA *ispa)
{
struct ROGUE_TA_STATE_HEADER *const header = &cmd_buffer->state.emit_header;
struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
uint32_t merge_word;
uint32_t mask;
pvr_csb_pack (&merge_word, TA_STATE_PDS_SIZEINFO2, size_info) {
/* Disable for lines or punch-through or for DWD and depth compare
* always.
*/
if (ispa->objtype == ROGUE_TA_OBJTYPE_LINE ||
ispa->passtype == ROGUE_TA_PASSTYPE_PUNCH_THROUGH ||
(ispa->dwritedisable && ispa->dcmpmode == ROGUE_TA_CMPMODE_ALWAYS)) {
size_info.pds_tri_merge_disable = true;
}
}
pvr_csb_pack (&mask, TA_STATE_PDS_SIZEINFO2, size_info) {
size_info.pds_tri_merge_disable = true;
}
merge_word |= ppp_state->pds.size_info2 & ~mask;
if (merge_word != ppp_state->pds.size_info2) {
ppp_state->pds.size_info2 = merge_word;
header->pres_pds_state_ptr0 = true;
}
}
static VkResult
setup_pds_fragment_program(struct pvr_cmd_buffer *const cmd_buffer,
struct pvr_pds_upload *pds_fragment_program)
{
struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
const struct pvr_fragment_shader_state *const fragment_shader_state =
&state->gfx_pipeline->shader_state.fragment;
const struct vk_dynamic_graphics_state *const dynamic_state =
&cmd_buffer->vk.dynamic_graphics_state;
const struct pvr_pds_kickusc_program *program =
&fragment_shader_state->pds_fragment_program;
uint32_t *pds_fragment_program_buffer =
fragment_shader_state->pds_fragment_program_buffer;
memset(pds_fragment_program, 0, sizeof(*pds_fragment_program));
if (!pds_fragment_program_buffer)
return VK_SUCCESS;
struct ROGUE_PDSINST_DOUTU_SRC0 doutu_src;
ROGUE_PDSINST_DOUTU_SRC0_unpack(
&pds_fragment_program_buffer[program->doutu_offset],
&doutu_src);
/* TODO: VkPipelineMultisampleStateCreateInfo.sampleShadingEnable? */
doutu_src.sample_rate = dynamic_state->ms.rasterization_samples >
VK_SAMPLE_COUNT_1_BIT
? ROGUE_PDSINST_DOUTU_SAMPLE_RATE_FULL
: ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE;
ROGUE_PDSINST_DOUTU_SRC0_pack(
&pds_fragment_program_buffer[program->doutu_offset],
&doutu_src);
/* FIXME: Figure out the define for alignment of 16. */
return pvr_cmd_buffer_upload_pds(
cmd_buffer,
&pds_fragment_program_buffer[0],
program->data_size,
16,
&pds_fragment_program_buffer[program->data_size],
program->code_size,
16,
16,
pds_fragment_program);
}
static VkResult
setup_pds_coeff_program(struct pvr_cmd_buffer *const cmd_buffer,
struct pvr_pds_upload *pds_coeff_program)
{
struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
const struct pvr_fragment_shader_state *const fragment_shader_state =
&state->gfx_pipeline->shader_state.fragment;
const struct vk_dynamic_graphics_state *const dynamic_state =
&cmd_buffer->vk.dynamic_graphics_state;
const VkProvokingVertexModeEXT provoking_vertex =
dynamic_state->rs.provoking_vertex;
const VkPrimitiveTopology topology = dynamic_state->ia.primitive_topology;
const struct pvr_pds_coeff_loading_program *program =
&fragment_shader_state->pds_coeff_program;
uint32_t *pds_coeff_program_buffer =
fragment_shader_state->pds_coeff_program_buffer;
unsigned i;
memset(pds_coeff_program, 0, sizeof(*pds_coeff_program));
if (!pds_coeff_program_buffer)
return VK_SUCCESS;
BITSET_FOREACH_SET (i, program->flat_iter_mask, PVR_MAXIMUM_ITERATIONS) {
uint32_t off = program->dout_src_offsets[i];
assert(off != ~0u);
struct ROGUE_PDSINST_DOUT_FIELDS_DOUTI_SRC douti_src;
ROGUE_PDSINST_DOUT_FIELDS_DOUTI_SRC_unpack(&pds_coeff_program_buffer[off],
&douti_src);
if (provoking_vertex == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT)
douti_src.shademodel = ROGUE_PDSINST_DOUTI_SHADEMODEL_FLAT_VERTEX2;
else if (topology == VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN)
douti_src.shademodel = ROGUE_PDSINST_DOUTI_SHADEMODEL_FLAT_VERTEX1;
else
douti_src.shademodel = ROGUE_PDSINST_DOUTI_SHADEMODEL_FLAT_VERTEX0;
ROGUE_PDSINST_DOUT_FIELDS_DOUTI_SRC_pack(&pds_coeff_program_buffer[off],
&douti_src);
}
if (program->dout_z_iterator_offset != ~0u) {
struct ROGUE_PDSINST_DOUT_FIELDS_DOUTI_SRC douti_src;
ROGUE_PDSINST_DOUT_FIELDS_DOUTI_SRC_unpack(
&pds_coeff_program_buffer[program->dout_z_iterator_offset],
&douti_src);
douti_src.depthbias = dynamic_state->rs.depth_bias.enable;
ROGUE_PDSINST_DOUT_FIELDS_DOUTI_SRC_pack(
&pds_coeff_program_buffer[program->dout_z_iterator_offset],
&douti_src);
}
/* FIXME: Figure out the define for alignment of 16. */
return pvr_cmd_buffer_upload_pds(
cmd_buffer,
&pds_coeff_program_buffer[0],
program->data_size,
16,
&pds_coeff_program_buffer[program->data_size],
program->code_size,
16,
16,
pds_coeff_program);
}
static VkResult
pvr_setup_fragment_state_pointers(struct pvr_cmd_buffer *const cmd_buffer,
struct pvr_sub_cmd_gfx *const sub_cmd)
{
struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
const pco_data *const fs_data = &state->gfx_pipeline->fs_data;
const struct pvr_fragment_shader_state *const fragment_shader_state =
&state->gfx_pipeline->shader_state.fragment;
const struct pvr_stage_allocation_descriptor_state *descriptor_shader_state =
&fragment_shader_state->descriptor_state;
const struct pvr_pipeline_stage_state *fragment_state =
&fragment_shader_state->stage_state;
struct pvr_pds_upload pds_fragment_program;
struct pvr_pds_upload pds_coeff_program;
VkResult result;
result = setup_pds_fragment_program(cmd_buffer, &pds_fragment_program);
if (result != VK_SUCCESS)
return result;
result = setup_pds_coeff_program(cmd_buffer, &pds_coeff_program);
if (result != VK_SUCCESS)
return result;
const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
struct ROGUE_TA_STATE_HEADER *const header = &state->emit_header;
struct pvr_ppp_state *const ppp_state = &state->ppp_state;
const uint32_t pds_uniform_size =
DIV_ROUND_UP(descriptor_shader_state->pds_info.data_size_in_dwords,
ROGUE_TA_STATE_PDS_SIZEINFO1_PDS_UNIFORMSIZE_UNIT_SIZE);
const uint32_t pds_varying_state_size =
DIV_ROUND_UP(pds_coeff_program.data_size,
ROGUE_TA_STATE_PDS_SIZEINFO1_PDS_VARYINGSIZE_UNIT_SIZE);
const uint32_t usc_varying_size =
DIV_ROUND_UP(fs_data->common.coeffs,
ROGUE_TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_UNIT_SIZE);
const uint32_t pds_temp_size =
DIV_ROUND_UP(fragment_state->pds_temps_count,
ROGUE_TA_STATE_PDS_SIZEINFO1_PDS_TEMPSIZE_UNIT_SIZE);
const uint32_t usc_shared_size =
DIV_ROUND_UP(fs_data->common.shareds,
ROGUE_TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE);
const uint32_t max_tiles_in_flight =
pvr_calc_fscommon_size_and_tiles_in_flight(
&pdevice->dev_info,
&pdevice->dev_runtime_info,
usc_shared_size *
ROGUE_TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE,
1);
uint32_t size_info_mask;
uint32_t size_info2;
if (max_tiles_in_flight < sub_cmd->max_tiles_in_flight)
sub_cmd->max_tiles_in_flight = max_tiles_in_flight;
pvr_csb_pack (&ppp_state->pds.pixel_shader_base,
TA_STATE_PDS_SHADERBASE,
shader_base) {
shader_base.addr = PVR_DEV_ADDR(pds_fragment_program.data_offset);
}
if (descriptor_shader_state->pds_code.pvr_bo) {
pvr_csb_pack (&ppp_state->pds.texture_uniform_code_base,
TA_STATE_PDS_TEXUNICODEBASE,
tex_base) {
tex_base.addr =
PVR_DEV_ADDR(descriptor_shader_state->pds_code.code_offset);
}
} else {
ppp_state->pds.texture_uniform_code_base = 0U;
}
pvr_csb_pack (&ppp_state->pds.size_info1, TA_STATE_PDS_SIZEINFO1, info1) {
info1.pds_uniformsize = pds_uniform_size;
info1.pds_texturestatesize = 0U;
info1.pds_varyingsize = pds_varying_state_size;
info1.usc_varyingsize = usc_varying_size;
info1.pds_tempsize = pds_temp_size;
}
pvr_csb_pack (&size_info_mask, TA_STATE_PDS_SIZEINFO2, mask) {
mask.pds_tri_merge_disable = true;
}
ppp_state->pds.size_info2 &= size_info_mask;
pvr_csb_pack (&size_info2, TA_STATE_PDS_SIZEINFO2, info2) {
info2.usc_sharedsize = usc_shared_size;
}
ppp_state->pds.size_info2 |= size_info2;
if (pds_coeff_program.pvr_bo) {
header->pres_pds_state_ptr1 = true;
pvr_csb_pack (&ppp_state->pds.varying_base,
TA_STATE_PDS_VARYINGBASE,
base) {
base.addr = PVR_DEV_ADDR(pds_coeff_program.data_offset);
}
} else {
ppp_state->pds.varying_base = 0U;
}
pvr_csb_pack (&ppp_state->pds.uniform_state_data_base,
TA_STATE_PDS_UNIFORMDATABASE,
base) {
base.addr = PVR_DEV_ADDR(state->pds_fragment_descriptor_data_offset);
}
header->pres_pds_state_ptr0 = true;
header->pres_pds_state_ptr3 = true;
return result;
}
static void pvr_setup_viewport(struct pvr_cmd_buffer *const cmd_buffer)
{
struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
struct ROGUE_TA_STATE_HEADER *const header = &state->emit_header;
struct vk_dynamic_graphics_state *const dynamic_state =
&cmd_buffer->vk.dynamic_graphics_state;
struct pvr_ppp_state *const ppp_state = &state->ppp_state;
if (ppp_state->viewport_count != dynamic_state->vp.viewport_count) {
ppp_state->viewport_count = dynamic_state->vp.viewport_count;
header->pres_viewport = true;
}
if (dynamic_state->rs.rasterizer_discard_enable) {
/* We don't want to emit any viewport data as it'll just get thrown
* away. It's after the previous condition because we still want to
* stash the viewport_count as it's our trigger for when
* rasterizer discard gets disabled.
*/
header->pres_viewport = false;
return;
}
for (uint32_t i = 0; i < ppp_state->viewport_count; i++) {
VkViewport *viewport = &dynamic_state->vp.viewports[i];
uint32_t x_scale = fui(viewport->width * 0.5f);
uint32_t y_scale = fui(viewport->height * 0.5f);
uint32_t z_scale = fui(viewport->maxDepth - viewport->minDepth);
uint32_t x_center = fui(viewport->x + viewport->width * 0.5f);
uint32_t y_center = fui(viewport->y + viewport->height * 0.5f);
uint32_t z_center = fui(viewport->minDepth);
if (ppp_state->viewports[i].a0 != x_center ||
ppp_state->viewports[i].m0 != x_scale ||
ppp_state->viewports[i].a1 != y_center ||
ppp_state->viewports[i].m1 != y_scale ||
ppp_state->viewports[i].a2 != z_center ||
ppp_state->viewports[i].m2 != z_scale) {
ppp_state->viewports[i].a0 = x_center;
ppp_state->viewports[i].m0 = x_scale;
ppp_state->viewports[i].a1 = y_center;
ppp_state->viewports[i].m1 = y_scale;
ppp_state->viewports[i].a2 = z_center;
ppp_state->viewports[i].m2 = z_scale;
header->pres_viewport = true;
}
}
}
static void pvr_setup_ppp_control(struct pvr_cmd_buffer *const cmd_buffer)
{
struct vk_dynamic_graphics_state *const dynamic_state =
&cmd_buffer->vk.dynamic_graphics_state;
const VkProvokingVertexModeEXT provoking_vertex =
dynamic_state->rs.provoking_vertex;
const VkPrimitiveTopology topology = dynamic_state->ia.primitive_topology;
struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
struct ROGUE_TA_STATE_HEADER *const header = &state->emit_header;
struct pvr_ppp_state *const ppp_state = &state->ppp_state;
uint32_t ppp_control;
pvr_csb_pack (&ppp_control, TA_STATE_PPP_CTRL, control) {
control.drawclippededges = true;
control.wclampen = true;
if (provoking_vertex == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT)
control.flatshade_vtx = ROGUE_TA_FLATSHADE_VTX_VERTEX_2;
else if (topology == VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN)
control.flatshade_vtx = ROGUE_TA_FLATSHADE_VTX_VERTEX_1;
else
control.flatshade_vtx = ROGUE_TA_FLATSHADE_VTX_VERTEX_0;
switch (dynamic_state->rs.depth_clip_enable) {
case VK_MESA_DEPTH_CLIP_ENABLE_FALSE:
control.clip_mode = ROGUE_TA_CLIP_MODE_NO_FRONT_OR_REAR;
break;
case VK_MESA_DEPTH_CLIP_ENABLE_TRUE:
control.clip_mode = ROGUE_TA_CLIP_MODE_FRONT_REAR;
break;
case VK_MESA_DEPTH_CLIP_ENABLE_NOT_CLAMP:
if (dynamic_state->rs.depth_clamp_enable)
control.clip_mode = ROGUE_TA_CLIP_MODE_NO_FRONT_OR_REAR;
else
control.clip_mode = ROGUE_TA_CLIP_MODE_FRONT_REAR;
break;
}
/* +--- FrontIsCCW?
* | +--- Cull Front?
* v v
* 0|0 CULLMODE_CULL_CCW,
* 0|1 CULLMODE_CULL_CW,
* 1|0 CULLMODE_CULL_CW,
* 1|1 CULLMODE_CULL_CCW,
*/
switch (dynamic_state->rs.cull_mode) {
case VK_CULL_MODE_BACK_BIT:
case VK_CULL_MODE_FRONT_BIT:
if ((dynamic_state->rs.front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE) ^
(dynamic_state->rs.cull_mode == VK_CULL_MODE_FRONT_BIT)) {
control.cullmode = ROGUE_TA_CULLMODE_CULL_CW;
} else {
control.cullmode = ROGUE_TA_CULLMODE_CULL_CCW;
}
break;
case VK_CULL_MODE_FRONT_AND_BACK:
case VK_CULL_MODE_NONE:
control.cullmode = ROGUE_TA_CULLMODE_NO_CULLING;
break;
default:
UNREACHABLE("Unsupported cull mode!");
}
if (dynamic_state->rs.line.mode ==
VK_LINE_RASTERIZATION_MODE_BRESENHAM_KHR) {
switch (topology) {
case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
control.prim_msaa = true;
break;
default:
break;
}
}
}
if (ppp_control != ppp_state->ppp_control) {
ppp_state->ppp_control = ppp_control;
header->pres_ppp_ctrl = true;
}
}
/* Largest valid PPP State update in words = 31
* 1 - Header
* 3 - Stream Out Config words 0, 1 and 2
* 1 - PPP Control word
* 3 - Varying Config words 0, 1 and 2
* 1 - Output Select
* 1 - WClamp
* 6 - Viewport Transform words
* 2 - Region Clip words
* 3 - PDS State for fragment phase (PDSSTATEPTR 1-3)
* 4 - PDS State for fragment phase (PDSSTATEPTR0)
* 6 - ISP Control Words
*/
#define PVR_MAX_PPP_STATE_DWORDS 31
static VkResult pvr_emit_ppp_state(struct pvr_cmd_buffer *const cmd_buffer,
struct pvr_sub_cmd_gfx *const sub_cmd)
{
const bool deferred_secondary = pvr_cmd_uses_deferred_cs_cmds(cmd_buffer);
struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
struct ROGUE_TA_STATE_HEADER *const header = &state->emit_header;
struct pvr_csb *const control_stream = &sub_cmd->control_stream;
struct pvr_ppp_state *const ppp_state = &state->ppp_state;
uint32_t ppp_state_words[PVR_MAX_PPP_STATE_DWORDS];
const bool emit_dbsc = header->pres_ispctl_dbsc;
uint32_t *buffer_ptr = ppp_state_words;
uint32_t dbsc_patching_offset = 0;
uint32_t ppp_state_words_count;
struct pvr_suballoc_bo *pvr_bo;
VkResult result;
#if !defined(NDEBUG)
struct ROGUE_TA_STATE_HEADER emit_mask = *header;
uint32_t packed_emit_mask;
static_assert(pvr_cmd_length(TA_STATE_HEADER) == 1,
"EMIT_MASK_IS_CLEAR assumes 1 dword sized header.");
# define EMIT_MASK_GET(field) (emit_mask.field)
# define EMIT_MASK_SET(field, value) (emit_mask.field = (value))
# define EMIT_MASK_IS_CLEAR \
(pvr_cmd_pack(TA_STATE_HEADER)(&packed_emit_mask, &emit_mask), \
packed_emit_mask == 0)
#else
# define EMIT_MASK_GET(field)
# define EMIT_MASK_SET(field, value)
#endif
header->view_port_count =
(ppp_state->viewport_count == 0) ? 0U : (ppp_state->viewport_count - 1);
header->pres_ispctl_fa = header->pres_ispctl;
/* If deferred_secondary is true then we do a separate state update
* which gets patched in vkCmdExecuteCommands().
*/
header->pres_ispctl_dbsc &= !deferred_secondary;
pvr_csb_write_struct(buffer_ptr, TA_STATE_HEADER, header);
static_assert(pvr_cmd_length(TA_STATE_HEADER) == 1,
"Following header check assumes 1 dword sized header.");
/* If the header is empty we exit early and prevent a bo alloc of 0 size. */
if (ppp_state_words[0] == 0)
return VK_SUCCESS;
if (header->pres_ispctl) {
pvr_csb_write_value(buffer_ptr, TA_STATE_ISPCTL, ppp_state->isp.control);
assert(header->pres_ispctl_fa);
/* This is not a mistake. FA, BA have the ISPA format, and FB, BB have the
* ISPB format.
*/
pvr_csb_write_value(buffer_ptr, TA_STATE_ISPA, ppp_state->isp.front_a);
EMIT_MASK_SET(pres_ispctl_fa, false);
if (header->pres_ispctl_fb) {
pvr_csb_write_value(buffer_ptr, TA_STATE_ISPB, ppp_state->isp.front_b);
EMIT_MASK_SET(pres_ispctl_fb, false);
}
if (header->pres_ispctl_ba) {
pvr_csb_write_value(buffer_ptr, TA_STATE_ISPA, ppp_state->isp.back_a);
EMIT_MASK_SET(pres_ispctl_ba, false);
}
if (header->pres_ispctl_bb) {
pvr_csb_write_value(buffer_ptr, TA_STATE_ISPB, ppp_state->isp.back_b);
EMIT_MASK_SET(pres_ispctl_bb, false);
}
EMIT_MASK_SET(pres_ispctl, false);
}
if (header->pres_ispctl_dbsc) {
assert(!deferred_secondary);
dbsc_patching_offset = buffer_ptr - ppp_state_words;
pvr_csb_pack (buffer_ptr, TA_STATE_ISPDBSC, ispdbsc) {
ispdbsc.dbindex = ppp_state->depthbias_scissor_indices.depthbias_index;
ispdbsc.scindex = ppp_state->depthbias_scissor_indices.scissor_index;
}
buffer_ptr += pvr_cmd_length(TA_STATE_ISPDBSC);
EMIT_MASK_SET(pres_ispctl_dbsc, false);
}
if (header->pres_pds_state_ptr0) {
pvr_csb_write_value(buffer_ptr,
TA_STATE_PDS_SHADERBASE,
ppp_state->pds.pixel_shader_base);
pvr_csb_write_value(buffer_ptr,
TA_STATE_PDS_TEXUNICODEBASE,
ppp_state->pds.texture_uniform_code_base);
pvr_csb_write_value(buffer_ptr,
TA_STATE_PDS_SIZEINFO1,
ppp_state->pds.size_info1);
pvr_csb_write_value(buffer_ptr,
TA_STATE_PDS_SIZEINFO2,
ppp_state->pds.size_info2);
EMIT_MASK_SET(pres_pds_state_ptr0, false);
}
if (header->pres_pds_state_ptr1) {
pvr_csb_write_value(buffer_ptr,
TA_STATE_PDS_VARYINGBASE,
ppp_state->pds.varying_base);
EMIT_MASK_SET(pres_pds_state_ptr1, false);
}
/* We don't use pds_state_ptr2 (texture state programs) control word, but
* this doesn't mean we need to set it to 0. This is because the hardware
* runs the texture state program only when
* ROGUE_TA_STATE_PDS_SIZEINFO1.pds_texturestatesize is non-zero.
*/
assert(pvr_csb_unpack(&ppp_state->pds.size_info1, TA_STATE_PDS_SIZEINFO1)
.pds_texturestatesize == 0);
if (header->pres_pds_state_ptr3) {
pvr_csb_write_value(buffer_ptr,
TA_STATE_PDS_UNIFORMDATABASE,
ppp_state->pds.uniform_state_data_base);
EMIT_MASK_SET(pres_pds_state_ptr3, false);
}
if (header->pres_region_clip) {
pvr_csb_write_value(buffer_ptr,
TA_REGION_CLIP0,
ppp_state->region_clipping.word0);
pvr_csb_write_value(buffer_ptr,
TA_REGION_CLIP1,
ppp_state->region_clipping.word1);
EMIT_MASK_SET(pres_region_clip, false);
}
if (header->pres_viewport) {
const uint32_t viewports = MAX2(1, ppp_state->viewport_count);
EMIT_MASK_SET(view_port_count, viewports);
for (uint32_t i = 0; i < viewports; i++) {
/* These don't have any definitions in the csbgen xml files and none
* will be added.
*/
*buffer_ptr++ = ppp_state->viewports[i].a0;
*buffer_ptr++ = ppp_state->viewports[i].m0;
*buffer_ptr++ = ppp_state->viewports[i].a1;
*buffer_ptr++ = ppp_state->viewports[i].m1;
*buffer_ptr++ = ppp_state->viewports[i].a2;
*buffer_ptr++ = ppp_state->viewports[i].m2;
EMIT_MASK_SET(view_port_count, EMIT_MASK_GET(view_port_count) - 1);
}
EMIT_MASK_SET(pres_viewport, false);
}
if (header->pres_wclamp) {
pvr_csb_pack (buffer_ptr, TA_WCLAMP, wclamp) {
wclamp.val = fui(1.0e-13f);
}
buffer_ptr += pvr_cmd_length(TA_WCLAMP);
EMIT_MASK_SET(pres_wclamp, false);
}
if (header->pres_outselects) {
pvr_csb_write_value(buffer_ptr, TA_OUTPUT_SEL, ppp_state->output_selects);
EMIT_MASK_SET(pres_outselects, false);
}
if (header->pres_varying_word0) {
pvr_csb_write_value(buffer_ptr,
TA_STATE_VARYING0,
ppp_state->varying_word[0]);
EMIT_MASK_SET(pres_varying_word0, false);
}
if (header->pres_varying_word1) {
pvr_csb_write_value(buffer_ptr,
TA_STATE_VARYING1,
ppp_state->varying_word[1]);
EMIT_MASK_SET(pres_varying_word1, false);
}
/* We only emit this on the first draw of a render job to prevent us from
* inheriting a non-zero value set elsewhere.
*/
if (header->pres_varying_word2) {
pvr_csb_write_value(buffer_ptr, TA_STATE_VARYING2, 0);
EMIT_MASK_SET(pres_varying_word2, false);
}
if (header->pres_ppp_ctrl) {
pvr_csb_write_value(buffer_ptr,
TA_STATE_PPP_CTRL,
ppp_state->ppp_control);
EMIT_MASK_SET(pres_ppp_ctrl, false);
}
/* We only emit this on the first draw of a render job to prevent us from
* inheriting a non-zero value set elsewhere.
*/
if (header->pres_stream_out_size) {
pvr_csb_write_value(buffer_ptr, TA_STATE_STREAM_OUT0, 0);
EMIT_MASK_SET(pres_stream_out_size, false);
}
assert(EMIT_MASK_IS_CLEAR);
#undef EMIT_MASK_GET
#undef EMIT_MASK_SET
#if !defined(NDEBUG)
# undef EMIT_MASK_IS_CLEAR
#endif
ppp_state_words_count = buffer_ptr - ppp_state_words;
assert(ppp_state_words_count <= PVR_MAX_PPP_STATE_DWORDS);
result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
cmd_buffer->device->heaps.general_heap,
PVR_DW_TO_BYTES(ppp_state_words_count),
&pvr_bo);
if (result != VK_SUCCESS)
return result;
memcpy(pvr_bo_suballoc_get_map_addr(pvr_bo),
ppp_state_words,
PVR_DW_TO_BYTES(ppp_state_words_count));
pvr_csb_set_relocation_mark(control_stream);
/* Write the VDM state update into the VDM control stream. */
pvr_csb_emit (control_stream, VDMCTRL_PPP_STATE0, state0) {
state0.word_count = ppp_state_words_count;
state0.addrmsb = pvr_bo->dev_addr;
}
pvr_csb_emit (control_stream, VDMCTRL_PPP_STATE1, state1) {
state1.addrlsb = pvr_bo->dev_addr;
}
pvr_csb_clear_relocation_mark(control_stream);
if (emit_dbsc && cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
struct pvr_deferred_cs_command cmd;
if (deferred_secondary) {
const uint32_t num_dwords = pvr_cmd_length(VDMCTRL_PPP_STATE0) +
pvr_cmd_length(VDMCTRL_PPP_STATE1);
uint32_t *vdm_state;
pvr_csb_set_relocation_mark(control_stream);
vdm_state = pvr_csb_alloc_dwords(control_stream, num_dwords);
if (!vdm_state) {
result = pvr_csb_get_status(control_stream);
return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
}
pvr_csb_clear_relocation_mark(control_stream);
cmd = (struct pvr_deferred_cs_command){
.type = PVR_DEFERRED_CS_COMMAND_TYPE_DBSC,
.dbsc = {
.state = ppp_state->depthbias_scissor_indices,
.vdm_state = vdm_state,
},
};
} else {
cmd = (struct pvr_deferred_cs_command){
.type = PVR_DEFERRED_CS_COMMAND_TYPE_DBSC2,
.dbsc2 = {
.state = ppp_state->depthbias_scissor_indices,
.ppp_cs_bo = pvr_bo,
.patch_offset = dbsc_patching_offset,
},
};
}
util_dynarray_append(&cmd_buffer->deferred_csb_commands, cmd);
}
state->emit_header = (struct ROGUE_TA_STATE_HEADER){ 0 };
return VK_SUCCESS;
}
static inline bool pvr_ppp_dynamic_state_isp_faces_and_control_dirty(
const BITSET_WORD *const dynamic_dirty)
{
return BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP) ||
BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE) ||
BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE) ||
BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE) ||
BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE) ||
BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH) ||
BITSET_TEST(dynamic_dirty,
MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE) ||
BITSET_TEST(dynamic_dirty,
MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES) ||
BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_MS_SAMPLE_MASK);
}
static inline bool
pvr_ppp_state_update_required(const struct pvr_cmd_buffer *cmd_buffer)
{
const BITSET_WORD *const dynamic_dirty =
cmd_buffer->vk.dynamic_graphics_state.dirty;
const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
const struct ROGUE_TA_STATE_HEADER *const header = &state->emit_header;
/* For push constants we only need to worry if they are updated for the
* fragment stage since we're only updating the pds programs used in the
* fragment stage.
*/
return header->pres_ppp_ctrl || header->pres_ispctl ||
header->pres_ispctl_fb || header->pres_ispctl_ba ||
header->pres_ispctl_bb || header->pres_ispctl_dbsc ||
header->pres_pds_state_ptr0 || header->pres_pds_state_ptr1 ||
header->pres_pds_state_ptr2 || header->pres_pds_state_ptr3 ||
header->pres_region_clip || header->pres_viewport ||
header->pres_wclamp || header->pres_outselects ||
header->pres_varying_word0 || header->pres_varying_word1 ||
header->pres_varying_word2 || header->pres_stream_out_program ||
state->dirty.fragment_descriptors || state->dirty.vis_test ||
state->dirty.gfx_pipeline_binding || state->dirty.isp_userpass ||
state->push_consts[PVR_STAGE_ALLOCATION_FRAGMENT].dirty ||
pvr_ppp_dynamic_state_isp_faces_and_control_dirty(dynamic_dirty) ||
BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS) ||
BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE) ||
BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_VP_SCISSORS) ||
BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_VP_SCISSOR_COUNT) ||
BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT);
}
static VkResult
pvr_emit_dirty_ppp_state(struct pvr_cmd_buffer *const cmd_buffer,
struct pvr_sub_cmd_gfx *const sub_cmd)
{
struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
struct vk_dynamic_graphics_state *const dynamic_state =
&cmd_buffer->vk.dynamic_graphics_state;
const struct pvr_fragment_shader_state *fragment_state =
&state->gfx_pipeline->shader_state.fragment;
VkResult result;
/* TODO: The emit_header will be dirty only if
* pvr_reset_graphics_dirty_state() was called before this (so when command
* buffer begins recording or when it's reset). Otherwise it will have been
* zeroed out by the previous pvr_emit_ppp_state(). We can probably set a
* flag in there and check it here instead of checking the header.
* Check if this is true and implement the flag.
*/
if (!pvr_ppp_state_update_required(cmd_buffer))
return VK_SUCCESS;
if (state->dirty.gfx_pipeline_binding) {
struct ROGUE_TA_STATE_ISPA ispa;
pvr_setup_output_select(cmd_buffer);
pvr_setup_isp_faces_and_control(cmd_buffer, &ispa);
pvr_setup_triangle_merging_flag(cmd_buffer, &ispa);
} else if (pvr_ppp_dynamic_state_isp_faces_and_control_dirty(
dynamic_state->dirty) ||
state->dirty.isp_userpass || state->dirty.vis_test) {
pvr_setup_isp_faces_and_control(cmd_buffer, NULL);
}
if (!dynamic_state->rs.rasterizer_discard_enable &&
state->dirty.fragment_descriptors &&
state->gfx_pipeline->shader_state.fragment.shader_bo &&
!state->gfx_pipeline->fs_data.common.uses.empty) {
result = pvr_setup_fragment_state_pointers(cmd_buffer, sub_cmd);
if (result != VK_SUCCESS)
return result;
}
pvr_setup_isp_depth_bias_scissor_state(cmd_buffer);
if (BITSET_TEST(dynamic_state->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
BITSET_TEST(dynamic_state->dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT))
pvr_setup_viewport(cmd_buffer);
pvr_setup_ppp_control(cmd_buffer);
/* The hardware doesn't have an explicit mode for this so we use a
* negative viewport to make sure all objects are culled out early.
*/
if (dynamic_state->rs.cull_mode == VK_CULL_MODE_FRONT_AND_BACK) {
/* Shift the viewport out of the guard-band culling everything. */
const uint32_t negative_vp_val = fui(-2.0f);
state->ppp_state.viewports[0].a0 = negative_vp_val;
state->ppp_state.viewports[0].m0 = 0;
state->ppp_state.viewports[0].a1 = negative_vp_val;
state->ppp_state.viewports[0].m1 = 0;
state->ppp_state.viewports[0].a2 = negative_vp_val;
state->ppp_state.viewports[0].m2 = 0;
state->ppp_state.viewport_count = 1;
state->emit_header.pres_viewport = true;
}
result = pvr_emit_ppp_state(cmd_buffer, sub_cmd);
if (result != VK_SUCCESS)
return result;
if (state->gfx_pipeline &&
fragment_state->pass_type == ROGUE_TA_PASSTYPE_DEPTH_FEEDBACK) {
assert(state->current_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS);
state->current_sub_cmd->gfx.has_depth_feedback = true;
}
return VK_SUCCESS;
}
void PVR_PER_ARCH(calculate_vertex_cam_size)(
const struct pvr_device_info *dev_info,
const uint32_t vs_output_size,
const bool raster_enable,
uint32_t *const cam_size_out,
uint32_t *const vs_max_instances_out)
{
/* First work out the size of a vertex in the UVS and multiply by 4 for
* column ordering.
*/
const uint32_t uvs_vertex_vector_size_in_dwords =
(vs_output_size + 1U + raster_enable * 4U) * 4U;
const uint32_t vdm_cam_size =
PVR_GET_FEATURE_VALUE(dev_info, vdm_cam_size, 32U);
/* This is a proxy for 8XE. */
if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format) &&
vdm_cam_size < 96U) {
/* Comparisons are based on size including scratch per vertex vector. */
if (uvs_vertex_vector_size_in_dwords < (14U * 4U)) {
*cam_size_out = MIN2(31U, vdm_cam_size - 1U);
*vs_max_instances_out = 16U;
} else if (uvs_vertex_vector_size_in_dwords < (20U * 4U)) {
*cam_size_out = 15U;
*vs_max_instances_out = 16U;
} else if (uvs_vertex_vector_size_in_dwords < (28U * 4U)) {
*cam_size_out = 11U;
*vs_max_instances_out = 12U;
} else if (uvs_vertex_vector_size_in_dwords < (44U * 4U)) {
*cam_size_out = 7U;
*vs_max_instances_out = 8U;
} else if (PVR_HAS_FEATURE(dev_info,
simple_internal_parameter_format_v2) ||
uvs_vertex_vector_size_in_dwords < (64U * 4U)) {
*cam_size_out = 7U;
*vs_max_instances_out = 4U;
} else {
*cam_size_out = 3U;
*vs_max_instances_out = 2U;
}
} else {
/* Comparisons are based on size including scratch per vertex vector. */
if (uvs_vertex_vector_size_in_dwords <= (32U * 4U)) {
/* output size <= 27 + 5 scratch. */
*cam_size_out = MIN2(95U, vdm_cam_size - 1U);
*vs_max_instances_out = 0U;
} else if (uvs_vertex_vector_size_in_dwords <= 48U * 4U) {
/* output size <= 43 + 5 scratch */
*cam_size_out = 63U;
if (PVR_GET_FEATURE_VALUE(dev_info, uvs_vtx_entries, 144U) < 288U)
*vs_max_instances_out = 16U;
else
*vs_max_instances_out = 0U;
} else if (uvs_vertex_vector_size_in_dwords <= 64U * 4U) {
/* output size <= 59 + 5 scratch. */
*cam_size_out = 31U;
if (PVR_GET_FEATURE_VALUE(dev_info, uvs_vtx_entries, 144U) < 288U)
*vs_max_instances_out = 16U;
else
*vs_max_instances_out = 0U;
} else {
*cam_size_out = 15U;
*vs_max_instances_out = 16U;
}
}
}
static void pvr_emit_dirty_vdm_state(struct pvr_cmd_buffer *const cmd_buffer,
struct pvr_sub_cmd_gfx *const sub_cmd)
{
/* FIXME: Assume all state is dirty for the moment. */
struct pvr_device_info *const dev_info =
&cmd_buffer->device->pdevice->dev_info;
ASSERTED const uint32_t max_user_vertex_output_components =
pvr_get_max_user_vertex_output_components(dev_info);
struct ROGUE_VDMCTRL_VDM_STATE0 header = { pvr_cmd_header(
VDMCTRL_VDM_STATE0) };
struct vk_dynamic_graphics_state *const dynamic_state =
&cmd_buffer->vk.dynamic_graphics_state;
const VkProvokingVertexModeEXT provoking_vertex =
dynamic_state->rs.provoking_vertex;
const VkPrimitiveTopology topology = dynamic_state->ia.primitive_topology;
const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
const pco_data *const vs_data = &state->gfx_pipeline->vs_data;
struct pvr_csb *const csb = &sub_cmd->control_stream;
uint32_t max_instances;
uint32_t cam_size;
/* CAM Calculations and HW state take vertex size aligned to DWORDS. */
assert(vs_data->vs.vtxouts <= max_user_vertex_output_components);
pvr_calculate_vertex_cam_size(dev_info,
vs_data->vs.vtxouts,
true,
&cam_size,
&max_instances);
pvr_csb_set_relocation_mark(csb);
pvr_csb_emit (csb, VDMCTRL_VDM_STATE0, state0) {
state0.cam_size = cam_size;
if (dynamic_state->ia.primitive_restart_enable) {
state0.cut_index_enable = true;
state0.cut_index_present = true;
}
if (provoking_vertex == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT)
state0.flatshade_control = ROGUE_VDMCTRL_FLATSHADE_CONTROL_VERTEX_2;
else if (topology == VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN)
state0.flatshade_control = ROGUE_VDMCTRL_FLATSHADE_CONTROL_VERTEX_1;
else
state0.flatshade_control = ROGUE_VDMCTRL_FLATSHADE_CONTROL_VERTEX_0;
/* If we've bound a different vertex buffer, or this draw-call requires
* a different PDS attrib data-section from the last draw call (changed
* base_instance) then we need to specify a new data section. This is
* also the case if we've switched pipeline or attrib program as the
* data-section layout will be different.
*/
state0.vs_data_addr_present =
state->dirty.gfx_pipeline_binding || state->dirty.vertex_bindings ||
state->dirty.draw_base_instance || state->dirty.draw_variant;
/* Need to specify new PDS Attrib program if we've bound a different
* pipeline or we needed a different PDS Attrib variant for this
* draw-call.
*/
state0.vs_other_present = state->dirty.gfx_pipeline_binding ||
state->dirty.draw_variant;
/* UVB_SCRATCH_SELECT_ONE with no rasterization is only valid when
* stream output is enabled. We use UVB_SCRATCH_SELECT_FIVE because
* Vulkan doesn't support stream output and the vertex position is
* always emitted to the UVB.
*/
state0.uvs_scratch_size_select =
ROGUE_VDMCTRL_UVS_SCRATCH_SIZE_SELECT_FIVE;
header = state0;
}
if (header.cut_index_present) {
pvr_csb_emit (csb, VDMCTRL_VDM_STATE1, state1) {
state1.cut_index =
vk_index_to_restart(state->index_buffer_binding.type);
}
}
if (header.vs_data_addr_present) {
pvr_csb_emit (csb, VDMCTRL_VDM_STATE2, state2) {
state2.vs_pds_data_base_addr =
PVR_DEV_ADDR(state->pds_vertex_attrib_offset);
}
}
if (header.vs_other_present) {
const uint32_t usc_unified_store_size_in_bytes = vs_data->common.vtxins
<< 2;
pvr_csb_emit (csb, VDMCTRL_VDM_STATE3, state3) {
state3.vs_pds_code_base_addr =
PVR_DEV_ADDR(state->pds_shader.code_offset);
}
pvr_csb_emit (csb, VDMCTRL_VDM_STATE4, state4) {
state4.vs_output_size = vs_data->vs.vtxouts;
}
pvr_csb_emit (csb, VDMCTRL_VDM_STATE5, state5) {
state5.vs_max_instances = max_instances;
state5.vs_usc_common_size = 0U;
state5.vs_usc_unified_size = DIV_ROUND_UP(
usc_unified_store_size_in_bytes,
ROGUE_VDMCTRL_VDM_STATE5_VS_USC_UNIFIED_SIZE_UNIT_SIZE);
state5.vs_pds_temp_size =
DIV_ROUND_UP(state->pds_shader.info->temps_required << 2,
ROGUE_VDMCTRL_VDM_STATE5_VS_PDS_TEMP_SIZE_UNIT_SIZE);
state5.vs_pds_data_size = DIV_ROUND_UP(
PVR_DW_TO_BYTES(state->pds_shader.info->data_size_in_dwords),
ROGUE_VDMCTRL_VDM_STATE5_VS_PDS_DATA_SIZE_UNIT_SIZE);
}
}
pvr_csb_clear_relocation_mark(csb);
}
static VkResult pvr_validate_draw_state(struct pvr_cmd_buffer *cmd_buffer)
{
struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
struct vk_dynamic_graphics_state *const dynamic_state =
&cmd_buffer->vk.dynamic_graphics_state;
const struct pvr_graphics_pipeline *const gfx_pipeline = state->gfx_pipeline;
const pco_data *const fs_data = &gfx_pipeline->fs_data;
struct pvr_sub_cmd_gfx *sub_cmd;
bool fstencil_writemask_zero;
bool bstencil_writemask_zero;
bool fstencil_keep;
bool bstencil_keep;
VkResult result;
pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
sub_cmd = &state->current_sub_cmd->gfx;
sub_cmd->empty_cmd = false;
/* Determine pipeline depth/stencil usage. If a pipeline uses depth or
* stencil testing, those attachments are using their loaded values, and
* the loadOps cannot be optimized out.
*/
/* Pipeline uses depth testing. */
if (sub_cmd->depth_usage == PVR_DEPTH_STENCIL_USAGE_UNDEFINED &&
dynamic_state->ds.depth.compare_op != VK_COMPARE_OP_ALWAYS) {
sub_cmd->depth_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED;
}
/* Pipeline uses stencil testing. */
if (sub_cmd->stencil_usage == PVR_DEPTH_STENCIL_USAGE_UNDEFINED &&
(dynamic_state->ds.stencil.front.op.compare != VK_COMPARE_OP_ALWAYS ||
dynamic_state->ds.stencil.back.op.compare != VK_COMPARE_OP_ALWAYS)) {
sub_cmd->stencil_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED;
}
if (PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
compute_overlap)) {
uint32_t coefficient_size =
DIV_ROUND_UP(fs_data->common.coeffs,
ROGUE_TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_UNIT_SIZE);
if (coefficient_size >
ROGUE_TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_MAX_SIZE)
sub_cmd->disable_compute_overlap = true;
}
sub_cmd->frag_uses_atomic_ops |= fs_data->common.uses.atomics;
sub_cmd->frag_has_side_effects |= fs_data->common.uses.side_effects;
sub_cmd->frag_uses_texture_rw |= false;
sub_cmd->vertex_uses_texture_rw |= false;
sub_cmd->job.get_vis_results = state->vis_test_enabled;
fstencil_keep =
(dynamic_state->ds.stencil.front.op.fail == VK_STENCIL_OP_KEEP) &&
(dynamic_state->ds.stencil.front.op.pass == VK_STENCIL_OP_KEEP);
bstencil_keep =
(dynamic_state->ds.stencil.back.op.fail == VK_STENCIL_OP_KEEP) &&
(dynamic_state->ds.stencil.back.op.pass == VK_STENCIL_OP_KEEP);
fstencil_writemask_zero = (dynamic_state->ds.stencil.front.write_mask == 0);
bstencil_writemask_zero = (dynamic_state->ds.stencil.back.write_mask == 0);
/* Set stencil modified flag if:
* - Neither front nor back-facing stencil has a fail_op/pass_op of KEEP.
* - Neither front nor back-facing stencil has a write_mask of zero.
*/
if (!(fstencil_keep && bstencil_keep) &&
!(fstencil_writemask_zero && bstencil_writemask_zero)) {
sub_cmd->modifies_stencil = true;
}
/* Set depth modified flag if depth write is enabled. */
if (dynamic_state->ds.depth.write_enable)
sub_cmd->modifies_depth = true;
/* If either the data or code changes for pds vertex attribs, regenerate the
* data segment.
*/
if (state->dirty.vertex_bindings || state->dirty.gfx_pipeline_binding ||
state->dirty.draw_variant || state->dirty.draw_base_instance) {
enum pvr_pds_vertex_attrib_program_type prog_type;
const struct pvr_pds_attrib_program *program;
if (state->draw_state.draw_indirect)
prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_DRAW_INDIRECT;
else if (state->draw_state.base_instance)
prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASE_INSTANCE;
else
prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASIC;
program =
&gfx_pipeline->shader_state.vertex.pds_attrib_programs[prog_type];
state->pds_shader.info = &program->info;
state->pds_shader.code_offset = program->program.code_offset;
state->max_shared_regs =
MAX2(state->max_shared_regs, pvr_calc_shared_regs_count(gfx_pipeline));
pvr_setup_vertex_buffers(cmd_buffer, gfx_pipeline);
}
state->dirty.vertex_descriptors = state->dirty.gfx_pipeline_binding;
state->dirty.fragment_descriptors = state->dirty.vertex_descriptors;
if (state->push_consts[PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY].dirty) {
result = pvr_cmd_upload_push_consts(cmd_buffer,
PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY);
state->dirty.vertex_descriptors = true;
}
if (state->push_consts[PVR_STAGE_ALLOCATION_FRAGMENT].dirty) {
result =
pvr_cmd_upload_push_consts(cmd_buffer, PVR_STAGE_ALLOCATION_FRAGMENT);
state->dirty.fragment_descriptors = true;
}
/* Account for dirty descriptor set. */
/* TODO: It could be the case that there are no descriptors for a specific
* stage, or that the update descriptors aren't active for a particular
* stage. In such cases we could avoid regenerating the descriptor PDS
* program.
*/
state->dirty.vertex_descriptors |= state->dirty.gfx_desc_dirty;
state->dirty.fragment_descriptors |= state->dirty.gfx_desc_dirty;
if (BITSET_TEST(dynamic_state->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS) ||
BITSET_TEST(dynamic_state->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE) ||
BITSET_TEST(dynamic_state->dirty,
MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES)) {
state->dirty.fragment_descriptors = true;
}
if (state->dirty.fragment_descriptors) {
result = pvr_setup_descriptor_mappings(
cmd_buffer,
PVR_STAGE_ALLOCATION_FRAGMENT,
&state->gfx_pipeline->shader_state.fragment.descriptor_state,
NULL,
&state->pds_fragment_descriptor_data_offset);
if (result != VK_SUCCESS) {
mesa_loge("Could not setup fragment descriptor mappings.");
return result;
}
}
if (state->dirty.vertex_descriptors) {
uint32_t pds_vertex_descriptor_data_offset;
result = pvr_setup_descriptor_mappings(
cmd_buffer,
PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY,
&state->gfx_pipeline->shader_state.vertex.descriptor_state,
NULL,
&pds_vertex_descriptor_data_offset);
if (result != VK_SUCCESS) {
mesa_loge("Could not setup vertex descriptor mappings.");
return result;
}
pvr_emit_dirty_pds_state(cmd_buffer,
sub_cmd,
pds_vertex_descriptor_data_offset);
}
pvr_emit_dirty_ppp_state(cmd_buffer, sub_cmd);
pvr_emit_dirty_vdm_state(cmd_buffer, sub_cmd);
vk_dynamic_graphics_state_clear_dirty(dynamic_state);
state->dirty.gfx_desc_dirty = false;
state->dirty.draw_base_instance = false;
state->dirty.draw_variant = false;
state->dirty.fragment_descriptors = false;
state->dirty.gfx_pipeline_binding = false;
state->dirty.isp_userpass = false;
state->dirty.vertex_bindings = false;
state->dirty.vis_test = false;
return VK_SUCCESS;
}
static uint32_t pvr_get_hw_primitive_topology(VkPrimitiveTopology topology)
{
switch (topology) {
case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_POINT_LIST;
case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_LIST;
case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_STRIP;
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST;
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_STRIP;
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_FAN;
case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_LIST_ADJ;
case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_STRIP_ADJ;
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST_ADJ;
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_STRIP_ADJ;
case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_PATCH_LIST;
default:
UNREACHABLE("Undefined primitive topology");
}
}
/* TODO: Rewrite this in terms of ALIGN_POT() and pvr_cmd_length(). */
/* Aligned to 128 bit for PDS loads / stores */
#define DUMMY_VDM_CONTROL_STREAM_BLOCK_SIZE 8
static VkResult
pvr_write_draw_indirect_vdm_stream(struct pvr_cmd_buffer *cmd_buffer,
struct pvr_csb *const csb,
pvr_dev_addr_t idx_buffer_addr,
uint32_t idx_stride,
struct ROGUE_VDMCTRL_INDEX_LIST0 *list_hdr,
struct pvr_buffer *buffer,
VkDeviceSize offset,
uint32_t count,
uint32_t stride)
{
struct pvr_pds_drawindirect_program pds_prog = { 0 };
uint32_t word0;
/* Draw indirect always has index offset and instance count. */
list_hdr->index_offset_present = true;
list_hdr->index_instance_count_present = true;
pvr_cmd_pack(VDMCTRL_INDEX_LIST0)(&word0, list_hdr);
pds_prog.support_base_instance = true;
pds_prog.arg_buffer = buffer->dev_addr.addr + offset;
pds_prog.index_buffer = idx_buffer_addr.addr;
pds_prog.index_block_header = word0;
pds_prog.index_stride = idx_stride;
pds_prog.num_views = 1U;
/* TODO: See if we can pre-upload the code section of all the pds programs
* and reuse them here.
*/
/* Generate and upload the PDS programs (code + data). */
for (uint32_t i = 0U; i < count; i++) {
const struct pvr_device_info *dev_info =
&cmd_buffer->device->pdevice->dev_info;
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
struct pvr_suballoc_bo *dummy_bo;
struct pvr_suballoc_bo *pds_bo;
uint32_t *dummy_stream;
uint32_t *pds_base;
uint32_t pds_size;
VkResult result;
/* TODO: Move this outside the loop and allocate all of them in one go? */
result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
cmd_buffer->device->heaps.general_heap,
DUMMY_VDM_CONTROL_STREAM_BLOCK_SIZE,
&dummy_bo);
if (result != VK_SUCCESS)
return result;
pds_prog.increment_draw_id = (i != 0);
pds_prog.index_list_addr_buffer = dummy_bo->dev_addr.addr;
if (state->draw_state.draw_indexed) {
pvr_pds_generate_draw_elements_indirect(&pds_prog,
0,
PDS_GENERATE_SIZES,
dev_info);
} else {
pvr_pds_generate_draw_arrays_indirect(&pds_prog,
0,
PDS_GENERATE_SIZES,
dev_info);
}
pds_size = PVR_DW_TO_BYTES(pds_prog.program.data_size_aligned +
pds_prog.program.code_size_aligned);
result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
cmd_buffer->device->heaps.pds_heap,
pds_size,
&pds_bo);
if (result != VK_SUCCESS)
return result;
pds_base = pvr_bo_suballoc_get_map_addr(pds_bo);
memcpy(pds_base,
pds_prog.program.code,
PVR_DW_TO_BYTES(pds_prog.program.code_size_aligned));
if (state->draw_state.draw_indexed) {
pvr_pds_generate_draw_elements_indirect(
&pds_prog,
pds_base + pds_prog.program.code_size_aligned,
PDS_GENERATE_DATA_SEGMENT,
dev_info);
} else {
pvr_pds_generate_draw_arrays_indirect(
&pds_prog,
pds_base + pds_prog.program.code_size_aligned,
PDS_GENERATE_DATA_SEGMENT,
dev_info);
}
pvr_csb_set_relocation_mark(csb);
pvr_csb_emit (csb, VDMCTRL_PDS_STATE0, state0) {
state0.usc_target = ROGUE_VDMCTRL_USC_TARGET_ANY;
state0.pds_temp_size =
DIV_ROUND_UP(PVR_DW_TO_BYTES(pds_prog.program.temp_size_aligned),
ROGUE_VDMCTRL_PDS_STATE0_PDS_TEMP_SIZE_UNIT_SIZE);
state0.pds_data_size =
DIV_ROUND_UP(PVR_DW_TO_BYTES(pds_prog.program.data_size_aligned),
ROGUE_VDMCTRL_PDS_STATE0_PDS_DATA_SIZE_UNIT_SIZE);
}
pvr_csb_emit (csb, VDMCTRL_PDS_STATE1, state1) {
const uint32_t data_offset =
pds_bo->dev_addr.addr +
PVR_DW_TO_BYTES(pds_prog.program.code_size_aligned) -
cmd_buffer->device->heaps.pds_heap->base_addr.addr;
state1.pds_data_addr = PVR_DEV_ADDR(data_offset);
state1.sd_type = ROGUE_VDMCTRL_SD_TYPE_PDS;
state1.sd_next_type = ROGUE_VDMCTRL_SD_TYPE_NONE;
}
pvr_csb_emit (csb, VDMCTRL_PDS_STATE2, state2) {
const uint32_t code_offset =
pds_bo->dev_addr.addr -
cmd_buffer->device->heaps.pds_heap->base_addr.addr;
state2.pds_code_addr = PVR_DEV_ADDR(code_offset);
}
pvr_csb_clear_relocation_mark(csb);
/* We don't really need to set the relocation mark since the following
* state update is just one emit but let's be nice and use it.
*/
pvr_csb_set_relocation_mark(csb);
/* Sync task to ensure the VDM doesn't start reading the dummy blocks
* before they are ready.
*/
pvr_csb_emit (csb, VDMCTRL_INDEX_LIST0, list0) {
list0.primitive_topology = ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST;
}
pvr_csb_clear_relocation_mark(csb);
dummy_stream = pvr_bo_suballoc_get_map_addr(dummy_bo);
/* For indexed draw cmds fill in the dummy's header (as it won't change
* based on the indirect args) and increment by the in-use size of each
* dummy block.
*/
if (!state->draw_state.draw_indexed) {
dummy_stream[0] = word0;
dummy_stream += 4;
} else {
dummy_stream += 5;
}
/* clang-format off */
pvr_csb_pack (dummy_stream, VDMCTRL_STREAM_RETURN, word);
/* clang-format on */
pvr_csb_set_relocation_mark(csb);
/* Stream link to the first dummy which forces the VDM to discard any
* prefetched (dummy) control stream.
*/
pvr_csb_emit (csb, VDMCTRL_STREAM_LINK0, link) {
link.with_return = true;
link.link_addrmsb = dummy_bo->dev_addr;
}
pvr_csb_emit (csb, VDMCTRL_STREAM_LINK1, link) {
link.link_addrlsb = dummy_bo->dev_addr;
}
pvr_csb_clear_relocation_mark(csb);
/* Point the pds program to the next argument buffer and the next VDM
* dummy buffer.
*/
pds_prog.arg_buffer += stride;
}
return VK_SUCCESS;
}
#undef DUMMY_VDM_CONTROL_STREAM_BLOCK_SIZE
static void pvr_emit_vdm_index_list(struct pvr_cmd_buffer *cmd_buffer,
struct pvr_sub_cmd_gfx *const sub_cmd,
VkPrimitiveTopology topology,
uint32_t index_offset,
uint32_t first_index,
uint32_t index_count,
uint32_t instance_count,
struct pvr_buffer *buffer,
VkDeviceSize offset,
uint32_t count,
uint32_t stride)
{
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
const pco_data *const vs_data = &state->gfx_pipeline->vs_data;
struct ROGUE_VDMCTRL_INDEX_LIST0 list_hdr = { pvr_cmd_header(
VDMCTRL_INDEX_LIST0) };
pvr_dev_addr_t index_buffer_addr = PVR_DEV_ADDR_INVALID;
struct pvr_csb *const csb = &sub_cmd->control_stream;
unsigned int index_stride = 0;
list_hdr.primitive_topology = pvr_get_hw_primitive_topology(topology);
/* firstInstance is not handled here in the VDM state, it's implemented as
* an addition in the PDS vertex fetch using
* PVR_PDS_CONST_MAP_ENTRY_TYPE_BASE_INSTANCE entry type.
*/
list_hdr.index_count_present = true;
if (instance_count > 1)
list_hdr.index_instance_count_present = true;
if (index_offset)
list_hdr.index_offset_present = true;
if (state->draw_state.draw_indexed) {
list_hdr.index_size =
pvr_vdmctrl_index_size_from_type(state->index_buffer_binding.type);
index_stride = vk_index_type_to_bytes(state->index_buffer_binding.type);
index_buffer_addr = PVR_DEV_ADDR_OFFSET(
state->index_buffer_binding.buffer->dev_addr,
state->index_buffer_binding.offset + first_index * index_stride);
list_hdr.index_addr_present = true;
list_hdr.index_base_addrmsb = index_buffer_addr;
}
list_hdr.degen_cull_enable =
PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
vdm_degenerate_culling) &&
!vs_data->common.uses.side_effects;
if (state->draw_state.draw_indirect) {
assert(buffer);
pvr_write_draw_indirect_vdm_stream(cmd_buffer,
csb,
index_buffer_addr,
index_stride,
&list_hdr,
buffer,
offset,
count,
stride);
return;
}
pvr_csb_set_relocation_mark(csb);
pvr_csb_emit (csb, VDMCTRL_INDEX_LIST0, list0) {
list0 = list_hdr;
}
if (list_hdr.index_addr_present) {
pvr_csb_emit (csb, VDMCTRL_INDEX_LIST1, list1) {
list1.index_base_addrlsb = index_buffer_addr;
}
}
if (list_hdr.index_count_present) {
pvr_csb_emit (csb, VDMCTRL_INDEX_LIST2, list2) {
list2.index_count = index_count;
}
}
if (list_hdr.index_instance_count_present) {
pvr_csb_emit (csb, VDMCTRL_INDEX_LIST3, list3) {
list3.instance_count = instance_count - 1;
}
}
if (list_hdr.index_offset_present) {
pvr_csb_emit (csb, VDMCTRL_INDEX_LIST4, list4) {
list4.index_offset = index_offset;
}
}
pvr_csb_clear_relocation_mark(csb);
}
void PVR_PER_ARCH(CmdDraw)(VkCommandBuffer commandBuffer,
uint32_t vertexCount,
uint32_t instanceCount,
uint32_t firstVertex,
uint32_t firstInstance)
{
const struct pvr_cmd_buffer_draw_state draw_state = {
.base_vertex = firstVertex,
.base_instance = firstInstance,
};
VK_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
struct vk_dynamic_graphics_state *const dynamic_state =
&cmd_buffer->vk.dynamic_graphics_state;
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
VkResult result;
PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
pvr_update_draw_state(state, &draw_state);
result = pvr_validate_draw_state(cmd_buffer);
if (result != VK_SUCCESS)
return;
/* Write the VDM control stream for the primitive. */
pvr_emit_vdm_index_list(cmd_buffer,
&state->current_sub_cmd->gfx,
dynamic_state->ia.primitive_topology,
firstVertex,
0U,
vertexCount,
instanceCount,
NULL,
0U,
0U,
0U);
}
void PVR_PER_ARCH(CmdDrawIndexed)(VkCommandBuffer commandBuffer,
uint32_t indexCount,
uint32_t instanceCount,
uint32_t firstIndex,
int32_t vertexOffset,
uint32_t firstInstance)
{
const struct pvr_cmd_buffer_draw_state draw_state = {
.base_vertex = vertexOffset,
.base_instance = firstInstance,
.draw_indexed = true,
};
VK_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
struct vk_dynamic_graphics_state *const dynamic_state =
&cmd_buffer->vk.dynamic_graphics_state;
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
VkResult result;
PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
pvr_update_draw_state(state, &draw_state);
result = pvr_validate_draw_state(cmd_buffer);
if (result != VK_SUCCESS)
return;
/* Write the VDM control stream for the primitive. */
pvr_emit_vdm_index_list(cmd_buffer,
&state->current_sub_cmd->gfx,
dynamic_state->ia.primitive_topology,
vertexOffset,
firstIndex,
indexCount,
instanceCount,
NULL,
0U,
0U,
0U);
}
void PVR_PER_ARCH(CmdDrawIndexedIndirect)(VkCommandBuffer commandBuffer,
VkBuffer _buffer,
VkDeviceSize offset,
uint32_t drawCount,
uint32_t stride)
{
const struct pvr_cmd_buffer_draw_state draw_state = {
.draw_indirect = true,
.draw_indexed = true,
};
VK_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
struct vk_dynamic_graphics_state *const dynamic_state =
&cmd_buffer->vk.dynamic_graphics_state;
VK_FROM_HANDLE(pvr_buffer, buffer, _buffer);
VkResult result;
PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
pvr_update_draw_state(state, &draw_state);
result = pvr_validate_draw_state(cmd_buffer);
if (result != VK_SUCCESS)
return;
/* Write the VDM control stream for the primitive. */
pvr_emit_vdm_index_list(cmd_buffer,
&state->current_sub_cmd->gfx,
dynamic_state->ia.primitive_topology,
0U,
0U,
0U,
0U,
buffer,
offset,
drawCount,
stride);
}
void PVR_PER_ARCH(CmdDrawIndirect)(VkCommandBuffer commandBuffer,
VkBuffer _buffer,
VkDeviceSize offset,
uint32_t drawCount,
uint32_t stride)
{
const struct pvr_cmd_buffer_draw_state draw_state = {
.draw_indirect = true,
};
VK_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
VK_FROM_HANDLE(pvr_buffer, buffer, _buffer);
struct vk_dynamic_graphics_state *const dynamic_state =
&cmd_buffer->vk.dynamic_graphics_state;
VkResult result;
PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
pvr_update_draw_state(state, &draw_state);
result = pvr_validate_draw_state(cmd_buffer);
if (result != VK_SUCCESS)
return;
/* Write the VDM control stream for the primitive. */
pvr_emit_vdm_index_list(cmd_buffer,
&state->current_sub_cmd->gfx,
dynamic_state->ia.primitive_topology,
0U,
0U,
0U,
0U,
buffer,
offset,
drawCount,
stride);
}
void PVR_PER_ARCH(CmdEndRenderPass2)(VkCommandBuffer commandBuffer,
const VkSubpassEndInfo *pSubpassEndInfo)
{
VK_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
struct pvr_image_view **attachments;
VkClearValue *clear_values;
VkResult result;
PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
assert(state->render_pass_info.pass);
assert(state->render_pass_info.framebuffer);
result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
if (result != VK_SUCCESS)
return;
result = pvr_resolve_unemitted_resolve_attachments(cmd_buffer,
&state->render_pass_info);
if (result != VK_SUCCESS)
return;
/* Save the required fields before clearing render_pass_info struct. */
attachments = state->render_pass_info.attachments;
clear_values = state->render_pass_info.clear_values;
memset(&state->render_pass_info, 0, sizeof(state->render_pass_info));
state->render_pass_info.attachments = attachments;
state->render_pass_info.clear_values = clear_values;
}
static VkResult
pvr_execute_deferred_cmd_buffer(struct pvr_cmd_buffer *cmd_buffer,
const struct pvr_cmd_buffer *sec_cmd_buffer)
{
struct vk_dynamic_graphics_state *const dynamic_state =
&cmd_buffer->vk.dynamic_graphics_state;
const uint32_t prim_db_elems =
util_dynarray_num_elements(&cmd_buffer->depth_bias_array,
struct pvr_depth_bias_state);
const uint32_t prim_scissor_elems =
util_dynarray_num_elements(&cmd_buffer->scissor_array,
struct pvr_scissor_words);
util_dynarray_foreach (&sec_cmd_buffer->deferred_csb_commands,
struct pvr_deferred_cs_command,
cmd) {
switch (cmd->type) {
case PVR_DEFERRED_CS_COMMAND_TYPE_DBSC: {
const uint32_t scissor_idx =
prim_scissor_elems + cmd->dbsc.state.scissor_index;
const uint32_t db_idx =
prim_db_elems + cmd->dbsc.state.depthbias_index;
const uint32_t num_dwords =
pvr_cmd_length(TA_STATE_HEADER) + pvr_cmd_length(TA_STATE_ISPDBSC);
struct pvr_suballoc_bo *suballoc_bo;
uint32_t ppp_state[num_dwords];
VkResult result;
pvr_csb_pack (&ppp_state[0], TA_STATE_HEADER, header) {
header.pres_ispctl_dbsc = true;
}
pvr_csb_pack (&ppp_state[1], TA_STATE_ISPDBSC, ispdbsc) {
ispdbsc.dbindex = db_idx;
ispdbsc.scindex = scissor_idx;
}
result = pvr_cmd_buffer_upload_general(cmd_buffer,
&ppp_state[0],
sizeof(ppp_state),
&suballoc_bo);
if (result != VK_SUCCESS)
return result;
pvr_csb_pack (&cmd->dbsc.vdm_state[0], VDMCTRL_PPP_STATE0, state) {
state.word_count = num_dwords;
state.addrmsb = suballoc_bo->dev_addr;
}
pvr_csb_pack (&cmd->dbsc.vdm_state[1], VDMCTRL_PPP_STATE1, state) {
state.addrlsb = suballoc_bo->dev_addr;
}
break;
}
case PVR_DEFERRED_CS_COMMAND_TYPE_DBSC2: {
const uint32_t scissor_idx =
prim_scissor_elems + cmd->dbsc2.state.scissor_index;
const uint32_t db_idx =
prim_db_elems + cmd->dbsc2.state.depthbias_index;
uint32_t *const addr =
(uint32_t *)pvr_bo_suballoc_get_map_addr(cmd->dbsc2.ppp_cs_bo) +
cmd->dbsc2.patch_offset;
assert(pvr_bo_suballoc_get_map_addr(cmd->dbsc2.ppp_cs_bo));
pvr_csb_pack (addr, TA_STATE_ISPDBSC, ispdbsc) {
ispdbsc.dbindex = db_idx;
ispdbsc.scindex = scissor_idx;
}
break;
}
default:
UNREACHABLE("Invalid deferred control stream command type.");
break;
}
}
util_dynarray_append_dynarray(&cmd_buffer->depth_bias_array,
&sec_cmd_buffer->depth_bias_array);
util_dynarray_append_dynarray(&cmd_buffer->scissor_array,
&sec_cmd_buffer->scissor_array);
BITSET_SET(dynamic_state->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS);
cmd_buffer->scissor_words = (struct pvr_scissor_words){ 0 };
return VK_SUCCESS;
}
/* Caller needs to make sure that it ends the current sub_cmd. This function
* only creates a copy of sec_sub_cmd and links it to the cmd_buffer's
* sub_cmd list.
*/
static VkResult pvr_execute_sub_cmd(struct pvr_cmd_buffer *cmd_buffer,
struct pvr_sub_cmd *sec_sub_cmd)
{
struct pvr_sub_cmd *primary_sub_cmd =
vk_zalloc(&cmd_buffer->vk.pool->alloc,
sizeof(*primary_sub_cmd),
8,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (!primary_sub_cmd) {
return vk_command_buffer_set_error(&cmd_buffer->vk,
VK_ERROR_OUT_OF_HOST_MEMORY);
}
primary_sub_cmd->type = sec_sub_cmd->type;
primary_sub_cmd->owned = false;
list_addtail(&primary_sub_cmd->link, &cmd_buffer->sub_cmds);
switch (sec_sub_cmd->type) {
case PVR_SUB_CMD_TYPE_GRAPHICS:
primary_sub_cmd->gfx = sec_sub_cmd->gfx;
break;
case PVR_SUB_CMD_TYPE_QUERY:
case PVR_SUB_CMD_TYPE_COMPUTE:
primary_sub_cmd->compute = sec_sub_cmd->compute;
break;
case PVR_SUB_CMD_TYPE_TRANSFER:
primary_sub_cmd->transfer = sec_sub_cmd->transfer;
break;
case PVR_SUB_CMD_TYPE_EVENT:
primary_sub_cmd->event = sec_sub_cmd->event;
break;
default:
UNREACHABLE("Unsupported sub-command type");
}
return VK_SUCCESS;
}
static VkResult
pvr_execute_graphics_cmd_buffer(struct pvr_cmd_buffer *cmd_buffer,
struct pvr_cmd_buffer *sec_cmd_buffer)
{
const struct pvr_device_info *dev_info =
&cmd_buffer->device->pdevice->dev_info;
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
struct pvr_sub_cmd *primary_sub_cmd = state->current_sub_cmd;
struct pvr_sub_cmd *first_sec_cmd;
VkResult result;
/* Inherited queries are not supported. */
assert(!state->vis_test_enabled);
if (list_is_empty(&sec_cmd_buffer->sub_cmds))
return VK_SUCCESS;
first_sec_cmd =
list_first_entry(&sec_cmd_buffer->sub_cmds, struct pvr_sub_cmd, link);
/* Kick a render if we have a new base address. */
if (primary_sub_cmd->gfx.query_pool && first_sec_cmd->gfx.query_pool &&
primary_sub_cmd->gfx.query_pool != first_sec_cmd->gfx.query_pool) {
state->current_sub_cmd->gfx.barrier_store = true;
result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
if (result != VK_SUCCESS)
return result;
result =
pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
if (result != VK_SUCCESS)
return result;
primary_sub_cmd = state->current_sub_cmd;
/* Use existing render setup, but load color attachments from HW
* Background object.
*/
primary_sub_cmd->gfx.barrier_load = true;
primary_sub_cmd->gfx.barrier_store = false;
}
list_for_each_entry (struct pvr_sub_cmd,
sec_sub_cmd,
&sec_cmd_buffer->sub_cmds,
link) {
/* Only graphics secondary execution supported within a renderpass. */
assert(sec_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS);
if (!sec_sub_cmd->gfx.empty_cmd)
primary_sub_cmd->gfx.empty_cmd = false;
if (sec_sub_cmd->gfx.query_pool) {
primary_sub_cmd->gfx.query_pool = sec_sub_cmd->gfx.query_pool;
util_dynarray_append_dynarray(&state->query_indices,
&sec_sub_cmd->gfx.sec_query_indices);
}
if (pvr_cmd_uses_deferred_cs_cmds(sec_cmd_buffer)) {
/* TODO: In case if secondary buffer is created with
* VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT, then we patch the
* stream and copy it to primary stream using pvr_csb_copy below.
* This will need locking if the same secondary command buffer is
* executed in multiple primary buffers at the same time.
*/
result = pvr_execute_deferred_cmd_buffer(cmd_buffer, sec_cmd_buffer);
if (result != VK_SUCCESS)
return result;
result = pvr_csb_copy(&primary_sub_cmd->gfx.control_stream,
&sec_sub_cmd->gfx.control_stream);
if (result != VK_SUCCESS)
return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
} else {
result = pvr_execute_deferred_cmd_buffer(cmd_buffer, sec_cmd_buffer);
if (result != VK_SUCCESS)
return result;
pvr_csb_emit_link(
&primary_sub_cmd->gfx.control_stream,
pvr_csb_get_start_address(&sec_sub_cmd->gfx.control_stream),
true);
}
if (PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
compute_overlap)) {
primary_sub_cmd->gfx.job.disable_compute_overlap |=
sec_sub_cmd->gfx.job.disable_compute_overlap;
}
primary_sub_cmd->gfx.max_tiles_in_flight =
MIN2(primary_sub_cmd->gfx.max_tiles_in_flight,
sec_sub_cmd->gfx.max_tiles_in_flight);
/* Pass loaded depth/stencil usage from secondary command buffer. */
if (sec_sub_cmd->gfx.depth_usage == PVR_DEPTH_STENCIL_USAGE_NEEDED)
primary_sub_cmd->gfx.depth_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED;
if (sec_sub_cmd->gfx.stencil_usage == PVR_DEPTH_STENCIL_USAGE_NEEDED)
primary_sub_cmd->gfx.stencil_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED;
/* Pass depth/stencil modification state from secondary command buffer. */
if (sec_sub_cmd->gfx.modifies_depth)
primary_sub_cmd->gfx.modifies_depth = true;
if (sec_sub_cmd->gfx.modifies_stencil)
primary_sub_cmd->gfx.modifies_stencil = true;
if (sec_sub_cmd->gfx.barrier_store) {
struct pvr_sub_cmd *sec_next =
list_entry(sec_sub_cmd->link.next, struct pvr_sub_cmd, link);
/* This shouldn't be the last sub cmd. There should be a barrier load
* subsequent to the barrier store.
*/
assert(list_last_entry(&sec_cmd_buffer->sub_cmds,
struct pvr_sub_cmd,
link) != sec_sub_cmd);
/* Kick render to store stencil. */
state->current_sub_cmd->gfx.barrier_store = true;
state->current_sub_cmd->gfx.empty_cmd = false;
result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
if (result != VK_SUCCESS)
return result;
result =
pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
if (result != VK_SUCCESS)
return result;
primary_sub_cmd = state->current_sub_cmd;
/* Use existing render setup, but load color attachments from HW
* Background object.
*/
primary_sub_cmd->gfx.barrier_load = sec_next->gfx.barrier_load;
primary_sub_cmd->gfx.barrier_store = sec_next->gfx.barrier_store;
primary_sub_cmd->gfx.empty_cmd = false;
}
if (!PVR_HAS_FEATURE(dev_info, gs_rta_support)) {
list_splicetail(&sec_cmd_buffer->deferred_clears,
&cmd_buffer->deferred_clears);
}
}
return VK_SUCCESS;
}
void PVR_PER_ARCH(CmdExecuteCommands)(VkCommandBuffer commandBuffer,
uint32_t commandBufferCount,
const VkCommandBuffer *pCommandBuffers)
{
VK_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
struct pvr_cmd_buffer *last_cmd_buffer;
VkResult result;
PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
/* Reset the CPU copy of the most recent PPP state of the primary command
* buffer.
*
* The next draw call in the primary after CmdExecuteCommands may send
* redundant state, if it all goes in the same geom job.
*
* Can't just copy state from the secondary because the recording state of
* the secondary command buffers would have been deleted at this point.
*/
pvr_reset_graphics_dirty_state(cmd_buffer, false);
if (state->current_sub_cmd &&
state->current_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS) {
for (uint32_t i = 0; i < commandBufferCount; i++) {
VK_FROM_HANDLE(pvr_cmd_buffer, sec_cmd_buffer, pCommandBuffers[i]);
assert(sec_cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
result = pvr_execute_graphics_cmd_buffer(cmd_buffer, sec_cmd_buffer);
if (result != VK_SUCCESS)
return;
}
last_cmd_buffer =
pvr_cmd_buffer_from_handle(pCommandBuffers[commandBufferCount - 1]);
/* Set barriers from final command secondary command buffer. */
for (uint32_t i = 0; i != PVR_NUM_SYNC_PIPELINE_STAGES; i++) {
state->barriers_needed[i] |=
last_cmd_buffer->state.barriers_needed[i] &
PVR_PIPELINE_STAGE_ALL_GRAPHICS_BITS;
}
} else {
for (uint32_t i = 0; i < commandBufferCount; i++) {
VK_FROM_HANDLE(pvr_cmd_buffer, sec_cmd_buffer, pCommandBuffers[i]);
assert(sec_cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
if (result != VK_SUCCESS)
return;
list_for_each_entry_safe (struct pvr_sub_cmd,
sec_sub_cmd,
&sec_cmd_buffer->sub_cmds,
link) {
result = pvr_execute_sub_cmd(cmd_buffer, sec_sub_cmd);
if (result != VK_SUCCESS)
return;
}
}
last_cmd_buffer =
pvr_cmd_buffer_from_handle(pCommandBuffers[commandBufferCount - 1]);
memcpy(state->barriers_needed,
last_cmd_buffer->state.barriers_needed,
sizeof(state->barriers_needed));
}
}
static void pvr_insert_transparent_obj(struct pvr_cmd_buffer *const cmd_buffer,
struct pvr_sub_cmd_gfx *const sub_cmd)
{
struct pvr_device *const device = cmd_buffer->device;
/* Yes we want a copy. The user could be recording multiple command buffers
* in parallel so writing the template in place could cause problems.
*/
struct pvr_static_clear_ppp_template clear =
device->static_clear_state->ppp_templates[VK_IMAGE_ASPECT_COLOR_BIT];
uint32_t pds_state[PVR_PDS_STATE_LENGTH] = { 0 };
struct pvr_csb *csb = &sub_cmd->control_stream;
struct pvr_suballoc_bo *ppp_bo;
assert(clear.requires_pds_state);
/* Patch the template. */
pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SHADERBASE],
TA_STATE_PDS_SHADERBASE,
shaderbase) {
shaderbase.addr = PVR_DEV_ADDR(device->nop_program.pds.data_offset);
}
clear.config.pds_state = &pds_state;
clear.config.ispctl.upass = cmd_buffer->state.render_pass_info.isp_userpass;
/* Emit PPP state from template. */
pvr_emit_ppp_from_template(csb, &clear, &ppp_bo);
list_add(&ppp_bo->link, &cmd_buffer->bo_list);
/* Emit VDM state. */
pvr_emit_clear_words(cmd_buffer, sub_cmd);
/* Reset graphics state. */
pvr_reset_graphics_dirty_state(cmd_buffer, false);
}
static inline struct pvr_render_subpass *
pvr_get_current_subpass(const struct pvr_cmd_buffer_state *const state)
{
const uint32_t subpass_idx = state->render_pass_info.subpass_idx;
return &state->render_pass_info.pass->subpasses[subpass_idx];
}
void PVR_PER_ARCH(CmdNextSubpass2)(VkCommandBuffer commandBuffer,
const VkSubpassBeginInfo *pSubpassBeginInfo,
const VkSubpassEndInfo *pSubpassEndInfo)
{
VK_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
struct pvr_render_pass_info *rp_info = &state->render_pass_info;
const struct pvr_renderpass_hwsetup_subpass *hw_subpass;
struct pvr_renderpass_hwsetup_render *next_hw_render;
const struct pvr_render_pass *pass = rp_info->pass;
const struct pvr_renderpass_hw_map *current_map;
const struct pvr_renderpass_hw_map *next_map;
struct pvr_load_op *hw_subpass_load_op;
VkResult result;
PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
current_map = &pass->hw_setup->subpass_map[rp_info->subpass_idx];
next_map = &pass->hw_setup->subpass_map[rp_info->subpass_idx + 1];
next_hw_render = &pass->hw_setup->renders[next_map->render];
if (current_map->render != next_map->render) {
result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
if (result != VK_SUCCESS)
return;
result = pvr_resolve_unemitted_resolve_attachments(cmd_buffer, rp_info);
if (result != VK_SUCCESS)
return;
rp_info->current_hw_subpass = next_map->render;
result =
pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
if (result != VK_SUCCESS)
return;
rp_info->enable_bg_tag = false;
rp_info->process_empty_tiles = false;
/* If this subpass contains any load ops the HW Background Object must be
* run to do the clears/loads.
*/
if (next_hw_render->color_init_count > 0) {
rp_info->enable_bg_tag = true;
if (pass->multiview_enabled) {
/* TODO: A more optimized fix would be to selectively enable empty
* tile processing for renders depending on whether the render is
* the first use of a particular view in a renderpass. This would be
* a much larger and more invasive change but could be considered if
* empty tile processing in multiview workloads becomes a
* performance issue.
*/
rp_info->process_empty_tiles = true;
} else {
for (uint32_t i = 0; i < next_hw_render->color_init_count; i++) {
/* Empty tiles need to be cleared too. */
if (next_hw_render->color_init[i].op ==
VK_ATTACHMENT_LOAD_OP_CLEAR) {
rp_info->process_empty_tiles = true;
break;
}
}
}
}
/* Set isp_userpass to zero for new hw_render. This will be used to set
* ROGUE_CR_ISP_CTL::upass_start.
*/
rp_info->isp_userpass = 0;
}
hw_subpass = &next_hw_render->subpasses[next_map->subpass];
hw_subpass_load_op = hw_subpass->load_op;
if (hw_subpass_load_op) {
result = pvr_cs_write_load_op(cmd_buffer,
&state->current_sub_cmd->gfx,
hw_subpass_load_op,
rp_info->isp_userpass);
}
/* Pipelines are created for a particular subpass so unbind but leave the
* vertex and descriptor bindings intact as they are orthogonal to the
* subpass.
*/
state->gfx_pipeline = NULL;
/* User-pass spawn is 4 bits so if the driver has to wrap it, it will emit a
* full screen transparent object to flush all tags up until now, then the
* user-pass spawn value will implicitly be reset to 0 because
* pvr_render_subpass::isp_userpass values are stored ANDed with
* ROGUE_CR_ISP_CTL_UPASS_START_SIZE_MAX.
*/
/* If hw_subpass_load_op is valid then pvr_write_load_op_control_stream
* has already done a full-screen transparent object.
*/
if (rp_info->isp_userpass == ROGUE_CR_ISP_CTL_UPASS_START_SIZE_MAX &&
!hw_subpass_load_op) {
pvr_insert_transparent_obj(cmd_buffer, &state->current_sub_cmd->gfx);
}
rp_info->subpass_idx++;
rp_info->isp_userpass = pass->subpasses[rp_info->subpass_idx].isp_userpass;
state->dirty.isp_userpass = true;
rp_info->pipeline_bind_point =
pass->subpasses[rp_info->subpass_idx].pipeline_bind_point;
pvr_stash_depth_format(state, &state->current_sub_cmd->gfx);
}
static bool
pvr_stencil_has_self_dependency(const struct pvr_cmd_buffer_state *const state)
{
const struct pvr_render_subpass *const current_subpass =
pvr_get_current_subpass(state);
const struct pvr_render_input_attachment *const input_attachments =
current_subpass->input_attachments;
if (current_subpass->depth_stencil_attachment == VK_ATTACHMENT_UNUSED)
return false;
/* We only need to check the current software subpass as we don't support
* merging to/from a subpass with self-dep stencil.
*/
for (uint32_t i = 0; i < current_subpass->input_count; i++) {
if (input_attachments[i].attachment_idx ==
current_subpass->depth_stencil_attachment) {
return input_attachments[i].aspect_mask & VK_IMAGE_ASPECT_STENCIL_BIT;
}
}
return false;
}
static bool pvr_is_stencil_store_load_needed(
const struct pvr_cmd_buffer *const cmd_buffer,
VkPipelineStageFlags2 vk_src_stage_mask,
VkPipelineStageFlags2 vk_dst_stage_mask,
uint32_t memory_barrier_count,
const VkMemoryBarrier2 *const memory_barriers,
uint32_t image_barrier_count,
const VkImageMemoryBarrier2 *const image_barriers)
{
const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
const uint32_t fragment_test_stages =
VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT;
const struct pvr_render_pass *const pass = state->render_pass_info.pass;
const struct pvr_renderpass_hwsetup_render *hw_render;
struct pvr_image_view **const attachments =
state->render_pass_info.attachments;
const struct pvr_image_view *attachment;
uint32_t hw_render_idx;
if (!pass)
return false;
hw_render_idx = state->current_sub_cmd->gfx.hw_render_idx;
hw_render =
pvr_pass_info_get_hw_render(&state->render_pass_info, hw_render_idx);
if (hw_render->ds_attach_idx == VK_ATTACHMENT_UNUSED)
return false;
if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
attachment = attachments[hw_render->ds_attach_idx];
} else {
assert(!attachments);
attachment = NULL;
}
if (!(vk_src_stage_mask & fragment_test_stages) &&
vk_dst_stage_mask & VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT)
return false;
for (uint32_t i = 0; i < memory_barrier_count; i++) {
const uint32_t stencil_write_bit =
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
const uint32_t input_attachment_read_bit =
VK_ACCESS_INPUT_ATTACHMENT_READ_BIT;
if (!(memory_barriers[i].srcAccessMask & stencil_write_bit))
continue;
if (!(memory_barriers[i].dstAccessMask & input_attachment_read_bit))
continue;
return pvr_stencil_has_self_dependency(state);
}
for (uint32_t i = 0; i < image_barrier_count; i++) {
VK_FROM_HANDLE(pvr_image, image, image_barriers[i].image);
const uint32_t stencil_bit = VK_IMAGE_ASPECT_STENCIL_BIT;
if (!(image_barriers[i].subresourceRange.aspectMask & stencil_bit))
continue;
if (attachment && image != vk_to_pvr_image(attachment->vk.image))
continue;
if (!vk_format_has_stencil(image->vk.format))
continue;
return pvr_stencil_has_self_dependency(state);
}
return false;
}
static VkResult
pvr_cmd_buffer_insert_mid_frag_barrier_event(struct pvr_cmd_buffer *cmd_buffer,
uint32_t src_stage_mask,
uint32_t dst_stage_mask)
{
struct pvr_sub_cmd *prev_sub_cmd = cmd_buffer->state.current_sub_cmd;
VkResult result;
assert(cmd_buffer->state.current_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS);
cmd_buffer->state.current_sub_cmd->gfx.empty_cmd = false;
/* Submit graphics job to store stencil. */
cmd_buffer->state.current_sub_cmd->gfx.barrier_store = true;
pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
if (result != VK_SUCCESS)
return result;
cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
.type = PVR_EVENT_TYPE_BARRIER,
.barrier = {
.wait_for_stage_mask = src_stage_mask,
.wait_at_stage_mask = dst_stage_mask,
},
};
pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
cmd_buffer->state.current_sub_cmd->gfx.dr_info = prev_sub_cmd->gfx.dr_info;
prev_sub_cmd->gfx.dr_info = NULL;
assert(cmd_buffer->state.current_sub_cmd->gfx.dr_info ==
cmd_buffer->state.render_pass_info.dr_info);
/* Use existing render setup, but load color attachments from HW BGOBJ */
cmd_buffer->state.current_sub_cmd->gfx.barrier_load = true;
cmd_buffer->state.current_sub_cmd->gfx.barrier_store = false;
cmd_buffer->state.current_sub_cmd->gfx.empty_cmd = false;
cmd_buffer->state.current_sub_cmd->is_dynamic_render =
prev_sub_cmd->is_dynamic_render;
cmd_buffer->state.current_sub_cmd->is_suspend = prev_sub_cmd->is_suspend;
return VK_SUCCESS;
}
static VkResult
pvr_cmd_buffer_insert_barrier_event(struct pvr_cmd_buffer *cmd_buffer,
uint32_t src_stage_mask,
uint32_t dst_stage_mask)
{
VkResult result;
result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
if (result != VK_SUCCESS)
return result;
cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
.type = PVR_EVENT_TYPE_BARRIER,
.barrier = {
.wait_for_stage_mask = src_stage_mask,
.wait_at_stage_mask = dst_stage_mask,
},
};
return pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
}
/* This is just enough to handle vkCmdPipelineBarrier().
* TODO: Complete?
*/
void PVR_PER_ARCH(CmdPipelineBarrier2)(VkCommandBuffer commandBuffer,
const VkDependencyInfo *pDependencyInfo)
{
VK_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
const struct pvr_render_pass *const render_pass =
state->render_pass_info.pass;
VkPipelineStageFlags vk_src_stage_mask = 0U;
VkPipelineStageFlags vk_dst_stage_mask = 0U;
bool is_stencil_store_load_needed;
uint32_t required_stage_mask = 0U;
uint32_t src_stage_mask;
uint32_t dst_stage_mask;
bool is_barrier_needed;
VkResult result;
PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++) {
vk_src_stage_mask |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
vk_dst_stage_mask |= pDependencyInfo->pMemoryBarriers[i].dstStageMask;
}
for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++) {
vk_src_stage_mask |=
pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
vk_dst_stage_mask |=
pDependencyInfo->pBufferMemoryBarriers[i].dstStageMask;
}
for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++) {
vk_src_stage_mask |=
pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
vk_dst_stage_mask |=
pDependencyInfo->pImageMemoryBarriers[i].dstStageMask;
}
src_stage_mask = pvr_stage_mask_src(vk_src_stage_mask);
dst_stage_mask = pvr_stage_mask_dst(vk_dst_stage_mask);
for (uint32_t stage = 0U; stage != PVR_NUM_SYNC_PIPELINE_STAGES; stage++) {
if (!(dst_stage_mask & BITFIELD_BIT(stage)))
continue;
required_stage_mask |= state->barriers_needed[stage];
}
src_stage_mask &= required_stage_mask;
for (uint32_t stage = 0U; stage != PVR_NUM_SYNC_PIPELINE_STAGES; stage++) {
if (!(dst_stage_mask & BITFIELD_BIT(stage)))
continue;
state->barriers_needed[stage] &= ~src_stage_mask;
}
if (src_stage_mask == 0 || dst_stage_mask == 0) {
is_barrier_needed = false;
} else if (src_stage_mask == PVR_PIPELINE_STAGE_GEOM_BIT &&
dst_stage_mask == PVR_PIPELINE_STAGE_FRAG_BIT) {
/* This is implicit so no need to barrier. */
is_barrier_needed = false;
} else if (src_stage_mask == dst_stage_mask &&
util_bitcount(src_stage_mask) == 1) {
struct pvr_sub_cmd *const current_sub_cmd = state->current_sub_cmd;
switch (src_stage_mask) {
case PVR_PIPELINE_STAGE_FRAG_BIT:
is_barrier_needed = false;
if (!render_pass)
break;
assert(current_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS);
/* Flush all fragment work up to this point. */
pvr_insert_transparent_obj(cmd_buffer, &current_sub_cmd->gfx);
break;
case PVR_PIPELINE_STAGE_COMPUTE_BIT:
is_barrier_needed = false;
if (!current_sub_cmd ||
current_sub_cmd->type != PVR_SUB_CMD_TYPE_COMPUTE) {
break;
}
/* Multiple dispatches can be merged into a single job. When back to
* back dispatches have a sequential dependency (Compute -> compute
* pipeline barrier) we need to do the following.
* - Dispatch a kernel which fences all previous memory writes and
* flushes the MADD cache.
* - Issue a compute fence which ensures all previous tasks emitted
* by the compute data master are completed before starting
* anything new.
*/
/* Issue Data Fence, Wait for Data Fence (IDFWDF) makes the PDS wait
* for data.
*/
pvr_compute_generate_idfwdf(cmd_buffer, &current_sub_cmd->compute);
pvr_compute_generate_fence(cmd_buffer,
&current_sub_cmd->compute,
false);
break;
default:
is_barrier_needed = false;
break;
};
} else {
is_barrier_needed = true;
}
is_stencil_store_load_needed =
pvr_is_stencil_store_load_needed(cmd_buffer,
vk_src_stage_mask,
vk_dst_stage_mask,
pDependencyInfo->memoryBarrierCount,
pDependencyInfo->pMemoryBarriers,
pDependencyInfo->imageMemoryBarrierCount,
pDependencyInfo->pImageMemoryBarriers);
if (is_stencil_store_load_needed) {
assert(render_pass);
result = pvr_cmd_buffer_insert_mid_frag_barrier_event(cmd_buffer,
src_stage_mask,
dst_stage_mask);
if (result != VK_SUCCESS)
mesa_loge("Failed to insert mid frag barrier event.");
} else if (is_barrier_needed) {
result = pvr_cmd_buffer_insert_barrier_event(cmd_buffer,
src_stage_mask,
dst_stage_mask);
if (result != VK_SUCCESS)
mesa_loge("Failed to insert pipeline barrier event.");
}
}
void PVR_PER_ARCH(CmdResetEvent2)(VkCommandBuffer commandBuffer,
VkEvent _event,
VkPipelineStageFlags2 stageMask)
{
VK_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
VK_FROM_HANDLE(pvr_event, event, _event);
VkResult result;
PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
if (result != VK_SUCCESS)
return;
cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
.type = PVR_EVENT_TYPE_RESET,
.set_reset = {
.event = event,
.wait_for_stage_mask = pvr_stage_mask_src(stageMask),
},
};
pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
}
void PVR_PER_ARCH(CmdSetEvent2)(VkCommandBuffer commandBuffer,
VkEvent _event,
const VkDependencyInfo *pDependencyInfo)
{
VK_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
VK_FROM_HANDLE(pvr_event, event, _event);
VkResult result;
PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
if (result != VK_SUCCESS)
return;
VkPipelineStageFlags2 stage_mask =
vk_collect_dependency_info_src_stages(pDependencyInfo);
cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
.type = PVR_EVENT_TYPE_SET,
.set_reset = {
.event = event,
.wait_for_stage_mask = pvr_stage_mask_dst(stage_mask),
},
};
pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
}
void PVR_PER_ARCH(CmdWaitEvents2)(VkCommandBuffer commandBuffer,
uint32_t eventCount,
const VkEvent *pEvents,
const VkDependencyInfo *pDependencyInfos)
{
VK_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
struct pvr_event **events_array;
uint32_t *stage_masks;
VkResult result;
PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
VK_MULTIALLOC(ma);
vk_multialloc_add(&ma, &events_array, __typeof__(*events_array), eventCount);
vk_multialloc_add(&ma, &stage_masks, __typeof__(*stage_masks), eventCount);
if (!vk_multialloc_alloc(&ma,
&cmd_buffer->vk.pool->alloc,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT)) {
vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
return;
}
result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
if (result != VK_SUCCESS) {
vk_free(&cmd_buffer->vk.pool->alloc, events_array);
return;
}
memcpy(events_array, pEvents, sizeof(*events_array) * eventCount);
for (uint32_t i = 0; i < eventCount; i++) {
const VkDependencyInfo *info = &pDependencyInfos[i];
VkPipelineStageFlags2 mask = 0;
for (uint32_t j = 0; j < info->memoryBarrierCount; j++)
mask |= info->pMemoryBarriers[j].dstStageMask;
for (uint32_t j = 0; j < info->bufferMemoryBarrierCount; j++)
mask |= info->pBufferMemoryBarriers[j].dstStageMask;
for (uint32_t j = 0; j < info->imageMemoryBarrierCount; j++)
mask |= info->pImageMemoryBarriers[j].dstStageMask;
stage_masks[i] = pvr_stage_mask_dst(mask);
}
cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
.type = PVR_EVENT_TYPE_WAIT,
.wait = {
.count = eventCount,
.events = events_array,
.wait_at_stage_masks = stage_masks,
},
};
pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
}
void PVR_PER_ARCH(CmdWriteTimestamp2)(VkCommandBuffer commandBuffer,
VkPipelineStageFlags2 stage,
VkQueryPool queryPool,
uint32_t query)
{
UNREACHABLE("Timestamp queries are not supported.");
}
VkResult PVR_PER_ARCH(EndCommandBuffer)(VkCommandBuffer commandBuffer)
{
VK_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
VkResult result;
if (vk_command_buffer_has_error(&cmd_buffer->vk))
return vk_command_buffer_end(&cmd_buffer->vk);
/* TODO: We should be freeing all the resources, allocated for recording,
* here.
*/
util_dynarray_fini(&state->query_indices);
result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
if (result != VK_SUCCESS)
pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
return vk_command_buffer_end(&cmd_buffer->vk);
}