pvr, pco: switch to clc query shaders

Signed-off-by: Simon Perretta <simon.perretta@imgtec.com>
Acked-by: Erik Faye-Lund <erik.faye-lund@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37439>
This commit is contained in:
Simon Perretta 2025-08-07 20:14:55 +01:00
parent 3fd3d7ee69
commit 6dd0a5ee2d
15 changed files with 465 additions and 573 deletions

View file

@ -2761,3 +2761,12 @@ intrinsic("uvsw_write_pco", src_comp=[1, 0], bit_sizes=[32])
# load_vtxin_pco(offset)
intrinsic("load_vtxin_pco", src_comp=[1], dest_comp=0, bit_sizes=[32])
# load_coeff_pco(offset)
intrinsic("load_coeff_pco", src_comp=[1], dest_comp=0, bit_sizes=[32])
# dma_ld_pco(address)
intrinsic("dma_ld_pco", src_comp=[2], dest_comp=0, flags=[CAN_ELIMINATE], bit_sizes=[32])
# dma_st_pco(address_data)
intrinsic("dma_st_pco", src_comp=[0], bit_sizes=[32])

View file

@ -273,6 +273,7 @@ lower_alu_instr_width(nir_builder *b, nir_instr *instr, void *_data)
case nir_op_unpack_snorm_2x16:
case nir_op_mqsad_4x8:
case nir_op_uadd64_32:
case nir_op_umad64_32:
/* There is no scalar version of these ops, unless we were to break it
* down to bitshifts and math (which is definitely not intended).
*/

View file

@ -645,6 +645,12 @@ dst.x = sum & 0xffffffff;
dst.y = sum >> 32;
""")
opcode("umad64_32", 2, tuint32, [1, 1, 1, 1], [tuint32, tuint32, tuint32, tuint32], False, "", """
uint64_t sum = ((uint64_t)src0.x * (uint64_t)src1.x) + ((uint64_t)src3.x << 32 | (uint64_t)src2.x);
dst.x = sum & 0xffffffff;
dst.y = sum >> 32;
""")
binop("fsub", tfloat, "", """
if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
if (bit_size == 64)

View file

@ -0,0 +1,61 @@
/*
* Copyright © 2025 Imagination Technologies Ltd.
* SPDX-License-Identifier: MIT
*/
#ifndef PVR_IFACE_H
#define PVR_IFACE_H
/**
* \file pvr_iface.h
*
* \brief USC program interface.
*/
/** Query availability shader data; shared registers. */
enum pvr_query_availability_data {
PVR_QUERY_AVAILABILITY_DATA_INDEX_COUNT,
PVR_QUERY_AVAILABILITY_DATA_INDEX_BO_LO,
PVR_QUERY_AVAILABILITY_DATA_INDEX_BO_HI,
PVR_QUERY_AVAILABILITY_DATA_BO_LO,
PVR_QUERY_AVAILABILITY_DATA_BO_HI,
_PVR_QUERY_AVAILABILITY_DATA_COUNT,
};
/** Query copy shader data; shared registers. */
enum pvr_query_copy_data {
PVR_QUERY_COPY_DATA_INDEX_COUNT,
PVR_QUERY_COPY_DATA_DEST_BO_LO,
PVR_QUERY_COPY_DATA_DEST_BO_HI,
PVR_QUERY_COPY_DATA_AVAILABILITY_BO_LO,
PVR_QUERY_COPY_DATA_AVAILABILITY_BO_HI,
PVR_QUERY_COPY_DATA_RESULT_BO_LO,
PVR_QUERY_COPY_DATA_RESULT_BO_HI,
PVR_QUERY_COPY_DATA_DEST_STRIDE,
PVR_QUERY_COPY_DATA_FLAGS,
_PVR_QUERY_COPY_DATA_COUNT,
};
/** Query reset shader data; shared registers. */
enum pvr_query_reset_data {
PVR_QUERY_RESET_DATA_INDEX_COUNT,
PVR_QUERY_RESET_DATA_RESULT_BO_LO,
PVR_QUERY_RESET_DATA_RESULT_BO_HI,
PVR_QUERY_RESET_DATA_AVAILABILITY_BO_LO,
PVR_QUERY_RESET_DATA_AVAILABILITY_BO_HI,
_PVR_QUERY_RESET_DATA_COUNT,
};
#endif /* PVR_IFACE_H */

View file

@ -263,10 +263,11 @@ static pco_instr *trans_uvsw_write(trans_ctx *tctx,
return pco_uvsw_write(&tctx->b, data_src, vtxout_addr, .rpt = chans);
}
static pco_instr *trans_load_vtxin(trans_ctx *tctx,
nir_intrinsic_instr *intr,
pco_ref dest,
UNUSED pco_ref offset_src)
static pco_instr *trans_load_reg(trans_ctx *tctx,
nir_intrinsic_instr *intr,
pco_ref dest,
UNUSED pco_ref offset_src,
enum pco_reg_class class)
{
unsigned chans = pco_ref_get_chans(dest);
@ -274,7 +275,7 @@ static pco_instr *trans_load_vtxin(trans_ctx *tctx,
/* TODO: support indexed source offset. */
assert(nir_src_is_const(*noffset_src));
unsigned offset = nir_src_as_uint(*noffset_src);
pco_ref src = pco_ref_hwreg_vec(offset, PCO_REG_CLASS_VTXIN, chans);
pco_ref src = pco_ref_hwreg_vec(offset, class, chans);
return pco_mov(&tctx->b, dest, src, .rpt = chans);
}
@ -1691,7 +1692,11 @@ static pco_instr *trans_intr(trans_ctx *tctx, nir_intrinsic_instr *intr)
break;
case nir_intrinsic_load_vtxin_pco:
instr = trans_load_vtxin(tctx, intr, dest, src[0]);
instr = trans_load_reg(tctx, intr, dest, src[0], PCO_REG_CLASS_VTXIN);
break;
case nir_intrinsic_load_coeff_pco:
instr = trans_load_reg(tctx, intr, dest, src[0], PCO_REG_CLASS_COEFF);
break;
case nir_intrinsic_load_output:
@ -1796,6 +1801,35 @@ static pco_instr *trans_intr(trans_ctx *tctx, nir_intrinsic_instr *intr)
instr = trans_scratch(tctx, dest, src[1], src[0]);
break;
case nir_intrinsic_dma_ld_pco: {
unsigned chans = pco_ref_get_chans(dest);
instr = pco_ld(&tctx->b,
dest,
pco_ref_drc(PCO_DRC_0),
pco_ref_imm8(chans),
src[0]);
break;
}
case nir_intrinsic_dma_st_pco: {
unsigned chans = pco_ref_get_chans(src[0]) - 2;
pco_ref data_comp =
pco_ref_new_ssa(tctx->func, pco_ref_get_bits(src[0]), chans);
pco_comp(&tctx->b, data_comp, src[0], pco_ref_val16(2));
instr = pco_st32(&tctx->b,
data_comp,
pco_ref_drc(PCO_DRC_0),
pco_ref_imm8(chans),
src[0],
pco_ref_null());
break;
}
/* Vertex sysvals. */
case nir_intrinsic_load_vertex_id:
case nir_intrinsic_load_instance_id:
@ -2675,6 +2709,28 @@ static pco_instr *trans_alu(trans_ctx *tctx, nir_alu_instr *alu)
break;
}
case nir_op_umad64_32: {
pco_ref dest_comps[2] = {
[0] = pco_ref_new_ssa32(tctx->func),
[1] = pco_ref_new_ssa32(tctx->func),
};
pco_imadd64(&tctx->b,
dest_comps[0],
dest_comps[1],
src[0],
src[1],
src[2],
src[3],
pco_ref_null());
/* TODO: mark this vec as being non-contiguous,
* add pass for expanding.
*/
instr = pco_trans_nir_vec(tctx, dest, 2, dest_comps);
break;
}
case nir_op_imul:
instr = pco_imul32(&tctx->b, dest, src[0], src[1], pco_ref_null());
break;
@ -2713,6 +2769,16 @@ static pco_instr *trans_alu(trans_ctx *tctx, nir_alu_instr *alu)
pco_ref_null());
break;
case nir_op_imad:
instr = pco_imadd32(&tctx->b,
dest,
src[0],
src[1],
src[2],
pco_ref_null(),
.s = true);
break;
/* Set-on (float) comparisons. */
case nir_op_slt:
case nir_op_sge:

View file

@ -4,6 +4,7 @@
*/
#include "libcl.h"
#include "compiler/libcl/libcl_vk.h"
KERNEL(1)
vs_nop_common(void)
@ -38,3 +39,115 @@ vs_passthrough_rta_common(void)
vs_passthrough_common();
nir_uvsw_write_pco(4, nir_load_vtxin_pco(1, 3));
}
/* TODO: uint index = cl_global_id.x;
* instead of this function once things
* are properly hooked up.
*/
static inline uint
query_calc_global_id(void)
{
uint local_invoc_index = nir_load_vtxin_pco(1, 0);
local_invoc_index &= get_local_size(0) - 1;
uint wg_id = nir_load_coeff_pco(1, 0);
return nir_imad(wg_id, get_local_size(0), local_invoc_index);
}
/* TODO: support parameter passing. */
/* TODO: switch to common implementation. */
KERNEL(32)
cs_query_availability_common(void)
{
uint index_count = nir_load_preamble(1, PVR_QUERY_AVAILABILITY_DATA_INDEX_COUNT, 0);
uint index_base_addr_lo = nir_load_preamble(1, PVR_QUERY_AVAILABILITY_DATA_INDEX_BO_LO, 0);
uint index_base_addr_hi = nir_load_preamble(1, PVR_QUERY_AVAILABILITY_DATA_INDEX_BO_HI, 0);
uint avail_base_addr_lo = nir_load_preamble(1, PVR_QUERY_AVAILABILITY_DATA_BO_LO, 0);
uint avail_base_addr_hi = nir_load_preamble(1, PVR_QUERY_AVAILABILITY_DATA_BO_HI, 0);
uint index = query_calc_global_id();
if (index < index_count) {
uint2 index_addr = nir_uadd64_32(index_base_addr_lo, index_base_addr_hi, index * sizeof(uint32_t));
uint offset = nir_dma_ld_pco(1, index_addr);
uint2 avail_addr = nir_uadd64_32(avail_base_addr_lo, avail_base_addr_hi, offset * sizeof(uint32_t));
nir_dma_st_pco(avail_addr, ~0U);
}
}
KERNEL(32)
cs_query_copy_common(void)
{
uint index_count = nir_load_preamble(1, PVR_QUERY_COPY_DATA_INDEX_COUNT, 0);
uint dest_base_addr_lo = nir_load_preamble(1, PVR_QUERY_COPY_DATA_DEST_BO_LO, 0);
uint dest_base_addr_hi = nir_load_preamble(1, PVR_QUERY_COPY_DATA_DEST_BO_HI, 0);
uint avail_base_addr_lo = nir_load_preamble(1, PVR_QUERY_COPY_DATA_AVAILABILITY_BO_LO, 0);
uint avail_base_addr_hi = nir_load_preamble(1, PVR_QUERY_COPY_DATA_AVAILABILITY_BO_HI, 0);
uint result_base_addr_lo = nir_load_preamble(1, PVR_QUERY_COPY_DATA_RESULT_BO_LO, 0);
uint result_base_addr_hi = nir_load_preamble(1, PVR_QUERY_COPY_DATA_RESULT_BO_HI, 0);
uint dest_stride = nir_load_preamble(1, PVR_QUERY_COPY_DATA_DEST_STRIDE, 0);
uint flags = nir_load_preamble(1, PVR_QUERY_COPY_DATA_FLAGS, 0);
uint index = query_calc_global_id();
if (index < index_count) {
uint2 avail_addr = nir_uadd64_32(avail_base_addr_lo, avail_base_addr_hi, index * sizeof(uint32_t));
uint available = nir_dma_ld_pco(1, avail_addr);
uint2 dest_addr = nir_umad64_32(dest_stride, index, dest_base_addr_lo, dest_base_addr_hi);
if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
uint2 result_addr = nir_uadd64_32(result_base_addr_lo, result_base_addr_hi, index * sizeof(uint32_t));
uint result = nir_dma_ld_pco(1, result_addr);
/* TODO: for 64/32-bit writes, just prep the 64-bit one and set the burst-length variably. */
if (flags & VK_QUERY_RESULT_64_BIT) {
/* TODO: check if data should be (result, 0) or (0, result) */
nir_dma_st_pco(dest_addr, result, 0);
} else {
nir_dma_st_pco(dest_addr, result);
}
}
if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
if (flags & VK_QUERY_RESULT_64_BIT) {
dest_addr = nir_uadd64_32(dest_addr.x, dest_addr.y, sizeof(uint64_t));
/* TODO: check if data should be (available, 0) or (0, available) */
nir_dma_st_pco(dest_addr, available, 0);
} else {
dest_addr = nir_uadd64_32(dest_addr.x, dest_addr.y, sizeof(uint32_t));
nir_dma_st_pco(dest_addr, available);
}
}
}
}
KERNEL(32)
cs_query_reset_common(void)
{
uint index_count = nir_load_preamble(1, PVR_QUERY_RESET_DATA_INDEX_COUNT, 0);
uint result_base_addr_lo = nir_load_preamble(1, PVR_QUERY_RESET_DATA_RESULT_BO_LO, 0);
uint result_base_addr_hi = nir_load_preamble(1, PVR_QUERY_RESET_DATA_RESULT_BO_HI, 0);
uint avail_base_addr_lo = nir_load_preamble(1, PVR_QUERY_RESET_DATA_AVAILABILITY_BO_LO, 0);
uint avail_base_addr_hi = nir_load_preamble(1, PVR_QUERY_RESET_DATA_AVAILABILITY_BO_HI, 0);
uint index = query_calc_global_id();
if (index < index_count) {
uint2 result_addr = nir_uadd64_32(result_base_addr_lo, result_base_addr_hi, index * sizeof(uint32_t));
nir_dma_st_pco(result_addr, 0);
uint2 avail_addr = nir_uadd64_32(avail_base_addr_lo, avail_base_addr_hi, index * sizeof(uint32_t));
nir_dma_st_pco(avail_addr, 0);
}
}

View file

@ -6,6 +6,7 @@
#ifndef PCO_LIBCL_H
#define PCO_LIBCL_H
#include "common/pvr_iface.h"
#include "compiler/libcl/libcl.h"
#include "compiler/shader_enums.h"
#include "pco/pco_common.h"
@ -58,4 +59,39 @@ uint3 nir_load_vtxin_pco__3(uint offset);
uint4 nir_load_vtxin_pco__4(uint offset);
#define nir_load_vtxin_pco(n, ...) CAT2(nir_load_vtxin_pco__, n)(__VA_ARGS__)
uint nir_load_coeff_pco__1(uint offset);
uint2 nir_load_coeff_pco__2(uint offset);
uint3 nir_load_coeff_pco__3(uint offset);
uint4 nir_load_coeff_pco__4(uint offset);
#define nir_load_coeff_pco(n, ...) CAT2(nir_load_coeff_pco__, n)(__VA_ARGS__)
uint nir_load_preamble__1(uint base, uint preamble_class);
uint4 nir_load_preamble__4(uint base, uint preamble_class);
#define nir_load_preamble(n, ...) CAT2(nir_load_preamble__, n)(__VA_ARGS__)
void nir_store_preamble(uint data, uint base, uint preamble_class);
uint nir_dma_ld_pco__1(uint2 addr);
uint2 nir_dma_ld_pco__2(uint2 addr);
uint3 nir_dma_ld_pco__3(uint2 addr);
uint4 nir_dma_ld_pco__4(uint2 addr);
#define nir_dma_ld_pco(n, ...) CAT2(nir_dma_ld_pco__, n)(__VA_ARGS__)
void nir_dma_st_pco__1(uint3 addr_data);
void nir_dma_st_pco__2(uint4 addr_data);
#define SELECT_ARGS_ST(addr, ...) \
((CAT2(uint, NUM_ARGS_PLUS_2(__VA_ARGS__)))(addr, __VA_ARGS__))
/* clang-format off */
#define nir_dma_st_pco(addr, ...) SELECT_NAME(nir_dma_st_pco, __, __VA_ARGS__)SELECT_ARGS_ST(addr, __VA_ARGS__)
/* clang-format on */
uint2 nir_uadd64_32(uint lo, uint hi, uint offset);
uint nir_imad(uint a, uint b, uint c);
uint2 nir_umad64_32(uint a, uint b, uint lo, uint hi);
#endif /* PCO_LIBCL_H */

View file

@ -104,7 +104,7 @@ static void pvr_cmd_buffer_free_sub_cmd(struct pvr_cmd_buffer *cmd_buffer,
break;
case PVR_SUB_CMD_TYPE_COMPUTE:
case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
case PVR_SUB_CMD_TYPE_QUERY:
pvr_csb_finish(&sub_cmd->compute.control_stream);
break;
@ -286,9 +286,9 @@ static void pvr_cmd_buffer_update_barriers(struct pvr_cmd_buffer *cmd_buffer,
barriers = PVR_PIPELINE_STAGE_COMPUTE_BIT;
break;
case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
case PVR_SUB_CMD_TYPE_QUERY:
case PVR_SUB_CMD_TYPE_TRANSFER:
/* Compute jobs are used for occlusion queries but to copy the results we
/* Compute jobs are used for queries but to copy the results we
* have to sync with transfer jobs because vkCmdCopyQueryPoolResults() is
* deemed as a transfer operation by the spec.
*/
@ -674,7 +674,8 @@ static VkResult pvr_setup_texture_state_words(
pvr_csb_pack (&descriptor->sampler.words[1],
TEXSTATE_SAMPLER_WORD1,
sampler) {}
sampler) {
}
return VK_SUCCESS;
}
@ -1086,7 +1087,7 @@ static void pvr_setup_pbe_state(
break;
}
#define PVR_DEC_IF_NOT_ZERO(_v) (((_v) > 0) ? (_v) - 1 : 0)
#define PVR_DEC_IF_NOT_ZERO(_v) (((_v) > 0) ? (_v)-1 : 0)
render_params.min_x_clip = MAX2(0, render_area->offset.x);
render_params.min_y_clip = MAX2(0, render_area->offset.y);
@ -2205,7 +2206,7 @@ VkResult pvr_cmd_buffer_end_sub_cmd(struct pvr_cmd_buffer *cmd_buffer)
query_pool = gfx_sub_cmd->query_pool;
}
gfx_sub_cmd->has_occlusion_query = true;
gfx_sub_cmd->has_query = true;
util_dynarray_clear(&state->query_indices);
}
@ -2256,7 +2257,7 @@ VkResult pvr_cmd_buffer_end_sub_cmd(struct pvr_cmd_buffer *cmd_buffer)
break;
}
case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
case PVR_SUB_CMD_TYPE_QUERY:
case PVR_SUB_CMD_TYPE_COMPUTE: {
struct pvr_sub_cmd_compute *const compute_sub_cmd = &sub_cmd->compute;
@ -2331,7 +2332,7 @@ VkResult pvr_cmd_buffer_end_sub_cmd(struct pvr_cmd_buffer *cmd_buffer)
.type = PVR_EVENT_TYPE_BARRIER,
.barrier = {
.wait_for_stage_mask = PVR_PIPELINE_STAGE_FRAG_BIT,
.wait_at_stage_mask = PVR_PIPELINE_STAGE_OCCLUSION_QUERY_BIT,
.wait_at_stage_mask = PVR_PIPELINE_STAGE_QUERY_BIT,
},
};
@ -2487,7 +2488,7 @@ VkResult pvr_cmd_buffer_start_sub_cmd(struct pvr_cmd_buffer *cmd_buffer,
util_dynarray_init(&sub_cmd->gfx.sec_query_indices, NULL);
break;
case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
case PVR_SUB_CMD_TYPE_QUERY:
case PVR_SUB_CMD_TYPE_COMPUTE:
pvr_csb_init(device,
PVR_CMD_STREAM_TYPE_COMPUTE,
@ -3895,7 +3896,8 @@ static VkResult pvr_setup_descriptor_mappings(
pvr_csb_pack (&point_sampler_words[1],
TEXSTATE_SAMPLER_WORD1,
sampler) {}
sampler) {
}
struct pvr_suballoc_bo *point_sampler_bo;
result = pvr_cmd_buffer_upload_general(cmd_buffer,
@ -3930,7 +3932,8 @@ static VkResult pvr_setup_descriptor_mappings(
pvr_csb_pack (&ia_sampler_words[1],
TEXSTATE_SAMPLER_WORD1,
sampler) {}
sampler) {
}
struct pvr_suballoc_bo *ia_sampler_bo;
result = pvr_cmd_buffer_upload_general(cmd_buffer,
@ -7121,7 +7124,7 @@ static VkResult pvr_execute_sub_cmd(struct pvr_cmd_buffer *cmd_buffer,
primary_sub_cmd->gfx = sec_sub_cmd->gfx;
break;
case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
case PVR_SUB_CMD_TYPE_QUERY:
case PVR_SUB_CMD_TYPE_COMPUTE:
primary_sub_cmd->compute = sec_sub_cmd->compute;
break;

View file

@ -88,7 +88,7 @@ enum pvr_sub_cmd_type {
PVR_SUB_CMD_TYPE_GRAPHICS,
PVR_SUB_CMD_TYPE_COMPUTE,
PVR_SUB_CMD_TYPE_TRANSFER,
PVR_SUB_CMD_TYPE_OCCLUSION_QUERY,
PVR_SUB_CMD_TYPE_QUERY,
PVR_SUB_CMD_TYPE_EVENT,
};
@ -110,7 +110,7 @@ enum pvr_job_type {
PVR_JOB_TYPE_FRAG,
PVR_JOB_TYPE_COMPUTE,
PVR_JOB_TYPE_TRANSFER,
PVR_JOB_TYPE_OCCLUSION_QUERY,
PVR_JOB_TYPE_QUERY,
PVR_JOB_TYPE_MAX
};
@ -128,8 +128,7 @@ enum pvr_pipeline_stage_bits {
/* Note that this doesn't map to VkPipelineStageFlagBits so be careful with
* this.
*/
PVR_PIPELINE_STAGE_OCCLUSION_QUERY_BIT =
BITFIELD_BIT(PVR_JOB_TYPE_OCCLUSION_QUERY),
PVR_PIPELINE_STAGE_QUERY_BIT = BITFIELD_BIT(PVR_JOB_TYPE_QUERY),
};
#define PVR_PIPELINE_STAGE_ALL_GRAPHICS_BITS \

View file

@ -196,8 +196,8 @@ struct pvr_device {
/* Compute shaders for queries. */
struct pvr_compute_query_shader availability_shader;
struct pvr_compute_query_shader *copy_results_shaders;
struct pvr_compute_query_shader *reset_queries_shaders;
struct pvr_compute_query_shader reset_queries_shader;
struct pvr_compute_query_shader copy_results_shader;
struct pvr_suballocator suballoc_general;
struct pvr_suballocator suballoc_pds;
@ -510,7 +510,7 @@ struct pvr_sub_cmd_gfx {
*/
bool frag_uses_texture_rw;
bool has_occlusion_query;
bool has_query;
bool wait_on_previous_transfer;

View file

@ -288,7 +288,7 @@ void pvr_CmdResetQueryPool(VkCommandBuffer commandBuffer,
.type = PVR_EVENT_TYPE_BARRIER,
.barrier = {
.wait_for_stage_mask = PVR_PIPELINE_STAGE_ALL_GRAPHICS_BITS,
.wait_at_stage_mask = PVR_PIPELINE_STAGE_OCCLUSION_QUERY_BIT,
.wait_at_stage_mask = PVR_PIPELINE_STAGE_QUERY_BIT,
},
};
@ -307,7 +307,7 @@ void pvr_CmdResetQueryPool(VkCommandBuffer commandBuffer,
cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
.type = PVR_EVENT_TYPE_BARRIER,
.barrier = {
.wait_for_stage_mask = PVR_PIPELINE_STAGE_OCCLUSION_QUERY_BIT,
.wait_for_stage_mask = PVR_PIPELINE_STAGE_QUERY_BIT,
.wait_at_stage_mask = PVR_PIPELINE_STAGE_ALL_GRAPHICS_BITS,
},
};
@ -371,7 +371,7 @@ void pvr_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
.type = PVR_EVENT_TYPE_BARRIER,
.barrier = {
.wait_for_stage_mask = PVR_PIPELINE_STAGE_TRANSFER_BIT,
.wait_at_stage_mask = PVR_PIPELINE_STAGE_OCCLUSION_QUERY_BIT,
.wait_at_stage_mask = PVR_PIPELINE_STAGE_QUERY_BIT,
},
};
@ -388,7 +388,7 @@ void pvr_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
.type = PVR_EVENT_TYPE_BARRIER,
.barrier = {
.wait_for_stage_mask = PVR_PIPELINE_STAGE_OCCLUSION_QUERY_BIT,
.wait_for_stage_mask = PVR_PIPELINE_STAGE_QUERY_BIT,
.wait_at_stage_mask = PVR_PIPELINE_STAGE_TRANSFER_BIT,
},
};

View file

@ -28,19 +28,21 @@
#include <string.h>
#include <vulkan/vulkan.h>
#include "common/pvr_iface.h"
#include "hwdef/rogue_hw_utils.h"
#include "pco_uscgen_programs.h"
#include "pvr_bo.h"
#include "pvr_formats.h"
#include "pvr_pds.h"
#include "pvr_private.h"
#include "usc/programs/pvr_shader_factory.h"
#include "usc/programs/pvr_static_shaders.h"
#include "pvr_tex_state.h"
#include "pvr_types.h"
#include "vk_alloc.h"
#include "vk_command_pool.h"
#include "vk_util.h"
/* TODO: multicore support/awareness. */
static inline void pvr_init_primary_compute_pds_program(
struct pvr_pds_compute_shader_program *program)
{
@ -52,10 +54,10 @@ static inline void pvr_init_primary_compute_pds_program(
program->kick_usc = true;
}
static VkResult pvr_create_compute_secondary_prog(
struct pvr_device *device,
const struct pvr_shader_factory_info *shader_factory_info,
struct pvr_compute_query_shader *query_prog)
static VkResult
pvr_create_compute_secondary_prog(struct pvr_device *device,
unsigned const_shared_regs,
struct pvr_compute_query_shader *query_prog)
{
const size_t size =
pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes();
@ -79,8 +81,8 @@ static VkResult pvr_create_compute_secondary_prog(
.buffer_id = 0,
.source_offset = 0,
.type = PVR_BUFFER_TYPE_COMPILE_TIME,
.size_in_dwords = shader_factory_info->const_shared_regs,
.destination = shader_factory_info->explicit_const_start_offset,
.size_in_dwords = const_shared_regs,
.destination = 0,
}
},
};
@ -133,26 +135,24 @@ pvr_destroy_compute_secondary_prog(struct pvr_device *device,
vk_free(&device->vk.alloc, program->info.entries);
}
static VkResult pvr_create_compute_query_program(
static VkResult pvr_create_compute_query_precomp_program(
struct pvr_device *device,
const struct pvr_shader_factory_info *shader_factory_info,
enum pco_usclib_program common_program_index,
unsigned const_shared_regs,
struct pvr_compute_query_shader *query_prog)
{
const uint32_t cache_line_size =
rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
struct pvr_pds_compute_shader_program pds_primary_prog = { 0 };
const pco_precomp_data *precomp_data;
VkResult result;
memset(query_prog, 0, sizeof(*query_prog));
/* No support for query constant calc program. */
assert(shader_factory_info->const_calc_prog_inst_bytes == 0);
/* No support for query coefficient update program. */
assert(shader_factory_info->coeff_update_prog_start == PVR_INVALID_INST);
precomp_data = (pco_precomp_data *)pco_usclib_common[common_program_index];
result = pvr_gpu_upload_usc(device,
shader_factory_info->shader_code,
shader_factory_info->code_size,
precomp_data->binary,
precomp_data->size_dwords * sizeof(uint32_t),
cache_line_size,
&query_prog->usc_bo);
if (result != VK_SUCCESS)
@ -162,7 +162,7 @@ static VkResult pvr_create_compute_query_program(
pvr_pds_setup_doutu(&pds_primary_prog.usc_task_control,
query_prog->usc_bo->dev_addr.addr,
shader_factory_info->temps_required,
precomp_data->temps,
ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE,
false);
@ -176,9 +176,8 @@ static VkResult pvr_create_compute_query_program(
query_prog->primary_data_size_dw = pds_primary_prog.data_size;
query_prog->primary_num_temps = pds_primary_prog.temps_used;
result = pvr_create_compute_secondary_prog(device,
shader_factory_info,
query_prog);
result =
pvr_create_compute_secondary_prog(device, const_shared_regs, query_prog);
if (result != VK_SUCCESS)
goto err_free_pds_prim_code_bo;
@ -224,7 +223,9 @@ static VkResult pvr_write_compute_query_pds_data_section(
* not needed. If it's needed we should probably be using LITERAL entries for
* this instead.
*/
#if !defined(NDEBUG)
memset(dword_buffer, 0xFE, PVR_DW_TO_BYTES(info->data_size_in_dwords));
#endif /* !defined(NDEBUG) */
pipeline->pds_shared_update_data_size_dw = info->data_size_in_dwords;
@ -321,7 +322,7 @@ static void pvr_write_private_compute_dispatch(
1,
};
assert(sub_cmd->type == PVR_SUB_CMD_TYPE_OCCLUSION_QUERY);
assert(sub_cmd->type == PVR_SUB_CMD_TYPE_QUERY);
pvr_compute_update_shared_private(cmd_buffer, &sub_cmd->compute, pipeline);
pvr_compute_update_kernel_private(cmd_buffer,
@ -340,90 +341,41 @@ pvr_destroy_compute_query_program(struct pvr_device *device,
pvr_bo_suballoc_free(program->usc_bo);
}
static VkResult pvr_create_multibuffer_compute_query_program(
struct pvr_device *device,
const struct pvr_shader_factory_info *const *shader_factory_info,
struct pvr_compute_query_shader *query_programs)
{
const uint32_t core_count = device->pdevice->dev_runtime_info.core_count;
VkResult result;
uint32_t i;
for (i = 0; i < core_count; i++) {
result = pvr_create_compute_query_program(device,
shader_factory_info[i],
&query_programs[i]);
if (result != VK_SUCCESS)
goto err_destroy_compute_query_program;
}
return VK_SUCCESS;
err_destroy_compute_query_program:
for (uint32_t j = 0; j < i; j++)
pvr_destroy_compute_query_program(device, &query_programs[j]);
return result;
}
VkResult pvr_device_create_compute_query_programs(struct pvr_device *device)
{
const uint32_t core_count = device->pdevice->dev_runtime_info.core_count;
VkResult result;
result = pvr_create_compute_query_program(device,
&availability_query_write_info,
&device->availability_shader);
result = pvr_create_compute_query_precomp_program(
device,
CS_QUERY_AVAILABILITY_COMMON,
_PVR_QUERY_AVAILABILITY_DATA_COUNT,
&device->availability_shader);
if (result != VK_SUCCESS)
return result;
device->copy_results_shaders =
vk_alloc(&device->vk.alloc,
sizeof(*device->copy_results_shaders) * core_count,
8,
VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
if (!device->copy_results_shaders) {
result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
result =
pvr_create_compute_query_precomp_program(device,
CS_QUERY_COPY_COMMON,
_PVR_QUERY_COPY_DATA_COUNT,
&device->copy_results_shader);
if (result != VK_SUCCESS)
goto err_destroy_availability_query_program;
}
result = pvr_create_multibuffer_compute_query_program(
device,
copy_query_results_collection,
device->copy_results_shaders);
result =
pvr_create_compute_query_precomp_program(device,
CS_QUERY_RESET_COMMON,
_PVR_QUERY_RESET_DATA_COUNT,
&device->reset_queries_shader);
if (result != VK_SUCCESS)
goto err_vk_free_copy_results_shaders;
device->reset_queries_shaders =
vk_alloc(&device->vk.alloc,
sizeof(*device->reset_queries_shaders) * core_count,
8,
VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
if (!device->reset_queries_shaders) {
result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
goto err_destroy_copy_results_query_programs;
}
result = pvr_create_multibuffer_compute_query_program(
device,
reset_query_collection,
device->reset_queries_shaders);
if (result != VK_SUCCESS)
goto err_vk_free_reset_queries_shaders;
goto err_destroy_copy_results_query_program;
return VK_SUCCESS;
err_vk_free_reset_queries_shaders:
vk_free(&device->vk.alloc, device->reset_queries_shaders);
err_destroy_copy_results_query_programs:
for (uint32_t i = 0; i < core_count; i++) {
pvr_destroy_compute_query_program(device,
&device->copy_results_shaders[i]);
}
err_vk_free_copy_results_shaders:
vk_free(&device->vk.alloc, device->copy_results_shaders);
err_destroy_copy_results_query_program:
pvr_destroy_compute_query_program(device, &device->copy_results_shader);
err_destroy_availability_query_program:
pvr_destroy_compute_query_program(device, &device->availability_shader);
@ -433,53 +385,9 @@ err_destroy_availability_query_program:
void pvr_device_destroy_compute_query_programs(struct pvr_device *device)
{
const uint32_t core_count = device->pdevice->dev_runtime_info.core_count;
pvr_destroy_compute_query_program(device, &device->availability_shader);
for (uint32_t i = 0; i < core_count; i++) {
pvr_destroy_compute_query_program(device,
&device->copy_results_shaders[i]);
pvr_destroy_compute_query_program(device,
&device->reset_queries_shaders[i]);
}
vk_free(&device->vk.alloc, device->copy_results_shaders);
vk_free(&device->vk.alloc, device->reset_queries_shaders);
}
static void pvr_init_tex_info(const struct pvr_device_info *dev_info,
struct pvr_texture_state_info *tex_info,
uint32_t width,
pvr_dev_addr_t addr)
{
const VkFormat vk_format = VK_FORMAT_R32_UINT;
const uint8_t *swizzle_arr = pvr_get_format_swizzle(vk_format);
bool is_view_1d = !PVR_HAS_FEATURE(dev_info, tpu_extended_integer_lookup) &&
!PVR_HAS_FEATURE(dev_info, tpu_image_state_v2);
*tex_info = (struct pvr_texture_state_info){
.format = vk_format,
.mem_layout = PVR_MEMLAYOUT_LINEAR,
.flags = PVR_TEXFLAGS_INDEX_LOOKUP,
.type = is_view_1d ? VK_IMAGE_VIEW_TYPE_1D : VK_IMAGE_VIEW_TYPE_2D,
.is_cube = false,
.tex_state_type = PVR_TEXTURE_STATE_SAMPLE,
.extent = { .width = width, .height = 1, .depth = 0 },
.array_size = 1,
.base_level = 0,
.mip_levels = 1,
.mipmaps_present = false,
.sample_count = 1,
.stride = width,
.offset = 0,
.swizzle = { [0] = swizzle_arr[0],
[1] = swizzle_arr[1],
[2] = swizzle_arr[2],
[3] = swizzle_arr[3] },
.addr = addr,
};
pvr_destroy_compute_query_program(device, &device->copy_results_shader);
pvr_destroy_compute_query_program(device, &device->reset_queries_shader);
}
/* TODO: Split this function into per program type functions. */
@ -487,33 +395,16 @@ VkResult pvr_add_query_program(struct pvr_cmd_buffer *cmd_buffer,
const struct pvr_query_info *query_info)
{
struct pvr_device *device = cmd_buffer->device;
const uint32_t core_count = device->pdevice->dev_runtime_info.core_count;
const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
const struct pvr_shader_factory_info *shader_factory_info;
uint64_t sampler_state[ROGUE_NUM_TEXSTATE_SAMPLER_WORDS];
struct pvr_image_descriptor image_descriptor;
const struct pvr_compute_query_shader *query_prog;
struct pvr_private_compute_pipeline pipeline;
const uint32_t buffer_count = core_count;
struct pvr_texture_state_info tex_info;
uint32_t num_query_indices;
uint32_t *const_buffer;
struct pvr_suballoc_bo *pvr_bo;
VkResult result;
pvr_csb_pack (&sampler_state[0U], TEXSTATE_SAMPLER_WORD0, reg) {
reg.addrmode_u = ROGUE_TEXSTATE_ADDRMODE_CLAMP_TO_EDGE;
reg.addrmode_v = ROGUE_TEXSTATE_ADDRMODE_CLAMP_TO_EDGE;
reg.addrmode_w = ROGUE_TEXSTATE_ADDRMODE_CLAMP_TO_EDGE;
reg.minfilter = ROGUE_TEXSTATE_FILTER_POINT;
reg.magfilter = ROGUE_TEXSTATE_FILTER_POINT;
reg.non_normalized_coords = true;
reg.dadjust = ROGUE_TEXSTATE_DADJUST_ZERO_UINT;
}
/* clang-format off */
pvr_csb_pack (&sampler_state[1], TEXSTATE_SAMPLER_WORD1, sampler_word1) {}
/* clang-format on */
result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_QUERY);
if (result != VK_SUCCESS)
return result;
switch (query_info->type) {
case PVR_QUERY_TYPE_AVAILABILITY_WRITE:
@ -521,33 +412,28 @@ VkResult pvr_add_query_program(struct pvr_cmd_buffer *cmd_buffer,
* value in availability_bo at every index in index_bo.
*/
query_prog = &device->availability_shader;
shader_factory_info = &availability_query_write_info;
num_query_indices = query_info->availability_write.num_query_indices;
pipeline.const_shared_regs_count = _PVR_QUERY_AVAILABILITY_DATA_COUNT;
break;
case PVR_QUERY_TYPE_COPY_QUERY_RESULTS:
/* Adds a compute shader to copy availability and query value data. */
query_prog = &device->copy_results_shaders[buffer_count - 1];
shader_factory_info = copy_query_results_collection[buffer_count - 1];
query_prog = &device->copy_results_shader;
num_query_indices = query_info->copy_query_results.query_count;
pipeline.const_shared_regs_count = _PVR_QUERY_COPY_DATA_COUNT;
break;
case PVR_QUERY_TYPE_RESET_QUERY_POOL:
/* Adds a compute shader to reset availability and query value data. */
query_prog = &device->reset_queries_shaders[buffer_count - 1];
shader_factory_info = reset_query_collection[buffer_count - 1];
query_prog = &device->reset_queries_shader;
num_query_indices = query_info->reset_query_pool.query_count;
pipeline.const_shared_regs_count = _PVR_QUERY_RESET_DATA_COUNT;
break;
default:
UNREACHABLE("Invalid query type");
}
result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer,
PVR_SUB_CMD_TYPE_OCCLUSION_QUERY);
if (result != VK_SUCCESS)
return result;
pipeline.pds_code_offset = query_prog->pds_prim_code.code_offset;
pipeline.pds_data_offset = query_prog->pds_prim_code.data_offset;
@ -556,82 +442,34 @@ VkResult pvr_add_query_program(struct pvr_cmd_buffer *cmd_buffer,
pipeline.pds_data_size_dw = query_prog->primary_data_size_dw;
pipeline.pds_temps_used = query_prog->primary_num_temps;
pipeline.coeff_regs_count = shader_factory_info->coeff_regs;
pipeline.unified_store_regs_count = shader_factory_info->input_regs;
pipeline.const_shared_regs_count = shader_factory_info->const_shared_regs;
/* TODO: set properly. */
pipeline.coeff_regs_count = 3;
pipeline.unified_store_regs_count = 8;
const_buffer =
vk_alloc(&cmd_buffer->vk.pool->alloc,
PVR_DW_TO_BYTES(shader_factory_info->const_shared_regs),
8,
VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
const_buffer = vk_alloc(&cmd_buffer->vk.pool->alloc,
PVR_DW_TO_BYTES(pipeline.const_shared_regs_count),
8,
VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
if (!const_buffer) {
return vk_command_buffer_set_error(&cmd_buffer->vk,
VK_ERROR_OUT_OF_HOST_MEMORY);
}
/* clang-format off */
#define DRIVER_CONST(index) \
assert(shader_factory_info->driver_const_location_map[index] < \
shader_factory_info->const_shared_regs); \
const_buffer[shader_factory_info->driver_const_location_map[index]]
/* clang-format on */
switch (query_info->type) {
case PVR_QUERY_TYPE_AVAILABILITY_WRITE: {
uint64_t image_sampler_state[3][ROGUE_NUM_TEXSTATE_SAMPLER_WORDS];
uint32_t image_sampler_idx = 0;
uint64_t index_addr =
query_info->availability_write.index_bo->dev_addr.addr;
memcpy(&image_sampler_state[image_sampler_idx][0],
&sampler_state[0],
sizeof(sampler_state));
image_sampler_idx++;
uint64_t avail_addr =
query_info->availability_write.availability_bo->dev_addr.addr;
pvr_init_tex_info(dev_info,
&tex_info,
num_query_indices,
query_info->availability_write.index_bo->dev_addr);
const_buffer[PVR_QUERY_AVAILABILITY_DATA_INDEX_COUNT] = num_query_indices;
const_buffer[PVR_QUERY_AVAILABILITY_DATA_INDEX_BO_LO] = index_addr &
0xffffffff;
const_buffer[PVR_QUERY_AVAILABILITY_DATA_INDEX_BO_HI] = index_addr >> 32;
const_buffer[PVR_QUERY_AVAILABILITY_DATA_BO_LO] = avail_addr & 0xffffffff;
const_buffer[PVR_QUERY_AVAILABILITY_DATA_BO_HI] = avail_addr >> 32;
result = pvr_pack_tex_state(device, &tex_info, &image_descriptor);
memcpy(&image_sampler_state[image_sampler_idx][0],
image_descriptor.words,
sizeof(image_descriptor.words));
if (result != VK_SUCCESS) {
vk_free(&cmd_buffer->vk.pool->alloc, const_buffer);
return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
}
image_sampler_idx++;
pvr_init_tex_info(
dev_info,
&tex_info,
query_info->availability_write.num_queries,
query_info->availability_write.availability_bo->dev_addr);
result = pvr_pack_tex_state(device, &tex_info, &image_descriptor);
memcpy(&image_sampler_state[image_sampler_idx][0],
image_descriptor.words,
sizeof(image_descriptor.words));
if (result != VK_SUCCESS) {
vk_free(&cmd_buffer->vk.pool->alloc, const_buffer);
return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
}
image_sampler_idx++;
memcpy(&const_buffer[0],
&image_sampler_state[0][0],
sizeof(image_sampler_state));
/* Only PVR_QUERY_AVAILABILITY_WRITE_COUNT driver consts allowed. */
assert(shader_factory_info->num_driver_consts ==
PVR_QUERY_AVAILABILITY_WRITE_COUNT);
DRIVER_CONST(PVR_QUERY_AVAILABILITY_WRITE_INDEX_COUNT) =
num_query_indices;
break;
}
@ -642,94 +480,44 @@ VkResult pvr_add_query_program(struct pvr_cmd_buffer *cmd_buffer,
PVR_FROM_HANDLE(pvr_buffer,
buffer,
query_info->copy_query_results.dst_buffer);
const uint32_t image_sampler_state_arr_size =
(buffer_count + 2) * ROGUE_NUM_TEXSTATE_SAMPLER_WORDS;
uint32_t image_sampler_idx = 0;
pvr_dev_addr_t addr;
uint64_t offset;
STACK_ARRAY(uint64_t, image_sampler_state, image_sampler_state_arr_size);
if (!image_sampler_state) {
vk_free(&cmd_buffer->vk.pool->alloc, const_buffer);
return vk_command_buffer_set_error(&cmd_buffer->vk,
VK_ERROR_OUT_OF_HOST_MEMORY);
}
#define SAMPLER_ARR_2D(_arr, _i, _j) \
_arr[_i * ROGUE_NUM_TEXSTATE_SAMPLER_WORDS + _j]
memcpy(&SAMPLER_ARR_2D(image_sampler_state, image_sampler_idx, 0),
&sampler_state[0],
sizeof(sampler_state));
image_sampler_idx++;
offset = query_info->copy_query_results.first_query * sizeof(uint32_t);
addr = PVR_DEV_ADDR_OFFSET(pool->availability_buffer->dev_addr, offset);
pvr_init_tex_info(dev_info, &tex_info, num_query_indices, addr);
result = pvr_pack_tex_state(device, &tex_info, &image_descriptor);
memcpy(&SAMPLER_ARR_2D(image_sampler_state, image_sampler_idx, 0),
image_descriptor.words,
sizeof(image_descriptor.words));
if (result != VK_SUCCESS) {
vk_free(&cmd_buffer->vk.pool->alloc, const_buffer);
return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
}
image_sampler_idx++;
for (uint32_t i = 0; i < buffer_count; i++) {
addr = PVR_DEV_ADDR_OFFSET(pool->result_buffer->dev_addr,
offset + i * pool->result_stride);
pvr_init_tex_info(dev_info, &tex_info, num_query_indices, addr);
result = pvr_pack_tex_state(device, &tex_info, &image_descriptor);
memcpy(&SAMPLER_ARR_2D(image_sampler_state, image_sampler_idx, 0),
image_descriptor.words,
sizeof(image_descriptor.words));
if (result != VK_SUCCESS) {
vk_free(&cmd_buffer->vk.pool->alloc, const_buffer);
return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
}
image_sampler_idx++;
}
memcpy(&const_buffer[0],
&SAMPLER_ARR_2D(image_sampler_state, 0, 0),
image_sampler_state_arr_size * sizeof(image_sampler_state[0]));
STACK_ARRAY_FINISH(image_sampler_state);
/* Only PVR_COPY_QUERY_POOL_RESULTS_COUNT driver consts allowed. */
assert(shader_factory_info->num_driver_consts ==
PVR_COPY_QUERY_POOL_RESULTS_COUNT);
pvr_dev_addr_t dev_addr;
/* Assert if no memory is bound to destination buffer. */
assert(buffer->dev_addr.addr);
addr = buffer->dev_addr;
addr.addr += query_info->copy_query_results.dst_offset;
uint64_t offset =
query_info->copy_query_results.first_query * sizeof(uint32_t);
DRIVER_CONST(PVR_COPY_QUERY_POOL_RESULTS_INDEX_COUNT) = num_query_indices;
DRIVER_CONST(PVR_COPY_QUERY_POOL_RESULTS_BASE_ADDRESS_LOW) = addr.addr &
0xFFFFFFFF;
DRIVER_CONST(PVR_COPY_QUERY_POOL_RESULTS_BASE_ADDRESS_HIGH) = addr.addr >>
32;
DRIVER_CONST(PVR_COPY_QUERY_POOL_RESULTS_DEST_STRIDE) =
dev_addr = PVR_DEV_ADDR_OFFSET(buffer->dev_addr,
query_info->copy_query_results.dst_offset);
uint64_t dest_addr = dev_addr.addr;
dev_addr =
PVR_DEV_ADDR_OFFSET(pool->availability_buffer->dev_addr, offset);
uint64_t avail_addr = dev_addr.addr;
dev_addr = PVR_DEV_ADDR_OFFSET(pool->result_buffer->dev_addr, offset);
uint64_t result_addr = dev_addr.addr;
const_buffer[PVR_QUERY_COPY_DATA_INDEX_COUNT] = num_query_indices;
const_buffer[PVR_QUERY_COPY_DATA_DEST_BO_LO] = dest_addr & 0xffffffff;
const_buffer[PVR_QUERY_COPY_DATA_DEST_BO_HI] = dest_addr >> 32;
const_buffer[PVR_QUERY_COPY_DATA_AVAILABILITY_BO_LO] = avail_addr &
0xffffffff;
const_buffer[PVR_QUERY_COPY_DATA_AVAILABILITY_BO_HI] = avail_addr >> 32;
const_buffer[PVR_QUERY_COPY_DATA_RESULT_BO_LO] = result_addr & 0xffffffff;
const_buffer[PVR_QUERY_COPY_DATA_RESULT_BO_HI] = result_addr >> 32;
const_buffer[PVR_QUERY_COPY_DATA_DEST_STRIDE] =
query_info->copy_query_results.stride;
DRIVER_CONST(PVR_COPY_QUERY_POOL_RESULTS_PARTIAL_RESULT_FLAG) =
query_info->copy_query_results.flags & VK_QUERY_RESULT_PARTIAL_BIT;
DRIVER_CONST(PVR_COPY_QUERY_POOL_RESULTS_64_BIT_FLAG) =
query_info->copy_query_results.flags & VK_QUERY_RESULT_64_BIT;
DRIVER_CONST(PVR_COPY_QUERY_POOL_RESULTS_WITH_AVAILABILITY_FLAG) =
query_info->copy_query_results.flags &
VK_QUERY_RESULT_WITH_AVAILABILITY_BIT;
const_buffer[PVR_QUERY_COPY_DATA_FLAGS] =
query_info->copy_query_results.flags;
break;
}
@ -737,74 +525,26 @@ VkResult pvr_add_query_program(struct pvr_cmd_buffer *cmd_buffer,
PVR_FROM_HANDLE(pvr_query_pool,
pool,
query_info->reset_query_pool.query_pool);
const uint32_t image_sampler_state_arr_size =
(buffer_count + 2) * ROGUE_NUM_TEXSTATE_SAMPLER_WORDS;
uint32_t image_sampler_idx = 0;
pvr_dev_addr_t addr;
uint64_t offset;
STACK_ARRAY(uint64_t, image_sampler_state, image_sampler_state_arr_size);
if (!image_sampler_state) {
vk_free(&cmd_buffer->vk.pool->alloc, const_buffer);
uint64_t offset =
query_info->reset_query_pool.first_query * sizeof(uint32_t);
return vk_command_buffer_set_error(&cmd_buffer->vk,
VK_ERROR_OUT_OF_HOST_MEMORY);
}
pvr_dev_addr_t dev_addr =
PVR_DEV_ADDR_OFFSET(pool->result_buffer->dev_addr, offset);
uint64_t result_addr = dev_addr.addr;
memcpy(&SAMPLER_ARR_2D(image_sampler_state, image_sampler_idx, 0),
&sampler_state[0],
sizeof(sampler_state));
image_sampler_idx++;
dev_addr =
PVR_DEV_ADDR_OFFSET(pool->availability_buffer->dev_addr, offset);
uint64_t avail_addr = dev_addr.addr;
offset = query_info->reset_query_pool.first_query * sizeof(uint32_t);
const_buffer[PVR_QUERY_RESET_DATA_INDEX_COUNT] = num_query_indices;
const_buffer[PVR_QUERY_RESET_DATA_RESULT_BO_LO] = result_addr &
0xffffffff;
const_buffer[PVR_QUERY_RESET_DATA_RESULT_BO_HI] = result_addr >> 32;
const_buffer[PVR_QUERY_RESET_DATA_AVAILABILITY_BO_LO] = avail_addr &
0xffffffff;
const_buffer[PVR_QUERY_RESET_DATA_AVAILABILITY_BO_HI] = avail_addr >> 32;
for (uint32_t i = 0; i < buffer_count; i++) {
addr = PVR_DEV_ADDR_OFFSET(pool->result_buffer->dev_addr,
offset + i * pool->result_stride);
pvr_init_tex_info(dev_info, &tex_info, num_query_indices, addr);
result = pvr_pack_tex_state(device, &tex_info, &image_descriptor);
memcpy(&SAMPLER_ARR_2D(image_sampler_state, image_sampler_idx, 0),
image_descriptor.words,
sizeof(image_descriptor.words));
if (result != VK_SUCCESS) {
vk_free(&cmd_buffer->vk.pool->alloc, const_buffer);
return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
}
image_sampler_idx++;
}
addr = PVR_DEV_ADDR_OFFSET(pool->availability_buffer->dev_addr, offset);
pvr_init_tex_info(dev_info, &tex_info, num_query_indices, addr);
result = pvr_pack_tex_state(device, &tex_info, &image_descriptor);
memcpy(&SAMPLER_ARR_2D(image_sampler_state, image_sampler_idx, 0),
image_descriptor.words,
sizeof(image_descriptor.words));
if (result != VK_SUCCESS) {
vk_free(&cmd_buffer->vk.pool->alloc, const_buffer);
return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
}
image_sampler_idx++;
#undef SAMPLER_ARR_2D
memcpy(&const_buffer[0],
&image_sampler_state[0],
image_sampler_state_arr_size * sizeof(image_sampler_state[0]));
STACK_ARRAY_FINISH(image_sampler_state);
/* Only PVR_RESET_QUERY_POOL_COUNT driver consts allowed. */
assert(shader_factory_info->num_driver_consts ==
PVR_RESET_QUERY_POOL_COUNT);
DRIVER_CONST(PVR_RESET_QUERY_POOL_INDEX_COUNT) = num_query_indices;
break;
}
@ -812,21 +552,10 @@ VkResult pvr_add_query_program(struct pvr_cmd_buffer *cmd_buffer,
UNREACHABLE("Invalid query type");
}
#undef DRIVER_CONST
for (uint32_t i = 0; i < shader_factory_info->num_static_const; i++) {
const struct pvr_static_buffer *load =
&shader_factory_info->static_const_buffer[i];
/* Assert if static const is out of range. */
assert(load->dst_idx < shader_factory_info->const_shared_regs);
const_buffer[load->dst_idx] = load->value;
}
result = pvr_cmd_buffer_upload_general(
cmd_buffer,
const_buffer,
PVR_DW_TO_BYTES(shader_factory_info->const_shared_regs),
PVR_DW_TO_BYTES(pipeline.const_shared_regs_count),
&pvr_bo);
if (result != VK_SUCCESS) {
vk_free(&cmd_buffer->vk.pool->alloc, const_buffer);

View file

@ -375,16 +375,15 @@ static VkResult pvr_process_transfer_cmds(struct pvr_device *device,
return result;
}
static VkResult
pvr_process_occlusion_query_cmd(struct pvr_device *device,
struct pvr_queue *queue,
struct pvr_sub_cmd_compute *sub_cmd)
static VkResult pvr_process_query_cmd(struct pvr_device *device,
struct pvr_queue *queue,
struct pvr_sub_cmd_compute *sub_cmd)
{
struct vk_sync *sync;
VkResult result;
/* TODO: Currently we add barrier event sub commands to handle the sync
* necessary for the different occlusion query types. Would we get any speed
* necessary for the different query types. Would we get any speed
* up in processing the queue by doing that sync here without using event sub
* commands?
*/
@ -397,17 +396,17 @@ pvr_process_occlusion_query_cmd(struct pvr_device *device,
if (result != VK_SUCCESS)
return result;
result = pvr_compute_job_submit(
queue->query_ctx,
sub_cmd,
queue->next_job_wait_sync[PVR_JOB_TYPE_OCCLUSION_QUERY],
sync);
result =
pvr_compute_job_submit(queue->query_ctx,
sub_cmd,
queue->next_job_wait_sync[PVR_JOB_TYPE_QUERY],
sync);
if (result != VK_SUCCESS) {
vk_sync_destroy(&device->vk, sync);
return result;
}
pvr_update_job_syncs(device, queue, sync, PVR_JOB_TYPE_OCCLUSION_QUERY);
pvr_update_job_syncs(device, queue, sync, PVR_JOB_TYPE_QUERY);
return result;
}
@ -423,10 +422,10 @@ pvr_process_event_cmd_barrier(struct pvr_device *device,
uint32_t src_wait_count = 0;
VkResult result;
assert(!(src_mask & ~(PVR_PIPELINE_STAGE_ALL_BITS |
PVR_PIPELINE_STAGE_OCCLUSION_QUERY_BIT)));
assert(!(dst_mask & ~(PVR_PIPELINE_STAGE_ALL_BITS |
PVR_PIPELINE_STAGE_OCCLUSION_QUERY_BIT)));
assert(!(src_mask &
~(PVR_PIPELINE_STAGE_ALL_BITS | PVR_PIPELINE_STAGE_QUERY_BIT)));
assert(!(dst_mask &
~(PVR_PIPELINE_STAGE_ALL_BITS | PVR_PIPELINE_STAGE_QUERY_BIT)));
u_foreach_bit (stage, src_mask) {
if (queue->last_job_signal_sync[stage]) {
@ -494,7 +493,7 @@ pvr_process_event_cmd_set_or_reset(struct pvr_device *device,
const enum pvr_event_state new_event_state)
{
/* Not PVR_JOB_TYPE_MAX since that also includes
* PVR_JOB_TYPE_OCCLUSION_QUERY so no stage in the src mask.
* PVR_JOB_TYPE_QUERY so no stage in the src mask.
*/
struct vk_sync_wait waits[PVR_NUM_SYNC_PIPELINE_STAGES];
struct vk_sync_signal signal;
@ -696,12 +695,12 @@ static VkResult pvr_process_cmd_buffer(struct pvr_device *device,
link) {
switch (sub_cmd->type) {
case PVR_SUB_CMD_TYPE_GRAPHICS: {
/* If the fragment job utilizes occlusion queries, for data integrity
* it needs to wait for the occlusion query to be processed.
/* If the fragment job utilizes queries, for data integrity
* it needs to wait for the query to be processed.
*/
if (sub_cmd->gfx.has_occlusion_query) {
if (sub_cmd->gfx.has_query) {
struct pvr_sub_cmd_event_barrier barrier = {
.wait_for_stage_mask = PVR_PIPELINE_STAGE_OCCLUSION_QUERY_BIT,
.wait_for_stage_mask = PVR_PIPELINE_STAGE_QUERY_BIT,
.wait_at_stage_mask = PVR_PIPELINE_STAGE_FRAG_BIT,
};
@ -761,9 +760,8 @@ static VkResult pvr_process_cmd_buffer(struct pvr_device *device,
break;
}
case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
result =
pvr_process_occlusion_query_cmd(device, queue, &sub_cmd->compute);
case PVR_SUB_CMD_TYPE_QUERY:
result = pvr_process_query_cmd(device, queue, &sub_cmd->compute);
break;
case PVR_SUB_CMD_TYPE_EVENT:
@ -847,11 +845,10 @@ static VkResult pvr_process_queue_signals(struct pvr_queue *queue,
uint32_t wait_count = 0;
for (uint32_t i = 0; i < PVR_JOB_TYPE_MAX; i++) {
/* Exception for occlusion query jobs since that's something internal,
/* Exception for query jobs since that's something internal,
* so the user provided syncs won't ever have it as a source stage.
*/
if (!(signal_stage_src & BITFIELD_BIT(i)) &&
i != PVR_JOB_TYPE_OCCLUSION_QUERY)
if (!(signal_stage_src & BITFIELD_BIT(i)) && i != PVR_JOB_TYPE_QUERY)
continue;
if (!queue->last_job_signal_sync[i])

View file

@ -31,30 +31,6 @@
#include "util/bitscan.h"
#include "util/u_math.h"
/* Occlusion query availability writes. */
enum pvr_query_availability_write_pool_const {
PVR_QUERY_AVAILABILITY_WRITE_INDEX_COUNT,
PVR_QUERY_AVAILABILITY_WRITE_COUNT,
};
/* Copy query pool results. */
enum pvr_copy_query_pool_const {
PVR_COPY_QUERY_POOL_RESULTS_INDEX_COUNT,
PVR_COPY_QUERY_POOL_RESULTS_BASE_ADDRESS_LOW,
PVR_COPY_QUERY_POOL_RESULTS_BASE_ADDRESS_HIGH,
PVR_COPY_QUERY_POOL_RESULTS_DEST_STRIDE,
PVR_COPY_QUERY_POOL_RESULTS_PARTIAL_RESULT_FLAG,
PVR_COPY_QUERY_POOL_RESULTS_64_BIT_FLAG,
PVR_COPY_QUERY_POOL_RESULTS_WITH_AVAILABILITY_FLAG,
PVR_COPY_QUERY_POOL_RESULTS_COUNT,
};
/* Reset query pool. */
enum pvr_reset_query_pool_pool_const {
PVR_RESET_QUERY_POOL_INDEX_COUNT,
PVR_RESET_QUERY_POOL_COUNT,
};
/* ClearAttachments. */
enum pvr_clear_attachment_const {
PVR_CLEAR_ATTACHMENT_CONST_COMPONENT_0 = 0, /* Don't change. Indexes array.

View file

@ -55,110 +55,6 @@ struct pvr_shader_factory_info {
uint32_t msaa_sample_count;
};
static const uint8_t availability_query_write_shader[144] = { 0 };
static const uint32_t availability_query_write_location_map[1] = {
0,
};
static const struct pvr_static_buffer
availability_query_write_static_consts[3] = {
{ 0, 0 },
{ 0, 0 },
{ 0, 0 },
};
static const struct pvr_shader_factory_info availability_query_write_info = {
0,
0,
0,
0,
0,
sizeof(availability_query_write_shader),
availability_query_write_shader,
0,
0,
NULL,
PVR_INVALID_INST,
0,
availability_query_write_location_map,
0,
availability_query_write_static_consts,
0,
~0,
};
static const uint8_t copy_query_results_shader[384] = { 0 };
static const uint32_t copy_query_results_location_map[7] = {
0, 0, 0, 0, 0, 0, 0,
};
static const struct pvr_static_buffer copy_query_results_static_consts[2] = {
{ 0, 0 },
{ 0, 0 },
};
static const struct pvr_shader_factory_info copy_query_results_info = {
0,
0,
0,
0,
0,
sizeof(copy_query_results_shader),
copy_query_results_shader,
0,
0,
NULL,
PVR_INVALID_INST,
0,
copy_query_results_location_map,
0,
copy_query_results_static_consts,
0,
~0,
};
static const uint8_t reset_query_shader_code[136] = { 0 };
static const uint32_t reset_query_location_map[1] = {
0,
};
static const struct pvr_static_buffer reset_query_static_consts[2] = {
{ 0, 0 },
{ 0, 0 },
};
static const struct pvr_shader_factory_info reset_query_info = {
0,
0,
0,
0,
0,
sizeof(reset_query_shader_code),
reset_query_shader_code,
0,
0,
NULL,
PVR_INVALID_INST,
0,
reset_query_location_map,
0,
reset_query_static_consts,
0,
~0,
};
static const struct pvr_shader_factory_info
*const copy_query_results_collection[1] = {
&copy_query_results_info,
};
static const struct pvr_shader_factory_info *const reset_query_collection[1] = {
&reset_query_info,
};
static const uint8_t clear_attachments_1_dw_0_offt_out_reg_shader_code[8] = {
0
};