mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-09 04:38:03 +02:00
panvk: Implement indirect draw for Bifrost on JM
Signed-off-by: Mary Guillemard <mary.guillemard@collabora.com> Reviewed-by: Olivia Lee <olivia.lee@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35724>
This commit is contained in:
parent
3be34989e0
commit
a382e97339
4 changed files with 300 additions and 18 deletions
|
|
@ -396,9 +396,6 @@ dEQP-VK.api.device_init.create_device_global_priority_khr.basic,Fail
|
|||
dEQP-VK.api.device_init.create_device_global_priority_query.basic,Fail
|
||||
dEQP-VK.api.device_init.create_device_global_priority_query_khr.basic,Fail
|
||||
|
||||
# CmdDrawIndirect not supported yet
|
||||
dEQP-VK.api.command_buffers.many_indirect_draws_on_secondary,Crash
|
||||
|
||||
dEQP-VK.device_group.afr,Fail
|
||||
dEQP-VK.device_group.afr_dedicated,Fail
|
||||
dEQP-VK.device_group.afr_sys,Fail
|
||||
|
|
|
|||
|
|
@ -38,6 +38,7 @@ afbcp-glx@glx-multithread-clearbuffer
|
|||
dEQP-VK.memory.pipeline_barrier.host_write_vertex_buffer.1048576_vertex_buffer_stride_2
|
||||
dEQP-VK.memory.pipeline_barrier.host_write_uniform_buffer.1048576
|
||||
dEQP-VK.memory.pipeline_barrier.host_write_uniform_texel_buffer.1048576
|
||||
dEQP-VK.memory.pipeline_barrier.host_write_index_buffer.1048576
|
||||
|
||||
# Sometime timeout
|
||||
dEQP-VK.memory.pipeline_barrier.host_write_storage_buffer.1048576
|
||||
|
|
|
|||
|
|
@ -73,19 +73,6 @@ shaders@glsl-bug-110796
|
|||
|
||||
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed*
|
||||
|
||||
# indirect draw not supported yet
|
||||
dEQP-VK.synchronization.*indirect*
|
||||
dEQP-VK.synchronization2.*indirect*
|
||||
dEQP-VK.draw.renderpass.indirect_draw.*
|
||||
dEQP-VK.draw.renderpass.*.draw_indirect*
|
||||
dEQP-VK.draw.renderpass.*.draw_indexed_indirect*
|
||||
dEQP-VK.draw.dynamic_rendering.*.draw_indirect*
|
||||
dEQP-VK.draw.dynamic_rendering.*.draw_indexed_indirect*
|
||||
dEQP-VK.draw.dynamic_rendering.*.indirect_draw*
|
||||
dEQP-VK.multiview.draw_indirect*
|
||||
dEQP-VK.multiview.dynamic_rendering.draw_indirect*
|
||||
dEQP-VK.multiview.renderpass2.draw_indirect*
|
||||
|
||||
# VKCTS bug?
|
||||
# Jump to 0x0 (XXX: need more research)
|
||||
dEQP-VK.glsl.shader_expect_assume.*
|
||||
|
|
@ -117,6 +104,9 @@ dEQP-VK.compute.pipeline.zero_initialize_workgroup_memory.types.f32mat4x4
|
|||
# Job timeout
|
||||
dEQP-VK.graphicsfuzz.spv-composites
|
||||
|
||||
# Cause OOM when running with alloacation tests
|
||||
dEQP-VK.api.command_buffers.many_indirect_draws_on_secondary
|
||||
|
||||
# Slow tests (>= 30s)
|
||||
dEQP-VK.api.external.fence.sync_fd.export_multiple_times_temporary
|
||||
dEQP-VK.api.external.semaphore.sync_fd.export_multiple_times_temporary
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@
|
|||
#include "panvk_cmd_desc_state.h"
|
||||
#include "panvk_cmd_draw.h"
|
||||
#include "panvk_cmd_meta.h"
|
||||
#include "panvk_cmd_precomp.h"
|
||||
#include "panvk_device.h"
|
||||
#include "panvk_entrypoints.h"
|
||||
#include "panvk_image.h"
|
||||
|
|
@ -1550,6 +1551,264 @@ panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_data *draw)
|
|||
cmdbuf->state.gfx.vs.previous_draw_was_indirect = false;
|
||||
}
|
||||
|
||||
static void
|
||||
panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf,
|
||||
struct panvk_draw_data *draw)
|
||||
{
|
||||
const struct panvk_shader_variant *vs = panvk_shader_hw_variant(cmdbuf->state.gfx.vs.shader);
|
||||
VkResult result;
|
||||
|
||||
/* If there's no vertex shader, we can skip the draw. */
|
||||
if (!panvk_priv_mem_dev_addr(vs->rsd))
|
||||
return;
|
||||
|
||||
/* Needs to be done before get_fs() is called because it depends on
|
||||
* fs.required being initialized. */
|
||||
cmdbuf->state.gfx.fs.required =
|
||||
fs_required(&cmdbuf->state.gfx, &cmdbuf->vk.dynamic_graphics_state);
|
||||
|
||||
result = prepare_draw(cmdbuf, draw);
|
||||
if (result != VK_SUCCESS)
|
||||
return;
|
||||
|
||||
struct panvk_batch *batch = cmdbuf->cur_batch;
|
||||
const struct vk_input_assembly_state *ia =
|
||||
&cmdbuf->vk.dynamic_graphics_state.ia;
|
||||
const struct vk_vertex_input_state *vi =
|
||||
cmdbuf->vk.dynamic_graphics_state.vi;
|
||||
|
||||
unsigned copy_desc_job_id =
|
||||
draw->jobs.vertex_copy_desc.gpu
|
||||
? pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_COMPUTE, false, false,
|
||||
0, 0, &draw->jobs.vertex_copy_desc, false)
|
||||
: 0;
|
||||
|
||||
if (draw->jobs.frag_copy_desc.gpu) {
|
||||
/* We don't need to add frag_copy_desc as a dependency because the
|
||||
* tiler job doesn't execute the fragment shader, the fragment job
|
||||
* will, and the tiler/fragment synchronization happens at the batch
|
||||
* level. */
|
||||
pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_COMPUTE, false, false, 0, 0,
|
||||
&draw->jobs.frag_copy_desc, false);
|
||||
}
|
||||
|
||||
uint32_t view_mask = cmdbuf->state.gfx.render.view_mask;
|
||||
assert(view_mask == 0 || util_bitcount(view_mask) <= batch->fb.layer_count);
|
||||
uint32_t enabled_layer_count = view_mask
|
||||
? util_bitcount(view_mask)
|
||||
: cmdbuf->state.gfx.render.layer_count;
|
||||
const struct panvk_shader_variant *fs = panvk_shader_only_variant(get_fs(cmdbuf));
|
||||
|
||||
for (uint32_t i = 0; i < enabled_layer_count; i++) {
|
||||
/* Force a new push uniform block to be allocated */
|
||||
gfx_state_set_dirty(cmdbuf, VS_PUSH_UNIFORMS);
|
||||
|
||||
result = panvk_draw_prepare_varyings(cmdbuf, draw);
|
||||
if (result != VK_SUCCESS)
|
||||
return;
|
||||
|
||||
draw->info.layer_id = (view_mask != 0) ? u_bit_scan(&view_mask) : i;
|
||||
if (draw->info.layer_id > 0) {
|
||||
cmdbuf->state.gfx.sysvals.layer_id = draw->info.layer_id;
|
||||
gfx_state_set_dirty(cmdbuf, FS_PUSH_UNIFORMS);
|
||||
}
|
||||
|
||||
result = panvk_per_arch(cmd_prepare_push_uniforms)(
|
||||
cmdbuf, vs, 1);
|
||||
if (result != VK_SUCCESS)
|
||||
return;
|
||||
|
||||
if (fs) {
|
||||
result = panvk_per_arch(cmd_prepare_push_uniforms)(
|
||||
cmdbuf, fs, 1);
|
||||
if (result != VK_SUCCESS)
|
||||
return;
|
||||
}
|
||||
|
||||
result = panvk_draw_prepare_tiler_context(cmdbuf, draw);
|
||||
if (result != VK_SUCCESS)
|
||||
return;
|
||||
|
||||
if (vs->info.vs.idvs) {
|
||||
result = panvk_draw_prepare_idvs_job(cmdbuf, draw);
|
||||
|
||||
if (result != VK_SUCCESS)
|
||||
return;
|
||||
} else {
|
||||
result = panvk_draw_prepare_vertex_job(cmdbuf, draw);
|
||||
|
||||
if (result != VK_SUCCESS)
|
||||
return;
|
||||
|
||||
bool needs_tiling =
|
||||
!cmdbuf->vk.dynamic_graphics_state.rs.rasterizer_discard_enable ||
|
||||
cmdbuf->state.gfx.occlusion_query.mode !=
|
||||
MALI_OCCLUSION_MODE_DISABLED;
|
||||
|
||||
if (needs_tiling) {
|
||||
result = panvk_draw_prepare_tiler_job(cmdbuf, draw);
|
||||
|
||||
if (result != VK_SUCCESS)
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
assert(draw->info.indirect.buffer_dev_addr != 0 || draw->info.index.size);
|
||||
|
||||
uint32_t attrib_bufs_valid = vi->bindings_valid;
|
||||
uint32_t attribs_valid = vi->attributes_valid;
|
||||
uint64_t first_vertex_sysval = 0x8ull << 60;
|
||||
uint64_t first_instance_sysval = 0x8ull << 60;
|
||||
uint64_t raw_vertex_offset_sysval = 0x8ull << 60;
|
||||
if (shader_uses_sysval(vs, graphics, vs.first_vertex)) {
|
||||
first_vertex_sysval = cmdbuf->state.gfx.vs.push_uniforms +
|
||||
shader_remapped_sysval_offset(
|
||||
vs, sysval_offset(graphics, vs.first_vertex));
|
||||
}
|
||||
|
||||
if (shader_uses_sysval(vs, graphics, vs.base_instance)) {
|
||||
first_instance_sysval =
|
||||
cmdbuf->state.gfx.vs.push_uniforms +
|
||||
shader_remapped_sysval_offset(
|
||||
vs, sysval_offset(graphics, vs.base_instance));
|
||||
}
|
||||
|
||||
if (shader_uses_sysval(vs, graphics, vs.raw_vertex_offset)) {
|
||||
raw_vertex_offset_sysval =
|
||||
cmdbuf->state.gfx.vs.push_uniforms +
|
||||
shader_remapped_sysval_offset(
|
||||
vs, sysval_offset(graphics, vs.raw_vertex_offset));
|
||||
}
|
||||
|
||||
struct panvk_precomp_ctx precomp_ctx = panvk_per_arch(precomp_cs)(cmdbuf);
|
||||
enum panlib_barrier indirect_barrier =
|
||||
PANLIB_BARRIER_JM_SUPPRESS_PREFETCH;
|
||||
struct panlib_precomp_grid indirect_grid =
|
||||
panlib_1d_with_jm_deps(1, 0, copy_desc_job_id);
|
||||
|
||||
if (draw->info.indirect.buffer_dev_addr != 0 && draw->info.index.size) {
|
||||
const struct panlib_draw_indexed_indirect_helper_args args = {
|
||||
.cmd = draw->info.indirect.buffer_dev_addr,
|
||||
.index_buffer_ptr = cmdbuf->state.gfx.ib.dev_addr,
|
||||
.index_size = draw->info.index.size,
|
||||
.primitive_vertex_count = primitive_vertex_count(
|
||||
translate_prim_topology(ia->primitive_topology)),
|
||||
.primitive_restart = ia->primitive_restart_enable,
|
||||
.varying_bufs_descs = draw->varying_bufs,
|
||||
.varying_bufs_info = draw->indirect_info.varying_bufs,
|
||||
.attrib_bufs_descs = draw->vs.attribute_bufs,
|
||||
.attrib_bufs_infos = draw->indirect_info.attrib_bufs,
|
||||
.attrib_bufs_valid = attrib_bufs_valid,
|
||||
.attribs_valid = attribs_valid,
|
||||
.attribs_descs = draw->vs.attributes,
|
||||
.attribs_infos = draw->indirect_info.attribs,
|
||||
.first_vertex_sysval = first_vertex_sysval,
|
||||
.first_instance_sysval = first_instance_sysval,
|
||||
.raw_vertex_offset_sysval = raw_vertex_offset_sysval,
|
||||
.idvs_job = vs->info.vs.idvs ? draw->jobs.idvs.gpu : 0,
|
||||
.vertex_job = draw->jobs.vertex.gpu,
|
||||
.tiler_job = draw->jobs.tiler.gpu,
|
||||
};
|
||||
panlib_draw_indexed_indirect_helper_struct(&precomp_ctx, indirect_grid,
|
||||
indirect_barrier, args);
|
||||
} else if (draw->info.indirect.buffer_dev_addr != 0) {
|
||||
const struct panlib_draw_indirect_helper_args args = {
|
||||
.cmd = draw->info.indirect.buffer_dev_addr,
|
||||
.primitive_vertex_count = primitive_vertex_count(
|
||||
translate_prim_topology(ia->primitive_topology)),
|
||||
.varying_bufs_descs = draw->varying_bufs,
|
||||
.varying_bufs_info = draw->indirect_info.varying_bufs,
|
||||
.attrib_bufs_descs = draw->vs.attribute_bufs,
|
||||
.attrib_bufs_infos = draw->indirect_info.attrib_bufs,
|
||||
.attrib_bufs_valid = attrib_bufs_valid,
|
||||
.attribs_valid = attribs_valid,
|
||||
.attribs_descs = draw->vs.attributes,
|
||||
.attribs_infos = draw->indirect_info.attribs,
|
||||
.first_vertex_sysval = first_vertex_sysval,
|
||||
.first_instance_sysval = first_instance_sysval,
|
||||
.raw_vertex_offset_sysval = raw_vertex_offset_sysval,
|
||||
.idvs_job = vs->info.vs.idvs ? draw->jobs.idvs.gpu : 0,
|
||||
.vertex_job = draw->jobs.vertex.gpu,
|
||||
.tiler_job = draw->jobs.tiler.gpu,
|
||||
};
|
||||
panlib_draw_indirect_helper_struct(&precomp_ctx, indirect_grid,
|
||||
indirect_barrier, args);
|
||||
} else {
|
||||
const struct panlib_draw_indexed_helper_args args = {
|
||||
.index_buffer_ptr = cmdbuf->state.gfx.ib.dev_addr,
|
||||
.index_size = draw->info.index.size,
|
||||
.first_index = draw->info.index.offset,
|
||||
.index_count = draw->info.vertex.count,
|
||||
.first_instance = draw->info.instance.base,
|
||||
.instance_count = draw->info.instance.count,
|
||||
.vertex_offset = draw->info.vertex.base,
|
||||
.primitive_restart = ia->primitive_restart_enable,
|
||||
.varying_bufs_descs = draw->varying_bufs,
|
||||
.varying_bufs_info = draw->indirect_info.varying_bufs,
|
||||
.attrib_bufs_descs = draw->vs.attribute_bufs,
|
||||
.attrib_bufs_infos = draw->indirect_info.attrib_bufs,
|
||||
.attrib_bufs_valid = attrib_bufs_valid,
|
||||
.attribs_valid = attribs_valid,
|
||||
.attribs_descs = draw->vs.attributes,
|
||||
.attribs_infos = draw->indirect_info.attribs,
|
||||
.first_vertex_sysval = first_vertex_sysval,
|
||||
.first_instance_sysval = first_instance_sysval,
|
||||
.raw_vertex_offset_sysval = raw_vertex_offset_sysval,
|
||||
.idvs_job = vs->info.vs.idvs ? draw->jobs.idvs.gpu : 0,
|
||||
.vertex_job = draw->jobs.vertex.gpu,
|
||||
.tiler_job = draw->jobs.tiler.gpu,
|
||||
.primitive_vertex_count = primitive_vertex_count(
|
||||
translate_prim_topology(ia->primitive_topology)),
|
||||
};
|
||||
panlib_draw_indexed_helper_struct(&precomp_ctx, indirect_grid,
|
||||
indirect_barrier, args);
|
||||
}
|
||||
|
||||
/* Grab the index of the indirect helper job */
|
||||
uint32_t prev_job = batch->vtc_jc.job_index;
|
||||
|
||||
if (vs->info.vs.idvs) {
|
||||
pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_INDEXED_VERTEX, false,
|
||||
false, 0, prev_job, &draw->jobs.idvs, false);
|
||||
} else {
|
||||
unsigned vjob_id =
|
||||
pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_VERTEX, false, true, 0,
|
||||
prev_job, &draw->jobs.vertex, false);
|
||||
|
||||
if (draw->jobs.tiler.gpu != 0) {
|
||||
pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_TILER, false, false,
|
||||
vjob_id, 0, &draw->jobs.tiler, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* We split every ~1024 indirect draw.
|
||||
* This is here for multiple reasons:
|
||||
* - The indirect varying buffer offset need to be reset at some point to
|
||||
* avoid going outside of bounds.
|
||||
* - It is possible to always end up with timeouts for batches with 4k draws
|
||||
* (see "dEQP-VK.api.command_buffers.many_indirect_draws_on_secondary") At
|
||||
* the same time, because of how TLS works on Mali, we should not split too
|
||||
* much as this will cause the TLS budget to go crazy.
|
||||
*/
|
||||
if (batch->vtc_jc.job_index > (5 * 1024)) {
|
||||
bool preload_fb =
|
||||
cmdbuf->cur_batch && cmdbuf->cur_batch->vtc_jc.first_tiler;
|
||||
|
||||
panvk_per_arch(cmd_close_batch)(cmdbuf);
|
||||
|
||||
if (preload_fb)
|
||||
panvk_per_arch(cmd_preload_fb_after_batch_split)(cmdbuf);
|
||||
|
||||
batch = panvk_per_arch(cmd_open_batch)(cmdbuf);
|
||||
cmdbuf->state.gfx.vs.indirect_varying_bufs_infos = 0;
|
||||
}
|
||||
|
||||
clear_dirty_after_draw(cmdbuf);
|
||||
cmdbuf->state.gfx.vs.previous_draw_was_indirect = true;
|
||||
}
|
||||
|
||||
static unsigned
|
||||
padded_vertex_count(struct panvk_cmd_buffer *cmdbuf, uint32_t vertex_count,
|
||||
uint32_t instance_count)
|
||||
|
|
@ -1697,7 +1956,24 @@ panvk_per_arch(CmdDrawIndirect)(VkCommandBuffer commandBuffer, VkBuffer _buffer,
|
|||
VkDeviceSize offset, uint32_t drawCount,
|
||||
uint32_t stride)
|
||||
{
|
||||
panvk_stub();
|
||||
VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
|
||||
VK_FROM_HANDLE(panvk_buffer, buffer, _buffer);
|
||||
|
||||
if (drawCount == 0)
|
||||
return;
|
||||
|
||||
/* We cannot support arbitrary draw count on JM */
|
||||
assert(drawCount == 1);
|
||||
|
||||
struct panvk_draw_data draw = {
|
||||
.info = {
|
||||
.indirect.buffer_dev_addr = panvk_buffer_gpu_ptr(buffer, offset),
|
||||
.indirect.draw_count = drawCount,
|
||||
.indirect.stride = stride,
|
||||
},
|
||||
};
|
||||
|
||||
panvk_cmd_draw_indirect(cmdbuf, &draw);
|
||||
}
|
||||
|
||||
VKAPI_ATTR void VKAPI_CALL
|
||||
|
|
@ -1705,7 +1981,25 @@ panvk_per_arch(CmdDrawIndexedIndirect)(VkCommandBuffer commandBuffer,
|
|||
VkBuffer _buffer, VkDeviceSize offset,
|
||||
uint32_t drawCount, uint32_t stride)
|
||||
{
|
||||
panvk_stub();
|
||||
VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
|
||||
VK_FROM_HANDLE(panvk_buffer, buffer, _buffer);
|
||||
|
||||
if (drawCount == 0)
|
||||
return;
|
||||
|
||||
/* We cannot support arbitrary draw count on JM */
|
||||
assert(drawCount == 1);
|
||||
|
||||
struct panvk_draw_data draw = {
|
||||
.info = {
|
||||
.index.size = cmdbuf->state.gfx.ib.index_size,
|
||||
.indirect.buffer_dev_addr = panvk_buffer_gpu_ptr(buffer, offset),
|
||||
.indirect.draw_count = drawCount,
|
||||
.indirect.stride = stride,
|
||||
},
|
||||
};
|
||||
|
||||
panvk_cmd_draw_indirect(cmdbuf, &draw);
|
||||
}
|
||||
|
||||
VKAPI_ATTR void VKAPI_CALL
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue