mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-06 02:58:05 +02:00
v3dv: implement indirect compute dispatch
The hardware can't do this, so we need to record a CPU job that will map the indirect buffer at queue submission time, read the dispatch parameters and then submit a regular dispatch. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6766>
This commit is contained in:
parent
1d6edcc3e8
commit
b356d3de8c
4 changed files with 234 additions and 46 deletions
|
|
@ -217,6 +217,14 @@ job_destroy_cpu_wait_events_resources(struct v3dv_job *job)
|
|||
vk_free(&job->cmd_buffer->device->alloc, job->cpu.event_wait.events);
|
||||
}
|
||||
|
||||
static void
|
||||
job_destroy_cpu_csd_indirect_resources(struct v3dv_job *job)
|
||||
{
|
||||
assert(job->type == V3DV_JOB_TYPE_CPU_CSD_INDIRECT);
|
||||
assert(job->cmd_buffer);
|
||||
v3dv_job_destroy(job->cpu.csd_indirect.csd_job);
|
||||
}
|
||||
|
||||
void
|
||||
v3dv_job_destroy(struct v3dv_job *job)
|
||||
{
|
||||
|
|
@ -240,6 +248,9 @@ v3dv_job_destroy(struct v3dv_job *job)
|
|||
case V3DV_JOB_TYPE_CPU_WAIT_EVENTS:
|
||||
job_destroy_cpu_wait_events_resources(job);
|
||||
break;
|
||||
case V3DV_JOB_TYPE_CPU_CSD_INDIRECT:
|
||||
job_destroy_cpu_csd_indirect_resources(job);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
|
@ -806,28 +817,6 @@ v3dv_cmd_buffer_start_job(struct v3dv_cmd_buffer *cmd_buffer,
|
|||
return job;
|
||||
}
|
||||
|
||||
static struct v3dv_job *
|
||||
cmd_buffer_start_compute_job(struct v3dv_cmd_buffer *cmd_buffer)
|
||||
{
|
||||
/* Compute jobs can only happen outside a render pass */
|
||||
assert(!cmd_buffer->state.job);
|
||||
assert(!cmd_buffer->state.pass);
|
||||
|
||||
struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->alloc,
|
||||
sizeof(struct v3dv_job), 8,
|
||||
VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
|
||||
cmd_buffer->state.job = job;
|
||||
|
||||
if (!job) {
|
||||
v3dv_flag_oom(cmd_buffer, NULL);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
v3dv_job_init(job, V3DV_JOB_TYPE_GPU_CSD, cmd_buffer->device, cmd_buffer, -1);
|
||||
|
||||
return job;
|
||||
}
|
||||
|
||||
static VkResult
|
||||
cmd_buffer_reset(struct v3dv_cmd_buffer *cmd_buffer,
|
||||
VkCommandBufferResetFlags flags)
|
||||
|
|
@ -4589,32 +4578,83 @@ cmd_buffer_emit_pre_dispatch(struct v3dv_cmd_buffer *cmd_buffer)
|
|||
#define V3D_CSD_CFG5_SINGLE_SEG (1 << 1)
|
||||
#define V3D_CSD_CFG5_THREADING (1 << 0)
|
||||
|
||||
static void
|
||||
cmd_buffer_dispatch(struct v3dv_cmd_buffer *cmd_buffer,
|
||||
uint32_t group_count_x,
|
||||
uint32_t group_count_y,
|
||||
uint32_t group_count_z)
|
||||
void
|
||||
v3dv_cmd_buffer_rewrite_indirect_csd_job(
|
||||
struct v3dv_csd_indirect_cpu_job_info *info,
|
||||
const uint32_t *wg_counts)
|
||||
{
|
||||
if (group_count_x == 0 || group_count_y == 0 || group_count_z == 0)
|
||||
return;
|
||||
assert(info->csd_job);
|
||||
struct v3dv_job *job = info->csd_job;
|
||||
|
||||
struct v3dv_job *job = cmd_buffer_start_compute_job(cmd_buffer);
|
||||
if (!job)
|
||||
return;
|
||||
assert(job->type == V3DV_JOB_TYPE_GPU_CSD);
|
||||
assert(wg_counts[0] > 0 && wg_counts[1] > 0 && wg_counts[2] > 0);
|
||||
|
||||
struct drm_v3d_submit_csd *submit = &job->csd.submit;
|
||||
|
||||
job->csd.workgroup_count[0] = group_count_x;
|
||||
job->csd.workgroup_count[1] = group_count_y;
|
||||
job->csd.workgroup_count[2] = group_count_z;
|
||||
job->csd.wg_count[0] = wg_counts[0];
|
||||
job->csd.wg_count[1] = wg_counts[1];
|
||||
job->csd.wg_count[2] = wg_counts[2];
|
||||
|
||||
submit->cfg[0] = wg_counts[0] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
|
||||
submit->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
|
||||
submit->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
|
||||
|
||||
submit->cfg[4] = DIV_ROUND_UP(info->wg_size, 16) *
|
||||
(wg_counts[0] * wg_counts[1] * wg_counts[2]) - 1;
|
||||
assert(submit->cfg[4] != ~0);
|
||||
|
||||
if (info->needs_wg_uniform_rewrite) {
|
||||
/* Make sure the GPU is not currently accessing the indirect CL for this
|
||||
* job, since we are about to overwrite some of the uniform data.
|
||||
*/
|
||||
const uint64_t infinite = 0xffffffffffffffffull;
|
||||
v3dv_bo_wait(job->device, job->indirect.bo, infinite);
|
||||
|
||||
for (uint32_t i = 0; i < 3; i++) {
|
||||
if (info->wg_uniform_offsets[i]) {
|
||||
/* Sanity check that our uniform pointers are within the allocated
|
||||
* BO space for our indirect CL.
|
||||
*/
|
||||
assert(info->wg_uniform_offsets[i] >= (uint32_t *) job->indirect.base);
|
||||
assert(info->wg_uniform_offsets[i] < (uint32_t *) job->indirect.next);
|
||||
*(info->wg_uniform_offsets[i]) = wg_counts[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static struct v3dv_job *
|
||||
cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
|
||||
uint32_t group_count_x,
|
||||
uint32_t group_count_y,
|
||||
uint32_t group_count_z,
|
||||
uint32_t **wg_uniform_offsets_out,
|
||||
uint32_t *wg_size_out)
|
||||
{
|
||||
struct v3dv_pipeline *pipeline = cmd_buffer->state.pipeline;
|
||||
assert(pipeline && pipeline->cs && pipeline->cs->nir);
|
||||
|
||||
struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->alloc,
|
||||
sizeof(struct v3dv_job), 8,
|
||||
VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
|
||||
if (!job) {
|
||||
v3dv_flag_oom(cmd_buffer, NULL);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
v3dv_job_init(job, V3DV_JOB_TYPE_GPU_CSD, cmd_buffer->device, cmd_buffer, -1);
|
||||
cmd_buffer->state.job = job;
|
||||
|
||||
struct drm_v3d_submit_csd *submit = &job->csd.submit;
|
||||
|
||||
job->csd.wg_count[0] = group_count_x;
|
||||
job->csd.wg_count[1] = group_count_y;
|
||||
job->csd.wg_count[2] = group_count_z;
|
||||
|
||||
submit->cfg[0] |= group_count_x << V3D_CSD_CFG012_WG_COUNT_SHIFT;
|
||||
submit->cfg[1] |= group_count_y << V3D_CSD_CFG012_WG_COUNT_SHIFT;
|
||||
submit->cfg[2] |= group_count_z << V3D_CSD_CFG012_WG_COUNT_SHIFT;
|
||||
|
||||
struct v3dv_pipeline *pipeline = cmd_buffer->state.pipeline;
|
||||
assert(pipeline->cs && pipeline->cs->nir);
|
||||
|
||||
const struct nir_shader *cs = pipeline->cs->nir;
|
||||
|
||||
const uint32_t wgs_per_sg = 1; /* FIXME */
|
||||
|
|
@ -4625,6 +4665,8 @@ cmd_buffer_dispatch(struct v3dv_cmd_buffer *cmd_buffer,
|
|||
submit->cfg[3] |= ((DIV_ROUND_UP(wgs_per_sg * wg_size, 16) - 1) <<
|
||||
V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT);
|
||||
submit->cfg[3] |= (wg_size & 0xff) << V3D_CSD_CFG3_WG_SIZE_SHIFT;
|
||||
if (wg_size_out)
|
||||
*wg_size_out = wg_size;
|
||||
|
||||
uint32_t batches_per_wg = DIV_ROUND_UP(wg_size, 16);
|
||||
submit->cfg[4] = batches_per_wg *
|
||||
|
|
@ -4646,16 +4688,40 @@ cmd_buffer_dispatch(struct v3dv_cmd_buffer *cmd_buffer,
|
|||
v3dv_bo_alloc(cmd_buffer->device,
|
||||
variant->prog_data.cs->shared_size * wgs_per_sg,
|
||||
"shared_vars", true);
|
||||
if (!job->csd.shared_memory) {
|
||||
v3dv_flag_oom(cmd_buffer, NULL);
|
||||
return job;
|
||||
}
|
||||
}
|
||||
|
||||
v3dv_job_add_bo(job, variant->assembly_bo);
|
||||
|
||||
struct v3dv_cl_reloc uniforms =
|
||||
v3dv_write_uniforms(cmd_buffer, pipeline->cs);
|
||||
v3dv_write_uniforms_wg_offsets(cmd_buffer, pipeline->cs,
|
||||
wg_uniform_offsets_out);
|
||||
submit->cfg[6] = uniforms.bo->offset + uniforms.offset;
|
||||
|
||||
v3dv_job_add_bo(job, uniforms.bo);
|
||||
|
||||
return job;
|
||||
}
|
||||
|
||||
static void
|
||||
cmd_buffer_dispatch(struct v3dv_cmd_buffer *cmd_buffer,
|
||||
uint32_t group_count_x,
|
||||
uint32_t group_count_y,
|
||||
uint32_t group_count_z)
|
||||
{
|
||||
if (group_count_x == 0 || group_count_y == 0 || group_count_z == 0)
|
||||
return;
|
||||
|
||||
struct v3dv_job *job =
|
||||
cmd_buffer_create_csd_job(cmd_buffer,
|
||||
group_count_x,
|
||||
group_count_y,
|
||||
group_count_z,
|
||||
NULL, NULL);
|
||||
|
||||
list_addtail(&job->list_link, &cmd_buffer->jobs);
|
||||
cmd_buffer->state.job = NULL;
|
||||
}
|
||||
|
|
@ -4672,12 +4738,64 @@ v3dv_CmdDispatch(VkCommandBuffer commandBuffer,
|
|||
cmd_buffer_dispatch(cmd_buffer, groupCountX, groupCountY, groupCountZ);
|
||||
}
|
||||
|
||||
static void
|
||||
cmd_buffer_dispatch_indirect(struct v3dv_cmd_buffer *cmd_buffer,
|
||||
struct v3dv_buffer *buffer,
|
||||
uint32_t offset)
|
||||
{
|
||||
/* We can't do indirect dispatches, so instead we record a CPU job that,
|
||||
* when executed in the queue, will map the indirect buffer, read the
|
||||
* dispatch parameters, and submit a regular dispatch.
|
||||
*/
|
||||
struct v3dv_job *job =
|
||||
v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
|
||||
V3DV_JOB_TYPE_CPU_CSD_INDIRECT,
|
||||
cmd_buffer, -1);
|
||||
v3dv_return_if_oom(cmd_buffer, NULL);
|
||||
|
||||
/* We need to create a CSD job now, even if we still don't know the actual
|
||||
* dispatch parameters, because the job setup needs to be done using the
|
||||
* current command buffer state (i.e. pipeline, descriptor sets, push
|
||||
* constants, etc.). So we create the job with default dispatch parameters
|
||||
* and we will rewrite the parts we need at submit time if the indirect
|
||||
* parameters don't match the ones we used to setup the job.
|
||||
*/
|
||||
struct v3dv_job *csd_job =
|
||||
cmd_buffer_create_csd_job(cmd_buffer,
|
||||
1, 1, 1,
|
||||
&job->cpu.csd_indirect.wg_uniform_offsets[0],
|
||||
&job->cpu.csd_indirect.wg_size);
|
||||
v3dv_return_if_oom(cmd_buffer, NULL);
|
||||
assert(csd_job);
|
||||
|
||||
job->cpu.csd_indirect.buffer = buffer;
|
||||
job->cpu.csd_indirect.offset = offset;
|
||||
job->cpu.csd_indirect.csd_job = csd_job;
|
||||
|
||||
/* If the compute shader reads the workgroup sizes we will also need to
|
||||
* rewrite the corresponding uniforms.
|
||||
*/
|
||||
job->cpu.csd_indirect.needs_wg_uniform_rewrite =
|
||||
job->cpu.csd_indirect.wg_uniform_offsets[0] ||
|
||||
job->cpu.csd_indirect.wg_uniform_offsets[1] ||
|
||||
job->cpu.csd_indirect.wg_uniform_offsets[2];
|
||||
|
||||
list_addtail(&job->list_link, &cmd_buffer->jobs);
|
||||
cmd_buffer->state.job = NULL;
|
||||
}
|
||||
|
||||
void
|
||||
v3dv_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
|
||||
VkBuffer buffer,
|
||||
VkBuffer _buffer,
|
||||
VkDeviceSize offset)
|
||||
{
|
||||
unreachable("vkCmdDispatchIndirect not implemented.");
|
||||
V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
|
||||
V3DV_FROM_HANDLE(v3dv_buffer, buffer, _buffer);
|
||||
|
||||
assert(offset <= UINT32_MAX);
|
||||
|
||||
cmd_buffer_emit_pre_dispatch(cmd_buffer);
|
||||
cmd_buffer_dispatch_indirect(cmd_buffer, buffer, offset);
|
||||
}
|
||||
|
||||
void
|
||||
|
|
|
|||
|
|
@ -676,6 +676,7 @@ enum v3dv_job_type {
|
|||
V3DV_JOB_TYPE_CPU_WAIT_EVENTS,
|
||||
V3DV_JOB_TYPE_CPU_CLEAR_ATTACHMENTS,
|
||||
V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE,
|
||||
V3DV_JOB_TYPE_CPU_CSD_INDIRECT,
|
||||
};
|
||||
|
||||
struct v3dv_reset_query_cpu_job_info {
|
||||
|
|
@ -733,6 +734,15 @@ struct v3dv_copy_buffer_to_image_cpu_job_info {
|
|||
uint32_t layer_count;
|
||||
};
|
||||
|
||||
struct v3dv_csd_indirect_cpu_job_info {
|
||||
struct v3dv_buffer *buffer;
|
||||
uint32_t offset;
|
||||
struct v3dv_job *csd_job;
|
||||
uint32_t wg_size;
|
||||
uint32_t *wg_uniform_offsets[3];
|
||||
bool needs_wg_uniform_rewrite;
|
||||
};
|
||||
|
||||
struct v3dv_job {
|
||||
struct list_head list_link;
|
||||
|
||||
|
|
@ -797,6 +807,7 @@ struct v3dv_job {
|
|||
struct v3dv_event_wait_cpu_job_info event_wait;
|
||||
struct v3dv_clear_attachments_cpu_job_info clear_attachments;
|
||||
struct v3dv_copy_buffer_to_image_cpu_job_info copy_buffer_to_image;
|
||||
struct v3dv_csd_indirect_cpu_job_info csd_indirect;
|
||||
} cpu;
|
||||
|
||||
/* Job specs for TFU jobs */
|
||||
|
|
@ -805,7 +816,7 @@ struct v3dv_job {
|
|||
/* Job specs for CSD jobs */
|
||||
struct {
|
||||
struct v3dv_bo *shared_memory;
|
||||
uint32_t workgroup_count[3];
|
||||
uint32_t wg_count[3];
|
||||
struct drm_v3d_submit_csd submit;
|
||||
} csd;
|
||||
};
|
||||
|
|
@ -1109,6 +1120,9 @@ void v3dv_cmd_buffer_copy_query_results(struct v3dv_cmd_buffer *cmd_buffer,
|
|||
void v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer *cmd_buffer,
|
||||
struct drm_v3d_submit_tfu *tfu);
|
||||
|
||||
void v3dv_cmd_buffer_rewrite_indirect_csd_job(struct v3dv_csd_indirect_cpu_job_info *info,
|
||||
const uint32_t *wg_counts);
|
||||
|
||||
void v3dv_cmd_buffer_add_private_obj(struct v3dv_cmd_buffer *cmd_buffer,
|
||||
uint64_t obj,
|
||||
v3dv_cmd_buffer_private_obj_destroy_cb destroy_cb);
|
||||
|
|
@ -1622,6 +1636,9 @@ void v3d_store_tiled_image(void *dst, uint32_t dst_stride,
|
|||
|
||||
struct v3dv_cl_reloc v3dv_write_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
|
||||
struct v3dv_pipeline_stage *p_stage);
|
||||
struct v3dv_cl_reloc v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
|
||||
struct v3dv_pipeline_stage *p_stage,
|
||||
uint32_t **wg_count_offsets);
|
||||
|
||||
struct v3dv_shader_variant *
|
||||
v3dv_get_shader_variant(struct v3dv_pipeline_stage *p_stage,
|
||||
|
|
|
|||
|
|
@ -421,6 +421,47 @@ handle_copy_buffer_to_image_cpu_job(struct v3dv_job *job)
|
|||
return VK_SUCCESS;
|
||||
}
|
||||
|
||||
static VkResult
|
||||
handle_csd_job(struct v3dv_queue *queue,
|
||||
struct v3dv_job *job,
|
||||
bool do_wait);
|
||||
|
||||
static VkResult
|
||||
handle_csd_indirect_cpu_job(struct v3dv_queue *queue,
|
||||
struct v3dv_job *job,
|
||||
bool do_wait)
|
||||
{
|
||||
assert(job->type == V3DV_JOB_TYPE_CPU_CSD_INDIRECT);
|
||||
struct v3dv_csd_indirect_cpu_job_info *info = &job->cpu.csd_indirect;
|
||||
assert(info->csd_job);
|
||||
|
||||
/* Make sure the GPU is no longer using the indirect buffer*/
|
||||
assert(info->buffer && info->buffer->mem && info->buffer->mem->bo);
|
||||
const uint64_t infinite = 0xffffffffffffffffull;
|
||||
v3dv_bo_wait(queue->device, info->buffer->mem->bo, infinite);
|
||||
|
||||
/* Map the indirect buffer and read the dispatch parameters */
|
||||
assert(info->buffer && info->buffer->mem && info->buffer->mem->bo);
|
||||
struct v3dv_bo *bo = info->buffer->mem->bo;
|
||||
if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size))
|
||||
return vk_error(job->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
|
||||
assert(bo->map);
|
||||
|
||||
const uint32_t offset = info->buffer->mem_offset + info->offset;
|
||||
const uint32_t *group_counts = (uint32_t *) (bo->map + offset);
|
||||
if (group_counts[0] == 0 || group_counts[1] == 0|| group_counts[2] == 0)
|
||||
return VK_SUCCESS;
|
||||
|
||||
if (memcmp(group_counts, info->csd_job->csd.wg_count,
|
||||
sizeof(info->csd_job->csd.wg_count)) != 0) {
|
||||
v3dv_cmd_buffer_rewrite_indirect_csd_job(info, group_counts);
|
||||
}
|
||||
|
||||
handle_csd_job(queue, info->csd_job, do_wait);
|
||||
|
||||
return VK_SUCCESS;
|
||||
}
|
||||
|
||||
static VkResult
|
||||
process_semaphores_to_signal(struct v3dv_device *device,
|
||||
uint32_t count, const VkSemaphore *sems)
|
||||
|
|
@ -646,6 +687,8 @@ queue_submit_job(struct v3dv_queue *queue,
|
|||
return handle_wait_events_cpu_job(job, do_wait, wait_thread);
|
||||
case V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE:
|
||||
return handle_copy_buffer_to_image_cpu_job(job);
|
||||
case V3DV_JOB_TYPE_CPU_CSD_INDIRECT:
|
||||
return handle_csd_indirect_cpu_job(queue, job, do_wait);
|
||||
default:
|
||||
unreachable("Unhandled job type");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -253,8 +253,9 @@ get_texture_size(struct v3dv_cmd_buffer *cmd_buffer,
|
|||
}
|
||||
|
||||
struct v3dv_cl_reloc
|
||||
v3dv_write_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
|
||||
struct v3dv_pipeline_stage *p_stage)
|
||||
v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
|
||||
struct v3dv_pipeline_stage *p_stage,
|
||||
uint32_t **wg_count_offsets)
|
||||
{
|
||||
struct v3d_uniform_list *uinfo =
|
||||
&p_stage->current_variant->prog_data.base->uniforms;
|
||||
|
|
@ -336,8 +337,10 @@ v3dv_write_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
|
|||
|
||||
case QUNIFORM_NUM_WORK_GROUPS:
|
||||
assert(job->type == V3DV_JOB_TYPE_GPU_CSD);
|
||||
assert(job->csd.workgroup_count[data] > 0);
|
||||
cl_aligned_u32(&uniforms, job->csd.workgroup_count[data]);
|
||||
assert(job->csd.wg_count[data] > 0);
|
||||
if (wg_count_offsets)
|
||||
wg_count_offsets[data] = (uint32_t *) uniforms;
|
||||
cl_aligned_u32(&uniforms, job->csd.wg_count[data]);
|
||||
break;
|
||||
|
||||
case QUNIFORM_SHARED_OFFSET:
|
||||
|
|
@ -355,3 +358,10 @@ v3dv_write_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
|
|||
|
||||
return uniform_stream;
|
||||
}
|
||||
|
||||
struct v3dv_cl_reloc
|
||||
v3dv_write_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
|
||||
struct v3dv_pipeline_stage *p_stage)
|
||||
{
|
||||
return v3dv_write_uniforms_wg_offsets(cmd_buffer, p_stage, NULL);
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue