v3dv: implement indirect compute dispatch

The hardware can't do this, so we need to record a CPU job that will
map the indirect buffer at queue submission time, read the dispatch
parameters and then submit a regular dispatch.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6766>
This commit is contained in:
Iago Toral Quiroga 2020-06-19 11:56:20 +02:00 committed by Marge Bot
parent 1d6edcc3e8
commit b356d3de8c
4 changed files with 234 additions and 46 deletions

View file

@ -217,6 +217,14 @@ job_destroy_cpu_wait_events_resources(struct v3dv_job *job)
vk_free(&job->cmd_buffer->device->alloc, job->cpu.event_wait.events);
}
static void
job_destroy_cpu_csd_indirect_resources(struct v3dv_job *job)
{
assert(job->type == V3DV_JOB_TYPE_CPU_CSD_INDIRECT);
assert(job->cmd_buffer);
v3dv_job_destroy(job->cpu.csd_indirect.csd_job);
}
void
v3dv_job_destroy(struct v3dv_job *job)
{
@ -240,6 +248,9 @@ v3dv_job_destroy(struct v3dv_job *job)
case V3DV_JOB_TYPE_CPU_WAIT_EVENTS:
job_destroy_cpu_wait_events_resources(job);
break;
case V3DV_JOB_TYPE_CPU_CSD_INDIRECT:
job_destroy_cpu_csd_indirect_resources(job);
break;
default:
break;
}
@ -806,28 +817,6 @@ v3dv_cmd_buffer_start_job(struct v3dv_cmd_buffer *cmd_buffer,
return job;
}
static struct v3dv_job *
cmd_buffer_start_compute_job(struct v3dv_cmd_buffer *cmd_buffer)
{
/* Compute jobs can only happen outside a render pass */
assert(!cmd_buffer->state.job);
assert(!cmd_buffer->state.pass);
struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->alloc,
sizeof(struct v3dv_job), 8,
VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
cmd_buffer->state.job = job;
if (!job) {
v3dv_flag_oom(cmd_buffer, NULL);
return NULL;
}
v3dv_job_init(job, V3DV_JOB_TYPE_GPU_CSD, cmd_buffer->device, cmd_buffer, -1);
return job;
}
static VkResult
cmd_buffer_reset(struct v3dv_cmd_buffer *cmd_buffer,
VkCommandBufferResetFlags flags)
@ -4589,32 +4578,83 @@ cmd_buffer_emit_pre_dispatch(struct v3dv_cmd_buffer *cmd_buffer)
#define V3D_CSD_CFG5_SINGLE_SEG (1 << 1)
#define V3D_CSD_CFG5_THREADING (1 << 0)
static void
cmd_buffer_dispatch(struct v3dv_cmd_buffer *cmd_buffer,
uint32_t group_count_x,
uint32_t group_count_y,
uint32_t group_count_z)
void
v3dv_cmd_buffer_rewrite_indirect_csd_job(
struct v3dv_csd_indirect_cpu_job_info *info,
const uint32_t *wg_counts)
{
if (group_count_x == 0 || group_count_y == 0 || group_count_z == 0)
return;
assert(info->csd_job);
struct v3dv_job *job = info->csd_job;
struct v3dv_job *job = cmd_buffer_start_compute_job(cmd_buffer);
if (!job)
return;
assert(job->type == V3DV_JOB_TYPE_GPU_CSD);
assert(wg_counts[0] > 0 && wg_counts[1] > 0 && wg_counts[2] > 0);
struct drm_v3d_submit_csd *submit = &job->csd.submit;
job->csd.workgroup_count[0] = group_count_x;
job->csd.workgroup_count[1] = group_count_y;
job->csd.workgroup_count[2] = group_count_z;
job->csd.wg_count[0] = wg_counts[0];
job->csd.wg_count[1] = wg_counts[1];
job->csd.wg_count[2] = wg_counts[2];
submit->cfg[0] = wg_counts[0] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
submit->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
submit->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
submit->cfg[4] = DIV_ROUND_UP(info->wg_size, 16) *
(wg_counts[0] * wg_counts[1] * wg_counts[2]) - 1;
assert(submit->cfg[4] != ~0);
if (info->needs_wg_uniform_rewrite) {
/* Make sure the GPU is not currently accessing the indirect CL for this
* job, since we are about to overwrite some of the uniform data.
*/
const uint64_t infinite = 0xffffffffffffffffull;
v3dv_bo_wait(job->device, job->indirect.bo, infinite);
for (uint32_t i = 0; i < 3; i++) {
if (info->wg_uniform_offsets[i]) {
/* Sanity check that our uniform pointers are within the allocated
* BO space for our indirect CL.
*/
assert(info->wg_uniform_offsets[i] >= (uint32_t *) job->indirect.base);
assert(info->wg_uniform_offsets[i] < (uint32_t *) job->indirect.next);
*(info->wg_uniform_offsets[i]) = wg_counts[i];
}
}
}
}
static struct v3dv_job *
cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
uint32_t group_count_x,
uint32_t group_count_y,
uint32_t group_count_z,
uint32_t **wg_uniform_offsets_out,
uint32_t *wg_size_out)
{
struct v3dv_pipeline *pipeline = cmd_buffer->state.pipeline;
assert(pipeline && pipeline->cs && pipeline->cs->nir);
struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->alloc,
sizeof(struct v3dv_job), 8,
VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
if (!job) {
v3dv_flag_oom(cmd_buffer, NULL);
return NULL;
}
v3dv_job_init(job, V3DV_JOB_TYPE_GPU_CSD, cmd_buffer->device, cmd_buffer, -1);
cmd_buffer->state.job = job;
struct drm_v3d_submit_csd *submit = &job->csd.submit;
job->csd.wg_count[0] = group_count_x;
job->csd.wg_count[1] = group_count_y;
job->csd.wg_count[2] = group_count_z;
submit->cfg[0] |= group_count_x << V3D_CSD_CFG012_WG_COUNT_SHIFT;
submit->cfg[1] |= group_count_y << V3D_CSD_CFG012_WG_COUNT_SHIFT;
submit->cfg[2] |= group_count_z << V3D_CSD_CFG012_WG_COUNT_SHIFT;
struct v3dv_pipeline *pipeline = cmd_buffer->state.pipeline;
assert(pipeline->cs && pipeline->cs->nir);
const struct nir_shader *cs = pipeline->cs->nir;
const uint32_t wgs_per_sg = 1; /* FIXME */
@ -4625,6 +4665,8 @@ cmd_buffer_dispatch(struct v3dv_cmd_buffer *cmd_buffer,
submit->cfg[3] |= ((DIV_ROUND_UP(wgs_per_sg * wg_size, 16) - 1) <<
V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT);
submit->cfg[3] |= (wg_size & 0xff) << V3D_CSD_CFG3_WG_SIZE_SHIFT;
if (wg_size_out)
*wg_size_out = wg_size;
uint32_t batches_per_wg = DIV_ROUND_UP(wg_size, 16);
submit->cfg[4] = batches_per_wg *
@ -4646,16 +4688,40 @@ cmd_buffer_dispatch(struct v3dv_cmd_buffer *cmd_buffer,
v3dv_bo_alloc(cmd_buffer->device,
variant->prog_data.cs->shared_size * wgs_per_sg,
"shared_vars", true);
if (!job->csd.shared_memory) {
v3dv_flag_oom(cmd_buffer, NULL);
return job;
}
}
v3dv_job_add_bo(job, variant->assembly_bo);
struct v3dv_cl_reloc uniforms =
v3dv_write_uniforms(cmd_buffer, pipeline->cs);
v3dv_write_uniforms_wg_offsets(cmd_buffer, pipeline->cs,
wg_uniform_offsets_out);
submit->cfg[6] = uniforms.bo->offset + uniforms.offset;
v3dv_job_add_bo(job, uniforms.bo);
return job;
}
static void
cmd_buffer_dispatch(struct v3dv_cmd_buffer *cmd_buffer,
uint32_t group_count_x,
uint32_t group_count_y,
uint32_t group_count_z)
{
if (group_count_x == 0 || group_count_y == 0 || group_count_z == 0)
return;
struct v3dv_job *job =
cmd_buffer_create_csd_job(cmd_buffer,
group_count_x,
group_count_y,
group_count_z,
NULL, NULL);
list_addtail(&job->list_link, &cmd_buffer->jobs);
cmd_buffer->state.job = NULL;
}
@ -4672,12 +4738,64 @@ v3dv_CmdDispatch(VkCommandBuffer commandBuffer,
cmd_buffer_dispatch(cmd_buffer, groupCountX, groupCountY, groupCountZ);
}
static void
cmd_buffer_dispatch_indirect(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_buffer *buffer,
uint32_t offset)
{
/* We can't do indirect dispatches, so instead we record a CPU job that,
* when executed in the queue, will map the indirect buffer, read the
* dispatch parameters, and submit a regular dispatch.
*/
struct v3dv_job *job =
v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
V3DV_JOB_TYPE_CPU_CSD_INDIRECT,
cmd_buffer, -1);
v3dv_return_if_oom(cmd_buffer, NULL);
/* We need to create a CSD job now, even if we still don't know the actual
* dispatch parameters, because the job setup needs to be done using the
* current command buffer state (i.e. pipeline, descriptor sets, push
* constants, etc.). So we create the job with default dispatch parameters
* and we will rewrite the parts we need at submit time if the indirect
* parameters don't match the ones we used to setup the job.
*/
struct v3dv_job *csd_job =
cmd_buffer_create_csd_job(cmd_buffer,
1, 1, 1,
&job->cpu.csd_indirect.wg_uniform_offsets[0],
&job->cpu.csd_indirect.wg_size);
v3dv_return_if_oom(cmd_buffer, NULL);
assert(csd_job);
job->cpu.csd_indirect.buffer = buffer;
job->cpu.csd_indirect.offset = offset;
job->cpu.csd_indirect.csd_job = csd_job;
/* If the compute shader reads the workgroup sizes we will also need to
* rewrite the corresponding uniforms.
*/
job->cpu.csd_indirect.needs_wg_uniform_rewrite =
job->cpu.csd_indirect.wg_uniform_offsets[0] ||
job->cpu.csd_indirect.wg_uniform_offsets[1] ||
job->cpu.csd_indirect.wg_uniform_offsets[2];
list_addtail(&job->list_link, &cmd_buffer->jobs);
cmd_buffer->state.job = NULL;
}
void
v3dv_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
VkBuffer buffer,
VkBuffer _buffer,
VkDeviceSize offset)
{
unreachable("vkCmdDispatchIndirect not implemented.");
V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
V3DV_FROM_HANDLE(v3dv_buffer, buffer, _buffer);
assert(offset <= UINT32_MAX);
cmd_buffer_emit_pre_dispatch(cmd_buffer);
cmd_buffer_dispatch_indirect(cmd_buffer, buffer, offset);
}
void

View file

@ -676,6 +676,7 @@ enum v3dv_job_type {
V3DV_JOB_TYPE_CPU_WAIT_EVENTS,
V3DV_JOB_TYPE_CPU_CLEAR_ATTACHMENTS,
V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE,
V3DV_JOB_TYPE_CPU_CSD_INDIRECT,
};
struct v3dv_reset_query_cpu_job_info {
@ -733,6 +734,15 @@ struct v3dv_copy_buffer_to_image_cpu_job_info {
uint32_t layer_count;
};
struct v3dv_csd_indirect_cpu_job_info {
struct v3dv_buffer *buffer;
uint32_t offset;
struct v3dv_job *csd_job;
uint32_t wg_size;
uint32_t *wg_uniform_offsets[3];
bool needs_wg_uniform_rewrite;
};
struct v3dv_job {
struct list_head list_link;
@ -797,6 +807,7 @@ struct v3dv_job {
struct v3dv_event_wait_cpu_job_info event_wait;
struct v3dv_clear_attachments_cpu_job_info clear_attachments;
struct v3dv_copy_buffer_to_image_cpu_job_info copy_buffer_to_image;
struct v3dv_csd_indirect_cpu_job_info csd_indirect;
} cpu;
/* Job specs for TFU jobs */
@ -805,7 +816,7 @@ struct v3dv_job {
/* Job specs for CSD jobs */
struct {
struct v3dv_bo *shared_memory;
uint32_t workgroup_count[3];
uint32_t wg_count[3];
struct drm_v3d_submit_csd submit;
} csd;
};
@ -1109,6 +1120,9 @@ void v3dv_cmd_buffer_copy_query_results(struct v3dv_cmd_buffer *cmd_buffer,
void v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer *cmd_buffer,
struct drm_v3d_submit_tfu *tfu);
void v3dv_cmd_buffer_rewrite_indirect_csd_job(struct v3dv_csd_indirect_cpu_job_info *info,
const uint32_t *wg_counts);
void v3dv_cmd_buffer_add_private_obj(struct v3dv_cmd_buffer *cmd_buffer,
uint64_t obj,
v3dv_cmd_buffer_private_obj_destroy_cb destroy_cb);
@ -1622,6 +1636,9 @@ void v3d_store_tiled_image(void *dst, uint32_t dst_stride,
struct v3dv_cl_reloc v3dv_write_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_pipeline_stage *p_stage);
struct v3dv_cl_reloc v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_pipeline_stage *p_stage,
uint32_t **wg_count_offsets);
struct v3dv_shader_variant *
v3dv_get_shader_variant(struct v3dv_pipeline_stage *p_stage,

View file

@ -421,6 +421,47 @@ handle_copy_buffer_to_image_cpu_job(struct v3dv_job *job)
return VK_SUCCESS;
}
static VkResult
handle_csd_job(struct v3dv_queue *queue,
struct v3dv_job *job,
bool do_wait);
static VkResult
handle_csd_indirect_cpu_job(struct v3dv_queue *queue,
struct v3dv_job *job,
bool do_wait)
{
assert(job->type == V3DV_JOB_TYPE_CPU_CSD_INDIRECT);
struct v3dv_csd_indirect_cpu_job_info *info = &job->cpu.csd_indirect;
assert(info->csd_job);
/* Make sure the GPU is no longer using the indirect buffer*/
assert(info->buffer && info->buffer->mem && info->buffer->mem->bo);
const uint64_t infinite = 0xffffffffffffffffull;
v3dv_bo_wait(queue->device, info->buffer->mem->bo, infinite);
/* Map the indirect buffer and read the dispatch parameters */
assert(info->buffer && info->buffer->mem && info->buffer->mem->bo);
struct v3dv_bo *bo = info->buffer->mem->bo;
if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size))
return vk_error(job->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
assert(bo->map);
const uint32_t offset = info->buffer->mem_offset + info->offset;
const uint32_t *group_counts = (uint32_t *) (bo->map + offset);
if (group_counts[0] == 0 || group_counts[1] == 0|| group_counts[2] == 0)
return VK_SUCCESS;
if (memcmp(group_counts, info->csd_job->csd.wg_count,
sizeof(info->csd_job->csd.wg_count)) != 0) {
v3dv_cmd_buffer_rewrite_indirect_csd_job(info, group_counts);
}
handle_csd_job(queue, info->csd_job, do_wait);
return VK_SUCCESS;
}
static VkResult
process_semaphores_to_signal(struct v3dv_device *device,
uint32_t count, const VkSemaphore *sems)
@ -646,6 +687,8 @@ queue_submit_job(struct v3dv_queue *queue,
return handle_wait_events_cpu_job(job, do_wait, wait_thread);
case V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE:
return handle_copy_buffer_to_image_cpu_job(job);
case V3DV_JOB_TYPE_CPU_CSD_INDIRECT:
return handle_csd_indirect_cpu_job(queue, job, do_wait);
default:
unreachable("Unhandled job type");
}

View file

@ -253,8 +253,9 @@ get_texture_size(struct v3dv_cmd_buffer *cmd_buffer,
}
struct v3dv_cl_reloc
v3dv_write_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_pipeline_stage *p_stage)
v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_pipeline_stage *p_stage,
uint32_t **wg_count_offsets)
{
struct v3d_uniform_list *uinfo =
&p_stage->current_variant->prog_data.base->uniforms;
@ -336,8 +337,10 @@ v3dv_write_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
case QUNIFORM_NUM_WORK_GROUPS:
assert(job->type == V3DV_JOB_TYPE_GPU_CSD);
assert(job->csd.workgroup_count[data] > 0);
cl_aligned_u32(&uniforms, job->csd.workgroup_count[data]);
assert(job->csd.wg_count[data] > 0);
if (wg_count_offsets)
wg_count_offsets[data] = (uint32_t *) uniforms;
cl_aligned_u32(&uniforms, job->csd.wg_count[data]);
break;
case QUNIFORM_SHARED_OFFSET:
@ -355,3 +358,10 @@ v3dv_write_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
return uniform_stream;
}
struct v3dv_cl_reloc
v3dv_write_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_pipeline_stage *p_stage)
{
return v3dv_write_uniforms_wg_offsets(cmd_buffer, p_stage, NULL);
}