diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c index 2bf87155ab8..943f48d7ec1 100644 --- a/src/broadcom/vulkan/v3dv_cmd_buffer.c +++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c @@ -217,6 +217,14 @@ job_destroy_cpu_wait_events_resources(struct v3dv_job *job) vk_free(&job->cmd_buffer->device->alloc, job->cpu.event_wait.events); } +static void +job_destroy_cpu_csd_indirect_resources(struct v3dv_job *job) +{ + assert(job->type == V3DV_JOB_TYPE_CPU_CSD_INDIRECT); + assert(job->cmd_buffer); + v3dv_job_destroy(job->cpu.csd_indirect.csd_job); +} + void v3dv_job_destroy(struct v3dv_job *job) { @@ -240,6 +248,9 @@ v3dv_job_destroy(struct v3dv_job *job) case V3DV_JOB_TYPE_CPU_WAIT_EVENTS: job_destroy_cpu_wait_events_resources(job); break; + case V3DV_JOB_TYPE_CPU_CSD_INDIRECT: + job_destroy_cpu_csd_indirect_resources(job); + break; default: break; } @@ -806,28 +817,6 @@ v3dv_cmd_buffer_start_job(struct v3dv_cmd_buffer *cmd_buffer, return job; } -static struct v3dv_job * -cmd_buffer_start_compute_job(struct v3dv_cmd_buffer *cmd_buffer) -{ - /* Compute jobs can only happen outside a render pass */ - assert(!cmd_buffer->state.job); - assert(!cmd_buffer->state.pass); - - struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->alloc, - sizeof(struct v3dv_job), 8, - VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); - cmd_buffer->state.job = job; - - if (!job) { - v3dv_flag_oom(cmd_buffer, NULL); - return NULL; - } - - v3dv_job_init(job, V3DV_JOB_TYPE_GPU_CSD, cmd_buffer->device, cmd_buffer, -1); - - return job; -} - static VkResult cmd_buffer_reset(struct v3dv_cmd_buffer *cmd_buffer, VkCommandBufferResetFlags flags) @@ -4589,32 +4578,83 @@ cmd_buffer_emit_pre_dispatch(struct v3dv_cmd_buffer *cmd_buffer) #define V3D_CSD_CFG5_SINGLE_SEG (1 << 1) #define V3D_CSD_CFG5_THREADING (1 << 0) -static void -cmd_buffer_dispatch(struct v3dv_cmd_buffer *cmd_buffer, - uint32_t group_count_x, - uint32_t group_count_y, - uint32_t group_count_z) +void +v3dv_cmd_buffer_rewrite_indirect_csd_job( + struct v3dv_csd_indirect_cpu_job_info *info, + const uint32_t *wg_counts) { - if (group_count_x == 0 || group_count_y == 0 || group_count_z == 0) - return; + assert(info->csd_job); + struct v3dv_job *job = info->csd_job; - struct v3dv_job *job = cmd_buffer_start_compute_job(cmd_buffer); - if (!job) - return; + assert(job->type == V3DV_JOB_TYPE_GPU_CSD); + assert(wg_counts[0] > 0 && wg_counts[1] > 0 && wg_counts[2] > 0); struct drm_v3d_submit_csd *submit = &job->csd.submit; - job->csd.workgroup_count[0] = group_count_x; - job->csd.workgroup_count[1] = group_count_y; - job->csd.workgroup_count[2] = group_count_z; + job->csd.wg_count[0] = wg_counts[0]; + job->csd.wg_count[1] = wg_counts[1]; + job->csd.wg_count[2] = wg_counts[2]; + + submit->cfg[0] = wg_counts[0] << V3D_CSD_CFG012_WG_COUNT_SHIFT; + submit->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT; + submit->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT; + + submit->cfg[4] = DIV_ROUND_UP(info->wg_size, 16) * + (wg_counts[0] * wg_counts[1] * wg_counts[2]) - 1; + assert(submit->cfg[4] != ~0); + + if (info->needs_wg_uniform_rewrite) { + /* Make sure the GPU is not currently accessing the indirect CL for this + * job, since we are about to overwrite some of the uniform data. + */ + const uint64_t infinite = 0xffffffffffffffffull; + v3dv_bo_wait(job->device, job->indirect.bo, infinite); + + for (uint32_t i = 0; i < 3; i++) { + if (info->wg_uniform_offsets[i]) { + /* Sanity check that our uniform pointers are within the allocated + * BO space for our indirect CL. + */ + assert(info->wg_uniform_offsets[i] >= (uint32_t *) job->indirect.base); + assert(info->wg_uniform_offsets[i] < (uint32_t *) job->indirect.next); + *(info->wg_uniform_offsets[i]) = wg_counts[i]; + } + } + } +} + +static struct v3dv_job * +cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer, + uint32_t group_count_x, + uint32_t group_count_y, + uint32_t group_count_z, + uint32_t **wg_uniform_offsets_out, + uint32_t *wg_size_out) +{ + struct v3dv_pipeline *pipeline = cmd_buffer->state.pipeline; + assert(pipeline && pipeline->cs && pipeline->cs->nir); + + struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->alloc, + sizeof(struct v3dv_job), 8, + VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); + if (!job) { + v3dv_flag_oom(cmd_buffer, NULL); + return NULL; + } + + v3dv_job_init(job, V3DV_JOB_TYPE_GPU_CSD, cmd_buffer->device, cmd_buffer, -1); + cmd_buffer->state.job = job; + + struct drm_v3d_submit_csd *submit = &job->csd.submit; + + job->csd.wg_count[0] = group_count_x; + job->csd.wg_count[1] = group_count_y; + job->csd.wg_count[2] = group_count_z; submit->cfg[0] |= group_count_x << V3D_CSD_CFG012_WG_COUNT_SHIFT; submit->cfg[1] |= group_count_y << V3D_CSD_CFG012_WG_COUNT_SHIFT; submit->cfg[2] |= group_count_z << V3D_CSD_CFG012_WG_COUNT_SHIFT; - struct v3dv_pipeline *pipeline = cmd_buffer->state.pipeline; - assert(pipeline->cs && pipeline->cs->nir); - const struct nir_shader *cs = pipeline->cs->nir; const uint32_t wgs_per_sg = 1; /* FIXME */ @@ -4625,6 +4665,8 @@ cmd_buffer_dispatch(struct v3dv_cmd_buffer *cmd_buffer, submit->cfg[3] |= ((DIV_ROUND_UP(wgs_per_sg * wg_size, 16) - 1) << V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT); submit->cfg[3] |= (wg_size & 0xff) << V3D_CSD_CFG3_WG_SIZE_SHIFT; + if (wg_size_out) + *wg_size_out = wg_size; uint32_t batches_per_wg = DIV_ROUND_UP(wg_size, 16); submit->cfg[4] = batches_per_wg * @@ -4646,16 +4688,40 @@ cmd_buffer_dispatch(struct v3dv_cmd_buffer *cmd_buffer, v3dv_bo_alloc(cmd_buffer->device, variant->prog_data.cs->shared_size * wgs_per_sg, "shared_vars", true); + if (!job->csd.shared_memory) { + v3dv_flag_oom(cmd_buffer, NULL); + return job; + } } v3dv_job_add_bo(job, variant->assembly_bo); struct v3dv_cl_reloc uniforms = - v3dv_write_uniforms(cmd_buffer, pipeline->cs); + v3dv_write_uniforms_wg_offsets(cmd_buffer, pipeline->cs, + wg_uniform_offsets_out); submit->cfg[6] = uniforms.bo->offset + uniforms.offset; v3dv_job_add_bo(job, uniforms.bo); + return job; +} + +static void +cmd_buffer_dispatch(struct v3dv_cmd_buffer *cmd_buffer, + uint32_t group_count_x, + uint32_t group_count_y, + uint32_t group_count_z) +{ + if (group_count_x == 0 || group_count_y == 0 || group_count_z == 0) + return; + + struct v3dv_job *job = + cmd_buffer_create_csd_job(cmd_buffer, + group_count_x, + group_count_y, + group_count_z, + NULL, NULL); + list_addtail(&job->list_link, &cmd_buffer->jobs); cmd_buffer->state.job = NULL; } @@ -4672,12 +4738,64 @@ v3dv_CmdDispatch(VkCommandBuffer commandBuffer, cmd_buffer_dispatch(cmd_buffer, groupCountX, groupCountY, groupCountZ); } +static void +cmd_buffer_dispatch_indirect(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_buffer *buffer, + uint32_t offset) +{ + /* We can't do indirect dispatches, so instead we record a CPU job that, + * when executed in the queue, will map the indirect buffer, read the + * dispatch parameters, and submit a regular dispatch. + */ + struct v3dv_job *job = + v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device, + V3DV_JOB_TYPE_CPU_CSD_INDIRECT, + cmd_buffer, -1); + v3dv_return_if_oom(cmd_buffer, NULL); + + /* We need to create a CSD job now, even if we still don't know the actual + * dispatch parameters, because the job setup needs to be done using the + * current command buffer state (i.e. pipeline, descriptor sets, push + * constants, etc.). So we create the job with default dispatch parameters + * and we will rewrite the parts we need at submit time if the indirect + * parameters don't match the ones we used to setup the job. + */ + struct v3dv_job *csd_job = + cmd_buffer_create_csd_job(cmd_buffer, + 1, 1, 1, + &job->cpu.csd_indirect.wg_uniform_offsets[0], + &job->cpu.csd_indirect.wg_size); + v3dv_return_if_oom(cmd_buffer, NULL); + assert(csd_job); + + job->cpu.csd_indirect.buffer = buffer; + job->cpu.csd_indirect.offset = offset; + job->cpu.csd_indirect.csd_job = csd_job; + + /* If the compute shader reads the workgroup sizes we will also need to + * rewrite the corresponding uniforms. + */ + job->cpu.csd_indirect.needs_wg_uniform_rewrite = + job->cpu.csd_indirect.wg_uniform_offsets[0] || + job->cpu.csd_indirect.wg_uniform_offsets[1] || + job->cpu.csd_indirect.wg_uniform_offsets[2]; + + list_addtail(&job->list_link, &cmd_buffer->jobs); + cmd_buffer->state.job = NULL; +} + void v3dv_CmdDispatchIndirect(VkCommandBuffer commandBuffer, - VkBuffer buffer, + VkBuffer _buffer, VkDeviceSize offset) { - unreachable("vkCmdDispatchIndirect not implemented."); + V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); + V3DV_FROM_HANDLE(v3dv_buffer, buffer, _buffer); + + assert(offset <= UINT32_MAX); + + cmd_buffer_emit_pre_dispatch(cmd_buffer); + cmd_buffer_dispatch_indirect(cmd_buffer, buffer, offset); } void diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h index b3564241a4e..b21c11ec62f 100644 --- a/src/broadcom/vulkan/v3dv_private.h +++ b/src/broadcom/vulkan/v3dv_private.h @@ -676,6 +676,7 @@ enum v3dv_job_type { V3DV_JOB_TYPE_CPU_WAIT_EVENTS, V3DV_JOB_TYPE_CPU_CLEAR_ATTACHMENTS, V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE, + V3DV_JOB_TYPE_CPU_CSD_INDIRECT, }; struct v3dv_reset_query_cpu_job_info { @@ -733,6 +734,15 @@ struct v3dv_copy_buffer_to_image_cpu_job_info { uint32_t layer_count; }; +struct v3dv_csd_indirect_cpu_job_info { + struct v3dv_buffer *buffer; + uint32_t offset; + struct v3dv_job *csd_job; + uint32_t wg_size; + uint32_t *wg_uniform_offsets[3]; + bool needs_wg_uniform_rewrite; +}; + struct v3dv_job { struct list_head list_link; @@ -797,6 +807,7 @@ struct v3dv_job { struct v3dv_event_wait_cpu_job_info event_wait; struct v3dv_clear_attachments_cpu_job_info clear_attachments; struct v3dv_copy_buffer_to_image_cpu_job_info copy_buffer_to_image; + struct v3dv_csd_indirect_cpu_job_info csd_indirect; } cpu; /* Job specs for TFU jobs */ @@ -805,7 +816,7 @@ struct v3dv_job { /* Job specs for CSD jobs */ struct { struct v3dv_bo *shared_memory; - uint32_t workgroup_count[3]; + uint32_t wg_count[3]; struct drm_v3d_submit_csd submit; } csd; }; @@ -1109,6 +1120,9 @@ void v3dv_cmd_buffer_copy_query_results(struct v3dv_cmd_buffer *cmd_buffer, void v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer *cmd_buffer, struct drm_v3d_submit_tfu *tfu); +void v3dv_cmd_buffer_rewrite_indirect_csd_job(struct v3dv_csd_indirect_cpu_job_info *info, + const uint32_t *wg_counts); + void v3dv_cmd_buffer_add_private_obj(struct v3dv_cmd_buffer *cmd_buffer, uint64_t obj, v3dv_cmd_buffer_private_obj_destroy_cb destroy_cb); @@ -1622,6 +1636,9 @@ void v3d_store_tiled_image(void *dst, uint32_t dst_stride, struct v3dv_cl_reloc v3dv_write_uniforms(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_pipeline_stage *p_stage); +struct v3dv_cl_reloc v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_pipeline_stage *p_stage, + uint32_t **wg_count_offsets); struct v3dv_shader_variant * v3dv_get_shader_variant(struct v3dv_pipeline_stage *p_stage, diff --git a/src/broadcom/vulkan/v3dv_queue.c b/src/broadcom/vulkan/v3dv_queue.c index cadb9065f78..6a9cca8a91c 100644 --- a/src/broadcom/vulkan/v3dv_queue.c +++ b/src/broadcom/vulkan/v3dv_queue.c @@ -421,6 +421,47 @@ handle_copy_buffer_to_image_cpu_job(struct v3dv_job *job) return VK_SUCCESS; } +static VkResult +handle_csd_job(struct v3dv_queue *queue, + struct v3dv_job *job, + bool do_wait); + +static VkResult +handle_csd_indirect_cpu_job(struct v3dv_queue *queue, + struct v3dv_job *job, + bool do_wait) +{ + assert(job->type == V3DV_JOB_TYPE_CPU_CSD_INDIRECT); + struct v3dv_csd_indirect_cpu_job_info *info = &job->cpu.csd_indirect; + assert(info->csd_job); + + /* Make sure the GPU is no longer using the indirect buffer*/ + assert(info->buffer && info->buffer->mem && info->buffer->mem->bo); + const uint64_t infinite = 0xffffffffffffffffull; + v3dv_bo_wait(queue->device, info->buffer->mem->bo, infinite); + + /* Map the indirect buffer and read the dispatch parameters */ + assert(info->buffer && info->buffer->mem && info->buffer->mem->bo); + struct v3dv_bo *bo = info->buffer->mem->bo; + if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size)) + return vk_error(job->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + assert(bo->map); + + const uint32_t offset = info->buffer->mem_offset + info->offset; + const uint32_t *group_counts = (uint32_t *) (bo->map + offset); + if (group_counts[0] == 0 || group_counts[1] == 0|| group_counts[2] == 0) + return VK_SUCCESS; + + if (memcmp(group_counts, info->csd_job->csd.wg_count, + sizeof(info->csd_job->csd.wg_count)) != 0) { + v3dv_cmd_buffer_rewrite_indirect_csd_job(info, group_counts); + } + + handle_csd_job(queue, info->csd_job, do_wait); + + return VK_SUCCESS; +} + static VkResult process_semaphores_to_signal(struct v3dv_device *device, uint32_t count, const VkSemaphore *sems) @@ -646,6 +687,8 @@ queue_submit_job(struct v3dv_queue *queue, return handle_wait_events_cpu_job(job, do_wait, wait_thread); case V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE: return handle_copy_buffer_to_image_cpu_job(job); + case V3DV_JOB_TYPE_CPU_CSD_INDIRECT: + return handle_csd_indirect_cpu_job(queue, job, do_wait); default: unreachable("Unhandled job type"); } diff --git a/src/broadcom/vulkan/v3dv_uniforms.c b/src/broadcom/vulkan/v3dv_uniforms.c index 36809b6bb66..c168498b8c4 100644 --- a/src/broadcom/vulkan/v3dv_uniforms.c +++ b/src/broadcom/vulkan/v3dv_uniforms.c @@ -253,8 +253,9 @@ get_texture_size(struct v3dv_cmd_buffer *cmd_buffer, } struct v3dv_cl_reloc -v3dv_write_uniforms(struct v3dv_cmd_buffer *cmd_buffer, - struct v3dv_pipeline_stage *p_stage) +v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_pipeline_stage *p_stage, + uint32_t **wg_count_offsets) { struct v3d_uniform_list *uinfo = &p_stage->current_variant->prog_data.base->uniforms; @@ -336,8 +337,10 @@ v3dv_write_uniforms(struct v3dv_cmd_buffer *cmd_buffer, case QUNIFORM_NUM_WORK_GROUPS: assert(job->type == V3DV_JOB_TYPE_GPU_CSD); - assert(job->csd.workgroup_count[data] > 0); - cl_aligned_u32(&uniforms, job->csd.workgroup_count[data]); + assert(job->csd.wg_count[data] > 0); + if (wg_count_offsets) + wg_count_offsets[data] = (uint32_t *) uniforms; + cl_aligned_u32(&uniforms, job->csd.wg_count[data]); break; case QUNIFORM_SHARED_OFFSET: @@ -355,3 +358,10 @@ v3dv_write_uniforms(struct v3dv_cmd_buffer *cmd_buffer, return uniform_stream; } + +struct v3dv_cl_reloc +v3dv_write_uniforms(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_pipeline_stage *p_stage) +{ + return v3dv_write_uniforms_wg_offsets(cmd_buffer, p_stage, NULL); +}