v3dv: implement compute dispatch

for now this only implements regular dispatches, not indirect.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6766>
This commit is contained in:
Iago Toral Quiroga 2020-06-18 13:53:51 +02:00 committed by Marge Bot
parent d0b1bb3032
commit 7e990683fd
4 changed files with 252 additions and 7 deletions

View file

@ -195,6 +195,20 @@ job_destroy_cloned_gpu_cl_resources(struct v3dv_job *job)
}
}
static void
job_destroy_gpu_csd_resources(struct v3dv_job *job)
{
assert(job->type == V3DV_JOB_TYPE_GPU_CSD);
assert(job->cmd_buffer);
v3dv_cl_destroy(&job->indirect);
_mesa_set_destroy(job->bos, NULL);
if (job->csd.shared_memory)
v3dv_bo_free(job->device, job->csd.shared_memory);
}
static void
job_destroy_cpu_wait_events_resources(struct v3dv_job *job)
{
@ -220,6 +234,9 @@ v3dv_job_destroy(struct v3dv_job *job)
case V3DV_JOB_TYPE_GPU_CL_SECONDARY:
job_destroy_gpu_cl_resources(job);
break;
case V3DV_JOB_TYPE_GPU_CSD:
job_destroy_gpu_csd_resources(job);
break;
case V3DV_JOB_TYPE_CPU_WAIT_EVENTS:
job_destroy_cpu_wait_events_resources(job);
break;
@ -716,19 +733,24 @@ v3dv_job_init(struct v3dv_job *job,
list_inithead(&job->list_link);
if (type == V3DV_JOB_TYPE_GPU_CL ||
type == V3DV_JOB_TYPE_GPU_CL_SECONDARY) {
type == V3DV_JOB_TYPE_GPU_CL_SECONDARY ||
type == V3DV_JOB_TYPE_GPU_CSD) {
job->bos =
_mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
job->bo_count = 0;
v3dv_cl_init(job, &job->bcl);
v3dv_cl_init(job, &job->rcl);
v3dv_cl_init(job, &job->indirect);
if (V3D_DEBUG & V3D_DEBUG_ALWAYS_FLUSH)
job->always_flush = true;
}
if (type == V3DV_JOB_TYPE_GPU_CL ||
type == V3DV_JOB_TYPE_GPU_CL_SECONDARY) {
v3dv_cl_init(job, &job->bcl);
v3dv_cl_init(job, &job->rcl);
}
if (cmd_buffer) {
/* Flag all state as dirty. Generally, we need to re-emit state for each
* new job.
@ -784,6 +806,28 @@ v3dv_cmd_buffer_start_job(struct v3dv_cmd_buffer *cmd_buffer,
return job;
}
static struct v3dv_job *
cmd_buffer_start_compute_job(struct v3dv_cmd_buffer *cmd_buffer)
{
/* Compute jobs can only happen outside a render pass */
assert(!cmd_buffer->state.job);
assert(!cmd_buffer->state.pass);
struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->alloc,
sizeof(struct v3dv_job), 8,
VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
cmd_buffer->state.job = job;
if (!job) {
v3dv_flag_oom(cmd_buffer, NULL);
return NULL;
}
v3dv_job_init(job, V3DV_JOB_TYPE_GPU_CSD, cmd_buffer->device, cmd_buffer, -1);
return job;
}
static VkResult
cmd_buffer_reset(struct v3dv_cmd_buffer *cmd_buffer,
VkCommandBufferResetFlags flags)
@ -2677,6 +2721,33 @@ update_vs_variant(struct v3dv_cmd_buffer *cmd_buffer)
p_stage->current_variant = variant;
}
static void
update_cs_variant(struct v3dv_cmd_buffer *cmd_buffer)
{
struct v3dv_shader_variant *variant;
struct v3dv_pipeline_stage *p_stage = cmd_buffer->state.pipeline->cs;
struct v3d_key local_key;
/* We start with a copy of the original pipeline key */
memcpy(&local_key, &p_stage->key.base, sizeof(struct v3d_key));
cmd_buffer_populate_v3d_key(&local_key, cmd_buffer,
VK_PIPELINE_BIND_POINT_COMPUTE);
VkResult result;
variant = v3dv_get_shader_variant(p_stage, &local_key,
sizeof(struct v3d_key),
&cmd_buffer->device->alloc,
&result);
/* At this point we are not creating a vulkan object to return to the
* API user, so we can't really return back a OOM error
*/
assert(variant);
assert(result == VK_SUCCESS);
p_stage->current_variant = variant;
}
/*
* Some updates on the cmd buffer requires also updates on the shader being
* compiled at the pipeline. The poster boy here are textures, as the compiler
@ -2690,8 +2761,13 @@ update_pipeline_variants(struct v3dv_cmd_buffer *cmd_buffer)
{
assert(cmd_buffer->state.pipeline);
update_fs_variant(cmd_buffer);
update_vs_variant(cmd_buffer);
if (v3dv_pipeline_get_binding_point(cmd_buffer->state.pipeline) ==
VK_PIPELINE_BIND_POINT_GRAPHICS) {
update_fs_variant(cmd_buffer);
update_vs_variant(cmd_buffer);
} else {
update_cs_variant(cmd_buffer);
}
}
static void
@ -4471,13 +4547,120 @@ v3dv_CmdWriteTimestamp(VkCommandBuffer commandBuffer,
unreachable("Timestamp queries are not supported.");
}
static void
cmd_buffer_emit_pre_dispatch(struct v3dv_cmd_buffer *cmd_buffer)
{
assert(cmd_buffer->state.pipeline);
assert(cmd_buffer->state.pipeline->active_stages == VK_SHADER_STAGE_COMPUTE_BIT);
/* We may need to compile shader variants based on bound textures */
uint32_t *dirty = &cmd_buffer->state.dirty;
if (*dirty & (V3DV_CMD_DIRTY_PIPELINE |
V3DV_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS)) {
update_pipeline_variants(cmd_buffer);
}
*dirty &= ~(V3DV_CMD_DIRTY_PIPELINE |
V3DV_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS);
}
#define V3D_CSD_CFG012_WG_COUNT_SHIFT 16
#define V3D_CSD_CFG012_WG_OFFSET_SHIFT 0
/* Allow this dispatch to start while the last one is still running. */
#define V3D_CSD_CFG3_OVERLAP_WITH_PREV (1 << 26)
/* Maximum supergroup ID. 6 bits. */
#define V3D_CSD_CFG3_MAX_SG_ID_SHIFT 20
/* Batches per supergroup minus 1. 8 bits. */
#define V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT 12
/* Workgroups per supergroup, 0 means 16 */
#define V3D_CSD_CFG3_WGS_PER_SG_SHIFT 8
#define V3D_CSD_CFG3_WG_SIZE_SHIFT 0
#define V3D_CSD_CFG5_PROPAGATE_NANS (1 << 2)
#define V3D_CSD_CFG5_SINGLE_SEG (1 << 1)
#define V3D_CSD_CFG5_THREADING (1 << 0)
static void
cmd_buffer_dispatch(struct v3dv_cmd_buffer *cmd_buffer,
uint32_t group_count_x,
uint32_t group_count_y,
uint32_t group_count_z)
{
if (group_count_x == 0 || group_count_y == 0 || group_count_z == 0)
return;
struct v3dv_job *job = cmd_buffer_start_compute_job(cmd_buffer);
if (!job)
return;
struct drm_v3d_submit_csd *submit = &job->csd.submit;
job->csd.workgroup_count[0] = group_count_x;
job->csd.workgroup_count[1] = group_count_y;
job->csd.workgroup_count[2] = group_count_z;
submit->cfg[0] |= group_count_x << V3D_CSD_CFG012_WG_COUNT_SHIFT;
submit->cfg[1] |= group_count_y << V3D_CSD_CFG012_WG_COUNT_SHIFT;
submit->cfg[2] |= group_count_z << V3D_CSD_CFG012_WG_COUNT_SHIFT;
struct v3dv_pipeline *pipeline = cmd_buffer->state.pipeline;
assert(pipeline->cs && pipeline->cs->nir);
const struct nir_shader *cs = pipeline->cs->nir;
const uint32_t wgs_per_sg = 1; /* FIXME */
const uint32_t wg_size = cs->info.cs.local_size[0] *
cs->info.cs.local_size[1] *
cs->info.cs.local_size[2];
submit->cfg[3] |= wgs_per_sg << V3D_CSD_CFG3_WGS_PER_SG_SHIFT;
submit->cfg[3] |= ((DIV_ROUND_UP(wgs_per_sg * wg_size, 16) - 1) <<
V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT);
submit->cfg[3] |= (wg_size & 0xff) << V3D_CSD_CFG3_WG_SIZE_SHIFT;
uint32_t batches_per_wg = DIV_ROUND_UP(wg_size, 16);
submit->cfg[4] = batches_per_wg *
(group_count_x * group_count_y * group_count_z) - 1;
assert(submit->cfg[4] != ~0);
assert(pipeline->cs->current_variant &&
pipeline->cs->current_variant->assembly_bo);
const struct v3dv_shader_variant *variant = pipeline->cs->current_variant;
submit->cfg[5] = variant->assembly_bo->offset;
submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
if (variant->prog_data.base->single_seg)
submit->cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG;
if (variant->prog_data.base->threads == 4)
submit->cfg[5] |= V3D_CSD_CFG5_THREADING;
if (variant->prog_data.cs->shared_size > 0) {
job->csd.shared_memory =
v3dv_bo_alloc(cmd_buffer->device,
variant->prog_data.cs->shared_size * wgs_per_sg,
"shared_vars", true);
}
v3dv_job_add_bo(job, variant->assembly_bo);
struct v3dv_cl_reloc uniforms =
v3dv_write_uniforms(cmd_buffer, pipeline->cs);
submit->cfg[6] = uniforms.bo->offset + uniforms.offset;
v3dv_job_add_bo(job, uniforms.bo);
list_addtail(&job->list_link, &cmd_buffer->jobs);
cmd_buffer->state.job = NULL;
}
void
v3dv_CmdDispatch(VkCommandBuffer commandBuffer,
uint32_t groupCountX,
uint32_t groupCountY,
uint32_t groupCountZ)
{
unreachable("vkCmdDispatch not implemented.");
V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
cmd_buffer_emit_pre_dispatch(cmd_buffer);
cmd_buffer_dispatch(cmd_buffer, groupCountX, groupCountY, groupCountZ);
}
void

View file

@ -798,8 +798,15 @@ struct v3dv_job {
struct v3dv_copy_buffer_to_image_cpu_job_info copy_buffer_to_image;
} cpu;
/* Job spects for TFU jobs */
/* Job specs for TFU jobs */
struct drm_v3d_submit_tfu tfu;
/* Job specs for CSD jobs */
struct {
struct v3dv_bo *shared_memory;
uint32_t workgroup_count[3];
struct drm_v3d_submit_csd submit;
} csd;
};
void v3dv_job_init(struct v3dv_job *job,

View file

@ -578,6 +578,47 @@ handle_tfu_job(struct v3dv_queue *queue,
return VK_SUCCESS;
}
static VkResult
handle_csd_job(struct v3dv_queue *queue,
struct v3dv_job *job,
bool do_wait)
{
struct v3dv_device *device = queue->device;
struct drm_v3d_submit_csd *submit = &job->csd.submit;
submit->bo_handle_count = job->bo_count;
uint32_t *bo_handles =
(uint32_t *) malloc(sizeof(uint32_t) * MAX2(4, submit->bo_handle_count * 2));
uint32_t bo_idx = 0;
set_foreach(job->bos, entry) {
struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
bo_handles[bo_idx++] = bo->handle;
}
assert(bo_idx == submit->bo_handle_count);
submit->bo_handles = (uintptr_t)(void *)bo_handles;
mtx_lock(&queue->device->mutex);
submit->in_sync = do_wait ? device->last_job_sync : 0;
submit->out_sync = device->last_job_sync;
int ret = v3dv_ioctl(device->render_fd, DRM_IOCTL_V3D_SUBMIT_CSD, submit);
mtx_unlock(&queue->device->mutex);
static bool warned = false;
if (ret && !warned) {
fprintf(stderr, "Compute dispatch returned %s. Expect corruption.\n",
strerror(errno));
warned = true;
}
free(bo_handles);
if (ret)
return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
return VK_SUCCESS;
}
static VkResult
queue_submit_job(struct v3dv_queue *queue,
struct v3dv_job *job,
@ -591,6 +632,8 @@ queue_submit_job(struct v3dv_queue *queue,
return handle_cl_job(queue, job, do_wait);
case V3DV_JOB_TYPE_GPU_TFU:
return handle_tfu_job(queue, job, do_wait);
case V3DV_JOB_TYPE_GPU_CSD:
return handle_csd_job(queue, job, do_wait);
case V3DV_JOB_TYPE_CPU_RESET_QUERIES:
return handle_reset_query_cpu_job(job);
case V3DV_JOB_TYPE_CPU_END_QUERY:

View file

@ -327,6 +327,18 @@ v3dv_write_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
data));
break;
case QUNIFORM_NUM_WORK_GROUPS:
assert(job->type == V3DV_JOB_TYPE_GPU_CSD);
assert(job->csd.workgroup_count[data] > 0);
cl_aligned_u32(&uniforms, job->csd.workgroup_count[data]);
break;
case QUNIFORM_SHARED_OFFSET:
assert(job->type == V3DV_JOB_TYPE_GPU_CSD);
assert(job->csd.shared_memory);
cl_aligned_reloc(&job->indirect, &uniforms, job->csd.shared_memory, 0);
break;
default:
unreachable("unsupported quniform_contents uniform type\n");
}