mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-01-08 10:40:11 +01:00
anv: Add support for OpenCL-style kernel dispatch
v2: Use brw_cs_get_dispatch_info() (Lionel)
Merge barrier fixes (Lionel)
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Acked-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16970>
This commit is contained in:
parent
eb249f125d
commit
6c76ceb613
3 changed files with 147 additions and 1 deletions
|
|
@ -140,6 +140,12 @@ void genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
|
|||
struct anv_address dst, struct anv_address src,
|
||||
uint32_t size);
|
||||
|
||||
void genX(cmd_buffer_dispatch_kernel)(struct anv_cmd_buffer *cmd_buffer,
|
||||
struct anv_kernel *kernel,
|
||||
const uint32_t *global_size, /* NULL for indirect */
|
||||
uint32_t arg_count,
|
||||
const struct anv_kernel_arg *args);
|
||||
|
||||
void genX(blorp_exec)(struct blorp_batch *batch,
|
||||
const struct blorp_params *params);
|
||||
|
||||
|
|
|
|||
|
|
@ -3083,12 +3083,22 @@ anv_pipeline_finish(struct anv_pipeline *pipeline,
|
|||
struct anv_device *device,
|
||||
const VkAllocationCallbacks *pAllocator);
|
||||
|
||||
struct anv_kernel_arg {
|
||||
bool is_ptr;
|
||||
uint16_t size;
|
||||
|
||||
union {
|
||||
uint64_t u64;
|
||||
void *ptr;
|
||||
};
|
||||
};
|
||||
|
||||
struct anv_kernel {
|
||||
#ifndef NDEBUG
|
||||
const char *name;
|
||||
#endif
|
||||
struct anv_shader_bin *bin;
|
||||
const struct gen_l3_config *l3_config;
|
||||
const struct intel_l3_config *l3_config;
|
||||
};
|
||||
|
||||
struct anv_format_plane {
|
||||
|
|
|
|||
|
|
@ -5306,6 +5306,136 @@ genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer)
|
|||
}
|
||||
|
||||
#if GFX_VERx10 >= 125
|
||||
void
|
||||
genX(cmd_buffer_dispatch_kernel)(struct anv_cmd_buffer *cmd_buffer,
|
||||
struct anv_kernel *kernel,
|
||||
const uint32_t *global_size,
|
||||
uint32_t arg_count,
|
||||
const struct anv_kernel_arg *args)
|
||||
{
|
||||
const struct intel_device_info *devinfo = cmd_buffer->device->info;
|
||||
const struct brw_cs_prog_data *cs_prog_data =
|
||||
brw_cs_prog_data_const(kernel->bin->prog_data);
|
||||
|
||||
genX(cmd_buffer_config_l3)(cmd_buffer, kernel->l3_config);
|
||||
|
||||
if (is_render_queue_cmd_buffer(cmd_buffer))
|
||||
genX(flush_pipeline_select_gpgpu)(cmd_buffer);
|
||||
|
||||
/* Apply any pending pipeline flushes we may have. We want to apply them
|
||||
* now because, if any of those flushes are for things like push constants,
|
||||
* the GPU will read the state at weird times.
|
||||
*/
|
||||
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
|
||||
|
||||
uint32_t indirect_data_size = sizeof(struct brw_kernel_sysvals);
|
||||
indirect_data_size += kernel->bin->bind_map.kernel_args_size;
|
||||
indirect_data_size = ALIGN(indirect_data_size, 64);
|
||||
struct anv_state indirect_data =
|
||||
anv_state_stream_alloc(&cmd_buffer->general_state_stream,
|
||||
indirect_data_size, 64);
|
||||
memset(indirect_data.map, 0, indirect_data.alloc_size);
|
||||
|
||||
struct brw_kernel_sysvals sysvals = {};
|
||||
if (global_size != NULL) {
|
||||
for (unsigned i = 0; i < 3; i++)
|
||||
sysvals.num_work_groups[i] = global_size[i];
|
||||
} else {
|
||||
struct mi_builder b;
|
||||
mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
|
||||
|
||||
struct anv_address sysvals_addr = {
|
||||
.bo = cmd_buffer->device->general_state_pool.block_pool.bo,
|
||||
.offset = indirect_data.offset,
|
||||
};
|
||||
|
||||
mi_store(&b, mi_mem32(anv_address_add(sysvals_addr, 0)),
|
||||
mi_reg32(GPGPU_DISPATCHDIMX));
|
||||
mi_store(&b, mi_mem32(anv_address_add(sysvals_addr, 4)),
|
||||
mi_reg32(GPGPU_DISPATCHDIMY));
|
||||
mi_store(&b, mi_mem32(anv_address_add(sysvals_addr, 8)),
|
||||
mi_reg32(GPGPU_DISPATCHDIMZ));
|
||||
}
|
||||
|
||||
memcpy(indirect_data.map, &sysvals, sizeof(sysvals));
|
||||
|
||||
void *args_map = indirect_data.map + sizeof(sysvals);
|
||||
for (unsigned i = 0; i < kernel->bin->bind_map.kernel_arg_count; i++) {
|
||||
struct brw_kernel_arg_desc *arg_desc =
|
||||
&kernel->bin->bind_map.kernel_args[i];
|
||||
assert(i < arg_count);
|
||||
const struct anv_kernel_arg *arg = &args[i];
|
||||
if (arg->is_ptr) {
|
||||
memcpy(args_map + arg_desc->offset, arg->ptr, arg_desc->size);
|
||||
} else {
|
||||
assert(arg_desc->size <= sizeof(arg->u64));
|
||||
memcpy(args_map + arg_desc->offset, &arg->u64, arg_desc->size);
|
||||
}
|
||||
}
|
||||
|
||||
struct brw_cs_dispatch_info dispatch =
|
||||
brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL);
|
||||
|
||||
anv_batch_emit(&cmd_buffer->batch, GENX(CFE_STATE), cfe) {
|
||||
const uint32_t subslices = MAX2(devinfo->subslice_total, 1);
|
||||
cfe.MaximumNumberofThreads =
|
||||
devinfo->max_cs_threads * subslices - 1;
|
||||
|
||||
if (cs_prog_data->base.total_scratch > 0) {
|
||||
struct anv_bo *scratch_bo =
|
||||
anv_scratch_pool_alloc(cmd_buffer->device,
|
||||
&cmd_buffer->device->scratch_pool,
|
||||
MESA_SHADER_COMPUTE,
|
||||
cs_prog_data->base.total_scratch);
|
||||
anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
|
||||
cmd_buffer->batch.alloc,
|
||||
scratch_bo);
|
||||
uint32_t scratch_surf =
|
||||
anv_scratch_pool_get_surf(cmd_buffer->device,
|
||||
&cmd_buffer->device->scratch_pool,
|
||||
cs_prog_data->base.total_scratch);
|
||||
cfe.ScratchSpaceBuffer = scratch_surf >> 4;
|
||||
}
|
||||
}
|
||||
|
||||
anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
|
||||
cw.PredicateEnable = false;
|
||||
cw.SIMDSize = dispatch.simd_size / 16;
|
||||
cw.IndirectDataStartAddress = indirect_data.offset;
|
||||
cw.IndirectDataLength = indirect_data.alloc_size;
|
||||
cw.LocalXMaximum = cs_prog_data->local_size[0] - 1;
|
||||
cw.LocalYMaximum = cs_prog_data->local_size[1] - 1;
|
||||
cw.LocalZMaximum = cs_prog_data->local_size[2] - 1;
|
||||
cw.ExecutionMask = dispatch.right_mask;
|
||||
cw.PostSync.MOCS = cmd_buffer->device->isl_dev.mocs.internal;
|
||||
|
||||
if (global_size != NULL) {
|
||||
cw.ThreadGroupIDXDimension = global_size[0];
|
||||
cw.ThreadGroupIDYDimension = global_size[1];
|
||||
cw.ThreadGroupIDZDimension = global_size[2];
|
||||
} else {
|
||||
cw.IndirectParameterEnable = true;
|
||||
}
|
||||
|
||||
struct GENX(INTERFACE_DESCRIPTOR_DATA) idd = {};
|
||||
{
|
||||
idd.KernelStartPointer = kernel->bin->kernel.offset;
|
||||
idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads;
|
||||
idd.SharedLocalMemorySize =
|
||||
encode_slm_size(GFX_VER, cs_prog_data->base.total_shared);
|
||||
|
||||
if (GFX_VER > 12 || intel_device_info_is_dg2(devinfo))
|
||||
idd.NumberOfBarriers = cs_prog_data->uses_barrier;
|
||||
else
|
||||
idd.BarrierEnable = cs_prog_data->uses_barrier;
|
||||
}
|
||||
cw.InterfaceDescriptor = idd;
|
||||
}
|
||||
|
||||
/* We just blew away the compute pipeline state */
|
||||
cmd_buffer->state.compute.pipeline_dirty = true;
|
||||
}
|
||||
|
||||
static void
|
||||
calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3])
|
||||
{
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue