From 6c76ceb6138c92f2c2f855f9a7ce2adca288ea3e Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Tue, 29 Sep 2020 18:47:09 -0500 Subject: [PATCH] anv: Add support for OpenCL-style kernel dispatch v2: Use brw_cs_get_dispatch_info() (Lionel) Merge barrier fixes (Lionel) Reviewed-by: Lionel Landwerlin Acked-by: Caio Oliveira Part-of: --- src/intel/vulkan/anv_genX.h | 6 ++ src/intel/vulkan/anv_private.h | 12 ++- src/intel/vulkan/genX_cmd_buffer.c | 130 +++++++++++++++++++++++++++++ 3 files changed, 147 insertions(+), 1 deletion(-) diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h index fc7c0cf26ab..73aa5ce7ca2 100644 --- a/src/intel/vulkan/anv_genX.h +++ b/src/intel/vulkan/anv_genX.h @@ -140,6 +140,12 @@ void genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer, struct anv_address dst, struct anv_address src, uint32_t size); +void genX(cmd_buffer_dispatch_kernel)(struct anv_cmd_buffer *cmd_buffer, + struct anv_kernel *kernel, + const uint32_t *global_size, /* NULL for indirect */ + uint32_t arg_count, + const struct anv_kernel_arg *args); + void genX(blorp_exec)(struct blorp_batch *batch, const struct blorp_params *params); diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 8137c91b9d4..6be0b6a38ba 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -3083,12 +3083,22 @@ anv_pipeline_finish(struct anv_pipeline *pipeline, struct anv_device *device, const VkAllocationCallbacks *pAllocator); +struct anv_kernel_arg { + bool is_ptr; + uint16_t size; + + union { + uint64_t u64; + void *ptr; + }; +}; + struct anv_kernel { #ifndef NDEBUG const char *name; #endif struct anv_shader_bin *bin; - const struct gen_l3_config *l3_config; + const struct intel_l3_config *l3_config; }; struct anv_format_plane { diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index d197bb3afac..3284b58a662 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -5306,6 +5306,136 @@ genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer) } #if GFX_VERx10 >= 125 +void +genX(cmd_buffer_dispatch_kernel)(struct anv_cmd_buffer *cmd_buffer, + struct anv_kernel *kernel, + const uint32_t *global_size, + uint32_t arg_count, + const struct anv_kernel_arg *args) +{ + const struct intel_device_info *devinfo = cmd_buffer->device->info; + const struct brw_cs_prog_data *cs_prog_data = + brw_cs_prog_data_const(kernel->bin->prog_data); + + genX(cmd_buffer_config_l3)(cmd_buffer, kernel->l3_config); + + if (is_render_queue_cmd_buffer(cmd_buffer)) + genX(flush_pipeline_select_gpgpu)(cmd_buffer); + + /* Apply any pending pipeline flushes we may have. We want to apply them + * now because, if any of those flushes are for things like push constants, + * the GPU will read the state at weird times. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + uint32_t indirect_data_size = sizeof(struct brw_kernel_sysvals); + indirect_data_size += kernel->bin->bind_map.kernel_args_size; + indirect_data_size = ALIGN(indirect_data_size, 64); + struct anv_state indirect_data = + anv_state_stream_alloc(&cmd_buffer->general_state_stream, + indirect_data_size, 64); + memset(indirect_data.map, 0, indirect_data.alloc_size); + + struct brw_kernel_sysvals sysvals = {}; + if (global_size != NULL) { + for (unsigned i = 0; i < 3; i++) + sysvals.num_work_groups[i] = global_size[i]; + } else { + struct mi_builder b; + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); + + struct anv_address sysvals_addr = { + .bo = cmd_buffer->device->general_state_pool.block_pool.bo, + .offset = indirect_data.offset, + }; + + mi_store(&b, mi_mem32(anv_address_add(sysvals_addr, 0)), + mi_reg32(GPGPU_DISPATCHDIMX)); + mi_store(&b, mi_mem32(anv_address_add(sysvals_addr, 4)), + mi_reg32(GPGPU_DISPATCHDIMY)); + mi_store(&b, mi_mem32(anv_address_add(sysvals_addr, 8)), + mi_reg32(GPGPU_DISPATCHDIMZ)); + } + + memcpy(indirect_data.map, &sysvals, sizeof(sysvals)); + + void *args_map = indirect_data.map + sizeof(sysvals); + for (unsigned i = 0; i < kernel->bin->bind_map.kernel_arg_count; i++) { + struct brw_kernel_arg_desc *arg_desc = + &kernel->bin->bind_map.kernel_args[i]; + assert(i < arg_count); + const struct anv_kernel_arg *arg = &args[i]; + if (arg->is_ptr) { + memcpy(args_map + arg_desc->offset, arg->ptr, arg_desc->size); + } else { + assert(arg_desc->size <= sizeof(arg->u64)); + memcpy(args_map + arg_desc->offset, &arg->u64, arg_desc->size); + } + } + + struct brw_cs_dispatch_info dispatch = + brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL); + + anv_batch_emit(&cmd_buffer->batch, GENX(CFE_STATE), cfe) { + const uint32_t subslices = MAX2(devinfo->subslice_total, 1); + cfe.MaximumNumberofThreads = + devinfo->max_cs_threads * subslices - 1; + + if (cs_prog_data->base.total_scratch > 0) { + struct anv_bo *scratch_bo = + anv_scratch_pool_alloc(cmd_buffer->device, + &cmd_buffer->device->scratch_pool, + MESA_SHADER_COMPUTE, + cs_prog_data->base.total_scratch); + anv_reloc_list_add_bo(cmd_buffer->batch.relocs, + cmd_buffer->batch.alloc, + scratch_bo); + uint32_t scratch_surf = + anv_scratch_pool_get_surf(cmd_buffer->device, + &cmd_buffer->device->scratch_pool, + cs_prog_data->base.total_scratch); + cfe.ScratchSpaceBuffer = scratch_surf >> 4; + } + } + + anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) { + cw.PredicateEnable = false; + cw.SIMDSize = dispatch.simd_size / 16; + cw.IndirectDataStartAddress = indirect_data.offset; + cw.IndirectDataLength = indirect_data.alloc_size; + cw.LocalXMaximum = cs_prog_data->local_size[0] - 1; + cw.LocalYMaximum = cs_prog_data->local_size[1] - 1; + cw.LocalZMaximum = cs_prog_data->local_size[2] - 1; + cw.ExecutionMask = dispatch.right_mask; + cw.PostSync.MOCS = cmd_buffer->device->isl_dev.mocs.internal; + + if (global_size != NULL) { + cw.ThreadGroupIDXDimension = global_size[0]; + cw.ThreadGroupIDYDimension = global_size[1]; + cw.ThreadGroupIDZDimension = global_size[2]; + } else { + cw.IndirectParameterEnable = true; + } + + struct GENX(INTERFACE_DESCRIPTOR_DATA) idd = {}; + { + idd.KernelStartPointer = kernel->bin->kernel.offset; + idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads; + idd.SharedLocalMemorySize = + encode_slm_size(GFX_VER, cs_prog_data->base.total_shared); + + if (GFX_VER > 12 || intel_device_info_is_dg2(devinfo)) + idd.NumberOfBarriers = cs_prog_data->uses_barrier; + else + idd.BarrierEnable = cs_prog_data->uses_barrier; + } + cw.InterfaceDescriptor = idd; + } + + /* We just blew away the compute pipeline state */ + cmd_buffer->state.compute.pipeline_dirty = true; +} + static void calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3]) {