From b741a9a851ca3747aa92ce0d6611b488c6e0e07b Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Mon, 25 Sep 2023 09:16:55 -0700 Subject: [PATCH] anv: Set PIPELINE_SELECT systolic mode enable flag Set the flag on compute shaders when the application has enabled the cooperative matrix feature. We might still want to enable this only when DPAS is actually used. The current method is based on many suggestions from Lionel. Reviewed-by: Lionel Landwerlin Part-of: --- src/intel/genxml/gen125.xml | 1 + src/intel/vulkan/anv_genX.h | 3 ++- src/intel/vulkan/genX_cmd_buffer.c | 15 ++++++++++++--- src/intel/vulkan/genX_gpu_memcpy.c | 2 +- src/intel/vulkan/genX_init_state.c | 8 ++++---- 5 files changed, 20 insertions(+), 9 deletions(-) diff --git a/src/intel/genxml/gen125.xml b/src/intel/genxml/gen125.xml index bbf0a257992..b2b6567cf77 100644 --- a/src/intel/genxml/gen125.xml +++ b/src/intel/genxml/gen125.xml @@ -1764,6 +1764,7 @@ + diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h index 94cb208a103..477f1688353 100644 --- a/src/intel/vulkan/anv_genX.h +++ b/src/intel/vulkan/anv_genX.h @@ -90,7 +90,8 @@ void genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer, void genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer); void genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer); -void genX(emit_pipeline_select)(struct anv_batch *batch, uint32_t pipeline); +void genX(emit_pipeline_select)(struct anv_batch *batch, uint32_t pipeline, + const struct anv_device *device); void genX(apply_task_urb_workaround)(struct anv_cmd_buffer *cmd_buffer); diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 71ba9f0a46c..e8b992d020a 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -6815,14 +6815,23 @@ genX(CmdTraceRaysIndirect2KHR)( * flush_pipeline_select() */ void -genX(emit_pipeline_select)(struct anv_batch *batch, uint32_t pipeline) +genX(emit_pipeline_select)(struct anv_batch *batch, uint32_t pipeline, + const struct anv_device *device) { anv_batch_emit(batch, GENX(PIPELINE_SELECT), ps) { - ps.MaskBits = GFX_VER == 12 ? 0x13 : 0x3; + ps.MaskBits = GFX_VERx10 >= 125 ? 0x93 : GFX_VER >= 12 ? 0x13 : 0x3; #if GFX_VER == 12 ps.MediaSamplerDOPClockGateEnable = true; #endif ps.PipelineSelection = pipeline; +#if GFX_VERx10 == 125 + /* It might still be better to only enable this when the compute + * pipeline will have DPAS instructions. + */ + ps.SystolicModeEnable = pipeline == GPGPU && + device->vk.enabled_extensions.KHR_cooperative_matrix && + device->vk.enabled_features.cooperativeMatrix; +#endif } } @@ -6972,7 +6981,7 @@ genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer, } #endif - genX(emit_pipeline_select)(&cmd_buffer->batch, pipeline); + genX(emit_pipeline_select)(&cmd_buffer->batch, pipeline, cmd_buffer->device); #if GFX_VER == 9 if (devinfo->platform == INTEL_PLATFORM_GLK) { diff --git a/src/intel/vulkan/genX_gpu_memcpy.c b/src/intel/vulkan/genX_gpu_memcpy.c index 742aea1fa83..5e78d2ab387 100644 --- a/src/intel/vulkan/genX_gpu_memcpy.c +++ b/src/intel/vulkan/genX_gpu_memcpy.c @@ -256,7 +256,7 @@ genX(emit_so_memcpy_init)(struct anv_memcpy_state *state, const struct intel_l3_config *cfg = intel_get_default_l3_config(device->info); genX(emit_l3_config)(batch, device, cfg); - genX(emit_pipeline_select)(batch, _3D); + genX(emit_pipeline_select)(batch, _3D, device); emit_common_so_memcpy(batch, device, cfg); } diff --git a/src/intel/vulkan/genX_init_state.c b/src/intel/vulkan/genX_init_state.c index b4757376b7e..f4892d687b4 100644 --- a/src/intel/vulkan/genX_init_state.c +++ b/src/intel/vulkan/genX_init_state.c @@ -352,7 +352,7 @@ init_render_queue_state(struct anv_queue *queue, bool is_companion_rcs_batch) }; GENX(VERTEX_ELEMENT_STATE_pack)(NULL, device->empty_vs_input, &empty_ve); - genX(emit_pipeline_select)(&batch, _3D); + genX(emit_pipeline_select)(&batch, _3D, device); #if GFX_VER == 9 anv_batch_write_reg(&batch, GENX(CACHE_MODE_1), cm1) { @@ -595,7 +595,7 @@ init_render_queue_state(struct anv_queue *queue, bool is_companion_rcs_batch) ANV_NULL_ADDRESS, 0, ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS); - genX(emit_pipeline_select)(&batch, GPGPU); + genX(emit_pipeline_select)(&batch, GPGPU, device); anv_batch_emit(&batch, GENX(CFE_STATE), cfe) { cfe.MaximumNumberofThreads = devinfo->max_cs_threads * devinfo->subslice_total; @@ -604,7 +604,7 @@ init_render_queue_state(struct anv_queue *queue, bool is_companion_rcs_batch) ANV_NULL_ADDRESS, 0, ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS); - genX(emit_pipeline_select)(&batch, _3D); + genX(emit_pipeline_select)(&batch, _3D, device); #endif anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe); @@ -628,7 +628,7 @@ init_compute_queue_state(struct anv_queue *queue) .end = (void *) cmds + sizeof(cmds), }; - genX(emit_pipeline_select)(&batch, GPGPU); + genX(emit_pipeline_select)(&batch, GPGPU, queue->device); #if GFX_VER == 12 if (queue->device->info->has_aux_map) {