anv: try to keep the pipeline in GPGPU mode when buffer transfer ops

To avoid ping-ponging between 3D & GPGPU in the following sequence :

  vkCmdDispatch(...)
  vkCmdCopyBuffer(...)
  vkCmdDispatch(...)

We can try to keep the pipeline in GPGPU mode when doing blorp buffer
operations (we have blorp support for the CCS and can use the same
shaders on RCS).

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: José Roberto de Souza <jose.souza@intel.com>
Reviewed-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27956>
This commit is contained in:
Lionel Landwerlin 2024-03-01 12:39:03 +02:00 committed by Marge Bot
parent 194afe8416
commit 6823ffe70e
4 changed files with 27 additions and 3 deletions

View file

@ -130,6 +130,10 @@ anv_blorp_batch_init(struct anv_cmd_buffer *cmd_buffer,
unreachable("unknown queue family");
}
/* Can't have both flags at the same time. */
assert((flags & BLORP_BATCH_USE_BLITTER) == 0 ||
(flags & BLORP_BATCH_USE_COMPUTE) == 0);
blorp_batch_init(&cmd_buffer->device->blorp, batch, cmd_buffer, flags);
}
@ -1030,7 +1034,10 @@ void anv_CmdCopyBuffer2(
ANV_FROM_HANDLE(anv_buffer, dst_buffer, pCopyBufferInfo->dstBuffer);
struct blorp_batch batch;
anv_blorp_batch_init(cmd_buffer, &batch, 0);
anv_blorp_batch_init(cmd_buffer, &batch,
cmd_buffer->state.current_pipeline ==
cmd_buffer->device->physical->gpgpu_pipeline_value ?
BLORP_BATCH_USE_COMPUTE : 0);
for (unsigned r = 0; r < pCopyBufferInfo->regionCount; r++) {
copy_buffer(cmd_buffer->device, &batch, src_buffer, dst_buffer,
@ -1054,7 +1061,10 @@ void anv_CmdUpdateBuffer(
ANV_FROM_HANDLE(anv_buffer, dst_buffer, dstBuffer);
struct blorp_batch batch;
anv_blorp_batch_init(cmd_buffer, &batch, 0);
anv_blorp_batch_init(cmd_buffer, &batch,
cmd_buffer->state.current_pipeline ==
cmd_buffer->device->physical->gpgpu_pipeline_value ?
BLORP_BATCH_USE_COMPUTE : 0);
/* We can't quite grab a full block because the state stream needs a
* little data at the top to build its linked list.
@ -1118,7 +1128,10 @@ anv_cmd_buffer_fill_area(struct anv_cmd_buffer *cmd_buffer,
struct isl_surf isl_surf;
struct blorp_batch batch;
anv_blorp_batch_init(cmd_buffer, &batch, 0);
anv_blorp_batch_init(cmd_buffer, &batch,
cmd_buffer->state.current_pipeline ==
cmd_buffer->device->physical->gpgpu_pipeline_value ?
BLORP_BATCH_USE_COMPUTE : 0);
/* First, we compute the biggest format that can be used with the
* given offsets and size.

View file

@ -1201,6 +1201,9 @@ struct anv_physical_device {
void (*cmd_emit_timestamp)(struct anv_batch *, struct anv_device *, struct anv_address,
enum anv_timestamp_capture_type, void *);
struct intel_measure_device measure_device;
/* Value of PIPELINE_SELECT::PipelineSelection == GPGPU */
uint32_t gpgpu_pipeline_value;
};
static inline uint32_t

View file

@ -3390,6 +3390,12 @@ anv_pipe_flush_bits_for_access_flags(struct anv_cmd_buffer *cmd_buffer,
pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
} else {
/* We can use the data port when trying to stay in compute mode on
* the RCS.
*/
pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
/* Most operations are done through RT/detph writes */
pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
}

View file

@ -770,6 +770,8 @@ genX(init_physical_device_state)(ASSERTED struct anv_physical_device *pdevice)
#endif
pdevice->cmd_emit_timestamp = genX(cmd_emit_timestamp);
pdevice->gpgpu_pipeline_value = GPGPU;
}
VkResult