anv: Add support for a transfer queue on Alchemist

Alchemist has an improved blitter that's sufficiently powerful to
implement a transfer queue. Tigerlake's blitter lacks compression
handling and other features we need, unfortunately.

Rework (Sagar):
- Check blitter command buffer in EndCommandBuffer

v2: (Lionel)
- Look at image, buffer and memory barriers as well
- Flush cache if there is queue ownership transfer

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18325>
This commit is contained in:
Kenneth Graunke 2022-06-16 09:05:30 -07:00 committed by Marge Bot
parent 5112b42146
commit 17b8b2cffd
5 changed files with 138 additions and 5 deletions

View file

@ -112,9 +112,16 @@ static void
anv_blorp_batch_init(struct anv_cmd_buffer *cmd_buffer,
struct blorp_batch *batch, enum blorp_batch_flags flags)
{
if (!(cmd_buffer->queue_family->queueFlags & VK_QUEUE_GRAPHICS_BIT)) {
assert(cmd_buffer->queue_family->queueFlags & VK_QUEUE_COMPUTE_BIT);
VkQueueFlags queue_flags = cmd_buffer->queue_family->queueFlags;
if (queue_flags & VK_QUEUE_GRAPHICS_BIT) {
/* blorp runs on render engine by default */
} else if (queue_flags & VK_QUEUE_COMPUTE_BIT) {
flags |= BLORP_BATCH_USE_COMPUTE;
} else if (queue_flags & VK_QUEUE_TRANSFER_BIT) {
flags |= BLORP_BATCH_USE_BLITTER;
} else {
unreachable("unknown queue family");
}
blorp_batch_init(&cmd_buffer->device->blorp, batch, cmd_buffer, flags);

View file

@ -1145,6 +1145,12 @@ anv_physical_device_init_queue_families(struct anv_physical_device *pdevice)
enum intel_engine_class compute_class =
c_count < 1 ? INTEL_ENGINE_CLASS_RENDER : INTEL_ENGINE_CLASS_COMPUTE;
int blit_count = 0;
if (debug_get_bool_option("INTEL_COPY_CLASS", false)) {
blit_count = intel_engines_count(pdevice->engine_info,
INTEL_ENGINE_CLASS_COPY);
}
anv_override_engine_counts(&gc_count, &g_count, &c_count, &v_count);
if (gc_count > 0) {
@ -1192,6 +1198,13 @@ anv_physical_device_init_queue_families(struct anv_physical_device *pdevice)
.engine_class = INTEL_ENGINE_CLASS_VIDEO,
};
}
if (blit_count > 0) {
pdevice->queue.families[family_count++] = (struct anv_queue_family) {
.queueFlags = VK_QUEUE_TRANSFER_BIT,
.queueCount = blit_count,
.engine_class = INTEL_ENGINE_CLASS_COPY,
};
}
/* Increase count below when other families are added as a reminder to
* increase the ANV_MAX_QUEUE_FAMILIES value.

View file

@ -414,6 +414,18 @@ blorp_exec_on_compute(struct blorp_batch *batch,
cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
}
static void
blorp_exec_on_blitter(struct blorp_batch *batch,
const struct blorp_params *params)
{
assert(batch->flags & BLORP_BATCH_USE_BLITTER);
struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
assert(cmd_buffer->queue_family->queueFlags == VK_QUEUE_TRANSFER_BIT);
blorp_exec(batch, params);
}
void
genX(blorp_exec)(struct blorp_batch *batch,
const struct blorp_params *params)
@ -430,7 +442,9 @@ genX(blorp_exec)(struct blorp_batch *batch,
genX(cmd_buffer_config_l3)(cmd_buffer, cfg);
}
if (batch->flags & BLORP_BATCH_USE_COMPUTE)
if (batch->flags & BLORP_BATCH_USE_BLITTER)
blorp_exec_on_blitter(batch, params);
else if (batch->flags & BLORP_BATCH_USE_COMPUTE)
blorp_exec_on_compute(batch, params);
else
blorp_exec_on_render(batch, params);

View file

@ -1808,6 +1808,9 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
else if (bits == 0)
return;
if (anv_cmd_buffer_is_blitter_queue(cmd_buffer))
return;
const bool trace_flush =
(bits & (ANV_PIPE_FLUSH_BITS |
ANV_PIPE_STALL_BITS |
@ -3390,7 +3393,8 @@ genX(BeginCommandBuffer)(
trace_intel_begin_cmd_buffer(&cmd_buffer->trace);
if (anv_cmd_buffer_is_video_queue(cmd_buffer))
if (anv_cmd_buffer_is_video_queue(cmd_buffer) ||
anv_cmd_buffer_is_blitter_queue(cmd_buffer))
return VK_SUCCESS;
genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
@ -3560,7 +3564,8 @@ end_command_buffer(struct anv_cmd_buffer *cmd_buffer)
anv_measure_endcommandbuffer(cmd_buffer);
if (anv_cmd_buffer_is_video_queue(cmd_buffer)) {
if (anv_cmd_buffer_is_video_queue(cmd_buffer) ||
anv_cmd_buffer_is_blitter_queue(cmd_buffer)) {
trace_intel_end_cmd_buffer(&cmd_buffer->trace, cmd_buffer->vk.level);
anv_cmd_buffer_end_batch_buffer(cmd_buffer);
return VK_SUCCESS;
@ -3947,6 +3952,88 @@ cmd_buffer_barrier_video(struct anv_cmd_buffer *cmd_buffer,
}
}
static void
cmd_buffer_barrier_blitter(struct anv_cmd_buffer *cmd_buffer,
const VkDependencyInfo *dep_info)
{
#if GFX_VERx10 >= 125
assert(anv_cmd_buffer_is_blitter_queue(cmd_buffer));
/* The blitter requires an MI_FLUSH_DW command when a buffer transitions
* from being a destination to a source.
*/
bool flush_llc = false;
bool flush_ccs = false;
for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
const VkImageMemoryBarrier2 *img_barrier =
&dep_info->pImageMemoryBarriers[i];
ANV_FROM_HANDLE(anv_image, image, img_barrier->image);
const VkImageSubresourceRange *range = &img_barrier->subresourceRange;
/* If srcQueueFamilyIndex is not equal to dstQueueFamilyIndex, this
* memory barrier defines a queue family transfer operation.
*/
if (img_barrier->srcQueueFamilyIndex != img_barrier->dstQueueFamilyIndex)
flush_llc = true;
/* Flush cache if transfer command reads the output of the previous
* transfer command, ideally we should just wait for the completion but
* for now just flush the cache to make the data visible.
*/
if ((img_barrier->oldLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL ||
img_barrier->oldLayout == VK_IMAGE_LAYOUT_GENERAL) &&
(img_barrier->newLayout == VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL ||
img_barrier->newLayout == VK_IMAGE_LAYOUT_GENERAL)) {
flush_llc = true;
}
VkImageAspectFlags img_aspects =
vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
anv_foreach_image_aspect_bit(aspect_bit, image, img_aspects) {
const uint32_t plane =
anv_image_aspect_to_plane(image, 1UL << aspect_bit);
if (isl_aux_usage_has_ccs(image->planes[plane].aux_usage)) {
flush_ccs = true;
}
}
}
for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
/* Flush the cache if something is written by the transfer command and
* used by any other stages except transfer stage or if
* srcQueueFamilyIndex is not equal to dstQueueFamilyIndex, this memory
* barrier defines a queue family transfer operation.
*/
if ((stage_is_transfer(dep_info->pBufferMemoryBarriers[i].srcStageMask) &&
mask_is_write(dep_info->pBufferMemoryBarriers[i].srcAccessMask)) ||
(dep_info->pBufferMemoryBarriers[i].srcQueueFamilyIndex !=
dep_info->pBufferMemoryBarriers[i].dstQueueFamilyIndex)) {
flush_llc = true;
break;
}
}
for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
/* Flush the cache if something is written by the transfer command and
* used by any other stages except transfer stage.
*/
if (stage_is_transfer(dep_info->pMemoryBarriers[i].srcStageMask) &&
mask_is_write(dep_info->pMemoryBarriers[i].srcAccessMask)) {
flush_llc = true;
break;
}
}
if (flush_ccs || flush_llc) {
anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), fd) {
fd.FlushCCS = flush_ccs;
fd.FlushLLC = flush_llc;
}
}
#endif
}
static void
cmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer,
const VkDependencyInfo *dep_info,
@ -3957,6 +4044,11 @@ cmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer,
return;
}
if (anv_cmd_buffer_is_blitter_queue(cmd_buffer)) {
cmd_buffer_barrier_blitter(cmd_buffer, dep_info);
return;
}
struct anv_device *device = cmd_buffer->device;
/* XXX: Right now, we're really dumb and just flush whatever categories

View file

@ -711,6 +711,13 @@ genX(init_device_state)(struct anv_device *device)
case INTEL_ENGINE_CLASS_VIDEO:
res = VK_SUCCESS;
break;
case INTEL_ENGINE_CLASS_COPY:
/**
* Execute RCS init batch by default on the companion RCS command buffer in
* order to support MSAA copy/clear operations on copy queue.
*/
res = init_render_queue_state(queue, true /* is_companion_rcs_batch */);
break;
default:
res = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
break;