anv: make device initialization more asynchronous

With this change, the engine initialization batches are build and
submitted at vkCreateDevice() but the function doesn't wait for them
to complete. Instead we wait at vkDestroyDevice() or whenever another
submission happens on the queue, we check whether the initialization
batch has completed (without waiting) and free it if completed.

Seems to be about 25% reduction time of vkCreateDevice()

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: José Roberto de Souza <jose.souza@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28975>
This commit is contained in:
Lionel Landwerlin 2024-04-29 17:40:56 +03:00 committed by Marge Bot
parent 729c0b54b6
commit 49d2d25e24
13 changed files with 188 additions and 292 deletions

View file

@ -1561,6 +1561,21 @@ anv_queue_submit_cmd_buffers_locked(struct anv_queue *queue,
return VK_SUCCESS;
}
static inline void
anv_queue_free_initial_submission(struct anv_queue *queue)
{
if (queue->init_submit &&
anv_async_submit_done(queue->init_submit)) {
anv_async_submit_destroy(queue->init_submit);
queue->init_submit = NULL;
}
if (queue->init_companion_submit &&
anv_async_submit_done(queue->init_companion_submit)) {
anv_async_submit_destroy(queue->init_companion_submit);
queue->init_companion_submit = NULL;
}
}
VkResult
anv_queue_submit(struct vk_queue *vk_queue,
struct vk_queue_submit *submit)
@ -1569,6 +1584,8 @@ anv_queue_submit(struct vk_queue *vk_queue,
struct anv_device *device = queue->device;
VkResult result;
anv_queue_free_initial_submission(queue);
if (queue->device->info->no_hw) {
for (uint32_t i = 0; i < submit->signal_count; i++) {
result = vk_sync_signal(&device->vk,
@ -1615,59 +1632,6 @@ anv_queue_submit(struct vk_queue *vk_queue,
return result;
}
VkResult
anv_queue_submit_simple_batch(struct anv_queue *queue,
struct anv_batch *batch,
bool is_companion_rcs_batch)
{
struct anv_device *device = queue->device;
VkResult result = VK_SUCCESS;
if (anv_batch_has_error(batch))
return batch->status;
if (queue->device->info->no_hw)
return VK_SUCCESS;
/* This is only used by device init so we can assume the queue is empty and
* we aren't fighting with a submit thread.
*/
assert(vk_queue_is_empty(&queue->vk));
uint32_t batch_size = align(batch->next - batch->start, 8);
struct anv_bo *batch_bo = NULL;
result = anv_bo_pool_alloc(&device->batch_bo_pool, batch_size, &batch_bo);
if (result != VK_SUCCESS)
return result;
memcpy(batch_bo->map, batch->start, batch_size);
#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
if (device->physical->memory.need_flush &&
anv_bo_needs_host_cache_flush(batch_bo->alloc_flags))
intel_flush_range(batch_bo->map, batch_size);
#endif
if (INTEL_DEBUG(DEBUG_BATCH) &&
intel_debug_batch_in_range(device->debug_frame_desc->frame_id)) {
int render_queue_idx =
anv_get_first_render_queue_index(device->physical);
struct intel_batch_decode_ctx *ctx = is_companion_rcs_batch ?
&device->decoder[render_queue_idx] :
queue->decoder;
intel_print_batch(ctx, batch_bo->map, batch_bo->size, batch_bo->offset,
false);
}
result = device->kmd_backend->execute_simple_batch(queue, batch_bo,
batch_size,
is_companion_rcs_batch);
anv_bo_pool_free(&device->batch_bo_pool, batch_bo);
return result;
}
void
anv_cmd_buffer_clflush(struct anv_cmd_buffer **cmd_buffers,
uint32_t num_cmd_buffers)

View file

@ -3421,24 +3421,9 @@ VkResult anv_CreateDevice(
goto fail_context_id;
}
device->queue_count = 0;
for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
const VkDeviceQueueCreateInfo *queueCreateInfo =
&pCreateInfo->pQueueCreateInfos[i];
for (uint32_t j = 0; j < queueCreateInfo->queueCount; j++) {
result = anv_queue_init(device, &device->queues[device->queue_count],
queueCreateInfo, j);
if (result != VK_SUCCESS)
goto fail_queues;
device->queue_count++;
}
}
if (pthread_mutex_init(&device->vma_mutex, NULL) != 0) {
result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
goto fail_queues;
goto fail_queues_alloc;
}
/* keep the page with address zero out of the allocator */
@ -3873,12 +3858,6 @@ VkResult anv_CreateDevice(
goto fail_internal_cache;
}
result = anv_device_init_rt_shaders(device);
if (result != VK_SUCCESS) {
result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
goto fail_print;
}
#if DETECT_OS_ANDROID
device->u_gralloc = u_gralloc_create(U_GRALLOC_TYPE_AUTO);
#endif
@ -3903,7 +3882,7 @@ VkResult anv_CreateDevice(
&pool_info, NULL,
&device->companion_rcs_cmd_pool);
if (result != VK_SUCCESS) {
goto fail_internal_cache;
goto fail_print;
}
}
@ -3911,6 +3890,12 @@ VkResult anv_CreateDevice(
if (result != VK_SUCCESS)
goto fail_companion_cmd_pool;
result = anv_device_init_rt_shaders(device);
if (result != VK_SUCCESS) {
result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
goto fail_trtt;
}
anv_device_init_blorp(device);
anv_device_init_border_colors(device);
@ -3921,8 +3906,6 @@ VkResult anv_CreateDevice(
anv_device_perf_init(device);
anv_device_utrace_init(device);
anv_device_init_embedded_samplers(device);
BITSET_ONES(device->gfx_dirty_state);
@ -3955,22 +3938,43 @@ VkResult anv_CreateDevice(
if (device->info->ver > 9)
BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_PMA_FIX);
device->queue_count = 0;
for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
const VkDeviceQueueCreateInfo *queueCreateInfo =
&pCreateInfo->pQueueCreateInfos[i];
for (uint32_t j = 0; j < queueCreateInfo->queueCount; j++) {
result = anv_queue_init(device, &device->queues[device->queue_count],
queueCreateInfo, j);
if (result != VK_SUCCESS)
goto fail_queues;
device->queue_count++;
}
}
anv_device_utrace_init(device);
result = anv_genX(device->info, init_device_state)(device);
if (result != VK_SUCCESS)
goto fail_inits;
goto fail_utrace;
*pDevice = anv_device_to_handle(device);
return VK_SUCCESS;
fail_inits:
anv_device_finish_trtt(device);
anv_device_finish_embedded_samplers(device);
fail_utrace:
anv_device_utrace_finish(device);
fail_queues:
for (uint32_t i = 0; i < device->queue_count; i++)
anv_queue_finish(&device->queues[i]);
anv_device_finish_embedded_samplers(device);
anv_device_finish_blorp(device);
anv_device_finish_rt_shaders(device);
anv_device_finish_astc_emu(device);
anv_device_finish_internal_kernels(device);
anv_device_finish_rt_shaders(device);
fail_trtt:
anv_device_finish_trtt(device);
fail_companion_cmd_pool:
if (device->info->verx10 >= 125) {
vk_common_DestroyCommandPool(anv_device_to_handle(device),
@ -4051,9 +4055,7 @@ VkResult anv_CreateDevice(
util_vma_heap_finish(&device->vma_hi);
util_vma_heap_finish(&device->vma_lo);
pthread_mutex_destroy(&device->vma_mutex);
fail_queues:
for (uint32_t i = 0; i < device->queue_count; i++)
anv_queue_finish(&device->queues[i]);
fail_queues_alloc:
vk_free(&device->vk.alloc, device->queues);
fail_context_id:
anv_device_destroy_context_or_vm(device);
@ -4087,12 +4089,12 @@ void anv_DestroyDevice(
/* Do TRTT batch garbage collection before destroying queues. */
anv_device_finish_trtt(device);
anv_device_utrace_finish(device);
for (uint32_t i = 0; i < device->queue_count; i++)
anv_queue_finish(&device->queues[i]);
vk_free(&device->vk.alloc, device->queues);
anv_device_utrace_finish(device);
anv_device_finish_blorp(device);
anv_device_finish_rt_shaders(device);

View file

@ -58,13 +58,6 @@ stub_gem_mmap(struct anv_device *device, struct anv_bo *bo, uint64_t offset,
offset);
}
static VkResult
stub_execute_simple_batch(struct anv_queue *queue, struct anv_bo *batch_bo,
uint32_t batch_bo_size, bool is_companion_rcs_batch)
{
return VK_ERROR_UNKNOWN;
}
static VkResult
stub_queue_exec_locked(struct anv_queue *queue,
uint32_t wait_count,
@ -172,7 +165,6 @@ const struct anv_kmd_backend *anv_stub_kmd_backend_get(void)
.vm_bind = stub_vm_bind,
.vm_bind_bo = stub_vm_bind_bo,
.vm_unbind_bo = stub_vm_bind_bo,
.execute_simple_batch = stub_execute_simple_batch,
.queue_exec_locked = stub_queue_exec_locked,
.queue_exec_async = stub_queue_exec_async,
.bo_alloc_flags_to_bo_flags = stub_bo_alloc_flags_to_bo_flags,

View file

@ -106,10 +106,6 @@ struct anv_kmd_backend {
VkResult (*vm_bind_bo)(struct anv_device *device, struct anv_bo *bo);
VkResult (*vm_unbind_bo)(struct anv_device *device, struct anv_bo *bo);
VkResult (*execute_simple_batch)(struct anv_queue *queue,
struct anv_bo *batch_bo,
uint32_t batch_bo_size,
bool is_companion_rcs_batch);
/* The caller is expected to hold device->mutex when calling this vfunc.
*/
VkResult (*queue_exec_locked)(struct anv_queue *queue,

View file

@ -1319,6 +1319,9 @@ struct anv_queue {
struct vk_sync *companion_sync;
struct intel_ds_queue ds;
struct anv_async_submit *init_submit;
struct anv_async_submit *init_companion_submit;
};
struct nir_xfb_info;
@ -2157,9 +2160,6 @@ void anv_queue_finish(struct anv_queue *queue);
VkResult anv_queue_submit(struct vk_queue *queue,
struct vk_queue_submit *submit);
VkResult anv_queue_submit_simple_batch(struct anv_queue *queue,
struct anv_batch *batch,
bool is_companion_rcs_batch);
void anv_queue_trace(struct anv_queue *queue, const char *label,
bool frame, bool begin);

View file

@ -119,6 +119,15 @@ anv_queue_init(struct anv_device *device, struct anv_queue *queue,
void
anv_queue_finish(struct anv_queue *queue)
{
if (queue->init_submit) {
anv_async_submit_wait(queue->init_submit);
anv_async_submit_destroy(queue->init_submit);
}
if (queue->init_companion_submit) {
anv_async_submit_wait(queue->init_companion_submit);
anv_async_submit_destroy(queue->init_companion_submit);
}
if (queue->sync)
vk_sync_destroy(&queue->device->vk, queue->sync);

View file

@ -343,18 +343,21 @@ init_render_queue_state(struct anv_queue *queue, bool is_companion_rcs_batch)
{
struct anv_device *device = queue->device;
UNUSED const struct intel_device_info *devinfo = queue->device->info;
uint32_t cmds[256];
struct anv_batch batch = {
.start = cmds,
.next = cmds,
.end = (void *) cmds + sizeof(cmds),
};
struct anv_async_submit *submit;
VkResult result = anv_async_submit_create(queue,
&device->batch_bo_pool,
is_companion_rcs_batch,
true, &submit);
if (result != VK_SUCCESS)
return result;
genX(emit_pipeline_select)(&batch, _3D, device);
struct anv_batch *batch = &submit->batch;
genX(emit_pipeline_select)(batch, _3D, device);
#if GFX_VER == 9
anv_batch_write_reg(&batch, GENX(CACHE_MODE_1), cm1) {
anv_batch_write_reg(batch, GENX(CACHE_MODE_1), cm1) {
cm1.FloatBlendOptimizationEnable = true;
cm1.FloatBlendOptimizationEnableMask = true;
cm1.MSCRAWHazardAvoidanceBit = true;
@ -364,9 +367,9 @@ init_render_queue_state(struct anv_queue *queue, bool is_companion_rcs_batch)
}
#endif
anv_batch_emit(&batch, GENX(3DSTATE_AA_LINE_PARAMETERS), aa);
anv_batch_emit(batch, GENX(3DSTATE_AA_LINE_PARAMETERS), aa);
anv_batch_emit(&batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
anv_batch_emit(batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
rect.ClippedDrawingRectangleYMin = 0;
rect.ClippedDrawingRectangleXMin = 0;
rect.ClippedDrawingRectangleYMax = UINT16_MAX;
@ -375,7 +378,7 @@ init_render_queue_state(struct anv_queue *queue, bool is_companion_rcs_batch)
rect.DrawingRectangleOriginX = 0;
}
anv_batch_emit(&batch, GENX(3DSTATE_WM_CHROMAKEY), ck);
anv_batch_emit(batch, GENX(3DSTATE_WM_CHROMAKEY), ck);
/* SKL PRMs, Volume 2a: Command Reference: Instructions: 3DSTATE_WM_HZ_OP:
*
@ -384,7 +387,7 @@ init_render_queue_state(struct anv_queue *queue, bool is_companion_rcs_batch)
*
* Emit this before 3DSTATE_WM_HZ_OP below.
*/
anv_batch_emit(&batch, GENX(3DSTATE_RASTER), rast) {
anv_batch_emit(batch, GENX(3DSTATE_RASTER), rast) {
rast.APIMode = DX101;
}
@ -396,7 +399,7 @@ init_render_queue_state(struct anv_queue *queue, bool is_companion_rcs_batch)
*
* Emit this before 3DSTATE_WM_HZ_OP below.
*/
anv_batch_emit(&batch, GENX(3DSTATE_MULTISAMPLE), ms);
anv_batch_emit(batch, GENX(3DSTATE_MULTISAMPLE), ms);
/* The BDW+ docs describe how to use the 3DSTATE_WM_HZ_OP instruction in the
* section titled, "Optimized Depth Buffer Clear and/or Stencil Buffer
@ -406,9 +409,9 @@ init_render_queue_state(struct anv_queue *queue, bool is_companion_rcs_batch)
* zeroed. Do it ourselves just in case. We've observed this to prevent a
* number of GPU hangs on ICL.
*/
anv_batch_emit(&batch, GENX(3DSTATE_WM_HZ_OP), hzp);
anv_batch_emit(batch, GENX(3DSTATE_WM_HZ_OP), hzp);
genX(emit_sample_pattern)(&batch, NULL);
genX(emit_sample_pattern)(batch, NULL);
#if GFX_VER == 11
/* The default behavior of bit 5 "Headerless Message for Pre-emptable
@ -416,7 +419,7 @@ init_render_queue_state(struct anv_queue *queue, bool is_companion_rcs_batch)
* headerless sampler messages are not allowed for pre-emptable
* contexts. Set the bit 5 to 1 to allow them.
*/
anv_batch_write_reg(&batch, GENX(SAMPLER_MODE), sm) {
anv_batch_write_reg(batch, GENX(SAMPLER_MODE), sm) {
sm.HeaderlessMessageforPreemptableContexts = true;
sm.HeaderlessMessageforPreemptableContextsMask = true;
}
@ -424,26 +427,26 @@ init_render_queue_state(struct anv_queue *queue, bool is_companion_rcs_batch)
/* Bit 1 "Enabled Texel Offset Precision Fix" must be set in
* HALF_SLICE_CHICKEN7 register.
*/
anv_batch_write_reg(&batch, GENX(HALF_SLICE_CHICKEN7), hsc7) {
anv_batch_write_reg(batch, GENX(HALF_SLICE_CHICKEN7), hsc7) {
hsc7.EnabledTexelOffsetPrecisionFix = true;
hsc7.EnabledTexelOffsetPrecisionFixMask = true;
}
anv_batch_write_reg(&batch, GENX(TCCNTLREG), tcc) {
anv_batch_write_reg(batch, GENX(TCCNTLREG), tcc) {
tcc.L3DataPartialWriteMergingEnable = true;
tcc.ColorZPartialWriteMergingEnable = true;
tcc.URBPartialWriteMergingEnable = true;
tcc.TCDisable = true;
}
#endif
genX(emit_slice_hashing_state)(device, &batch);
genX(emit_slice_hashing_state)(device, batch);
#if GFX_VER >= 11
/* hardware specification recommends disabling repacking for
* the compatibility with decompression mechanism in display controller.
*/
if (device->info->disable_ccs_repack) {
anv_batch_write_reg(&batch, GENX(CACHE_MODE_0), cm0) {
anv_batch_write_reg(batch, GENX(CACHE_MODE_0), cm0) {
cm0.DisableRepackingforCompression = true;
cm0.DisableRepackingforCompressionMask = true;
}
@ -454,7 +457,7 @@ init_render_queue_state(struct anv_queue *queue, bool is_companion_rcs_batch)
* to command buffer level preemption to avoid rendering
* corruption.
*/
anv_batch_write_reg(&batch, GENX(CS_CHICKEN1), cc1) {
anv_batch_write_reg(batch, GENX(CS_CHICKEN1), cc1) {
cc1.ReplayMode = MidcmdbufferPreemption;
cc1.ReplayModeMask = true;
@ -469,14 +472,14 @@ init_render_queue_state(struct anv_queue *queue, bool is_companion_rcs_batch)
* depth buffer is D16_UNORM. We've found the WA to help with more depth
* buffer configurations however, so we always disable it just to be safe.
*/
anv_batch_write_reg(&batch, GENX(HIZ_CHICKEN), reg) {
anv_batch_write_reg(batch, GENX(HIZ_CHICKEN), reg) {
reg.HZDepthTestLEGEOptimizationDisable = true;
reg.HZDepthTestLEGEOptimizationDisableMask = true;
}
#endif
#if GFX_VER == 12
anv_batch_write_reg(&batch, GENX(FF_MODE2), reg) {
anv_batch_write_reg(batch, GENX(FF_MODE2), reg) {
/* On Alchemist, the FF_MODE2 docs for the GS timer say:
*
* "The timer value must be set to 224."
@ -511,7 +514,7 @@ init_render_queue_state(struct anv_queue *queue, bool is_companion_rcs_batch)
* We implement global disabling of the optimization here and we toggle it
* in anv_image_ccs_op().
*/
anv_batch_write_reg(&batch, GENX(COMMON_SLICE_CHICKEN1), c1) {
anv_batch_write_reg(batch, GENX(COMMON_SLICE_CHICKEN1), c1) {
c1.RCCRHWOOptimizationDisable = true;
c1.RCCRHWOOptimizationDisableMask = true;
}
@ -526,7 +529,7 @@ init_render_queue_state(struct anv_queue *queue, bool is_companion_rcs_batch)
/* Enable the new line drawing algorithm that produces higher quality
* lines.
*/
anv_batch_write_reg(&batch, AA_LINE_QUALITY_REG, c3) {
anv_batch_write_reg(batch, AA_LINE_QUALITY_REG, c3) {
c3.AALineQualityFix = true;
c3.AALineQualityFixMask = true;
}
@ -536,11 +539,11 @@ init_render_queue_state(struct anv_queue *queue, bool is_companion_rcs_batch)
if (device->info->has_aux_map) {
uint64_t aux_base_addr = intel_aux_map_get_base(device->aux_map_ctx);
assert(aux_base_addr % (32 * 1024) == 0);
anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
lri.RegisterOffset = GENX(GFX_AUX_TABLE_BASE_ADDR_num);
lri.DataDWord = aux_base_addr & 0xffffffff;
}
anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
lri.RegisterOffset = GENX(GFX_AUX_TABLE_BASE_ADDR_num) + 4;
lri.DataDWord = aux_base_addr >> 32;
}
@ -548,7 +551,7 @@ init_render_queue_state(struct anv_queue *queue, bool is_companion_rcs_batch)
#endif
#if GFX_VERx10 == 125
anv_batch_write_reg(&batch, GENX(CHICKEN_RASTER_2), reg) {
anv_batch_write_reg(batch, GENX(CHICKEN_RASTER_2), reg) {
reg.TBIMRBatchSizeOverride = true;
reg.TBIMROpenBatchEnable = true;
reg.TBIMRFastClip = true;
@ -564,21 +567,21 @@ init_render_queue_state(struct anv_queue *queue, bool is_companion_rcs_batch)
* This is only safe on kernels with context isolation support.
*/
assert(device->physical->info.has_context_isolation);
anv_batch_write_reg(&batch, GENX(CS_DEBUG_MODE2), csdm2) {
anv_batch_write_reg(batch, GENX(CS_DEBUG_MODE2), csdm2) {
csdm2.CONSTANT_BUFFERAddressOffsetDisable = true;
csdm2.CONSTANT_BUFFERAddressOffsetDisableMask = true;
}
init_common_queue_state(queue, &batch);
init_common_queue_state(queue, batch);
/* Because 3DSTATE_CPS::CoarsePixelShadingStateArrayPointer is relative to
* the dynamic state base address we need to emit this instruction after
* STATE_BASE_ADDRESS in init_common_queue_state().
*/
#if GFX_VER == 11
anv_batch_emit(&batch, GENX(3DSTATE_CPS), cps);
anv_batch_emit(batch, GENX(3DSTATE_CPS), cps);
#elif GFX_VER >= 12
anv_batch_emit(&batch, GENX(3DSTATE_CPS_POINTERS), cps) {
anv_batch_emit(batch, GENX(3DSTATE_CPS_POINTERS), cps) {
assert(device->cps_states.alloc_size != 0);
/* Offset 0 is the disabled state */
cps.CoarsePixelShadingStateArrayPointer =
@ -587,27 +590,27 @@ init_render_queue_state(struct anv_queue *queue, bool is_companion_rcs_batch)
#endif
#if GFX_VERx10 >= 125
anv_batch_emit(&batch, GENX(STATE_COMPUTE_MODE), cm) {
anv_batch_emit(batch, GENX(STATE_COMPUTE_MODE), cm) {
cm.Mask1 = 0xffff;
#if GFX_VERx10 >= 200
cm.Mask2 = 0xffff;
#endif
}
anv_batch_emit(&batch, GENX(3DSTATE_MESH_CONTROL), zero);
anv_batch_emit(&batch, GENX(3DSTATE_TASK_CONTROL), zero);
anv_batch_emit(batch, GENX(3DSTATE_MESH_CONTROL), zero);
anv_batch_emit(batch, GENX(3DSTATE_TASK_CONTROL), zero);
/* We no longer required to explicitly flush or invalidate caches since the
* PIPELINE_SELECT is getting deprecated on Xe2+.
*/
#if GFX_VER < 20
genx_batch_emit_pipe_control_write(&batch, device->info, _3D, NoWrite,
genx_batch_emit_pipe_control_write(batch, device->info, _3D, NoWrite,
ANV_NULL_ADDRESS,
0,
ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS);
#endif
genX(emit_pipeline_select)(&batch, GPGPU, device);
anv_batch_emit(&batch, GENX(CFE_STATE), cfe) {
genX(emit_pipeline_select)(batch, GPGPU, device);
anv_batch_emit(batch, GENX(CFE_STATE), cfe) {
cfe.MaximumNumberofThreads =
devinfo->max_cs_threads * devinfo->subslice_total;
}
@ -616,48 +619,66 @@ init_render_queue_state(struct anv_queue *queue, bool is_companion_rcs_batch)
* PIPELINE_SELECT is getting deprecated on Xe2+.
*/
#if GFX_VER < 20
genx_batch_emit_pipe_control_write(&batch, device->info, _3D, NoWrite,
genx_batch_emit_pipe_control_write(batch, device->info, _3D, NoWrite,
ANV_NULL_ADDRESS,
0,
ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS);
#endif
genX(emit_pipeline_select)(&batch, _3D, device);
genX(emit_pipeline_select)(batch, _3D, device);
#endif
anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
anv_batch_emit(batch, GENX(MI_BATCH_BUFFER_END), bbe);
assert(batch.next <= batch.end);
result = batch->status;
if (result != VK_SUCCESS) {
anv_async_submit_destroy(submit);
return result;
}
result = device->kmd_backend->queue_exec_async(submit, 0, NULL, 0, NULL);
if (result != VK_SUCCESS) {
anv_async_submit_destroy(submit);
return result;
}
if (!device->trtt.queue)
device->trtt.queue = queue;
return anv_queue_submit_simple_batch(queue, &batch, is_companion_rcs_batch);
if (is_companion_rcs_batch)
queue->init_companion_submit = submit;
else
queue->init_submit = submit;
return VK_SUCCESS;
}
static VkResult
init_compute_queue_state(struct anv_queue *queue)
{
UNUSED const struct intel_device_info *devinfo = queue->device->info;
uint32_t cmds[64];
struct anv_batch batch = {
.start = cmds,
.next = cmds,
.end = (void *) cmds + sizeof(cmds),
};
struct anv_device *device = queue->device;
UNUSED const struct intel_device_info *devinfo = device->info;
struct anv_async_submit *submit;
VkResult result = anv_async_submit_create(queue,
&device->batch_bo_pool,
false, true, &submit);
if (result != VK_SUCCESS)
return result;
genX(emit_pipeline_select)(&batch, GPGPU, queue->device);
struct anv_batch *batch = &submit->batch;
genX(emit_pipeline_select)(batch, GPGPU, queue->device);
#if GFX_VER == 12
if (queue->device->info->has_aux_map) {
uint64_t aux_base_addr =
intel_aux_map_get_base(queue->device->aux_map_ctx);
assert(aux_base_addr % (32 * 1024) == 0);
anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
lri.RegisterOffset = GENX(COMPCS0_AUX_TABLE_BASE_ADDR_num);
lri.DataDWord = aux_base_addr & 0xffffffff;
}
anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
lri.RegisterOffset = GENX(COMPCS0_AUX_TABLE_BASE_ADDR_num) + 4;
lri.DataDWord = aux_base_addr >> 32;
}
@ -672,7 +693,7 @@ init_compute_queue_state(struct anv_queue *queue)
*/
if (intel_needs_workaround(devinfo, 14015782607) &&
queue->family->engine_class == INTEL_ENGINE_CLASS_COMPUTE) {
genx_batch_emit_pipe_control(&batch, devinfo, GPGPU,
genx_batch_emit_pipe_control(batch, devinfo, GPGPU,
ANV_PIPE_CS_STALL_BIT |
ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT |
ANV_PIPE_HDC_PIPELINE_FLUSH_BIT);
@ -685,7 +706,7 @@ init_compute_queue_state(struct anv_queue *queue)
if (intel_device_info_is_atsm(devinfo) &&
queue->family->engine_class == INTEL_ENGINE_CLASS_COMPUTE) {
genx_batch_emit_pipe_control
(&batch, devinfo, GPGPU,
(batch, devinfo, GPGPU,
ANV_PIPE_CS_STALL_BIT |
ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
@ -695,7 +716,7 @@ init_compute_queue_state(struct anv_queue *queue)
ANV_PIPE_HDC_PIPELINE_FLUSH_BIT);
}
anv_batch_emit(&batch, GENX(STATE_COMPUTE_MODE), cm) {
anv_batch_emit(batch, GENX(STATE_COMPUTE_MODE), cm) {
#if GFX_VER < 20
cm.PixelAsyncComputeThreadLimit = 4;
cm.PixelAsyncComputeThreadLimitMask = 0x7;
@ -703,36 +724,51 @@ init_compute_queue_state(struct anv_queue *queue)
}
#endif
init_common_queue_state(queue, &batch);
init_common_queue_state(queue, batch);
#if GFX_VERx10 >= 125
anv_batch_emit(&batch, GENX(CFE_STATE), cfe) {
anv_batch_emit(batch, GENX(CFE_STATE), cfe) {
cfe.MaximumNumberofThreads =
devinfo->max_cs_threads * devinfo->subslice_total;
}
#endif
anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
anv_batch_emit(batch, GENX(MI_BATCH_BUFFER_END), bbe);
assert(batch.next <= batch.end);
result = batch->status;
if (result != VK_SUCCESS) {
anv_async_submit_destroy(submit);
return result;
}
return anv_queue_submit_simple_batch(queue, &batch,
false /* is_companion_rcs_batch */);
result = device->kmd_backend->queue_exec_async(submit, 0, NULL, 0, NULL);
if (result != VK_SUCCESS) {
anv_async_submit_destroy(submit);
return result;
}
queue->init_submit = submit;
return VK_SUCCESS;
}
static VkResult
init_copy_video_queue_state(struct anv_queue *queue)
{
#if GFX_VER >= 12
UNUSED const struct intel_device_info *devinfo = queue->device->info;
uint32_t cmds[64];
UNUSED struct anv_batch batch = {
.start = cmds,
.next = cmds,
.end = (void *) cmds + sizeof(cmds),
};
struct anv_device *device = queue->device;
const struct intel_device_info *devinfo = device->info;
if (devinfo->has_aux_map) {
struct anv_async_submit *submit;
VkResult result = anv_async_submit_create(queue,
&device->batch_bo_pool,
false, true, &submit);
if (result != VK_SUCCESS)
return result;
struct anv_batch *batch = &submit->batch;
if (queue->device->info->has_aux_map) {
uint64_t reg = GENX(VD0_AUX_TABLE_BASE_ADDR_num);
if (queue->family->engine_class == INTEL_ENGINE_CLASS_COPY) {
@ -744,20 +780,30 @@ init_copy_video_queue_state(struct anv_queue *queue)
uint64_t aux_base_addr =
intel_aux_map_get_base(queue->device->aux_map_ctx);
assert(aux_base_addr % (32 * 1024) == 0);
anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
lri.RegisterOffset = reg;
lri.DataDWord = aux_base_addr & 0xffffffff;
}
anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
lri.RegisterOffset = reg + 4;
lri.DataDWord = aux_base_addr >> 32;
}
anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
assert(batch.next <= batch.end);
anv_batch_emit(batch, GENX(MI_BATCH_BUFFER_END), bbe);
return anv_queue_submit_simple_batch(queue, &batch,
false /* is_companion_rcs_batch */);
result = batch->status;
if (result != VK_SUCCESS) {
anv_async_submit_destroy(submit);
return result;
}
result = device->kmd_backend->queue_exec_async(submit, 0, NULL, 0, NULL);
if (result != VK_SUCCESS) {
anv_async_submit_destroy(submit);
return result;
}
queue->init_submit = submit;
}
#else
assert(!queue->device->info->has_aux_map);

View file

@ -1004,50 +1004,3 @@ i915_queue_exec_locked(struct anv_queue *queue,
return result;
}
VkResult
i915_execute_simple_batch(struct anv_queue *queue, struct anv_bo *batch_bo,
uint32_t batch_bo_size, bool is_companion_rcs_batch)
{
struct anv_device *device = queue->device;
struct anv_execbuf execbuf = {
.alloc = &queue->device->vk.alloc,
.alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE,
};
VkResult result = anv_execbuf_add_bo(device, &execbuf, batch_bo, NULL, 0);
if (result != VK_SUCCESS)
goto fail;
assert(!is_companion_rcs_batch || device->physical->has_vm_control);
uint64_t exec_flags = 0;
uint32_t context_id;
get_context_and_exec_flags(queue, is_companion_rcs_batch, &exec_flags,
&context_id);
execbuf.execbuf = (struct drm_i915_gem_execbuffer2) {
.buffers_ptr = (uintptr_t) execbuf.objects,
.buffer_count = execbuf.bo_count,
.batch_start_offset = 0,
.batch_len = batch_bo_size,
.flags = I915_EXEC_HANDLE_LUT | exec_flags | I915_EXEC_NO_RELOC,
.rsvd1 = context_id,
.rsvd2 = 0,
};
ANV_RMV(bos_gtt_map, device, execbuf.bos, execbuf.bo_count);
if (anv_gem_execbuffer(device, &execbuf.execbuf)) {
result = vk_device_set_lost(&device->vk, "anv_gem_execbuffer failed: %m");
goto fail;
}
result = anv_device_wait(device, batch_bo, INT64_MAX);
if (result != VK_SUCCESS)
result = vk_device_set_lost(&device->vk,
"anv_device_wait failed: %m");
fail:
anv_execbuf_finish(&execbuf);
return result;
}

View file

@ -30,7 +30,6 @@
#include "vk_sync.h"
struct anv_queue;
struct anv_bo;
struct anv_cmd_buffer;
struct anv_query_pool;
struct anv_async_submit;
@ -43,10 +42,6 @@ i915_queue_exec_async(struct anv_async_submit *submit,
uint32_t signal_count,
const struct vk_sync_signal *signals);
VkResult
i915_execute_simple_batch(struct anv_queue *queue, struct anv_bo *batch_bo,
uint32_t batch_bo_size, bool is_companion_rcs_batch);
VkResult
i915_queue_exec_locked(struct anv_queue *queue,
uint32_t wait_count,

View file

@ -296,7 +296,6 @@ anv_i915_kmd_backend_get(void)
.vm_bind = i915_vm_bind,
.vm_bind_bo = i915_vm_bind_bo,
.vm_unbind_bo = i915_vm_bind_bo,
.execute_simple_batch = i915_execute_simple_batch,
.queue_exec_locked = i915_queue_exec_locked,
.queue_exec_async = i915_queue_exec_async,
.bo_alloc_flags_to_bo_flags = i915_bo_alloc_flags_to_bo_flags,

View file

@ -29,61 +29,6 @@
#include "drm-uapi/xe_drm.h"
VkResult
xe_execute_simple_batch(struct anv_queue *queue,
struct anv_bo *batch_bo,
uint32_t batch_bo_size,
bool is_companion_rcs_batch)
{
struct anv_device *device = queue->device;
uint32_t exec_queue_id = is_companion_rcs_batch ?
queue->companion_rcs_id :
queue->exec_queue_id;
struct drm_syncobj_create syncobj_create = {};
struct drm_syncobj_destroy syncobj_destroy = {};
struct drm_xe_sync syncs[2] = {};
VkResult result = VK_SUCCESS;
if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_CREATE, &syncobj_create))
return vk_errorf(device, VK_ERROR_UNKNOWN, "Unable to create sync obj");
syncs[0].type = DRM_XE_SYNC_TYPE_SYNCOBJ;
syncs[0].flags = DRM_XE_SYNC_FLAG_SIGNAL;
syncs[0].handle = syncobj_create.handle;
/* vm bind sync */
syncs[1].type = DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ;
syncs[1].handle = intel_bind_timeline_get_syncobj(&device->bind_timeline);
syncs[1].timeline_value = intel_bind_timeline_get_last_point(&device->bind_timeline);
struct drm_xe_exec exec = {
.exec_queue_id = exec_queue_id,
.num_batch_buffer = 1,
.address = batch_bo->offset,
.num_syncs = ARRAY_SIZE(syncs),
.syncs = (uintptr_t)syncs,
};
if (intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, &exec)) {
result = vk_device_set_lost(&device->vk, "XE_EXEC failed: %m");
goto exec_error;
}
struct drm_syncobj_wait wait = {
.handles = (uintptr_t)&syncobj_create.handle,
.timeout_nsec = INT64_MAX,
.count_handles = 1,
};
if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_WAIT, &wait))
result = vk_device_set_lost(&device->vk, "DRM_IOCTL_SYNCOBJ_WAIT failed: %m");
exec_error:
syncobj_destroy.handle = syncobj_create.handle;
intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_DESTROY, &syncobj_destroy);
return result;
}
#define TYPE_SIGNAL true
#define TYPE_WAIT false

View file

@ -37,10 +37,6 @@ struct anv_query_pool;
struct anv_async_submit;
struct anv_utrace_submit;
VkResult
xe_execute_simple_batch(struct anv_queue *queue, struct anv_bo *batch_bo,
uint32_t batch_bo_size, bool is_companion_rcs_batch);
VkResult
xe_queue_exec_locked(struct anv_queue *queue,
uint32_t wait_count,

View file

@ -345,7 +345,6 @@ anv_xe_kmd_backend_get(void)
.vm_bind = xe_vm_bind,
.vm_bind_bo = xe_vm_bind_bo,
.vm_unbind_bo = xe_vm_unbind_bo,
.execute_simple_batch = xe_execute_simple_batch,
.queue_exec_locked = xe_queue_exec_locked,
.queue_exec_async = xe_queue_exec_async,
.bo_alloc_flags_to_bo_flags = xe_bo_alloc_flags_to_bo_flags,