From 5ca224aa0c35bdd1acc5b9d93100fc8ec27b0f3f Mon Sep 17 00:00:00 2001 From: Paulo Zanoni Date: Tue, 25 Jun 2024 14:36:35 -0700 Subject: [PATCH] anv/trtt: make all contexts have the same TR-TT programming MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On Gen12 (the oldest we support on Mesa right now for TR-TT) we started having per-engine TR-TT registers and we are supposed to make all contexts share the same TR-TT programming. On LNL+, this is documented in the BSpec page for the TRTT_CNTRL register (68417), with more details in HSDs 14020454786 and 16022013154. On Gen12 platforms this information is a little harder to find and there's a whole trail of HSDs leading up to 1209977595, which links to the documents that describe the programming. BSpec for TR-TT on Gen12 is very confusing as it still contains registers and other information from Gen11 that were not removed. Regarding the additional BLT and COMP registers, please notice that on the BSpec pages for the TR-TT registers, the "Register Instance" section only lists the GFX registers as non-privileged. However, the "User Mode Privileged Commands" lists the other instances of the TR-TT Regsiters as non-privileged, which matches what we see: there's no need to put these addresses in the FORCE_TO_NONPRIV registers. Notice that for now, when TR-TT is being used we only expose a single queue, so this change effectively does nothing until we start exposing extra queues. I left that part for later to help bisectability. v2: - s/trtt_init_context_state/trtt_init_queues_state/ (José) - pass device as the argument to init_queues_state (José) v3: - use async_submit_end (José) Reviewed-by: José Roberto de Souza Signed-off-by: Paulo Zanoni Part-of: --- src/intel/vulkan/anv_genX.h | 3 +- src/intel/vulkan/anv_sparse.c | 60 +++++++++++++++++------------- src/intel/vulkan/genX_init_state.c | 47 ++++++++++++++++++++--- 3 files changed, 77 insertions(+), 33 deletions(-) diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h index 3f1d359c4b9..5bd07650217 100644 --- a/src/intel/vulkan/anv_genX.h +++ b/src/intel/vulkan/anv_genX.h @@ -354,8 +354,7 @@ genX(simple_shader_push_state_address)(struct anv_simple_shader *state, void genX(emit_simple_shader_end)(struct anv_simple_shader *state); -VkResult genX(init_trtt_context_state)(struct anv_device *device, - struct anv_async_submit *submit); +VkResult genX(init_trtt_context_state)(struct anv_async_submit *submit); void genX(write_trtt_entries)(struct anv_async_submit *submit, struct anv_trtt_bind *l3l2_binds, diff --git a/src/intel/vulkan/anv_sparse.c b/src/intel/vulkan/anv_sparse.c index 3c5c8e751d0..b80b681d44d 100644 --- a/src/intel/vulkan/anv_sparse.c +++ b/src/intel/vulkan/anv_sparse.c @@ -405,9 +405,8 @@ trtt_get_page_table_bo(struct anv_device *device, struct anv_bo **bo, } static VkResult -anv_trtt_init_context_state(struct anv_queue *queue) +anv_trtt_init_queues_state(struct anv_device *device) { - struct anv_device *device = queue->device; struct anv_trtt *trtt = &device->trtt; struct anv_bo *l3_bo; @@ -417,43 +416,52 @@ anv_trtt_init_context_state(struct anv_queue *queue) trtt->l3_mirror = vk_zalloc(&device->vk.alloc, 4096, 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); - if (!trtt->l3_mirror) { - result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - return result; - } + if (!trtt->l3_mirror) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); /* L3 has 512 entries, so we can have up to 512 L2 tables. */ trtt->l2_mirror = vk_zalloc(&device->vk.alloc, 512 * 4096, 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); if (!trtt->l2_mirror) { - result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - goto fail_free_l3; + vk_free(&device->vk.alloc, trtt->l3_mirror); + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); } + struct anv_async_submit submits[device->queue_count]; + int submits_used = 0; + for (uint32_t i = 0; i < device->queue_count; i++) { + struct anv_queue *q = &device->queues[i]; - struct anv_async_submit submit; - result = anv_async_submit_init(&submit, queue, &device->batch_bo_pool, - false, true); - if (result != VK_SUCCESS) - return result; + result = anv_async_submit_init(&submits[submits_used], q, + &device->batch_bo_pool, false, true); + if (result != VK_SUCCESS) + break; - result = anv_genX(device->info, init_trtt_context_state)(device, &submit); - if (result != VK_SUCCESS) - goto fail_fini_submit; + struct anv_async_submit *submit = &submits[submits_used++]; - anv_genX(device->info, async_submit_end)(&submit); + result = anv_genX(device->info, init_trtt_context_state)(submit); + if (result != VK_SUCCESS) { + anv_async_submit_fini(submit); + submits_used--; + break; + } - result = device->kmd_backend->queue_exec_async(&submit, 0, NULL, 1, - &submit.signal); + anv_genX(device->info, async_submit_end)(submit); - anv_async_submit_wait(&submit); + result = device->kmd_backend->queue_exec_async(submit, 0, NULL, 1, + &submit->signal); + if (result != VK_SUCCESS) { + anv_async_submit_fini(submit); + submits_used--; + break; + } + } -fail_fini_submit: - anv_async_submit_fini(&submit); - return result; + for (uint32_t i = 0; i < submits_used; i++) { + anv_async_submit_wait(&submits[i]); + anv_async_submit_fini(&submits[i]); + } -fail_free_l3: - vk_free(&device->vk.alloc, trtt->l3_mirror); return result; } @@ -645,7 +653,7 @@ anv_sparse_bind_trtt(struct anv_device *device, * submission. */ if (!trtt->l3_addr) { - result = anv_trtt_init_context_state(sparse_submit->queue); + result = anv_trtt_init_queues_state(device); if (result != VK_SUCCESS) goto error_add_bind; } diff --git a/src/intel/vulkan/genX_init_state.c b/src/intel/vulkan/genX_init_state.c index e15729253aa..7f5ff759830 100644 --- a/src/intel/vulkan/genX_init_state.c +++ b/src/intel/vulkan/genX_init_state.c @@ -1442,10 +1442,11 @@ genX(apply_task_urb_workaround)(struct anv_cmd_buffer *cmd_buffer) } VkResult -genX(init_trtt_context_state)(struct anv_device *device, - struct anv_async_submit *submit) +genX(init_trtt_context_state)(struct anv_async_submit *submit) { #if GFX_VER >= 12 + struct anv_queue *queue = submit->queue; + struct anv_device *device = queue->device; struct anv_trtt *trtt = &device->trtt; struct anv_batch *batch = &submit->batch; @@ -1462,25 +1463,61 @@ genX(init_trtt_context_state)(struct anv_device *device, anv_batch_write_reg(batch, GENX(GFX_TRTT_L3_BASE_HIGH), trtt_base_high) trtt_base_high.TRVAL3PointerUpperAddress = l3_addr_high; + anv_batch_write_reg(batch, GENX(BLT_TRTT_INVAL), trtt_inval) + trtt_inval.InvalidTileDetectionValue = ANV_TRTT_L1_INVALID_TILE_VAL; + anv_batch_write_reg(batch, GENX(BLT_TRTT_NULL), trtt_null) + trtt_null.NullTileDetectionValue = ANV_TRTT_L1_NULL_TILE_VAL; + anv_batch_write_reg(batch, GENX(BLT_TRTT_L3_BASE_LOW), trtt_base_low) + trtt_base_low.TRVAL3PointerLowerAddress = l3_addr_low; + anv_batch_write_reg(batch, GENX(BLT_TRTT_L3_BASE_HIGH), trtt_base_high) + trtt_base_high.TRVAL3PointerUpperAddress = l3_addr_high; + + anv_batch_write_reg(batch, GENX(COMP_CTX0_TRTT_INVAL), trtt_inval) + trtt_inval.InvalidTileDetectionValue = ANV_TRTT_L1_INVALID_TILE_VAL; + anv_batch_write_reg(batch, GENX(COMP_CTX0_TRTT_NULL), trtt_null) + trtt_null.NullTileDetectionValue = ANV_TRTT_L1_NULL_TILE_VAL; + anv_batch_write_reg(batch, GENX(COMP_CTX0_TRTT_L3_BASE_LOW), trtt_base_low) + trtt_base_low.TRVAL3PointerLowerAddress = l3_addr_low; + anv_batch_write_reg(batch, GENX(COMP_CTX0_TRTT_L3_BASE_HIGH), trtt_base_high) + trtt_base_high.TRVAL3PointerUpperAddress = l3_addr_high; + #if GFX_VER >= 20 uint32_t trva_base = device->physical->va.trtt.addr >> 44; anv_batch_write_reg(batch, GENX(GFX_TRTT_VA_RANGE), trtt_va_range) trtt_va_range.TRVABase = trva_base; + anv_batch_write_reg(batch, GENX(BLT_TRTT_VA_RANGE), trtt_va_range) + trtt_va_range.TRVABase = trva_base; + anv_batch_write_reg(batch, GENX(COMP_CTX0_TRTT_VA_RANGE), trtt_va_range) + trtt_va_range.TRVABase = trva_base; #else anv_batch_write_reg(batch, GENX(GFX_TRTT_VA_RANGE), trtt_va_range) { trtt_va_range.TRVAMaskValue = 0xF; trtt_va_range.TRVADataValue = 0xF; } + anv_batch_write_reg(batch, GENX(BLT_TRTT_VA_RANGE), trtt_va_range) { + trtt_va_range.TRVAMaskValue = 0xF; + trtt_va_range.TRVADataValue = 0xF; + } + anv_batch_write_reg(batch, GENX(COMP_CTX0_TRTT_VA_RANGE), trtt_va_range) { + trtt_va_range.TRVAMaskValue = 0xF; + trtt_va_range.TRVADataValue = 0xF; + } #endif /* Enabling TR-TT needs to be done after setting up the other registers. */ anv_batch_write_reg(batch, GENX(GFX_TRTT_CR), trtt_cr) trtt_cr.TRTTEnable = true; + anv_batch_write_reg(batch, GENX(BLT_TRTT_CR), trtt_cr) + trtt_cr.TRTTEnable = true; + anv_batch_write_reg(batch, GENX(COMP_CTX0_TRTT_CR), trtt_cr) + trtt_cr.TRTTEnable = true; - genx_batch_emit_pipe_control(batch, device->info, _3D, - ANV_PIPE_CS_STALL_BIT | - ANV_PIPE_TLB_INVALIDATE_BIT); + if (queue->family->engine_class != INTEL_ENGINE_CLASS_COPY) { + genx_batch_emit_pipe_control(batch, device->info, _3D, + ANV_PIPE_CS_STALL_BIT | + ANV_PIPE_TLB_INVALIDATE_BIT); + } #endif return VK_SUCCESS; }