diff --git a/src/intel/vulkan/anv_batch_chain.c b/src/intel/vulkan/anv_batch_chain.c index 004977f3f44..489c4346c58 100644 --- a/src/intel/vulkan/anv_batch_chain.c +++ b/src/intel/vulkan/anv_batch_chain.c @@ -1668,37 +1668,6 @@ anv_queue_submit_simple_batch(struct anv_queue *queue, return result; } -VkResult -anv_queue_submit_trtt_batch(struct anv_sparse_submission *submit, - struct anv_batch *batch) -{ - struct anv_queue *queue = submit->queue; - struct anv_device *device = queue->device; - VkResult result = VK_SUCCESS; - - uint32_t batch_size = align(batch->next - batch->start, 8); - struct anv_trtt_batch_bo *trtt_bbo; - result = anv_trtt_batch_bo_new(device, batch_size, &trtt_bbo); - if (result != VK_SUCCESS) - return result; - - memcpy(trtt_bbo->bo->map, batch->start, trtt_bbo->size); -#ifdef SUPPORT_INTEL_INTEGRATED_GPUS - if (device->physical->memory.need_flush && - anv_bo_needs_host_cache_flush(trtt_bbo->bo->alloc_flags)) - intel_flush_range(trtt_bbo->bo->map, trtt_bbo->size); -#endif - - if (INTEL_DEBUG(DEBUG_BATCH)) { - intel_print_batch(queue->decoder, trtt_bbo->bo->map, trtt_bbo->bo->size, - trtt_bbo->bo->offset, false); - } - - result = device->kmd_backend->execute_trtt_batch(submit, trtt_bbo); - - return result; -} - void anv_cmd_buffer_clflush(struct anv_cmd_buffer **cmd_buffers, uint32_t num_cmd_buffers) diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index 889f114644a..d2e9f09b785 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -3246,14 +3246,25 @@ anv_device_destroy_context_or_vm(struct anv_device *device) } } -static void +static VkResult anv_device_init_trtt(struct anv_device *device) { struct anv_trtt *trtt = &device->trtt; + VkResult result = + vk_sync_create(&device->vk, + &device->physical->sync_syncobj_type, + VK_SYNC_IS_TIMELINE, + 0 /* initial_value */, + &trtt->timeline); + if (result != VK_SUCCESS) + return result; + simple_mtx_init(&trtt->mutex, mtx_plain); list_inithead(&trtt->in_flight_batches); + + return VK_SUCCESS; } static void @@ -3261,31 +3272,9 @@ anv_device_finish_trtt(struct anv_device *device) { struct anv_trtt *trtt = &device->trtt; - if (trtt->timeline_val > 0) { - struct drm_syncobj_timeline_wait wait = { - .handles = (uintptr_t)&trtt->timeline_handle, - .points = (uintptr_t)&trtt->timeline_val, - .timeout_nsec = INT64_MAX, - .count_handles = 1, - .flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL, - .first_signaled = false, - }; - if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT, &wait)) - fprintf(stderr, "TR-TT syncobj wait failed!\n"); + anv_sparse_trtt_garbage_collect_batches(device, true); - list_for_each_entry_safe(struct anv_trtt_batch_bo, trtt_bbo, - &trtt->in_flight_batches, link) - anv_trtt_batch_bo_free(device, trtt_bbo); - - } - - if (trtt->timeline_handle > 0) { - struct drm_syncobj_destroy destroy = { - .handle = trtt->timeline_handle, - }; - if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_DESTROY, &destroy)) - fprintf(stderr, "TR-TT syncobj destroy failed!\n"); - } + vk_sync_destroy(&device->vk, trtt->timeline); simple_mtx_destroy(&trtt->mutex); @@ -3915,6 +3904,10 @@ VkResult anv_CreateDevice( } } + result = anv_device_init_trtt(device); + if (result != VK_SUCCESS) + goto fail_companion_cmd_pool; + anv_device_init_blorp(device); anv_device_init_border_colors(device); @@ -3929,8 +3922,6 @@ VkResult anv_CreateDevice( anv_device_init_embedded_samplers(device); - anv_device_init_trtt(device); - BITSET_ONES(device->gfx_dirty_state); BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_INDEX_BUFFER); BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_SO_DECL_LIST); @@ -3963,13 +3954,13 @@ VkResult anv_CreateDevice( result = anv_genX(device->info, init_device_state)(device); if (result != VK_SUCCESS) - goto fail_companion_cmd_pool; + goto fail_inits; *pDevice = anv_device_to_handle(device); return VK_SUCCESS; - fail_companion_cmd_pool: + fail_inits: anv_device_finish_trtt(device); anv_device_finish_embedded_samplers(device); anv_device_utrace_finish(device); @@ -3977,7 +3968,7 @@ VkResult anv_CreateDevice( anv_device_finish_rt_shaders(device); anv_device_finish_astc_emu(device); anv_device_finish_internal_kernels(device); - + fail_companion_cmd_pool: if (device->info->verx10 >= 125) { vk_common_DestroyCommandPool(anv_device_to_handle(device), device->companion_rcs_cmd_pool, NULL); @@ -4089,6 +4080,7 @@ void anv_DestroyDevice( struct anv_physical_device *pdevice = device->physical; + /* Do TRTT batch garbage collection before destroying queues. */ anv_device_finish_trtt(device); for (uint32_t i = 0; i < device->queue_count; i++) diff --git a/src/intel/vulkan/anv_gem_stubs.c b/src/intel/vulkan/anv_gem_stubs.c index 6f170e29c5d..cdddeeb9054 100644 --- a/src/intel/vulkan/anv_gem_stubs.c +++ b/src/intel/vulkan/anv_gem_stubs.c @@ -65,13 +65,6 @@ stub_execute_simple_batch(struct anv_queue *queue, struct anv_bo *batch_bo, return VK_ERROR_UNKNOWN; } -static VkResult -stub_execute_trtt_batch(struct anv_sparse_submission *submit, - struct anv_trtt_batch_bo *trtt_bbo) -{ - return VK_ERROR_UNKNOWN; -} - static VkResult stub_queue_exec_locked(struct anv_queue *queue, uint32_t wait_count, @@ -180,7 +173,6 @@ const struct anv_kmd_backend *anv_stub_kmd_backend_get(void) .vm_bind_bo = stub_vm_bind_bo, .vm_unbind_bo = stub_vm_bind_bo, .execute_simple_batch = stub_execute_simple_batch, - .execute_trtt_batch = stub_execute_trtt_batch, .queue_exec_locked = stub_queue_exec_locked, .queue_exec_async = stub_queue_exec_async, .bo_alloc_flags_to_bo_flags = stub_bo_alloc_flags_to_bo_flags, diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h index af82d30ddfb..c469bb57910 100644 --- a/src/intel/vulkan/anv_genX.h +++ b/src/intel/vulkan/anv_genX.h @@ -38,8 +38,10 @@ struct intel_sample_positions; struct intel_urb_config; +struct anv_async_submit; struct anv_embedded_sampler; struct anv_pipeline_embedded_sampler_binding; +struct anv_trtt_bind; typedef struct nir_builder nir_builder; typedef struct nir_shader nir_shader; @@ -351,9 +353,16 @@ genX(simple_shader_push_state_address)(struct anv_simple_shader *state, void genX(emit_simple_shader_end)(struct anv_simple_shader *state); -VkResult genX(init_trtt_context_state)(struct anv_queue *queue); +VkResult genX(init_trtt_context_state)(struct anv_device *device, + struct anv_async_submit *submit); -VkResult genX(write_trtt_entries)(struct anv_trtt_submission *submit); +void genX(write_trtt_entries)(struct anv_async_submit *submit, + struct anv_trtt_bind *l3l2_binds, + uint32_t n_l3l2_binds, + struct anv_trtt_bind *l1_binds, + uint32_t n_l1_binds); + +void genX(async_submit_end)(struct anv_async_submit *submit); void genX(cmd_buffer_emit_push_descriptor_buffer_surface)(struct anv_cmd_buffer *cmd_buffer, diff --git a/src/intel/vulkan/anv_kmd_backend.h b/src/intel/vulkan/anv_kmd_backend.h index 6177709fd48..fd0d006887d 100644 --- a/src/intel/vulkan/anv_kmd_backend.h +++ b/src/intel/vulkan/anv_kmd_backend.h @@ -40,7 +40,6 @@ struct anv_query_pool; struct anv_async_submit; struct anv_utrace_submit; struct anv_sparse_submission; -struct anv_trtt_batch_bo; enum anv_vm_bind_op { /* bind vma specified in anv_vm_bind */ @@ -113,8 +112,6 @@ struct anv_kmd_backend { bool is_companion_rcs_batch); /* The caller is expected to hold device->mutex when calling this vfunc. */ - VkResult (*execute_trtt_batch)(struct anv_sparse_submission *submit, - struct anv_trtt_batch_bo *trtt_bbo); VkResult (*queue_exec_locked)(struct anv_queue *queue, uint32_t wait_count, const struct vk_sync_wait *waits, diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 660169593ec..6a5462ee6ad 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -768,35 +768,6 @@ struct anv_state_stream { struct util_dynarray all_blocks; }; -struct anv_sparse_submission { - struct anv_queue *queue; - - struct anv_vm_bind *binds; - int binds_len; - int binds_capacity; - - uint32_t wait_count; - uint32_t signal_count; - - struct vk_sync_wait *waits; - struct vk_sync_signal *signals; -}; - -struct anv_trtt_bind { - uint64_t pte_addr; - uint64_t entry_addr; -}; - -struct anv_trtt_submission { - struct anv_sparse_submission *sparse; - - struct anv_trtt_bind *l3l2_binds; - struct anv_trtt_bind *l1_binds; - - int l3l2_binds_len; - int l1_binds_len; -}; - /* The block_pool functions exported for testing only. The block pool should * only be used via a state pool (see below). */ @@ -1788,19 +1759,6 @@ struct anv_device_astc_emu { VkPipeline pipeline; }; -struct anv_trtt_batch_bo { - struct anv_bo *bo; - uint32_t size; - - /* Once device->trtt.timeline_handle signals timeline_val as complete we - * can free this struct and its members. - */ - uint64_t timeline_val; - - /* Part of device->trtt.in_flight_batches. */ - struct list_head link; -}; - struct anv_device { struct vk_device vk; @@ -2028,12 +1986,11 @@ struct anv_device { struct anv_bo *cur_page_table_bo; uint64_t next_page_table_bo_offset; - /* Timeline syncobj used to track completion of the TR-TT batch BOs. */ - uint32_t timeline_handle; + struct vk_sync *timeline; uint64_t timeline_val; - /* List of struct anv_trtt_batch_bo batches that are in flight and can - * be freed once their timeline gets signaled. + /* List of struct anv_trtt_submission that are in flight and can be + * freed once their vk_sync gets signaled. */ struct list_head in_flight_batches; } trtt; @@ -2203,17 +2160,6 @@ VkResult anv_queue_submit(struct vk_queue *queue, VkResult anv_queue_submit_simple_batch(struct anv_queue *queue, struct anv_batch *batch, bool is_companion_rcs_batch); -VkResult anv_queue_submit_trtt_batch(struct anv_sparse_submission *submit, - struct anv_batch *batch); - -static inline void -anv_trtt_batch_bo_free(struct anv_device *device, - struct anv_trtt_batch_bo *trtt_bbo) -{ - anv_bo_pool_free(&device->batch_bo_pool, trtt_bbo->bo); - list_del(&trtt_bbo->link); - vk_free(&device->vk.alloc, trtt_bbo); -} void anv_queue_trace(struct anv_queue *queue, const char *label, bool frame, bool begin); @@ -2521,6 +2467,32 @@ anv_async_submit_done(struct anv_async_submit *submit); bool anv_async_submit_wait(struct anv_async_submit *submit); +struct anv_sparse_submission { + struct anv_queue *queue; + + struct anv_vm_bind *binds; + int binds_len; + int binds_capacity; + + uint32_t wait_count; + uint32_t signal_count; + + struct vk_sync_wait *waits; + struct vk_sync_signal *signals; +}; + +struct anv_trtt_bind { + uint64_t pte_addr; + uint64_t entry_addr; +}; + +struct anv_trtt_submission { + struct anv_async_submit base; + + struct anv_sparse_submission *sparse; + + struct list_head link; +}; struct anv_device_memory { struct vk_device_memory vk; @@ -3217,6 +3189,9 @@ VkResult anv_sparse_bind_image_memory(struct anv_queue *queue, VkResult anv_sparse_bind(struct anv_device *device, struct anv_sparse_submission *sparse_submit); +VkResult anv_sparse_trtt_garbage_collect_batches(struct anv_device *device, + bool wait_completion); + VkSparseImageFormatProperties anv_sparse_calc_image_format_properties(struct anv_physical_device *pdevice, VkImageAspectFlags aspect, @@ -3236,8 +3211,6 @@ VkResult anv_sparse_image_check_support(struct anv_physical_device *pdevice, VkSampleCountFlagBits samples, VkImageType type, VkFormat format); -VkResult anv_trtt_batch_bo_new(struct anv_device *device, uint32_t batch_size, - struct anv_trtt_batch_bo **out_trtt_bbo); struct anv_buffer { struct vk_buffer vk; diff --git a/src/intel/vulkan/anv_sparse.c b/src/intel/vulkan/anv_sparse.c index 0e95d665434..086f12baec7 100644 --- a/src/intel/vulkan/anv_sparse.c +++ b/src/intel/vulkan/anv_sparse.c @@ -396,20 +396,11 @@ trtt_get_page_table_bo(struct anv_device *device, struct anv_bo **bo, } static VkResult -anv_trtt_init_context_state(struct anv_queue *queue) +anv_trtt_init_context_state(struct anv_device *device, + struct anv_async_submit *submit) { - struct anv_device *device = queue->device; struct anv_trtt *trtt = &device->trtt; - struct drm_syncobj_create create = { - .handle = 0, - .flags = 0, - }; - if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_CREATE, &create)) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - assert(create.handle != 0); - trtt->timeline_handle = create.handle; - struct anv_bo *l3_bo; VkResult result = trtt_get_page_table_bo(device, &l3_bo, &trtt->l3_addr); if (result != VK_SUCCESS) @@ -430,7 +421,7 @@ anv_trtt_init_context_state(struct anv_queue *queue) goto fail_free_l3; } - result = anv_genX(device->info, init_trtt_context_state)(queue); + result = anv_genX(device->info, init_trtt_context_state)(device, submit); return result; @@ -439,17 +430,6 @@ fail_free_l3: return result; } -static void -anv_trtt_bind_list_add_entry(struct anv_trtt_bind *binds, int *binds_len, - uint64_t pte_addr, uint64_t entry_addr) -{ - binds[*binds_len] = (struct anv_trtt_bind) { - .pte_addr = pte_addr, - .entry_addr = entry_addr, - }; - (*binds_len)++; -} - /* For L3 and L2 pages, null and invalid entries are indicated by bits 1 and 0 * respectively. For L1 entries, the hardware compares the addresses against * what we program to the GFX_TRTT_NULL and GFX_TRTT_INVAL registers. @@ -457,13 +437,27 @@ anv_trtt_bind_list_add_entry(struct anv_trtt_bind *binds, int *binds_len, #define ANV_TRTT_L3L2_NULL_ENTRY (1 << 1) #define ANV_TRTT_L3L2_INVALID_ENTRY (1 << 0) +static void +anv_trtt_bind_list_add_entry(struct anv_trtt_bind *binds, uint32_t *binds_len, + uint64_t pte_addr, uint64_t entry_addr) +{ + binds[*binds_len] = (struct anv_trtt_bind) { + .pte_addr = pte_addr, + .entry_addr = entry_addr, + }; + (*binds_len)++; +} + /* Adds elements to the anv_trtt_bind structs passed. This doesn't write the * entries to the HW yet. */ static VkResult anv_trtt_bind_add(struct anv_device *device, uint64_t trtt_addr, uint64_t dest_addr, - struct anv_trtt_submission *s) + struct anv_trtt_bind *l3l2_binds, + uint32_t *n_l3l2_binds, + struct anv_trtt_bind *l1_binds, + uint32_t *n_l1_binds) { VkResult result = VK_SUCCESS; struct anv_trtt *trtt = &device->trtt; @@ -480,9 +474,10 @@ anv_trtt_bind_add(struct anv_device *device, if (is_null_bind) { trtt->l3_mirror[l3_index] = ANV_TRTT_L3L2_NULL_ENTRY; - anv_trtt_bind_list_add_entry(s->l3l2_binds, &s->l3l2_binds_len, - trtt->l3_addr + l3_index * sizeof(uint64_t), - ANV_TRTT_L3L2_NULL_ENTRY); + anv_trtt_bind_list_add_entry(l3l2_binds, n_l3l2_binds, + trtt->l3_addr + l3_index * + sizeof(uint64_t), + ANV_TRTT_L3L2_NULL_ENTRY); return VK_SUCCESS; } @@ -494,8 +489,9 @@ anv_trtt_bind_add(struct anv_device *device, trtt->l3_mirror[l3_index] = l2_addr; - anv_trtt_bind_list_add_entry(s->l3l2_binds, &s->l3l2_binds_len, - trtt->l3_addr + l3_index * sizeof(uint64_t), l2_addr); + anv_trtt_bind_list_add_entry(l3l2_binds, n_l3l2_binds, + trtt->l3_addr + l3_index * + sizeof(uint64_t), l2_addr); } assert(l2_addr != 0 && l2_addr != ANV_TRTT_L3L2_NULL_ENTRY); @@ -508,9 +504,9 @@ anv_trtt_bind_add(struct anv_device *device, trtt->l2_mirror[l3_index * 512 + l2_index] = ANV_TRTT_L3L2_NULL_ENTRY; - anv_trtt_bind_list_add_entry(s->l3l2_binds, &s->l3l2_binds_len, - l2_addr + l2_index * sizeof(uint64_t), - ANV_TRTT_L3L2_NULL_ENTRY); + anv_trtt_bind_list_add_entry(l3l2_binds, n_l3l2_binds, + l2_addr + l2_index * sizeof(uint64_t), + ANV_TRTT_L3L2_NULL_ENTRY); return VK_SUCCESS; } @@ -522,13 +518,65 @@ anv_trtt_bind_add(struct anv_device *device, trtt->l2_mirror[l3_index * 512 + l2_index] = l1_addr; - anv_trtt_bind_list_add_entry(s->l3l2_binds, &s->l3l2_binds_len, - l2_addr + l2_index * sizeof(uint64_t), l1_addr); + anv_trtt_bind_list_add_entry(l3l2_binds, n_l3l2_binds, + l2_addr + l2_index * sizeof(uint64_t), + l1_addr); } assert(l1_addr != 0 && l1_addr != ANV_TRTT_L3L2_NULL_ENTRY); - anv_trtt_bind_list_add_entry(s->l1_binds, &s->l1_binds_len, - l1_addr + l1_index * sizeof(uint32_t), dest_addr); + anv_trtt_bind_list_add_entry(l1_binds, n_l1_binds, + l1_addr + l1_index * sizeof(uint32_t), + dest_addr); + + return VK_SUCCESS; +} + +VkResult +anv_sparse_trtt_garbage_collect_batches(struct anv_device *device, + bool wait_completion) +{ + struct anv_trtt *trtt = &device->trtt; + + uint64_t last_value; + if (!wait_completion) { + VkResult result = + vk_sync_get_value(&device->vk, trtt->timeline, &last_value); + if (result != VK_SUCCESS) + return result; + } else { + last_value = trtt->timeline_val; + } + + list_for_each_entry_safe(struct anv_trtt_submission, submit, + &trtt->in_flight_batches, link) { + if (submit->base.signal.signal_value <= last_value) { + list_del(&submit->link); + anv_async_submit_fini(&submit->base); + vk_free(&device->vk.alloc, submit); + continue; + } + + if (!wait_completion) + break; + + VkResult result = vk_sync_wait( + &device->vk, + submit->base.signal.sync, + submit->base.signal.signal_value, + VK_SYNC_WAIT_COMPLETE, + os_time_get_absolute_timeout(OS_TIMEOUT_INFINITE)); + if (result == VK_SUCCESS) { + list_del(&submit->link); + anv_async_submit_fini(&submit->base); + vk_free(&device->vk.alloc, submit); + continue; + } + + /* If the wait failed but the caller wanted completion, return the + * error. + */ + return result; + } return VK_SUCCESS; } @@ -545,6 +593,35 @@ anv_sparse_bind_trtt(struct anv_device *device, if (!sparse_submit->queue) sparse_submit->queue = trtt->queue; + struct anv_trtt_submission *submit = + vk_zalloc(&device->vk.alloc, sizeof(*submit), 8, + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + if (submit == NULL) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + result = anv_async_submit_init(&submit->base, sparse_submit->queue, + &device->batch_bo_pool, + false, false); + if (result != VK_SUCCESS) + goto error_async; + + simple_mtx_lock(&trtt->mutex); + + anv_sparse_trtt_garbage_collect_batches(device, false); + + submit->base.signal = (struct vk_sync_signal) { + .sync = trtt->timeline, + .signal_value = ++trtt->timeline_val, + }; + + /* If the TRTT L3 table was never set, initialize it as part of this + * submission. + */ + if (!trtt->l3_addr) + anv_trtt_init_context_state(device, &submit->base); + + assert(trtt->l3_addr); + /* These capacities are conservative estimations. For L1 binds the * number will match exactly unless we skip NULL binds due to L2 already * being NULL. For L3/L2 things are harder to estimate, but the resulting @@ -561,26 +638,15 @@ anv_sparse_bind_trtt(struct anv_device *device, l3l2_binds_capacity += (pages / 1024 + 1) * 2; } + /* Turn a series of virtual address maps, into a list of L3/L2/L1 TRTT page + * table updates. + */ STACK_ARRAY(struct anv_trtt_bind, l3l2_binds, l3l2_binds_capacity); STACK_ARRAY(struct anv_trtt_bind, l1_binds, l1_binds_capacity); - struct anv_trtt_submission trtt_submit = { - .sparse = sparse_submit, - .l3l2_binds = l3l2_binds, - .l1_binds = l1_binds, - .l3l2_binds_len = 0, - .l1_binds_len = 0, - }; - - simple_mtx_lock(&trtt->mutex); - - if (!trtt->l3_addr) - anv_trtt_init_context_state(sparse_submit->queue); - - assert(trtt->l3_addr); - - for (int b = 0; b < sparse_submit->binds_len; b++) { + uint32_t n_l3l2_binds = 0, n_l1_binds = 0; + for (int b = 0; b < sparse_submit->binds_len && result == VK_SUCCESS; b++) { struct anv_vm_bind *vm_bind = &sparse_submit->binds[b]; - for (size_t i = 0; i < vm_bind->size; i += 64 * 1024) { + for (size_t i = 0; i < vm_bind->size && result == VK_SUCCESS; i += 64 * 1024) { uint64_t trtt_addr = vm_bind->address + i; uint64_t dest_addr = (vm_bind->op == ANV_VM_BIND && vm_bind->bo) ? @@ -588,29 +654,74 @@ anv_sparse_bind_trtt(struct anv_device *device, ANV_TRTT_L1_NULL_TILE_VAL; result = anv_trtt_bind_add(device, trtt_addr, dest_addr, - &trtt_submit); - if (result != VK_SUCCESS) - goto out; + l3l2_binds, &n_l3l2_binds, + l1_binds, &n_l1_binds); } } - assert(trtt_submit.l3l2_binds_len <= l3l2_binds_capacity); - assert(trtt_submit.l1_binds_len <= l1_binds_capacity); + assert(n_l3l2_binds <= l3l2_binds_capacity); + assert(n_l1_binds <= l1_binds_capacity); - sparse_debug("trtt_binds: num_vm_binds:%02d l3l2:%04d l1:%04d\n", - sparse_submit->binds_len, trtt_submit.l3l2_binds_len, - trtt_submit.l1_binds_len); + /* Convert the L3/L2/L1 TRTT page table updates in anv_trtt_bind elements + * into MI commands. + */ + if (result == VK_SUCCESS) { + sparse_debug("trtt_binds: num_vm_binds:%02d l3l2:%04d l1:%04d\n", + sparse_submit->binds_len, n_l3l2_binds, n_l1_binds); - if (trtt_submit.l3l2_binds_len || trtt_submit.l1_binds_len) - result = anv_genX(device->info, write_trtt_entries)(&trtt_submit); + if (n_l3l2_binds || n_l1_binds) { + anv_genX(device->info, write_trtt_entries)( + &submit->base, l3l2_binds, n_l3l2_binds, l1_binds, n_l1_binds); + } + } - if (result == VK_SUCCESS) - ANV_RMV(vm_binds, device, sparse_submit->binds, sparse_submit->binds_len); - -out: - simple_mtx_unlock(&trtt->mutex); STACK_ARRAY_FINISH(l1_binds); STACK_ARRAY_FINISH(l3l2_binds); + + anv_genX(device->info, async_submit_end)(&submit->base); + + if (submit->base.batch.status != VK_SUCCESS) { + result = submit->base.batch.status; + goto error_add_bind; + } + + /* Add all the BOs backing TRTT page tables to the reloc list. + * + * TODO: we could narrow down the list by using anv_address structures in + * anv_trtt_bind for the pte_addr. + */ + if (device->physical->uses_relocs) { + for (int i = 0; i < trtt->num_page_table_bos; i++) { + result = anv_reloc_list_add_bo(&submit->base.relocs, + trtt->page_table_bos[i]); + if (result != VK_SUCCESS) + goto error_add_bind; + } + } + + result = + device->kmd_backend->queue_exec_async(&submit->base, + sparse_submit->wait_count, + sparse_submit->waits, + sparse_submit->signal_count, + sparse_submit->signals); + if (result != VK_SUCCESS) + goto error_add_bind; + + + list_addtail(&submit->link, &trtt->in_flight_batches); + + simple_mtx_unlock(&trtt->mutex); + + ANV_RMV(vm_binds, device, sparse_submit->binds, sparse_submit->binds_len); + + return VK_SUCCESS; + + error_add_bind: + simple_mtx_unlock(&trtt->mutex); + anv_async_submit_fini(&submit->base); + error_async: + vk_free(&device->vk.alloc, submit); return result; } @@ -1299,65 +1410,3 @@ anv_sparse_image_check_support(struct anv_physical_device *pdevice, return VK_SUCCESS; } - -static VkResult -anv_trtt_garbage_collect_batches(struct anv_device *device) -{ - struct anv_trtt *trtt = &device->trtt; - - if (trtt->timeline_val % 8 != 7) - return VK_SUCCESS; - - uint64_t cur_timeline_val = 0; - struct drm_syncobj_timeline_array array = { - .handles = (uintptr_t)&trtt->timeline_handle, - .points = (uintptr_t)&cur_timeline_val, - .count_handles = 1, - .flags = 0, - }; - if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_QUERY, &array)) - return vk_error(device, VK_ERROR_UNKNOWN); - - list_for_each_entry_safe(struct anv_trtt_batch_bo, trtt_bbo, - &trtt->in_flight_batches, link) { - if (trtt_bbo->timeline_val > cur_timeline_val) - return VK_SUCCESS; - - anv_trtt_batch_bo_free(device, trtt_bbo); - } - - return VK_SUCCESS; -} - -VkResult -anv_trtt_batch_bo_new(struct anv_device *device, uint32_t batch_size, - struct anv_trtt_batch_bo **out_trtt_bbo) -{ - struct anv_trtt *trtt = &device->trtt; - VkResult result; - - anv_trtt_garbage_collect_batches(device); - - struct anv_trtt_batch_bo *trtt_bbo = - vk_alloc(&device->vk.alloc, sizeof(*trtt_bbo), 8, - VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); - if (!trtt_bbo) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - - result = anv_bo_pool_alloc(&device->batch_bo_pool, batch_size, - &trtt_bbo->bo); - if (result != VK_SUCCESS) - goto out; - - trtt_bbo->size = batch_size; - trtt_bbo->timeline_val = ++trtt->timeline_val; - - list_addtail(&trtt_bbo->link, &trtt->in_flight_batches); - - *out_trtt_bbo = trtt_bbo; - - return VK_SUCCESS; -out: - vk_free(&device->vk.alloc, trtt_bbo); - return result; -} diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 0270dd351c7..d4f5fdf439a 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -6094,22 +6094,17 @@ genX(cmd_buffer_end_companion_rcs_syncpoint)(struct anv_cmd_buffer *cmd_buffer, #endif } -VkResult -genX(write_trtt_entries)(struct anv_trtt_submission *submit) +void +genX(write_trtt_entries)(struct anv_async_submit *submit, + struct anv_trtt_bind *l3l2_binds, + uint32_t n_l3l2_binds, + struct anv_trtt_bind *l1_binds, + uint32_t n_l1_binds) { #if GFX_VER >= 12 const struct intel_device_info *devinfo = - submit->sparse->queue->device->info; - - size_t batch_size = submit->l3l2_binds_len * 20 + - submit->l1_binds_len * 16 + - GENX(PIPE_CONTROL_length) * sizeof(uint32_t) + 8; - STACK_ARRAY(uint32_t, cmds, batch_size); - struct anv_batch batch = { - .start = cmds, - .next = cmds, - .end = (void *)cmds + batch_size, - }; + submit->queue->device->info; + struct anv_batch *batch = &submit->batch; /* BSpec says: * "DWord Length programmed must not exceed 0x3FE." @@ -6127,90 +6122,86 @@ genX(write_trtt_entries)(struct anv_trtt_submission *submit) * contiguous addresses. */ - for (int i = 0; i < submit->l3l2_binds_len; i++) { + for (uint32_t i = 0; i < n_l3l2_binds; i++) { int extra_writes = 0; - for (int j = i + 1; - j < submit->l3l2_binds_len && - extra_writes <= max_qword_extra_writes; + for (uint32_t j = i + 1; + j < n_l3l2_binds && extra_writes <= max_qword_extra_writes; j++) { - if (submit->l3l2_binds[i].pte_addr + (j - i) * 8 == - submit->l3l2_binds[j].pte_addr) { + if (l3l2_binds[i].pte_addr + (j - i) * 8 == l3l2_binds[j].pte_addr) { extra_writes++; } else { break; } } - bool is_last_write = submit->l1_binds_len == 0 && - i + extra_writes + 1 == submit->l3l2_binds_len; + bool is_last_write = n_l1_binds == 0 && + i + extra_writes + 1 == n_l3l2_binds; uint32_t total_len = GENX(MI_STORE_DATA_IMM_length_bias) + qword_write_len + (extra_writes * 2); uint32_t *dw; - dw = anv_batch_emitn(&batch, total_len, GENX(MI_STORE_DATA_IMM), + dw = anv_batch_emitn(batch, total_len, GENX(MI_STORE_DATA_IMM), .ForceWriteCompletionCheck = is_last_write, .StoreQword = true, - .Address = anv_address_from_u64(submit->l3l2_binds[i].pte_addr), + .Address = anv_address_from_u64(l3l2_binds[i].pte_addr), ); dw += 3; - for (int j = 0; j < extra_writes + 1; j++) { - uint64_t entry_addr_64b = submit->l3l2_binds[i + j].entry_addr; + for (uint32_t j = 0; j < extra_writes + 1; j++) { + uint64_t entry_addr_64b = l3l2_binds[i + j].entry_addr; *dw = entry_addr_64b & 0xFFFFFFFF; dw++; *dw = (entry_addr_64b >> 32) & 0xFFFFFFFF; dw++; } - assert(dw == batch.next); + assert(dw == batch->next); i += extra_writes; } - for (int i = 0; i < submit->l1_binds_len; i++) { + for (uint32_t i = 0; i < n_l1_binds; i++) { int extra_writes = 0; - for (int j = i + 1; - j < submit->l1_binds_len && extra_writes <= max_dword_extra_writes; + for (uint32_t j = i + 1; + j < n_l1_binds && extra_writes <= max_dword_extra_writes; j++) { - if (submit->l1_binds[i].pte_addr + (j - i) * 4 == - submit->l1_binds[j].pte_addr) { + if (l1_binds[i].pte_addr + (j - i) * 4 == + l1_binds[j].pte_addr) { extra_writes++; } else { break; } } - bool is_last_write = i + extra_writes + 1 == submit->l1_binds_len; + bool is_last_write = i + extra_writes + 1 == n_l1_binds; uint32_t total_len = GENX(MI_STORE_DATA_IMM_length_bias) + dword_write_len + extra_writes; uint32_t *dw; - dw = anv_batch_emitn(&batch, total_len, GENX(MI_STORE_DATA_IMM), + dw = anv_batch_emitn(batch, total_len, GENX(MI_STORE_DATA_IMM), .ForceWriteCompletionCheck = is_last_write, - .Address = anv_address_from_u64(submit->l1_binds[i].pte_addr), + .Address = anv_address_from_u64(l1_binds[i].pte_addr), ); dw += 3; - for (int j = 0; j < extra_writes + 1; j++) { - *dw = (submit->l1_binds[i + j].entry_addr >> 16) & 0xFFFFFFFF; + for (uint32_t j = 0; j < extra_writes + 1; j++) { + *dw = (l1_binds[i + j].entry_addr >> 16) & 0xFFFFFFFF; dw++; } - assert(dw == batch.next); + assert(dw == batch->next); i += extra_writes; } - genx_batch_emit_pipe_control(&batch, devinfo, _3D, + genx_batch_emit_pipe_control(batch, devinfo, _3D, ANV_PIPE_CS_STALL_BIT | ANV_PIPE_TLB_INVALIDATE_BIT); - - anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe); - - assert(batch.next <= batch.end); - - VkResult result = anv_queue_submit_trtt_batch(submit->sparse, &batch); - STACK_ARRAY_FINISH(cmds); - - return result; - +#else + unreachable("Not implemented"); #endif - return VK_SUCCESS; +} + +void +genX(async_submit_end)(struct anv_async_submit *submit) +{ + struct anv_batch *batch = &submit->batch; + anv_batch_emit(batch, GENX(MI_BATCH_BUFFER_END), bbe); } void diff --git a/src/intel/vulkan/genX_init_state.c b/src/intel/vulkan/genX_init_state.c index bdfc95fd0d8..6feb078830b 100644 --- a/src/intel/vulkan/genX_init_state.c +++ b/src/intel/vulkan/genX_init_state.c @@ -1396,31 +1396,25 @@ genX(apply_task_urb_workaround)(struct anv_cmd_buffer *cmd_buffer) } VkResult -genX(init_trtt_context_state)(struct anv_queue *queue) +genX(init_trtt_context_state)(struct anv_device *device, + struct anv_async_submit *submit) { #if GFX_VER >= 12 - struct anv_device *device = queue->device; struct anv_trtt *trtt = &device->trtt; + struct anv_batch *batch = &submit->batch; - uint32_t cmds[128]; - struct anv_batch batch = { - .start = cmds, - .next = cmds, - .end = (void *)cmds + sizeof(cmds), - }; - - anv_batch_write_reg(&batch, GENX(GFX_TRTT_INVAL), trtt_inval) { + anv_batch_write_reg(batch, GENX(GFX_TRTT_INVAL), trtt_inval) { trtt_inval.InvalidTileDetectionValue = ANV_TRTT_L1_INVALID_TILE_VAL; } - anv_batch_write_reg(&batch, GENX(GFX_TRTT_NULL), trtt_null) { + anv_batch_write_reg(batch, GENX(GFX_TRTT_NULL), trtt_null) { trtt_null.NullTileDetectionValue = ANV_TRTT_L1_NULL_TILE_VAL; } #if GFX_VER >= 20 - anv_batch_write_reg(&batch, GENX(GFX_TRTT_VA_RANGE), trtt_va_range) { + anv_batch_write_reg(batch, GENX(GFX_TRTT_VA_RANGE), trtt_va_range) { trtt_va_range.TRVABase = device->physical->va.trtt.addr >> 44; } #else - anv_batch_write_reg(&batch, GENX(GFX_TRTT_VA_RANGE), trtt_va_range) { + anv_batch_write_reg(batch, GENX(GFX_TRTT_VA_RANGE), trtt_va_range) { trtt_va_range.TRVAMaskValue = 0xF; trtt_va_range.TRVADataValue = 0xF; } @@ -1428,28 +1422,24 @@ genX(init_trtt_context_state)(struct anv_queue *queue) uint64_t l3_addr = trtt->l3_addr; assert((l3_addr & 0xFFF) == 0); - anv_batch_write_reg(&batch, GENX(GFX_TRTT_L3_BASE_LOW), trtt_base_low) { + anv_batch_write_reg(batch, GENX(GFX_TRTT_L3_BASE_LOW), trtt_base_low) { trtt_base_low.TRVAL3PointerLowerAddress = (l3_addr & 0xFFFFF000) >> 12; } - anv_batch_write_reg(&batch, GENX(GFX_TRTT_L3_BASE_HIGH), + anv_batch_write_reg(batch, GENX(GFX_TRTT_L3_BASE_HIGH), trtt_base_high) { trtt_base_high.TRVAL3PointerUpperAddress = (l3_addr >> 32) & 0xFFFF; } /* Enabling TR-TT needs to be done after setting up the other registers. */ - anv_batch_write_reg(&batch, GENX(GFX_TRTT_CR), trtt_cr) { + anv_batch_write_reg(batch, GENX(GFX_TRTT_CR), trtt_cr) { trtt_cr.TRTTEnable = true; } - anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe); - assert(batch.next <= batch.end); - - VkResult res = anv_queue_submit_simple_batch(queue, &batch, false); - if (res != VK_SUCCESS) - return res; - + genx_batch_emit_pipe_control(batch, device->info, _3D, + ANV_PIPE_CS_STALL_BIT | + ANV_PIPE_TLB_INVALIDATE_BIT); #endif return VK_SUCCESS; } diff --git a/src/intel/vulkan/i915/anv_batch_chain.c b/src/intel/vulkan/i915/anv_batch_chain.c index bda46d9a117..545484dc2e6 100644 --- a/src/intel/vulkan/i915/anv_batch_chain.c +++ b/src/intel/vulkan/i915/anv_batch_chain.c @@ -1051,105 +1051,3 @@ fail: anv_execbuf_finish(&execbuf); return result; } - -VkResult -i915_execute_trtt_batch(struct anv_sparse_submission *submit, - struct anv_trtt_batch_bo *trtt_bbo) -{ - struct anv_queue *queue = submit->queue; - struct anv_device *device = queue->device; - struct anv_trtt *trtt = &device->trtt; - struct anv_execbuf execbuf = { - .alloc = &device->vk.alloc, - .alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE, - }; - VkResult result; - - for (uint32_t i = 0; i < submit->wait_count; i++) { - result = anv_execbuf_add_sync(device, &execbuf, submit->waits[i].sync, - false /* is_signal */, - submit->waits[i].wait_value); - if (result != VK_SUCCESS) - goto out; - } - - for (uint32_t i = 0; i < submit->signal_count; i++) { - result = anv_execbuf_add_sync(device, &execbuf, submit->signals[i].sync, - true /* is_signal */, - submit->signals[i].signal_value); - if (result != VK_SUCCESS) - goto out; - } - - result = anv_execbuf_add_syncobj(device, &execbuf, trtt->timeline_handle, - I915_EXEC_FENCE_SIGNAL, - trtt_bbo->timeline_val); - if (result != VK_SUCCESS) - goto out; - - - result = anv_execbuf_add_bo(device, &execbuf, device->workaround_bo, NULL, - 0); - if (result != VK_SUCCESS) - goto out; - - for (int i = 0; i < trtt->num_page_table_bos; i++) { - result = anv_execbuf_add_bo(device, &execbuf, trtt->page_table_bos[i], - NULL, EXEC_OBJECT_WRITE); - if (result != VK_SUCCESS) - goto out; - } - - if (queue->sync) { - result = anv_execbuf_add_sync(device, &execbuf, queue->sync, - true /* is_signal */, - 0 /* signal_value */); - if (result != VK_SUCCESS) - goto out; - } - - result = anv_execbuf_add_bo(device, &execbuf, trtt_bbo->bo, NULL, 0); - if (result != VK_SUCCESS) - goto out; - - if (INTEL_DEBUG(DEBUG_SUBMIT)) - anv_i915_debug_submit(&execbuf); - - uint64_t exec_flags = 0; - uint32_t context_id; - get_context_and_exec_flags(queue, false, &exec_flags, &context_id); - - execbuf.execbuf = (struct drm_i915_gem_execbuffer2) { - .buffers_ptr = (uintptr_t) execbuf.objects, - .buffer_count = execbuf.bo_count, - .batch_start_offset = 0, - .batch_len = trtt_bbo->size, - .flags = I915_EXEC_HANDLE_LUT | I915_EXEC_NO_RELOC | exec_flags, - .rsvd1 = context_id, - .rsvd2 = 0, - }; - setup_execbuf_fence_params(&execbuf); - - ANV_RMV(bos_gtt_map, device, execbuf.bos, execbuf.bo_count); - - int ret = queue->device->info->no_hw ? 0 : - anv_gem_execbuffer(device, &execbuf.execbuf); - if (ret) { - result = vk_device_set_lost(&device->vk, - "trtt anv_gem_execbuffer failed: %m"); - goto out; - } - - if (queue->sync) { - result = vk_sync_wait(&device->vk, queue->sync, 0, - VK_SYNC_WAIT_COMPLETE, UINT64_MAX); - if (result != VK_SUCCESS) { - result = vk_queue_set_lost(&queue->vk, "trtt sync wait failed"); - goto out; - } - } - -out: - anv_execbuf_finish(&execbuf); - return result; -} diff --git a/src/intel/vulkan/i915/anv_batch_chain.h b/src/intel/vulkan/i915/anv_batch_chain.h index 6a780b24bbb..ea907adfb4f 100644 --- a/src/intel/vulkan/i915/anv_batch_chain.h +++ b/src/intel/vulkan/i915/anv_batch_chain.h @@ -29,15 +29,12 @@ #include "vk_sync.h" -struct anv_device; struct anv_queue; struct anv_bo; struct anv_cmd_buffer; struct anv_query_pool; struct anv_async_submit; struct anv_utrace_submit; -struct anv_sparse_submission; -struct anv_trtt_batch_bo; VkResult i915_queue_exec_async(struct anv_async_submit *submit, @@ -50,10 +47,6 @@ VkResult i915_execute_simple_batch(struct anv_queue *queue, struct anv_bo *batch_bo, uint32_t batch_bo_size, bool is_companion_rcs_batch); -VkResult -i915_execute_trtt_batch(struct anv_sparse_submission *submit, - struct anv_trtt_batch_bo *trtt_bbo); - VkResult i915_queue_exec_locked(struct anv_queue *queue, uint32_t wait_count, diff --git a/src/intel/vulkan/i915/anv_kmd_backend.c b/src/intel/vulkan/i915/anv_kmd_backend.c index 539cdad6173..41788af0e95 100644 --- a/src/intel/vulkan/i915/anv_kmd_backend.c +++ b/src/intel/vulkan/i915/anv_kmd_backend.c @@ -297,7 +297,6 @@ anv_i915_kmd_backend_get(void) .vm_bind_bo = i915_vm_bind_bo, .vm_unbind_bo = i915_vm_bind_bo, .execute_simple_batch = i915_execute_simple_batch, - .execute_trtt_batch = i915_execute_trtt_batch, .queue_exec_locked = i915_queue_exec_locked, .queue_exec_async = i915_queue_exec_async, .bo_alloc_flags_to_bo_flags = i915_bo_alloc_flags_to_bo_flags, diff --git a/src/intel/vulkan/xe/anv_batch_chain.c b/src/intel/vulkan/xe/anv_batch_chain.c index 61f912a158d..73ea9c9b034 100644 --- a/src/intel/vulkan/xe/anv_batch_chain.c +++ b/src/intel/vulkan/xe/anv_batch_chain.c @@ -183,58 +183,6 @@ xe_exec_print_debug(struct anv_queue *queue, uint32_t cmd_buffer_count, perf_query_pool, perf_query_pass); } -VkResult -xe_execute_trtt_batch(struct anv_sparse_submission *submit, - struct anv_trtt_batch_bo *trtt_bbo) -{ - struct anv_queue *queue = submit->queue; - struct anv_device *device = queue->device; - struct anv_trtt *trtt = &device->trtt; - VkResult result = VK_SUCCESS; - - struct drm_xe_sync extra_sync = { - .type = DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ, - .flags = DRM_XE_SYNC_FLAG_SIGNAL, - .handle = trtt->timeline_handle, - .timeline_value = trtt_bbo->timeline_val, - }; - - struct drm_xe_sync *xe_syncs = NULL; - uint32_t xe_syncs_count = 0; - result = xe_exec_process_syncs(queue, submit->wait_count, submit->waits, - submit->signal_count, submit->signals, - 1, &extra_sync, - NULL, /* utrace_submit */ - false, /* is_companion_rcs_queue */ - &xe_syncs, &xe_syncs_count); - if (result != VK_SUCCESS) - return result; - - struct drm_xe_exec exec = { - .exec_queue_id = queue->exec_queue_id, - .num_syncs = xe_syncs_count, - .syncs = (uintptr_t)xe_syncs, - .address = trtt_bbo->bo->offset, - .num_batch_buffer = 1, - }; - - if (!device->info->no_hw) { - if (intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, &exec)) { - result = vk_device_set_lost(&device->vk, "XE_EXEC failed: %m"); - goto out; - } - } - - if (queue->sync) { - result = vk_sync_wait(&device->vk, queue->sync, 0, - VK_SYNC_WAIT_COMPLETE, UINT64_MAX); - } - -out: - vk_free(&device->vk.alloc, xe_syncs); - return result; -} - VkResult xe_queue_exec_async(struct anv_async_submit *submit, uint32_t wait_count, diff --git a/src/intel/vulkan/xe/anv_batch_chain.h b/src/intel/vulkan/xe/anv_batch_chain.h index f176d98d269..cc521b5d990 100644 --- a/src/intel/vulkan/xe/anv_batch_chain.h +++ b/src/intel/vulkan/xe/anv_batch_chain.h @@ -36,17 +36,11 @@ struct anv_cmd_buffer; struct anv_query_pool; struct anv_async_submit; struct anv_utrace_submit; -struct anv_sparse_submission; -struct anv_trtt_batch_bo; VkResult xe_execute_simple_batch(struct anv_queue *queue, struct anv_bo *batch_bo, uint32_t batch_bo_size, bool is_companion_rcs_batch); -VkResult -xe_execute_trtt_batch(struct anv_sparse_submission *submit, - struct anv_trtt_batch_bo *trtt_bbo); - VkResult xe_queue_exec_locked(struct anv_queue *queue, uint32_t wait_count, diff --git a/src/intel/vulkan/xe/anv_kmd_backend.c b/src/intel/vulkan/xe/anv_kmd_backend.c index 98a95f72bc0..f1d4d789d6c 100644 --- a/src/intel/vulkan/xe/anv_kmd_backend.c +++ b/src/intel/vulkan/xe/anv_kmd_backend.c @@ -346,7 +346,6 @@ anv_xe_kmd_backend_get(void) .vm_bind_bo = xe_vm_bind_bo, .vm_unbind_bo = xe_vm_unbind_bo, .execute_simple_batch = xe_execute_simple_batch, - .execute_trtt_batch = xe_execute_trtt_batch, .queue_exec_locked = xe_queue_exec_locked, .queue_exec_async = xe_queue_exec_async, .bo_alloc_flags_to_bo_flags = xe_bo_alloc_flags_to_bo_flags,