diff --git a/src/intel/compiler/brw/brw_nir_lower_ray_queries.c b/src/intel/compiler/brw/brw_nir_lower_ray_queries.c index 9a9ddbd0364..85bc0f24bed 100644 --- a/src/intel/compiler/brw/brw_nir_lower_ray_queries.c +++ b/src/intel/compiler/brw/brw_nir_lower_ray_queries.c @@ -38,8 +38,10 @@ struct lowering_state { struct hash_table *queries; uint32_t n_queries; - struct brw_nir_rt_globals_defs globals; nir_def *rq_globals; + + uint32_t num_dss_rt_stacks; + uint32_t sync_stacks_stride; }; struct brw_ray_query { @@ -50,12 +52,6 @@ struct brw_ray_query { #define SIZEOF_QUERY_STATE (sizeof(uint32_t)) -static bool -need_spill_fill(struct lowering_state *state) -{ - return state->n_queries > 1; -} - /** * This pass converts opaque RayQuery structures from SPIRV into a vec3 where * the first 2 elements store a global address for the query and the third @@ -98,10 +94,8 @@ create_internal_var(struct brw_ray_query *rq, struct lowering_state *state) NULL); } - - static nir_def * -get_ray_query_shadow_addr(nir_builder *b, +get_ray_query_stack_index(nir_builder *b, nir_deref_instr *deref, struct lowering_state *state, nir_deref_instr **out_state_deref) @@ -116,35 +110,17 @@ get_ray_query_shadow_addr(nir_builder *b, struct brw_ray_query *rq = entry->data; - /* Base address in the shadow memory of the variable associated with this - * ray query variable. - */ - nir_def *base_addr = - nir_iadd_imm(b, state->globals.resume_sbt_addr, - brw_rt_ray_queries_shadow_stack_size(state->devinfo) * rq->id); - - bool spill_fill = need_spill_fill(state); + nir_def *query_idx = nir_imm_int(b, rq->id); *out_state_deref = nir_build_deref_var(b, rq->internal_var); - if (!spill_fill) - return NULL; - /* Just emit code and let constant-folding go to town */ nir_deref_instr **p = &path.path[1]; for (; *p; p++) { if ((*p)->deref_type == nir_deref_type_array) { nir_def *index = (*p)->arr.index.ssa; - - /**/ *out_state_deref = nir_build_deref_array(b, *out_state_deref, index); - - /**/ - uint64_t size = MAX2(1, glsl_get_aoa_size((*p)->type)) * - brw_rt_ray_queries_shadow_stack_size(state->devinfo); - - nir_def *mul = nir_amul_imm(b, nir_i2i64(b, index), size); - - base_addr = nir_iadd(b, base_addr, mul); + index = nir_amul_imm(b, index, MAX2(1, glsl_get_aoa_size((*p)->type))); + query_idx = nir_iadd(b, query_idx, index); } else { UNREACHABLE("Unsupported deref type"); } @@ -152,28 +128,7 @@ get_ray_query_shadow_addr(nir_builder *b, nir_deref_path_finish(&path); - /* Add the lane offset to the shadow memory address */ - nir_def *lane_offset = - nir_imul_imm( - b, - nir_iadd( - b, - nir_imul( - b, - brw_load_btd_dss_id(b), - state->globals.num_dss_rt_stacks), - brw_nir_rt_sync_stack_id(b)), - BRW_RT_SIZEOF_SHADOW_RAY_QUERY); - - /* Top/bottom 16 lanes each get their own stack area */ - lane_offset = nir_bcsel( - b, - nir_ilt_imm(b, nir_load_subgroup_invocation(b), 16), - lane_offset, - nir_iadd_imm(b, lane_offset, - brw_rt_ray_queries_shadow_stack_size(state->devinfo) / 2)); - - return nir_iadd(b, base_addr, nir_i2i64(b, lane_offset)); + return query_idx; } static void @@ -209,26 +164,6 @@ update_trace_ctrl_level(nir_builder *b, } } -static void -fill_query(nir_builder *b, - nir_def *hw_stack_addr, - nir_def *shadow_stack_addr, - nir_def *ctrl) -{ - brw_nir_memcpy_global(b, hw_stack_addr, 64, shadow_stack_addr, 64, - BRW_RT_SIZEOF_RAY_QUERY); -} - -static void -spill_query(nir_builder *b, - nir_def *hw_stack_addr, - nir_def *shadow_stack_addr) -{ - brw_nir_memcpy_global(b, shadow_stack_addr, 64, hw_stack_addr, 64, - BRW_RT_SIZEOF_RAY_QUERY); -} - - static void lower_ray_query_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin, @@ -239,12 +174,20 @@ lower_ray_query_intrinsic(nir_builder *b, b->cursor = nir_instr_remove(&intrin->instr); nir_deref_instr *ctrl_level_deref; - nir_def *shadow_stack_addr = - get_ray_query_shadow_addr(b, deref, state, &ctrl_level_deref); - nir_def *hw_stack_addr = - brw_nir_rt_sync_stack_addr(b, state->globals.base_mem_addr, - state->globals.num_dss_rt_stacks); - nir_def *stack_addr = shadow_stack_addr ? shadow_stack_addr : hw_stack_addr; + nir_def *stack_index = + get_ray_query_stack_index(b, deref, state, &ctrl_level_deref); + nir_def *rq_globals_addr = + nir_iadd(b, state->rq_globals, + nir_i2i64(b, nir_amul_imm(b, stack_index, + BRW_RT_DISPATCH_GLOBALS_ALIGN))); + nir_def *stack_base_addr = + nir_isub(b, state->rq_globals, + nir_i2i64(b, nir_amul_imm(b, stack_index, + state->sync_stacks_stride))); + nir_def *stack_addr = + brw_nir_rt_sync_stack_addr(b, stack_base_addr, + state->num_dss_rt_stacks, + state->devinfo); mesa_shader_stage stage = b->shader->info.stage; switch (intrin->intrinsic) { @@ -313,22 +256,12 @@ lower_ray_query_intrinsic(nir_builder *b, */ brw_nir_rt_query_mark_done(b, stack_addr); - if (shadow_stack_addr) - fill_query(b, hw_stack_addr, shadow_stack_addr, ctrl); - - /* Do not use state->rq_globals, we want a uniform value for the - * tracing call. - */ - nir_trace_ray_intel(b, nir_load_ray_query_global_intel(b), - level, ctrl, .synchronous = true); + nir_trace_ray_intel(b, rq_globals_addr, level, ctrl, .synchronous = true); struct brw_nir_rt_mem_hit_defs hit_in = {}; - brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, hw_stack_addr, false, + brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, stack_addr, false, state->devinfo); - if (shadow_stack_addr) - spill_query(b, hw_stack_addr, shadow_stack_addr); - update_trace_ctrl_level(b, ctrl_level_deref, NULL, NULL, nir_imm_int(b, GEN_RT_TRACE_RAY_CONTINUE), @@ -547,21 +480,17 @@ lower_ray_query_impl(nir_function_impl *impl, struct lowering_state *state) nir_builder _b, *b = &_b; _b = nir_builder_at(nir_before_impl(impl)); - nir_def *rq_globals_base = nir_load_ray_query_global_intel(b); + state->rq_globals = nir_load_ray_query_global_intel(b); - /* Use a different global for each 16lanes groups (only in SIMD32). */ - state->rq_globals = nir_bcsel( - b, - nir_iand(b, - nir_ige_imm(b, nir_load_subgroup_invocation(b), 16), - nir_ieq_imm(b, nir_load_subgroup_size(b), 32)), - nir_iadd_imm( - b, rq_globals_base, - align(4 * RT_DISPATCH_GLOBALS_length(state->devinfo), 64)), - rq_globals_base); + /* ATSM PRMs Vol 9, "State Model for Ray Tracing - RTDispatchGlobals" + * + * "For Sync Ray tracing (i.e. using RayQueries), SW must allocate + * space assuming 2K StackIDs" + */ + state->num_dss_rt_stacks = 2048; /* TODO */ - brw_nir_rt_load_globals_addr(b, &state->globals, state->rq_globals, - state->devinfo); + state->sync_stacks_stride = + brw_rt_ray_queries_stacks_stride(state->devinfo); nir_foreach_block_safe(block, impl) { nir_foreach_instr_safe(instr, block) { diff --git a/src/intel/compiler/brw/brw_nir_rt_builder.h b/src/intel/compiler/brw/brw_nir_rt_builder.h index d66fa897e4c..2062b24fc7a 100644 --- a/src/intel/compiler/brw/brw_nir_rt_builder.h +++ b/src/intel/compiler/brw/brw_nir_rt_builder.h @@ -178,7 +178,8 @@ brw_nir_rt_sw_hotzone_addr(nir_builder *b, static inline nir_def * brw_nir_rt_sync_stack_addr(nir_builder *b, nir_def *base_mem_addr, - nir_def *num_dss_rt_stacks) + uint32_t num_dss_rt_stacks, + const struct intel_device_info *devinfo) { /* Bspec 47547 (Xe) and 56936 (Xe2+) say: * For Ray queries (Synchronous Ray Tracing), the formula is similar but @@ -195,12 +196,29 @@ brw_nir_rt_sync_stack_addr(nir_builder *b, * NUM_SYNC_STACKID_PER_DSS instead. */ nir_def *offset32 = - nir_imul(b, - nir_iadd(b, - nir_imul(b, brw_load_btd_dss_id(b), - num_dss_rt_stacks), - nir_iadd_imm(b, brw_nir_rt_sync_stack_id(b), 1)), - nir_imm_int(b, BRW_RT_SIZEOF_RAY_QUERY)); + nir_imul_imm(b, + nir_iadd(b, + nir_imul_imm(b, brw_load_btd_dss_id(b), + num_dss_rt_stacks), + nir_iadd_imm(b, brw_nir_rt_sync_stack_id(b), 1)), + BRW_RT_SIZEOF_RAY_QUERY); + + /* StackID offset for the bottom 16 lanes in SIMD32, this must match the + * offset of the second base address provided by the driver through the + * pair of ray query RTDispatchGlobals + */ + uint32_t simd32_stack_offset = + num_dss_rt_stacks * BRW_RT_SIZEOF_RAY_QUERY * + intel_device_info_dual_subslice_id_bound(devinfo); + + offset32 = + nir_bcsel(b, + nir_iand(b, + nir_ige_imm(b, nir_load_subgroup_invocation(b), 16), + nir_ieq_imm(b, nir_load_subgroup_size(b), 32)), + nir_iadd_imm(b, offset32, simd32_stack_offset), + offset32); + return nir_isub(b, base_mem_addr, nir_u2u64(b, offset32)); } @@ -300,7 +318,6 @@ struct brw_nir_rt_globals_defs { nir_def *launch_size; nir_def *call_sbt_addr; nir_def *call_sbt_stride; - nir_def *resume_sbt_addr; }; static inline void @@ -368,8 +385,6 @@ brw_nir_rt_load_globals_addr(nir_builder *b, defs->call_sbt_stride = nir_iand_imm(b, nir_unpack_32_2x16_split_x(b, nir_channel(b, data, 2)), 0x1fff); - defs->resume_sbt_addr = - nir_pack_64_2x32(b, nir_channels(b, data, 0x3 << 3)); } else { defs->call_sbt_addr = nir_pack_64_2x32_split(b, nir_channel(b, data, 0), @@ -377,9 +392,6 @@ brw_nir_rt_load_globals_addr(nir_builder *b, nir_imm_int(b, 0))); defs->call_sbt_stride = nir_unpack_32_2x16_split_y(b, nir_channel(b, data, 1)); - - defs->resume_sbt_addr = - nir_pack_64_2x32(b, nir_channels(b, data, 0x3 << 2)); } } diff --git a/src/intel/compiler/brw/brw_rt.h b/src/intel/compiler/brw/brw_rt.h index 4d5791c88e1..09a0b86af77 100644 --- a/src/intel/compiler/brw/brw_rt.h +++ b/src/intel/compiler/brw/brw_rt.h @@ -36,7 +36,7 @@ extern "C" { #define BRW_RT_SBT_HANDLE_SIZE 32 /** RT_DISPATCH_GLOBALS size (see gen_rt.xml) */ -#define BRW_RT_DISPATCH_GLOBALS_SIZE 80 +#define BRW_RT_DISPATCH_GLOBALS_SIZE 72 /** RT_DISPATCH_GLOBALS alignment * @@ -191,10 +191,6 @@ struct brw_rt_raygen_trampoline_params { (BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS + \ (BRW_RT_MAX_BVH_LEVELS % 2 ? 32 : 0)) -#define BRW_RT_SIZEOF_SHADOW_RAY_QUERY \ - (BRW_RT_SIZEOF_HIT_INFO * 2 + \ - (BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS) - #define BRW_RT_SIZEOF_HW_STACK \ (BRW_RT_SIZEOF_HIT_INFO * 2 + \ BRW_RT_SIZEOF_RAY * BRW_RT_MAX_BVH_LEVELS + \ @@ -270,25 +266,15 @@ brw_rt_ray_queries_hw_stacks_size(const struct intel_device_info *devinfo) } static inline uint32_t -brw_rt_ray_queries_shadow_stack_size(const struct intel_device_info *devinfo) +brw_rt_ray_queries_stacks_offset(uint32_t num_queries) { - /* Maximum slice/subslice/EU ID can be computed from the max_scratch_ids - * which includes all the threads. - */ - uint32_t max_eu_id = devinfo->max_scratch_ids[MESA_SHADER_COMPUTE]; - uint32_t max_simd_size = 32; - return max_eu_id * max_simd_size * BRW_RT_SIZEOF_SHADOW_RAY_QUERY; + return BRW_RT_DISPATCH_GLOBALS_ALIGN << util_logbase2_ceil(num_queries); } static inline uint32_t -brw_rt_ray_queries_shadow_stacks_size(const struct intel_device_info *devinfo, - uint32_t ray_queries) +brw_rt_ray_queries_stacks_stride(const struct intel_device_info *devinfo) { - /* Don't bother a shadow stack if we only have a single query. We can - * directly write in the HW buffer. - */ - return (ray_queries > 1 ? ray_queries : 0) * brw_rt_ray_queries_shadow_stack_size(devinfo) + - ray_queries * 4; /* Ctrl + Level data */ + return align(brw_rt_ray_queries_hw_stacks_size(devinfo), 4096); } #ifdef __cplusplus diff --git a/src/intel/genxml/gen125_rt.xml b/src/intel/genxml/gen125_rt.xml index b23134ae038..6c4298905ae 100644 --- a/src/intel/genxml/gen125_rt.xml +++ b/src/intel/genxml/gen125_rt.xml @@ -28,7 +28,6 @@ - diff --git a/src/intel/genxml/gen300_rt.xml b/src/intel/genxml/gen300_rt.xml index 7b2bcff39cb..5861f3eacab 100644 --- a/src/intel/genxml/gen300_rt.xml +++ b/src/intel/genxml/gen300_rt.xml @@ -36,6 +36,5 @@ - diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c index 60789abcbcc..a1da91a8f4b 100644 --- a/src/intel/vulkan/anv_cmd_buffer.c +++ b/src/intel/vulkan/anv_cmd_buffer.c @@ -442,58 +442,84 @@ anv_cmd_buffer_set_rt_query_buffer(struct anv_cmd_buffer *cmd_buffer, uint32_t ray_queries, VkShaderStageFlags stages) { - struct anv_device *device = cmd_buffer->device; - uint8_t idx = anv_get_ray_query_bo_index(cmd_buffer); + if (ray_queries > cmd_buffer->state.num_ray_query_globals) { + struct anv_device *device = cmd_buffer->device; + uint8_t wa_idx = anv_get_ray_query_bo_index(cmd_buffer); - uint64_t ray_shadow_size = - align64(brw_rt_ray_queries_shadow_stacks_size(device->info, ray_queries), - 4096); - if (ray_shadow_size > 0 && - (!cmd_buffer->state.ray_query_shadow_bo || - cmd_buffer->state.ray_query_shadow_bo->size < ray_shadow_size)) { - unsigned shadow_size_log2 = MAX2(util_logbase2_ceil(ray_shadow_size), 16); - unsigned bucket = shadow_size_log2 - 16; - assert(bucket < ARRAY_SIZE(device->ray_query_shadow_bos[0])); + unsigned bucket = util_logbase2_ceil(ray_queries); + assert(bucket < ARRAY_SIZE(device->ray_query_bos[0])); - struct anv_bo *bo = p_atomic_read(&device->ray_query_shadow_bos[idx][bucket]); + uint64_t offset = brw_rt_ray_queries_stacks_offset(1 << bucket); + uint64_t stride = brw_rt_ray_queries_stacks_stride(device->info); + + struct anv_bo *bo = p_atomic_read(&device->ray_query_bos[wa_idx][bucket]); if (bo == NULL) { struct anv_bo *new_bo; - VkResult result = anv_device_alloc_bo(device, "RT queries shadow", - 1 << shadow_size_log2, - ANV_BO_ALLOC_INTERNAL, /* alloc_flags */ - 0, /* explicit_address */ - &new_bo); + VkResult result = + anv_device_alloc_bo(device, "RT queries scratch", + offset + (stride << bucket), /* size */ + ANV_BO_ALLOC_INTERNAL | + ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE, /* alloc_flags */ + 0, /* explicit_address */ + &new_bo); + ANV_DMR_BO_ALLOC(&cmd_buffer->vk.base, new_bo, result); if (result != VK_SUCCESS) { anv_batch_set_error(&cmd_buffer->batch, result); return; } - bo = p_atomic_cmpxchg(&device->ray_query_shadow_bos[idx][bucket], NULL, new_bo); + /* Map extra space we added at end of the buffer, we will write the + * array of RT_DISPATCH_GLOBALS into it so we can use only a single + * memory address in our shaders for all stacks and globals + */ + void *map; + result = anv_device_map_bo(device, new_bo, stride << bucket, + offset, NULL, &map); + + if (result != VK_SUCCESS) { + ANV_DMR_BO_FREE(&cmd_buffer->vk.base, new_bo); + anv_device_release_bo(device, new_bo); + anv_batch_set_error(&cmd_buffer->batch, result); + return; + } + + anv_genX(device->info, setup_ray_query_globals)(device, + new_bo, + stride << bucket, + map, + 1 << bucket); + +#ifdef SUPPORT_INTEL_INTEGRATED_GPUS + if (device->physical->memory.need_flush) + util_flush_inval_range(map, offset); +#endif + + anv_device_unmap_bo(device, new_bo, map, offset, false); + + bo = p_atomic_cmpxchg(&device->ray_query_bos[wa_idx][bucket], NULL, new_bo); if (bo != NULL) { - ANV_DMR_BO_FREE(&device->vk.base, new_bo); + ANV_DMR_BO_FREE(&cmd_buffer->vk.base, new_bo); anv_device_release_bo(device, new_bo); } else { bo = new_bo; } } - cmd_buffer->state.ray_query_shadow_bo = bo; - /* Add the ray query buffers to the batch list. */ - anv_reloc_list_add_bo(cmd_buffer->batch.relocs, - cmd_buffer->state.ray_query_shadow_bo); + /* Add the HW buffer to the list of BO used. */ + anv_reloc_list_add_bo(cmd_buffer->batch.relocs, bo); + + cmd_buffer->state.ray_query_globals = (struct anv_address) { + .bo = bo, + .offset = (int64_t) (stride << bucket), + }; + + cmd_buffer->state.num_ray_query_globals = 1 << bucket; } - /* Add the HW buffer to the list of BO used. */ - assert(device->ray_query_bo[idx]); - anv_reloc_list_add_bo(cmd_buffer->batch.relocs, - device->ray_query_bo[idx]); - - /* Fill the push constants & mark them dirty. */ - struct anv_address ray_query_globals_addr = - anv_genX(device->info, cmd_buffer_ray_query_globals)(cmd_buffer); + /* Update the push constants & mark them dirty. */ pipeline_state->push_constants.ray_query_globals = - anv_address_physical(ray_query_globals_addr); + anv_address_physical(cmd_buffer->state.ray_query_globals); cmd_buffer->state.push_constants_dirty |= stages; pipeline_state->push_constants_data_dirty = true; } diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index 6cb87a9ced8..86bb309ec2f 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -341,22 +341,16 @@ VkResult anv_CreateDevice( ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice); VkResult result; struct anv_device *device; - bool device_has_compute_queue = false; assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO); /* Check requested queues and fail if we are requested to create any * queues with flags we don't support. */ - for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) { + for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) if (pCreateInfo->pQueueCreateInfos[i].flags & ~VK_DEVICE_QUEUE_CREATE_PROTECTED_BIT) return vk_error(physical_device, VK_ERROR_INITIALIZATION_FAILED); - const struct anv_queue_family *family = - &physical_device->queue.families[pCreateInfo->pQueueCreateInfos[i].queueFamilyIndex]; - device_has_compute_queue |= family->engine_class == INTEL_ENGINE_CLASS_COMPUTE; - } - device = vk_zalloc2(&physical_device->instance->vk.alloc, pAllocator, sizeof(*device), 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); @@ -786,36 +780,9 @@ VkResult anv_CreateDevice( device->workaround_bo->size, INTEL_DEBUG_BLOCK_TYPE_FRAME); - if (device->vk.enabled_extensions.KHR_ray_query) { - uint32_t ray_queries_size = - align(brw_rt_ray_queries_hw_stacks_size(device->info), 4096); - - result = anv_device_alloc_bo(device, "ray queries", - ray_queries_size, - ANV_BO_ALLOC_INTERNAL, - 0 /* explicit_address */, - &device->ray_query_bo[0]); - ANV_DMR_BO_ALLOC(&device->vk.base, device->ray_query_bo[0], result); - if (result != VK_SUCCESS) - goto fail_alloc_device_bo; - - /* We need a separate ray query bo for CCS engine with Wa_14022863161. */ - if (intel_needs_workaround(device->isl_dev.info, 14022863161) && - device_has_compute_queue) { - result = anv_device_alloc_bo(device, "ray queries", - ray_queries_size, - ANV_BO_ALLOC_INTERNAL, - 0 /* explicit_address */, - &device->ray_query_bo[1]); - ANV_DMR_BO_ALLOC(&device->vk.base, device->ray_query_bo[1], result); - if (result != VK_SUCCESS) - goto fail_ray_query_bo; - } - } - result = anv_device_init_trivial_batch(device); if (result != VK_SUCCESS) - goto fail_ray_query_bo; + goto fail_alloc_device_bo; /* Emit the CPS states before running the initialization batch as those * structures are referenced. @@ -1073,13 +1040,6 @@ VkResult anv_CreateDevice( fail_trivial_batch: ANV_DMR_BO_FREE(&device->vk.base, device->trivial_batch_bo); anv_device_release_bo(device, device->trivial_batch_bo); - fail_ray_query_bo: - for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_bo); i++) { - if (device->ray_query_bo[i]) { - ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_bo[i]); - anv_device_release_bo(device, device->ray_query_bo[i]); - } - } fail_alloc_device_bo: if (device->mem_fence_bo) { ANV_DMR_BO_FREE(&device->vk.base, device->mem_fence_bo); @@ -1231,17 +1191,13 @@ void anv_DestroyDevice( anv_scratch_pool_finish(device, &device->protected_scratch_pool); if (device->vk.enabled_extensions.KHR_ray_query) { - for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_bo); i++) { - for (unsigned j = 0; j < ARRAY_SIZE(device->ray_query_shadow_bos[0]); j++) { - if (device->ray_query_shadow_bos[i][j] != NULL) { - ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_shadow_bos[i][j]); - anv_device_release_bo(device, device->ray_query_shadow_bos[i][j]); + for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_bos); i++) { + for (unsigned j = 0; j < ARRAY_SIZE(device->ray_query_bos[0]); j++) { + if (device->ray_query_bos[i][j] != NULL) { + ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_bos[i][j]); + anv_device_release_bo(device, device->ray_query_bos[i][j]); } } - if (device->ray_query_bo[i]) { - ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_bo[i]); - anv_device_release_bo(device, device->ray_query_bo[i]); - } } } ANV_DMR_BO_FREE(&device->vk.base, device->workaround_bo); diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h index 5234823b94a..1dd77b9c69e 100644 --- a/src/intel/vulkan/anv_genX.h +++ b/src/intel/vulkan/anv_genX.h @@ -226,7 +226,11 @@ void genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer, void genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer); -struct anv_address genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer); +void genX(setup_ray_query_globals)(struct anv_device *device, + struct anv_bo* bo, + uint64_t offset, + void* map, + uint32_t num_queries); void genX(cmd_buffer_ensure_cfe_state)(struct anv_cmd_buffer *cmd_buffer, uint32_t total_scratch); diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index a33f9e19365..741c7d41b32 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -2625,22 +2625,11 @@ struct anv_device { uint32_t protected_session_id; - /** Shadow ray query BO - * - * The ray_query_bo only holds the current ray being traced. When using - * more than 1 ray query per thread, we cannot fit all the queries in - * there, so we need a another buffer to hold query data that is not - * currently being used by the HW for tracing, similar to a scratch space. - * - * The size of the shadow buffer depends on the number of queries per - * shader. + /** Pool of ray query buffers used to communicated with HW unit. * * We might need a buffer per queue family due to Wa_14022863161. */ - struct anv_bo *ray_query_shadow_bos[2][16]; - /** Ray query buffer used to communicated with HW unit. - */ - struct anv_bo *ray_query_bo[2]; + struct anv_bo *ray_query_bos[2][16]; struct anv_shader_internal *rt_trampoline; struct anv_shader_internal *rt_trivial_return; @@ -4247,10 +4236,19 @@ struct anv_push_constants { */ uint32_t surfaces_base_offset; - /** Ray query globals + /** + * Pointer to ray query stacks and their associated pairs of + * RT_DISPATCH_GLOBALS structures (see genX(setup_ray_query_globals)) * - * Pointer to a couple of RT_DISPATCH_GLOBALS structures (see - * genX(cmd_buffer_ray_query_globals)) + * The pair of globals for each query object are stored counting up from + * this address in units of BRW_RT_DISPATCH_GLOBALS_ALIGN: + * + * rq_globals = ray_query_globals + (rq * BRW_RT_DISPATCH_GLOBALS_ALIGN) + * + * The raytracing scratch area for each ray query is stored counting down + * from this address in units of brw_rt_ray_queries_stacks_stride(devinfo): + * + * rq_stacks_addr = ray_query_globals - (rq * ray_queries_stacks_stride) */ uint64_t ray_query_globals; @@ -4753,9 +4751,14 @@ struct anv_cmd_state { unsigned current_hash_scale; /** - * A buffer used for spill/fill of ray queries. + * Number of ray query buffers allocated. */ - struct anv_bo * ray_query_shadow_bo; + uint32_t num_ray_query_globals; + + /** + * Current array of RT_DISPATCH_GLOBALS for ray queries. + */ + struct anv_address ray_query_globals; /** Pointer to the last emitted COMPUTE_WALKER. * diff --git a/src/intel/vulkan/genX_cmd_compute.c b/src/intel/vulkan/genX_cmd_compute.c index b7c84498bd2..ad93a8c718d 100644 --- a/src/intel/vulkan/genX_cmd_compute.c +++ b/src/intel/vulkan/genX_cmd_compute.c @@ -37,6 +37,7 @@ #include "ds/intel_tracepoints.h" #include "genX_mi_builder.h" +#include "nir_builder.h" void genX(cmd_buffer_ensure_cfe_state)(struct anv_cmd_buffer *cmd_buffer, @@ -811,49 +812,35 @@ void genX(CmdDispatchIndirect)( genX(cmd_buffer_dispatch_indirect)(cmd_buffer, addr, false); } -struct anv_address -genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer) +void +genX(setup_ray_query_globals)(struct anv_device *device, + struct anv_bo* bo, + uint64_t offset, + void* map, + uint32_t num_queries) { #if GFX_VERx10 >= 125 - struct anv_device *device = cmd_buffer->device; - - struct anv_state state = - anv_cmd_buffer_alloc_temporary_state(cmd_buffer, - 2 * align(BRW_RT_DISPATCH_GLOBALS_SIZE, 64), - BRW_RT_DISPATCH_GLOBALS_ALIGN); - struct brw_rt_scratch_layout layout; - uint32_t stack_ids_per_dss = 2048; /* TODO: can we use a lower value in - * some cases? - */ - brw_rt_compute_scratch_layout(&layout, device->info, - stack_ids_per_dss, 1 << 10); - - uint8_t idx = anv_get_ray_query_bo_index(cmd_buffer); - - for (uint32_t i = 0; i < 2; i++) { - const struct GENX(RT_DISPATCH_GLOBALS) rtdg = { - .MemBaseAddress = (struct anv_address) { - /* The ray query HW computes offsets from the top of the buffer, so - * let the address at the end of the buffer. - */ - .bo = device->ray_query_bo[idx], - .offset = (i + 1) * (device->ray_query_bo[idx]->size / 2), - }, - .AsyncRTStackSize = layout.ray_stack_stride / 64, - .NumDSSRTStacks = layout.stack_ids_per_dss, - .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS, - .Flags = RT_DEPTH_TEST_LESS_EQUAL, - .ResumeShaderTable = (struct anv_address) { - .bo = cmd_buffer->state.ray_query_shadow_bo, - }, - }; - GENX(RT_DISPATCH_GLOBALS_pack)( - NULL, - state.map + i * align(4 * GENX(RT_DISPATCH_GLOBALS_length), 64), - &rtdg); - } - - return anv_cmd_buffer_temporary_state_address(cmd_buffer, state); + assert(num_queries > 0); + uint64_t stack_stride = brw_rt_ray_queries_stacks_stride(device->info); + for (uint32_t i = 0; i < num_queries; ++i) + for (uint32_t j = 0; j < 2; j++) + GENX(RT_DISPATCH_GLOBALS_pack)(NULL, + (char*) map + + i * BRW_RT_DISPATCH_GLOBALS_ALIGN + + j * align(4 * GENX(RT_DISPATCH_GLOBALS_length), 64), + &(struct GENX(RT_DISPATCH_GLOBALS)) { + .MemBaseAddress = (struct anv_address) { + /* The ray query HW computes offsets from the top of the + * buffer, so set the address at the end of the buffer. + */ + .bo = bo, + .offset = offset - i * stack_stride - j * stack_stride / 2, + }, + .AsyncRTStackSize = BRW_RT_SIZEOF_RAY_QUERY / 64, + .NumDSSRTStacks = 2048, /* TODO */ + .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS, + .Flags = RT_DEPTH_TEST_LESS_EQUAL, + }); #else UNREACHABLE("Not supported"); #endif