diff --git a/.pick_status.json b/.pick_status.json index ff7b8093ff9..dccbe8fdfb0 100644 --- a/.pick_status.json +++ b/.pick_status.json @@ -2914,7 +2914,7 @@ "description": "Revert \"anv,brw: Allow multiple ray queries without spilling to a shadow stack\"", "nominated": true, "nomination_type": 2, - "resolution": 0, + "resolution": 1, "main_sha": null, "because_sha": "1f1de7ebd63cbe55972e01a0e7f5509e9e232917", "notes": null diff --git a/src/intel/compiler/brw/brw_nir_lower_ray_queries.c b/src/intel/compiler/brw/brw_nir_lower_ray_queries.c index 8bbec94dd5b..9a9ddbd0364 100644 --- a/src/intel/compiler/brw/brw_nir_lower_ray_queries.c +++ b/src/intel/compiler/brw/brw_nir_lower_ray_queries.c @@ -38,10 +38,8 @@ struct lowering_state { struct hash_table *queries; uint32_t n_queries; + struct brw_nir_rt_globals_defs globals; nir_def *rq_globals; - - uint32_t num_dss_rt_stacks; - uint32_t sync_stacks_stride; }; struct brw_ray_query { @@ -52,6 +50,12 @@ struct brw_ray_query { #define SIZEOF_QUERY_STATE (sizeof(uint32_t)) +static bool +need_spill_fill(struct lowering_state *state) +{ + return state->n_queries > 1; +} + /** * This pass converts opaque RayQuery structures from SPIRV into a vec3 where * the first 2 elements store a global address for the query and the third @@ -94,8 +98,10 @@ create_internal_var(struct brw_ray_query *rq, struct lowering_state *state) NULL); } + + static nir_def * -get_ray_query_stack_index(nir_builder *b, +get_ray_query_shadow_addr(nir_builder *b, nir_deref_instr *deref, struct lowering_state *state, nir_deref_instr **out_state_deref) @@ -110,17 +116,35 @@ get_ray_query_stack_index(nir_builder *b, struct brw_ray_query *rq = entry->data; - nir_def *query_idx = nir_imm_int(b, rq->id); + /* Base address in the shadow memory of the variable associated with this + * ray query variable. + */ + nir_def *base_addr = + nir_iadd_imm(b, state->globals.resume_sbt_addr, + brw_rt_ray_queries_shadow_stack_size(state->devinfo) * rq->id); + + bool spill_fill = need_spill_fill(state); *out_state_deref = nir_build_deref_var(b, rq->internal_var); + if (!spill_fill) + return NULL; + /* Just emit code and let constant-folding go to town */ nir_deref_instr **p = &path.path[1]; for (; *p; p++) { if ((*p)->deref_type == nir_deref_type_array) { nir_def *index = (*p)->arr.index.ssa; + + /**/ *out_state_deref = nir_build_deref_array(b, *out_state_deref, index); - index = nir_amul_imm(b, index, MAX2(1, glsl_get_aoa_size((*p)->type))); - query_idx = nir_iadd(b, query_idx, index); + + /**/ + uint64_t size = MAX2(1, glsl_get_aoa_size((*p)->type)) * + brw_rt_ray_queries_shadow_stack_size(state->devinfo); + + nir_def *mul = nir_amul_imm(b, nir_i2i64(b, index), size); + + base_addr = nir_iadd(b, base_addr, mul); } else { UNREACHABLE("Unsupported deref type"); } @@ -128,7 +152,28 @@ get_ray_query_stack_index(nir_builder *b, nir_deref_path_finish(&path); - return query_idx; + /* Add the lane offset to the shadow memory address */ + nir_def *lane_offset = + nir_imul_imm( + b, + nir_iadd( + b, + nir_imul( + b, + brw_load_btd_dss_id(b), + state->globals.num_dss_rt_stacks), + brw_nir_rt_sync_stack_id(b)), + BRW_RT_SIZEOF_SHADOW_RAY_QUERY); + + /* Top/bottom 16 lanes each get their own stack area */ + lane_offset = nir_bcsel( + b, + nir_ilt_imm(b, nir_load_subgroup_invocation(b), 16), + lane_offset, + nir_iadd_imm(b, lane_offset, + brw_rt_ray_queries_shadow_stack_size(state->devinfo) / 2)); + + return nir_iadd(b, base_addr, nir_i2i64(b, lane_offset)); } static void @@ -164,6 +209,26 @@ update_trace_ctrl_level(nir_builder *b, } } +static void +fill_query(nir_builder *b, + nir_def *hw_stack_addr, + nir_def *shadow_stack_addr, + nir_def *ctrl) +{ + brw_nir_memcpy_global(b, hw_stack_addr, 64, shadow_stack_addr, 64, + BRW_RT_SIZEOF_RAY_QUERY); +} + +static void +spill_query(nir_builder *b, + nir_def *hw_stack_addr, + nir_def *shadow_stack_addr) +{ + brw_nir_memcpy_global(b, shadow_stack_addr, 64, hw_stack_addr, 64, + BRW_RT_SIZEOF_RAY_QUERY); +} + + static void lower_ray_query_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin, @@ -174,20 +239,12 @@ lower_ray_query_intrinsic(nir_builder *b, b->cursor = nir_instr_remove(&intrin->instr); nir_deref_instr *ctrl_level_deref; - nir_def *stack_index = - get_ray_query_stack_index(b, deref, state, &ctrl_level_deref); - nir_def *rq_globals_addr = - nir_iadd(b, state->rq_globals, - nir_i2i64(b, nir_amul_imm(b, stack_index, - BRW_RT_DISPATCH_GLOBALS_ALIGN))); - nir_def *stack_base_addr = - nir_isub(b, state->rq_globals, - nir_i2i64(b, nir_amul_imm(b, stack_index, - state->sync_stacks_stride))); - nir_def *stack_addr = - brw_nir_rt_sync_stack_addr(b, stack_base_addr, - state->num_dss_rt_stacks, - state->devinfo); + nir_def *shadow_stack_addr = + get_ray_query_shadow_addr(b, deref, state, &ctrl_level_deref); + nir_def *hw_stack_addr = + brw_nir_rt_sync_stack_addr(b, state->globals.base_mem_addr, + state->globals.num_dss_rt_stacks); + nir_def *stack_addr = shadow_stack_addr ? shadow_stack_addr : hw_stack_addr; mesa_shader_stage stage = b->shader->info.stage; switch (intrin->intrinsic) { @@ -256,12 +313,22 @@ lower_ray_query_intrinsic(nir_builder *b, */ brw_nir_rt_query_mark_done(b, stack_addr); - nir_trace_ray_intel(b, rq_globals_addr, level, ctrl, .synchronous = true); + if (shadow_stack_addr) + fill_query(b, hw_stack_addr, shadow_stack_addr, ctrl); + + /* Do not use state->rq_globals, we want a uniform value for the + * tracing call. + */ + nir_trace_ray_intel(b, nir_load_ray_query_global_intel(b), + level, ctrl, .synchronous = true); struct brw_nir_rt_mem_hit_defs hit_in = {}; - brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, stack_addr, false, + brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, hw_stack_addr, false, state->devinfo); + if (shadow_stack_addr) + spill_query(b, hw_stack_addr, shadow_stack_addr); + update_trace_ctrl_level(b, ctrl_level_deref, NULL, NULL, nir_imm_int(b, GEN_RT_TRACE_RAY_CONTINUE), @@ -480,12 +547,21 @@ lower_ray_query_impl(nir_function_impl *impl, struct lowering_state *state) nir_builder _b, *b = &_b; _b = nir_builder_at(nir_before_impl(impl)); - state->rq_globals = nir_load_ray_query_global_intel(b); + nir_def *rq_globals_base = nir_load_ray_query_global_intel(b); - state->num_dss_rt_stacks = - brw_rt_ray_queries_stack_ids_per_dss(state->devinfo); - state->sync_stacks_stride = - brw_rt_ray_queries_stacks_stride(state->devinfo); + /* Use a different global for each 16lanes groups (only in SIMD32). */ + state->rq_globals = nir_bcsel( + b, + nir_iand(b, + nir_ige_imm(b, nir_load_subgroup_invocation(b), 16), + nir_ieq_imm(b, nir_load_subgroup_size(b), 32)), + nir_iadd_imm( + b, rq_globals_base, + align(4 * RT_DISPATCH_GLOBALS_length(state->devinfo), 64)), + rq_globals_base); + + brw_nir_rt_load_globals_addr(b, &state->globals, state->rq_globals, + state->devinfo); nir_foreach_block_safe(block, impl) { nir_foreach_instr_safe(instr, block) { diff --git a/src/intel/compiler/brw/brw_nir_rt_builder.h b/src/intel/compiler/brw/brw_nir_rt_builder.h index 2062b24fc7a..d66fa897e4c 100644 --- a/src/intel/compiler/brw/brw_nir_rt_builder.h +++ b/src/intel/compiler/brw/brw_nir_rt_builder.h @@ -178,8 +178,7 @@ brw_nir_rt_sw_hotzone_addr(nir_builder *b, static inline nir_def * brw_nir_rt_sync_stack_addr(nir_builder *b, nir_def *base_mem_addr, - uint32_t num_dss_rt_stacks, - const struct intel_device_info *devinfo) + nir_def *num_dss_rt_stacks) { /* Bspec 47547 (Xe) and 56936 (Xe2+) say: * For Ray queries (Synchronous Ray Tracing), the formula is similar but @@ -196,29 +195,12 @@ brw_nir_rt_sync_stack_addr(nir_builder *b, * NUM_SYNC_STACKID_PER_DSS instead. */ nir_def *offset32 = - nir_imul_imm(b, - nir_iadd(b, - nir_imul_imm(b, brw_load_btd_dss_id(b), - num_dss_rt_stacks), - nir_iadd_imm(b, brw_nir_rt_sync_stack_id(b), 1)), - BRW_RT_SIZEOF_RAY_QUERY); - - /* StackID offset for the bottom 16 lanes in SIMD32, this must match the - * offset of the second base address provided by the driver through the - * pair of ray query RTDispatchGlobals - */ - uint32_t simd32_stack_offset = - num_dss_rt_stacks * BRW_RT_SIZEOF_RAY_QUERY * - intel_device_info_dual_subslice_id_bound(devinfo); - - offset32 = - nir_bcsel(b, - nir_iand(b, - nir_ige_imm(b, nir_load_subgroup_invocation(b), 16), - nir_ieq_imm(b, nir_load_subgroup_size(b), 32)), - nir_iadd_imm(b, offset32, simd32_stack_offset), - offset32); - + nir_imul(b, + nir_iadd(b, + nir_imul(b, brw_load_btd_dss_id(b), + num_dss_rt_stacks), + nir_iadd_imm(b, brw_nir_rt_sync_stack_id(b), 1)), + nir_imm_int(b, BRW_RT_SIZEOF_RAY_QUERY)); return nir_isub(b, base_mem_addr, nir_u2u64(b, offset32)); } @@ -318,6 +300,7 @@ struct brw_nir_rt_globals_defs { nir_def *launch_size; nir_def *call_sbt_addr; nir_def *call_sbt_stride; + nir_def *resume_sbt_addr; }; static inline void @@ -385,6 +368,8 @@ brw_nir_rt_load_globals_addr(nir_builder *b, defs->call_sbt_stride = nir_iand_imm(b, nir_unpack_32_2x16_split_x(b, nir_channel(b, data, 2)), 0x1fff); + defs->resume_sbt_addr = + nir_pack_64_2x32(b, nir_channels(b, data, 0x3 << 3)); } else { defs->call_sbt_addr = nir_pack_64_2x32_split(b, nir_channel(b, data, 0), @@ -392,6 +377,9 @@ brw_nir_rt_load_globals_addr(nir_builder *b, nir_imm_int(b, 0))); defs->call_sbt_stride = nir_unpack_32_2x16_split_y(b, nir_channel(b, data, 1)); + + defs->resume_sbt_addr = + nir_pack_64_2x32(b, nir_channels(b, data, 0x3 << 2)); } } diff --git a/src/intel/compiler/brw/brw_rt.h b/src/intel/compiler/brw/brw_rt.h index 4255db0ff2b..3b4fdf4ec92 100644 --- a/src/intel/compiler/brw/brw_rt.h +++ b/src/intel/compiler/brw/brw_rt.h @@ -36,7 +36,7 @@ extern "C" { #define BRW_RT_SBT_HANDLE_SIZE 32 /** RT_DISPATCH_GLOBALS size (see gen_rt.xml) */ -#define BRW_RT_DISPATCH_GLOBALS_SIZE 72 +#define BRW_RT_DISPATCH_GLOBALS_SIZE 80 /** RT_DISPATCH_GLOBALS alignment * @@ -191,6 +191,10 @@ struct brw_rt_raygen_trampoline_params { (BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS + \ (BRW_RT_MAX_BVH_LEVELS % 2 ? 32 : 0)) +#define BRW_RT_SIZEOF_SHADOW_RAY_QUERY \ + (BRW_RT_SIZEOF_HIT_INFO * 2 + \ + (BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS) + #define BRW_RT_SIZEOF_HW_STACK \ (BRW_RT_SIZEOF_HIT_INFO * 2 + \ BRW_RT_SIZEOF_RAY * BRW_RT_MAX_BVH_LEVELS + \ @@ -277,15 +281,25 @@ brw_rt_ray_queries_stack_ids_per_dss(const struct intel_device_info *devinfo) } static inline uint32_t -brw_rt_ray_queries_stacks_offset(uint32_t num_queries) +brw_rt_ray_queries_shadow_stack_size(const struct intel_device_info *devinfo) { - return BRW_RT_DISPATCH_GLOBALS_ALIGN << util_logbase2_ceil(num_queries); + /* Maximum slice/subslice/EU ID can be computed from the max_scratch_ids + * which includes all the threads. + */ + uint32_t max_eu_id = devinfo->max_scratch_ids[MESA_SHADER_COMPUTE]; + uint32_t max_simd_size = 32; + return max_eu_id * max_simd_size * BRW_RT_SIZEOF_SHADOW_RAY_QUERY; } static inline uint32_t -brw_rt_ray_queries_stacks_stride(const struct intel_device_info *devinfo) +brw_rt_ray_queries_shadow_stacks_size(const struct intel_device_info *devinfo, + uint32_t ray_queries) { - return align(brw_rt_ray_queries_hw_stacks_size(devinfo), 4096); + /* Don't bother a shadow stack if we only have a single query. We can + * directly write in the HW buffer. + */ + return (ray_queries > 1 ? ray_queries : 0) * brw_rt_ray_queries_shadow_stack_size(devinfo) + + ray_queries * 4; /* Ctrl + Level data */ } #ifdef __cplusplus diff --git a/src/intel/genxml/gen125_rt.xml b/src/intel/genxml/gen125_rt.xml index 6c4298905ae..b23134ae038 100644 --- a/src/intel/genxml/gen125_rt.xml +++ b/src/intel/genxml/gen125_rt.xml @@ -28,6 +28,7 @@ + diff --git a/src/intel/genxml/gen300_rt.xml b/src/intel/genxml/gen300_rt.xml index 5861f3eacab..7b2bcff39cb 100644 --- a/src/intel/genxml/gen300_rt.xml +++ b/src/intel/genxml/gen300_rt.xml @@ -36,5 +36,6 @@ + diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c index 77a90224c4d..a7d817ee88b 100644 --- a/src/intel/vulkan/anv_cmd_buffer.c +++ b/src/intel/vulkan/anv_cmd_buffer.c @@ -442,84 +442,58 @@ anv_cmd_buffer_set_rt_query_buffer(struct anv_cmd_buffer *cmd_buffer, uint32_t ray_queries, VkShaderStageFlags stages) { - if (ray_queries > cmd_buffer->state.num_ray_query_globals) { - struct anv_device *device = cmd_buffer->device; - uint8_t wa_idx = anv_get_ray_query_bo_index(cmd_buffer); + struct anv_device *device = cmd_buffer->device; + uint8_t idx = anv_get_ray_query_bo_index(cmd_buffer); - unsigned bucket = util_logbase2_ceil(ray_queries); - assert(bucket < ARRAY_SIZE(device->ray_query_bos[0])); + uint64_t ray_shadow_size = + align64(brw_rt_ray_queries_shadow_stacks_size(device->info, ray_queries), + 4096); + if (ray_shadow_size > 0 && + (!cmd_buffer->state.ray_query_shadow_bo || + cmd_buffer->state.ray_query_shadow_bo->size < ray_shadow_size)) { + unsigned shadow_size_log2 = MAX2(util_logbase2_ceil(ray_shadow_size), 16); + unsigned bucket = shadow_size_log2 - 16; + assert(bucket < ARRAY_SIZE(device->ray_query_shadow_bos[0])); - uint64_t offset = brw_rt_ray_queries_stacks_offset(1 << bucket); - uint64_t stride = brw_rt_ray_queries_stacks_stride(device->info); - - struct anv_bo *bo = p_atomic_read(&device->ray_query_bos[wa_idx][bucket]); + struct anv_bo *bo = p_atomic_read(&device->ray_query_shadow_bos[idx][bucket]); if (bo == NULL) { struct anv_bo *new_bo; - VkResult result = - anv_device_alloc_bo(device, "RT queries scratch", - offset + (stride << bucket), /* size */ - ANV_BO_ALLOC_INTERNAL | - ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE, /* alloc_flags */ - 0, /* explicit_address */ - &new_bo); - + VkResult result = anv_device_alloc_bo(device, "RT queries shadow", + 1 << shadow_size_log2, + ANV_BO_ALLOC_INTERNAL, /* alloc_flags */ + 0, /* explicit_address */ + &new_bo); ANV_DMR_BO_ALLOC(&cmd_buffer->vk.base, new_bo, result); if (result != VK_SUCCESS) { anv_batch_set_error(&cmd_buffer->batch, result); return; } - /* Map extra space we added at end of the buffer, we will write the - * array of RT_DISPATCH_GLOBALS into it so we can use only a single - * memory address in our shaders for all stacks and globals - */ - void *map; - result = anv_device_map_bo(device, new_bo, stride << bucket, - offset, NULL, &map); - - if (result != VK_SUCCESS) { - ANV_DMR_BO_FREE(&cmd_buffer->vk.base, new_bo); - anv_device_release_bo(device, new_bo); - anv_batch_set_error(&cmd_buffer->batch, result); - return; - } - - anv_genX(device->info, setup_ray_query_globals)(device, - new_bo, - stride << bucket, - map, - 1 << bucket); - -#ifdef SUPPORT_INTEL_INTEGRATED_GPUS - if (device->physical->memory.need_flush) - util_flush_inval_range(map, offset); -#endif - - anv_device_unmap_bo(device, new_bo, map, offset, false); - - bo = p_atomic_cmpxchg(&device->ray_query_bos[wa_idx][bucket], NULL, new_bo); + bo = p_atomic_cmpxchg(&device->ray_query_shadow_bos[idx][bucket], NULL, new_bo); if (bo != NULL) { - ANV_DMR_BO_FREE(&cmd_buffer->vk.base, new_bo); + ANV_DMR_BO_FREE(&device->vk.base, new_bo); anv_device_release_bo(device, new_bo); } else { bo = new_bo; } } + cmd_buffer->state.ray_query_shadow_bo = bo; - /* Add the HW buffer to the list of BO used. */ - anv_reloc_list_add_bo(cmd_buffer->batch.relocs, bo); - - cmd_buffer->state.ray_query_globals = (struct anv_address) { - .bo = bo, - .offset = (int64_t) (stride << bucket), - }; - - cmd_buffer->state.num_ray_query_globals = 1 << bucket; + /* Add the ray query buffers to the batch list. */ + anv_reloc_list_add_bo(cmd_buffer->batch.relocs, + cmd_buffer->state.ray_query_shadow_bo); } - /* Update the push constants & mark them dirty. */ + /* Add the HW buffer to the list of BO used. */ + assert(device->ray_query_bo[idx]); + anv_reloc_list_add_bo(cmd_buffer->batch.relocs, + device->ray_query_bo[idx]); + + /* Fill the push constants & mark them dirty. */ + struct anv_address ray_query_globals_addr = + anv_genX(device->info, cmd_buffer_ray_query_globals)(cmd_buffer); pipeline_state->push_constants.ray_query_globals = - anv_address_physical(cmd_buffer->state.ray_query_globals); + anv_address_physical(ray_query_globals_addr); cmd_buffer->state.push_constants_dirty |= stages; pipeline_state->push_constants_data_dirty = true; } diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index 86bb309ec2f..6cb87a9ced8 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -341,16 +341,22 @@ VkResult anv_CreateDevice( ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice); VkResult result; struct anv_device *device; + bool device_has_compute_queue = false; assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO); /* Check requested queues and fail if we are requested to create any * queues with flags we don't support. */ - for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) + for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) { if (pCreateInfo->pQueueCreateInfos[i].flags & ~VK_DEVICE_QUEUE_CREATE_PROTECTED_BIT) return vk_error(physical_device, VK_ERROR_INITIALIZATION_FAILED); + const struct anv_queue_family *family = + &physical_device->queue.families[pCreateInfo->pQueueCreateInfos[i].queueFamilyIndex]; + device_has_compute_queue |= family->engine_class == INTEL_ENGINE_CLASS_COMPUTE; + } + device = vk_zalloc2(&physical_device->instance->vk.alloc, pAllocator, sizeof(*device), 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); @@ -780,9 +786,36 @@ VkResult anv_CreateDevice( device->workaround_bo->size, INTEL_DEBUG_BLOCK_TYPE_FRAME); + if (device->vk.enabled_extensions.KHR_ray_query) { + uint32_t ray_queries_size = + align(brw_rt_ray_queries_hw_stacks_size(device->info), 4096); + + result = anv_device_alloc_bo(device, "ray queries", + ray_queries_size, + ANV_BO_ALLOC_INTERNAL, + 0 /* explicit_address */, + &device->ray_query_bo[0]); + ANV_DMR_BO_ALLOC(&device->vk.base, device->ray_query_bo[0], result); + if (result != VK_SUCCESS) + goto fail_alloc_device_bo; + + /* We need a separate ray query bo for CCS engine with Wa_14022863161. */ + if (intel_needs_workaround(device->isl_dev.info, 14022863161) && + device_has_compute_queue) { + result = anv_device_alloc_bo(device, "ray queries", + ray_queries_size, + ANV_BO_ALLOC_INTERNAL, + 0 /* explicit_address */, + &device->ray_query_bo[1]); + ANV_DMR_BO_ALLOC(&device->vk.base, device->ray_query_bo[1], result); + if (result != VK_SUCCESS) + goto fail_ray_query_bo; + } + } + result = anv_device_init_trivial_batch(device); if (result != VK_SUCCESS) - goto fail_alloc_device_bo; + goto fail_ray_query_bo; /* Emit the CPS states before running the initialization batch as those * structures are referenced. @@ -1040,6 +1073,13 @@ VkResult anv_CreateDevice( fail_trivial_batch: ANV_DMR_BO_FREE(&device->vk.base, device->trivial_batch_bo); anv_device_release_bo(device, device->trivial_batch_bo); + fail_ray_query_bo: + for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_bo); i++) { + if (device->ray_query_bo[i]) { + ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_bo[i]); + anv_device_release_bo(device, device->ray_query_bo[i]); + } + } fail_alloc_device_bo: if (device->mem_fence_bo) { ANV_DMR_BO_FREE(&device->vk.base, device->mem_fence_bo); @@ -1191,13 +1231,17 @@ void anv_DestroyDevice( anv_scratch_pool_finish(device, &device->protected_scratch_pool); if (device->vk.enabled_extensions.KHR_ray_query) { - for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_bos); i++) { - for (unsigned j = 0; j < ARRAY_SIZE(device->ray_query_bos[0]); j++) { - if (device->ray_query_bos[i][j] != NULL) { - ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_bos[i][j]); - anv_device_release_bo(device, device->ray_query_bos[i][j]); + for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_bo); i++) { + for (unsigned j = 0; j < ARRAY_SIZE(device->ray_query_shadow_bos[0]); j++) { + if (device->ray_query_shadow_bos[i][j] != NULL) { + ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_shadow_bos[i][j]); + anv_device_release_bo(device, device->ray_query_shadow_bos[i][j]); } } + if (device->ray_query_bo[i]) { + ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_bo[i]); + anv_device_release_bo(device, device->ray_query_bo[i]); + } } } ANV_DMR_BO_FREE(&device->vk.base, device->workaround_bo); diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h index 1dd77b9c69e..5234823b94a 100644 --- a/src/intel/vulkan/anv_genX.h +++ b/src/intel/vulkan/anv_genX.h @@ -226,11 +226,7 @@ void genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer, void genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer); -void genX(setup_ray_query_globals)(struct anv_device *device, - struct anv_bo* bo, - uint64_t offset, - void* map, - uint32_t num_queries); +struct anv_address genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer); void genX(cmd_buffer_ensure_cfe_state)(struct anv_cmd_buffer *cmd_buffer, uint32_t total_scratch); diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 3a8a79ea6d2..1f481ec0fa2 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -2625,11 +2625,22 @@ struct anv_device { uint32_t protected_session_id; - /** Pool of ray query buffers used to communicated with HW unit. + /** Shadow ray query BO + * + * The ray_query_bo only holds the current ray being traced. When using + * more than 1 ray query per thread, we cannot fit all the queries in + * there, so we need a another buffer to hold query data that is not + * currently being used by the HW for tracing, similar to a scratch space. + * + * The size of the shadow buffer depends on the number of queries per + * shader. * * We might need a buffer per queue family due to Wa_14022863161. */ - struct anv_bo *ray_query_bos[2][16]; + struct anv_bo *ray_query_shadow_bos[2][16]; + /** Ray query buffer used to communicated with HW unit. + */ + struct anv_bo *ray_query_bo[2]; struct anv_shader_internal *rt_trampoline; struct anv_shader_internal *rt_trivial_return; @@ -4236,19 +4247,10 @@ struct anv_push_constants { */ uint32_t surfaces_base_offset; - /** - * Pointer to ray query stacks and their associated pairs of - * RT_DISPATCH_GLOBALS structures (see genX(setup_ray_query_globals)) + /** Ray query globals * - * The pair of globals for each query object are stored counting up from - * this address in units of BRW_RT_DISPATCH_GLOBALS_ALIGN: - * - * rq_globals = ray_query_globals + (rq * BRW_RT_DISPATCH_GLOBALS_ALIGN) - * - * The raytracing scratch area for each ray query is stored counting down - * from this address in units of brw_rt_ray_queries_stacks_stride(devinfo): - * - * rq_stacks_addr = ray_query_globals - (rq * ray_queries_stacks_stride) + * Pointer to a couple of RT_DISPATCH_GLOBALS structures (see + * genX(cmd_buffer_ray_query_globals)) */ uint64_t ray_query_globals; @@ -4751,14 +4753,9 @@ struct anv_cmd_state { unsigned current_hash_scale; /** - * Number of ray query buffers allocated. + * A buffer used for spill/fill of ray queries. */ - uint32_t num_ray_query_globals; - - /** - * Current array of RT_DISPATCH_GLOBALS for ray queries. - */ - struct anv_address ray_query_globals; + struct anv_bo * ray_query_shadow_bo; /** Pointer to the last emitted COMPUTE_WALKER. * diff --git a/src/intel/vulkan/genX_cmd_compute.c b/src/intel/vulkan/genX_cmd_compute.c index b4f50ef51a1..64ada047dc4 100644 --- a/src/intel/vulkan/genX_cmd_compute.c +++ b/src/intel/vulkan/genX_cmd_compute.c @@ -37,7 +37,6 @@ #include "ds/intel_tracepoints.h" #include "genX_mi_builder.h" -#include "nir_builder.h" void genX(cmd_buffer_ensure_cfe_state)(struct anv_cmd_buffer *cmd_buffer, @@ -812,36 +811,45 @@ void genX(CmdDispatchIndirect)( genX(cmd_buffer_dispatch_indirect)(cmd_buffer, addr, false); } -void -genX(setup_ray_query_globals)(struct anv_device *device, - struct anv_bo* bo, - uint64_t offset, - void* map, - uint32_t num_queries) +struct anv_address +genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer) { #if GFX_VERx10 >= 125 - assert(num_queries > 0); - uint64_t stack_stride = brw_rt_ray_queries_stacks_stride(device->info); - uint32_t ids_per_dss = brw_rt_ray_queries_stack_ids_per_dss(device->info); - for (uint32_t i = 0; i < num_queries; ++i) - for (uint32_t j = 0; j < 2; j++) - GENX(RT_DISPATCH_GLOBALS_pack)(NULL, - (char*) map + - i * BRW_RT_DISPATCH_GLOBALS_ALIGN + - j * align(4 * GENX(RT_DISPATCH_GLOBALS_length), 64), - &(struct GENX(RT_DISPATCH_GLOBALS)) { - .MemBaseAddress = (struct anv_address) { - /* The ray query HW computes offsets from the top of the - * buffer, so set the address at the end of the buffer. - */ - .bo = bo, - .offset = offset - i * stack_stride - j * stack_stride / 2, - }, - .AsyncRTStackSize = BRW_RT_SIZEOF_RAY_QUERY / 64, - .NumDSSRTStacks = ids_per_dss, - .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS, - .Flags = RT_DEPTH_TEST_LESS_EQUAL, - }); + struct anv_device *device = cmd_buffer->device; + + struct anv_state state = + anv_cmd_buffer_alloc_temporary_state(cmd_buffer, + 2 * align(BRW_RT_DISPATCH_GLOBALS_SIZE, 64), + BRW_RT_DISPATCH_GLOBALS_ALIGN); + uint32_t stack_ids_per_dss = + brw_rt_ray_queries_stack_ids_per_dss(device->info); + + uint8_t idx = anv_get_ray_query_bo_index(cmd_buffer); + + for (uint32_t i = 0; i < 2; i++) { + const struct GENX(RT_DISPATCH_GLOBALS) rtdg = { + .MemBaseAddress = (struct anv_address) { + /* The ray query HW computes offsets from the top of the buffer, so + * let the address at the end of the buffer. + */ + .bo = device->ray_query_bo[idx], + .offset = (i + 1) * (device->ray_query_bo[idx]->size / 2), + }, + .AsyncRTStackSize = BRW_RT_SIZEOF_RAY_QUERY / 64, + .NumDSSRTStacks = stack_ids_per_dss, + .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS, + .Flags = RT_DEPTH_TEST_LESS_EQUAL, + .ResumeShaderTable = (struct anv_address) { + .bo = cmd_buffer->state.ray_query_shadow_bo, + }, + }; + GENX(RT_DISPATCH_GLOBALS_pack)( + NULL, + state.map + i * align(4 * GENX(RT_DISPATCH_GLOBALS_length), 64), + &rtdg); + } + + return anv_cmd_buffer_temporary_state_address(cmd_buffer, state); #else UNREACHABLE("Not supported"); #endif