diff --git a/src/freedreno/common/freedreno_dev_info.h b/src/freedreno/common/freedreno_dev_info.h index fa9a02884d8..baf839a49d2 100644 --- a/src/freedreno/common/freedreno_dev_info.h +++ b/src/freedreno/common/freedreno_dev_info.h @@ -338,6 +338,9 @@ struct fd_dev_info { * just raytracing. */ bool has_sw_fuse; + + /* a750-specific HW bug workaround for ray tracing */ + bool has_rt_workaround; } a7xx; }; diff --git a/src/freedreno/common/freedreno_devices.py b/src/freedreno/common/freedreno_devices.py index aa981298f17..3885b36cf4f 100644 --- a/src/freedreno/common/freedreno_devices.py +++ b/src/freedreno/common/freedreno_devices.py @@ -911,6 +911,7 @@ a7xx_gen3 = A7XXProps( has_primitive_shading_rate = True, has_ray_intersection = True, has_sw_fuse = True, + has_rt_workaround = True, ) a730_magic_regs = dict( diff --git a/src/freedreno/common/freedreno_gpu_event.h b/src/freedreno/common/freedreno_gpu_event.h index 96c43cb352d..3ca2a6cebd8 100644 --- a/src/freedreno/common/freedreno_gpu_event.h +++ b/src/freedreno/common/freedreno_gpu_event.h @@ -45,6 +45,7 @@ enum fd_gpu_event : uint32_t { FD_LRZ_FLUSH, FD_BLIT, FD_LABEL, + FD_DUMMY_EVENT, FD_GPU_EVENT_MAX, }; @@ -111,6 +112,7 @@ constexpr inline struct fd_gpu_event_info fd_gpu_events[FD_GPU_EVENT_MAX] {LRZ_FLUSH, false}, /* FD_LRZ_FLUSH */ {BLIT, false}, /* FD_BLIT */ {LABEL, false}, /* FD_LABEL */ + {DUMMY_EVENT, false}, /* FD_DUMMY_EVENT */ }; #endif diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h index 24241fd4ea8..b962afe84c5 100644 --- a/src/freedreno/ir3/ir3.h +++ b/src/freedreno/ir3/ir3.h @@ -59,6 +59,7 @@ struct ir3_info { bool double_threadsize; bool multi_dword_ldp_stp; bool early_preamble; + bool uses_ray_intersection; /* number of sync bits: */ uint16_t ss, sy; diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index 5a223adb5d0..41ec1084cb4 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -2615,6 +2615,8 @@ emit_ray_intersection(struct ir3_context *ctx, nir_intrinsic_instr *intr, { struct ir3_builder *b = &ctx->build; + ctx->so->info.uses_ray_intersection = true; + struct ir3_instruction *bvh_base = ir3_create_collect(b, ir3_get_src(ctx, &intr->src[0]), 2); struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[1])[0]; diff --git a/src/freedreno/registers/adreno/a6xx.xml b/src/freedreno/registers/adreno/a6xx.xml index 050d9ae8980..af0436a0214 100644 --- a/src/freedreno/registers/adreno/a6xx.xml +++ b/src/freedreno/registers/adreno/a6xx.xml @@ -5101,6 +5101,7 @@ to upconvert to 32b float internally? + diff --git a/src/freedreno/registers/adreno/adreno_pm4.xml b/src/freedreno/registers/adreno/adreno_pm4.xml index 623a9a13900..7ec400770da 100644 --- a/src/freedreno/registers/adreno/adreno_pm4.xml +++ b/src/freedreno/registers/adreno/adreno_pm4.xml @@ -1821,6 +1821,30 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) + + + + + diff --git a/src/freedreno/vulkan/tu_acceleration_structure.cc b/src/freedreno/vulkan/tu_acceleration_structure.cc index f87337e899a..061aee59b48 100644 --- a/src/freedreno/vulkan/tu_acceleration_structure.cc +++ b/src/freedreno/vulkan/tu_acceleration_structure.cc @@ -30,6 +30,9 @@ #include "tu_acceleration_structure.h" #include "radix_sort/radix_sort_u64.h" + +#include "common/freedreno_gpu_event.h" + #include "util/u_hexdump.h" #include "bvh/tu_build_interface.h" diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index d606486adcc..d1421367144 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -165,6 +165,30 @@ tu_emit_vsc(struct tu_cmd_buffer *cmd, struct tu_cs *cs) cmd->vsc_initialized = true; } +/* This workaround, copied from the blob, seems to ensure that the BVH node + * cache is invalidated so that we don't read stale values when multiple BVHs + * share the same address. + */ +static void +tu_emit_rt_workaround(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +{ + tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); + tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_RT_WA_START); + + tu_cs_emit_regs(cs, A7XX_SP_CS_UNKNOWN_A9BE(.dword = 0x10000)); + tu_cs_emit_regs(cs, A7XX_SP_FS_UNKNOWN_A9AB(.dword = 0x10000)); + tu_emit_event_write(cmd, cs, FD_DUMMY_EVENT); + tu_cs_emit_regs(cs, A7XX_SP_CS_UNKNOWN_A9BE(.dword = 0)); + tu_cs_emit_regs(cs, A7XX_SP_FS_UNKNOWN_A9AB(.dword = 0)); + tu_emit_event_write(cmd, cs, FD_DUMMY_EVENT); + tu_emit_event_write(cmd, cs, FD_DUMMY_EVENT); + tu_emit_event_write(cmd, cs, FD_DUMMY_EVENT); + tu_emit_event_write(cmd, cs, FD_DUMMY_EVENT); + + tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); + tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_RT_WA_END); +} + template static void tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer, @@ -216,6 +240,9 @@ tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer, /* Invalidating UCHE seems to also invalidate CCHE */ !(flushes & TU_CMD_FLAG_CACHE_INVALIDATE)) tu_cs_emit_pkt7(cs, CP_CCHE_INVALIDATE, 0); + if (CHIP >= A7XX && (flushes & TU_CMD_FLAG_RTU_INVALIDATE) && + cmd_buffer->device->physical_device->info->a7xx.has_rt_workaround) + tu_emit_rt_workaround(cmd_buffer, cs); if (flushes & TU_CMD_FLAG_WAIT_MEM_WRITES) tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); if (flushes & TU_CMD_FLAG_WAIT_FOR_IDLE) @@ -3858,6 +3885,12 @@ tu_flush_for_access(struct tu_cache_state *cache, flush_bits |= TU_CMD_FLAG_BLIT_CACHE_CLEAN; } + /* Nothing writes through the RTU cache so there's no point trying to + * optimize this. Just always invalidate. + */ + if (dst_mask & TU_ACCESS_RTU_READ) + flush_bits |= TU_CMD_FLAG_RTU_INVALIDATE; + #undef DST_INCOHERENT_FLUSH cache->flush_bits |= flush_bits; @@ -3968,6 +4001,11 @@ vk2tu_access(VkAccessFlags2 flags, VkPipelineStageFlags2 stages, bool image_only SHADER_STAGES)) mask |= TU_ACCESS_UCHE_READ | TU_ACCESS_CCHE_READ; + if (gfx_read_access(flags, stages, + VK_ACCESS_2_ACCELERATION_STRUCTURE_READ_BIT_KHR, + SHADER_STAGES)) + mask |= TU_ACCESS_UCHE_READ | TU_ACCESS_CCHE_READ | TU_ACCESS_RTU_READ; + /* Reading the AS for copying involves doing CmdDispatchIndirect with the * copy size as a parameter, so it's read by the CP as well as a shader. */ @@ -3975,7 +4013,8 @@ vk2tu_access(VkAccessFlags2 flags, VkPipelineStageFlags2 stages, bool image_only VK_ACCESS_2_ACCELERATION_STRUCTURE_READ_BIT_KHR, VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR | VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_COPY_BIT_KHR)) - mask |= TU_ACCESS_SYSMEM_READ; + mask |= TU_ACCESS_SYSMEM_READ | TU_ACCESS_UCHE_READ | + TU_ACCESS_CCHE_READ; if (gfx_read_access(flags, stages, @@ -5826,6 +5865,12 @@ tu6_draw_common(struct tu_cmd_buffer *cmd, tess_params->output_lower_left)); } + if (cmd->device->physical_device->info->a7xx.has_rt_workaround && + cmd->state.program.uses_ray_intersection) { + tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); + tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_SHADER_USES_RT); + } + /* Early exit if there is nothing to emit, saves CPU cycles */ uint32_t dirty = cmd->state.dirty; if (!dynamic_draw_state_dirty && !(dirty & ~TU_CMD_DIRTY_COMPUTE_DESC_SETS)) @@ -6926,6 +6971,12 @@ tu_dispatch(struct tu_cmd_buffer *cmd, } } + if (cmd->device->physical_device->info->a7xx.has_rt_workaround && + shader->variant->info.uses_ray_intersection) { + tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); + tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_SHADER_USES_RT); + } + if (info->indirect) { trace_start_compute_indirect(&cmd->trace, cs, info->unaligned); diff --git a/src/freedreno/vulkan/tu_cmd_buffer.h b/src/freedreno/vulkan/tu_cmd_buffer.h index e9cd849f6ff..d7df362ddf1 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.h +++ b/src/freedreno/vulkan/tu_cmd_buffer.h @@ -148,6 +148,8 @@ enum tu_cmd_access_mask { */ TU_ACCESS_CCHE_READ = 1 << 16, + TU_ACCESS_RTU_READ = 1 << 17, + TU_ACCESS_READ = TU_ACCESS_UCHE_READ | TU_ACCESS_CCU_COLOR_READ | @@ -212,6 +214,7 @@ enum tu_cmd_flush_bits { * as it isn't necessary. Therefore, it's not included in ALL_FLUSH. */ TU_CMD_FLAG_BLIT_CACHE_CLEAN = 1 << 11, + TU_CMD_FLAG_RTU_INVALIDATE = 1 << 12, TU_CMD_FLAG_ALL_CLEAN = TU_CMD_FLAG_CCU_CLEAN_DEPTH | @@ -234,7 +237,8 @@ enum tu_cmd_flush_bits { * in case there was another command before the current command buffer * that it needs to wait for. */ - TU_CMD_FLAG_WAIT_FOR_ME, + TU_CMD_FLAG_WAIT_FOR_ME | + TU_CMD_FLAG_RTU_INVALIDATE, }; /* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty diff --git a/src/freedreno/vulkan/tu_pipeline.cc b/src/freedreno/vulkan/tu_pipeline.cc index 213024dc411..fa4d3979f79 100644 --- a/src/freedreno/vulkan/tu_pipeline.cc +++ b/src/freedreno/vulkan/tu_pipeline.cc @@ -2241,6 +2241,9 @@ tu_emit_program_state(struct tu_cs *sub_cs, push_consts->type == IR3_PUSH_CONSTS_SHARED_PREAMBLE) { prog->shared_consts = *push_consts; } + + if (variants[i]->info.uses_ray_intersection) + prog->uses_ray_intersection = true; } unsigned dynamic_descriptor_offset = 0; diff --git a/src/freedreno/vulkan/tu_pipeline.h b/src/freedreno/vulkan/tu_pipeline.h index 47a16229f67..5499b58bd38 100644 --- a/src/freedreno/vulkan/tu_pipeline.h +++ b/src/freedreno/vulkan/tu_pipeline.h @@ -105,6 +105,7 @@ struct tu_program_state bool writes_shading_rate; bool reads_shading_rate; bool accesses_smask; + bool uses_ray_intersection; }; struct tu_pipeline_executable {