mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-25 02:10:11 +01:00
tu, ir3: Implement a750 RT workaround
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28447>
This commit is contained in:
parent
967ea4bbbb
commit
f3f0c5048d
12 changed files with 98 additions and 2 deletions
|
|
@ -338,6 +338,9 @@ struct fd_dev_info {
|
|||
* just raytracing.
|
||||
*/
|
||||
bool has_sw_fuse;
|
||||
|
||||
/* a750-specific HW bug workaround for ray tracing */
|
||||
bool has_rt_workaround;
|
||||
} a7xx;
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -911,6 +911,7 @@ a7xx_gen3 = A7XXProps(
|
|||
has_primitive_shading_rate = True,
|
||||
has_ray_intersection = True,
|
||||
has_sw_fuse = True,
|
||||
has_rt_workaround = True,
|
||||
)
|
||||
|
||||
a730_magic_regs = dict(
|
||||
|
|
|
|||
|
|
@ -45,6 +45,7 @@ enum fd_gpu_event : uint32_t {
|
|||
FD_LRZ_FLUSH,
|
||||
FD_BLIT,
|
||||
FD_LABEL,
|
||||
FD_DUMMY_EVENT,
|
||||
|
||||
FD_GPU_EVENT_MAX,
|
||||
};
|
||||
|
|
@ -111,6 +112,7 @@ constexpr inline struct fd_gpu_event_info fd_gpu_events<A7XX>[FD_GPU_EVENT_MAX]
|
|||
{LRZ_FLUSH, false}, /* FD_LRZ_FLUSH */
|
||||
{BLIT, false}, /* FD_BLIT */
|
||||
{LABEL, false}, /* FD_LABEL */
|
||||
{DUMMY_EVENT, false}, /* FD_DUMMY_EVENT */
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -59,6 +59,7 @@ struct ir3_info {
|
|||
bool double_threadsize;
|
||||
bool multi_dword_ldp_stp;
|
||||
bool early_preamble;
|
||||
bool uses_ray_intersection;
|
||||
|
||||
/* number of sync bits: */
|
||||
uint16_t ss, sy;
|
||||
|
|
|
|||
|
|
@ -2615,6 +2615,8 @@ emit_ray_intersection(struct ir3_context *ctx, nir_intrinsic_instr *intr,
|
|||
{
|
||||
struct ir3_builder *b = &ctx->build;
|
||||
|
||||
ctx->so->info.uses_ray_intersection = true;
|
||||
|
||||
struct ir3_instruction *bvh_base =
|
||||
ir3_create_collect(b, ir3_get_src(ctx, &intr->src[0]), 2);
|
||||
struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[1])[0];
|
||||
|
|
|
|||
|
|
@ -5101,6 +5101,7 @@ to upconvert to 32b float internally?
|
|||
<reg32 offset="0xa9a7" name="SP_FS_TEX_COUNT" low="0" high="7" type="uint" usage="rp_blit"/>
|
||||
<reg32 offset="0xa9a8" name="SP_UNKNOWN_A9A8" low="0" high="16" usage="cmd"/> <!-- always 0x0 ? -->
|
||||
<reg32 offset="0xa9a9" name="SP_FS_PVT_MEM_HW_STACK_OFFSET" type="a6xx_sp_xs_pvt_mem_hw_stack_offset" usage="rp_blit"/>
|
||||
<reg32 offset="0xa9ab" name="SP_FS_UNKNOWN_A9AB" variants="A7XX-" usage="cmd"/>
|
||||
|
||||
<!-- TODO: unknown bool register at 0xa9aa, likely same as 0xa8c0-0xa8c3 but for FS -->
|
||||
|
||||
|
|
|
|||
|
|
@ -1821,6 +1821,30 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords)
|
|||
<bitfield name="USES_GMEM" pos="4" type="boolean" varset="set_marker_mode" variants="SET_RENDER_MODE"/>
|
||||
|
||||
<bitfield name="IFPC_MODE" pos="0" type="a6xx_ifpc_mode" varset="set_marker_mode" variants="SET_IFPC_MODE"/>
|
||||
|
||||
<!--
|
||||
CP_SET_MARKER is used with these bits to create a
|
||||
critical section around a workaround for ray tracing.
|
||||
The workaround happens after BVH building, and appears
|
||||
to invalidate the RTU's BVH node cache. It makes sure
|
||||
that only one of BR/BV/LPAC is executing the
|
||||
workaround at a time, and no draws using RT on BV/LPAC
|
||||
are executing while the workaround is executed on BR (or
|
||||
vice versa, that no draws on BV/BR using RT are executed
|
||||
while the workaround executes on LPAC), by
|
||||
hooking subsequent CP_EVENT_WRITE/CP_DRAW_*/CP_EXEC_CS.
|
||||
The blob usage is:
|
||||
|
||||
CP_SET_MARKER(RT_WA_START)
|
||||
... workaround here ...
|
||||
CP_SET_MARKER(RT_WA_END)
|
||||
...
|
||||
CP_SET_MARKER(SHADER_USES_RT)
|
||||
CP_DRAW_INDX(...) or CP_EXEC_CS(...)
|
||||
-->
|
||||
<bitfield name="SHADER_USES_RT" pos="9" type="boolean" variants="A7XX-"/>
|
||||
<bitfield name="RT_WA_START" pos="10" type="boolean" variants="A7XX-"/>
|
||||
<bitfield name="RT_WA_END" pos="11" type="boolean" variants="A7XX-"/>
|
||||
</reg32>
|
||||
</domain>
|
||||
|
||||
|
|
|
|||
|
|
@ -30,6 +30,9 @@
|
|||
#include "tu_acceleration_structure.h"
|
||||
#include "radix_sort/radix_sort_u64.h"
|
||||
|
||||
|
||||
#include "common/freedreno_gpu_event.h"
|
||||
|
||||
#include "util/u_hexdump.h"
|
||||
|
||||
#include "bvh/tu_build_interface.h"
|
||||
|
|
|
|||
|
|
@ -165,6 +165,30 @@ tu_emit_vsc(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
|
|||
cmd->vsc_initialized = true;
|
||||
}
|
||||
|
||||
/* This workaround, copied from the blob, seems to ensure that the BVH node
|
||||
* cache is invalidated so that we don't read stale values when multiple BVHs
|
||||
* share the same address.
|
||||
*/
|
||||
static void
|
||||
tu_emit_rt_workaround(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
|
||||
{
|
||||
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
|
||||
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_RT_WA_START);
|
||||
|
||||
tu_cs_emit_regs(cs, A7XX_SP_CS_UNKNOWN_A9BE(.dword = 0x10000));
|
||||
tu_cs_emit_regs(cs, A7XX_SP_FS_UNKNOWN_A9AB(.dword = 0x10000));
|
||||
tu_emit_event_write<A7XX>(cmd, cs, FD_DUMMY_EVENT);
|
||||
tu_cs_emit_regs(cs, A7XX_SP_CS_UNKNOWN_A9BE(.dword = 0));
|
||||
tu_cs_emit_regs(cs, A7XX_SP_FS_UNKNOWN_A9AB(.dword = 0));
|
||||
tu_emit_event_write<A7XX>(cmd, cs, FD_DUMMY_EVENT);
|
||||
tu_emit_event_write<A7XX>(cmd, cs, FD_DUMMY_EVENT);
|
||||
tu_emit_event_write<A7XX>(cmd, cs, FD_DUMMY_EVENT);
|
||||
tu_emit_event_write<A7XX>(cmd, cs, FD_DUMMY_EVENT);
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
|
||||
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_RT_WA_END);
|
||||
}
|
||||
|
||||
template <chip CHIP>
|
||||
static void
|
||||
tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer,
|
||||
|
|
@ -216,6 +240,9 @@ tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer,
|
|||
/* Invalidating UCHE seems to also invalidate CCHE */
|
||||
!(flushes & TU_CMD_FLAG_CACHE_INVALIDATE))
|
||||
tu_cs_emit_pkt7(cs, CP_CCHE_INVALIDATE, 0);
|
||||
if (CHIP >= A7XX && (flushes & TU_CMD_FLAG_RTU_INVALIDATE) &&
|
||||
cmd_buffer->device->physical_device->info->a7xx.has_rt_workaround)
|
||||
tu_emit_rt_workaround(cmd_buffer, cs);
|
||||
if (flushes & TU_CMD_FLAG_WAIT_MEM_WRITES)
|
||||
tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
|
||||
if (flushes & TU_CMD_FLAG_WAIT_FOR_IDLE)
|
||||
|
|
@ -3858,6 +3885,12 @@ tu_flush_for_access(struct tu_cache_state *cache,
|
|||
flush_bits |= TU_CMD_FLAG_BLIT_CACHE_CLEAN;
|
||||
}
|
||||
|
||||
/* Nothing writes through the RTU cache so there's no point trying to
|
||||
* optimize this. Just always invalidate.
|
||||
*/
|
||||
if (dst_mask & TU_ACCESS_RTU_READ)
|
||||
flush_bits |= TU_CMD_FLAG_RTU_INVALIDATE;
|
||||
|
||||
#undef DST_INCOHERENT_FLUSH
|
||||
|
||||
cache->flush_bits |= flush_bits;
|
||||
|
|
@ -3968,6 +4001,11 @@ vk2tu_access(VkAccessFlags2 flags, VkPipelineStageFlags2 stages, bool image_only
|
|||
SHADER_STAGES))
|
||||
mask |= TU_ACCESS_UCHE_READ | TU_ACCESS_CCHE_READ;
|
||||
|
||||
if (gfx_read_access(flags, stages,
|
||||
VK_ACCESS_2_ACCELERATION_STRUCTURE_READ_BIT_KHR,
|
||||
SHADER_STAGES))
|
||||
mask |= TU_ACCESS_UCHE_READ | TU_ACCESS_CCHE_READ | TU_ACCESS_RTU_READ;
|
||||
|
||||
/* Reading the AS for copying involves doing CmdDispatchIndirect with the
|
||||
* copy size as a parameter, so it's read by the CP as well as a shader.
|
||||
*/
|
||||
|
|
@ -3975,7 +4013,8 @@ vk2tu_access(VkAccessFlags2 flags, VkPipelineStageFlags2 stages, bool image_only
|
|||
VK_ACCESS_2_ACCELERATION_STRUCTURE_READ_BIT_KHR,
|
||||
VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR |
|
||||
VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_COPY_BIT_KHR))
|
||||
mask |= TU_ACCESS_SYSMEM_READ;
|
||||
mask |= TU_ACCESS_SYSMEM_READ | TU_ACCESS_UCHE_READ |
|
||||
TU_ACCESS_CCHE_READ;
|
||||
|
||||
|
||||
if (gfx_read_access(flags, stages,
|
||||
|
|
@ -5826,6 +5865,12 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
|
|||
tess_params->output_lower_left));
|
||||
}
|
||||
|
||||
if (cmd->device->physical_device->info->a7xx.has_rt_workaround &&
|
||||
cmd->state.program.uses_ray_intersection) {
|
||||
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
|
||||
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_SHADER_USES_RT);
|
||||
}
|
||||
|
||||
/* Early exit if there is nothing to emit, saves CPU cycles */
|
||||
uint32_t dirty = cmd->state.dirty;
|
||||
if (!dynamic_draw_state_dirty && !(dirty & ~TU_CMD_DIRTY_COMPUTE_DESC_SETS))
|
||||
|
|
@ -6926,6 +6971,12 @@ tu_dispatch(struct tu_cmd_buffer *cmd,
|
|||
}
|
||||
}
|
||||
|
||||
if (cmd->device->physical_device->info->a7xx.has_rt_workaround &&
|
||||
shader->variant->info.uses_ray_intersection) {
|
||||
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
|
||||
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_SHADER_USES_RT);
|
||||
}
|
||||
|
||||
if (info->indirect) {
|
||||
trace_start_compute_indirect(&cmd->trace, cs, info->unaligned);
|
||||
|
||||
|
|
|
|||
|
|
@ -148,6 +148,8 @@ enum tu_cmd_access_mask {
|
|||
*/
|
||||
TU_ACCESS_CCHE_READ = 1 << 16,
|
||||
|
||||
TU_ACCESS_RTU_READ = 1 << 17,
|
||||
|
||||
TU_ACCESS_READ =
|
||||
TU_ACCESS_UCHE_READ |
|
||||
TU_ACCESS_CCU_COLOR_READ |
|
||||
|
|
@ -212,6 +214,7 @@ enum tu_cmd_flush_bits {
|
|||
* as it isn't necessary. Therefore, it's not included in ALL_FLUSH.
|
||||
*/
|
||||
TU_CMD_FLAG_BLIT_CACHE_CLEAN = 1 << 11,
|
||||
TU_CMD_FLAG_RTU_INVALIDATE = 1 << 12,
|
||||
|
||||
TU_CMD_FLAG_ALL_CLEAN =
|
||||
TU_CMD_FLAG_CCU_CLEAN_DEPTH |
|
||||
|
|
@ -234,7 +237,8 @@ enum tu_cmd_flush_bits {
|
|||
* in case there was another command before the current command buffer
|
||||
* that it needs to wait for.
|
||||
*/
|
||||
TU_CMD_FLAG_WAIT_FOR_ME,
|
||||
TU_CMD_FLAG_WAIT_FOR_ME |
|
||||
TU_CMD_FLAG_RTU_INVALIDATE,
|
||||
};
|
||||
|
||||
/* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty
|
||||
|
|
|
|||
|
|
@ -2241,6 +2241,9 @@ tu_emit_program_state(struct tu_cs *sub_cs,
|
|||
push_consts->type == IR3_PUSH_CONSTS_SHARED_PREAMBLE) {
|
||||
prog->shared_consts = *push_consts;
|
||||
}
|
||||
|
||||
if (variants[i]->info.uses_ray_intersection)
|
||||
prog->uses_ray_intersection = true;
|
||||
}
|
||||
|
||||
unsigned dynamic_descriptor_offset = 0;
|
||||
|
|
|
|||
|
|
@ -105,6 +105,7 @@ struct tu_program_state
|
|||
bool writes_shading_rate;
|
||||
bool reads_shading_rate;
|
||||
bool accesses_smask;
|
||||
bool uses_ray_intersection;
|
||||
};
|
||||
|
||||
struct tu_pipeline_executable {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue