tu, ir3: Implement a750 RT workaround

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28447>
This commit is contained in:
Connor Abbott 2024-05-16 16:32:03 -04:00 committed by Marge Bot
parent 967ea4bbbb
commit f3f0c5048d
12 changed files with 98 additions and 2 deletions

View file

@ -338,6 +338,9 @@ struct fd_dev_info {
* just raytracing.
*/
bool has_sw_fuse;
/* a750-specific HW bug workaround for ray tracing */
bool has_rt_workaround;
} a7xx;
};

View file

@ -911,6 +911,7 @@ a7xx_gen3 = A7XXProps(
has_primitive_shading_rate = True,
has_ray_intersection = True,
has_sw_fuse = True,
has_rt_workaround = True,
)
a730_magic_regs = dict(

View file

@ -45,6 +45,7 @@ enum fd_gpu_event : uint32_t {
FD_LRZ_FLUSH,
FD_BLIT,
FD_LABEL,
FD_DUMMY_EVENT,
FD_GPU_EVENT_MAX,
};
@ -111,6 +112,7 @@ constexpr inline struct fd_gpu_event_info fd_gpu_events<A7XX>[FD_GPU_EVENT_MAX]
{LRZ_FLUSH, false}, /* FD_LRZ_FLUSH */
{BLIT, false}, /* FD_BLIT */
{LABEL, false}, /* FD_LABEL */
{DUMMY_EVENT, false}, /* FD_DUMMY_EVENT */
};
#endif

View file

@ -59,6 +59,7 @@ struct ir3_info {
bool double_threadsize;
bool multi_dword_ldp_stp;
bool early_preamble;
bool uses_ray_intersection;
/* number of sync bits: */
uint16_t ss, sy;

View file

@ -2615,6 +2615,8 @@ emit_ray_intersection(struct ir3_context *ctx, nir_intrinsic_instr *intr,
{
struct ir3_builder *b = &ctx->build;
ctx->so->info.uses_ray_intersection = true;
struct ir3_instruction *bvh_base =
ir3_create_collect(b, ir3_get_src(ctx, &intr->src[0]), 2);
struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[1])[0];

View file

@ -5101,6 +5101,7 @@ to upconvert to 32b float internally?
<reg32 offset="0xa9a7" name="SP_FS_TEX_COUNT" low="0" high="7" type="uint" usage="rp_blit"/>
<reg32 offset="0xa9a8" name="SP_UNKNOWN_A9A8" low="0" high="16" usage="cmd"/> <!-- always 0x0 ? -->
<reg32 offset="0xa9a9" name="SP_FS_PVT_MEM_HW_STACK_OFFSET" type="a6xx_sp_xs_pvt_mem_hw_stack_offset" usage="rp_blit"/>
<reg32 offset="0xa9ab" name="SP_FS_UNKNOWN_A9AB" variants="A7XX-" usage="cmd"/>
<!-- TODO: unknown bool register at 0xa9aa, likely same as 0xa8c0-0xa8c3 but for FS -->

View file

@ -1821,6 +1821,30 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords)
<bitfield name="USES_GMEM" pos="4" type="boolean" varset="set_marker_mode" variants="SET_RENDER_MODE"/>
<bitfield name="IFPC_MODE" pos="0" type="a6xx_ifpc_mode" varset="set_marker_mode" variants="SET_IFPC_MODE"/>
<!--
CP_SET_MARKER is used with these bits to create a
critical section around a workaround for ray tracing.
The workaround happens after BVH building, and appears
to invalidate the RTU's BVH node cache. It makes sure
that only one of BR/BV/LPAC is executing the
workaround at a time, and no draws using RT on BV/LPAC
are executing while the workaround is executed on BR (or
vice versa, that no draws on BV/BR using RT are executed
while the workaround executes on LPAC), by
hooking subsequent CP_EVENT_WRITE/CP_DRAW_*/CP_EXEC_CS.
The blob usage is:
CP_SET_MARKER(RT_WA_START)
... workaround here ...
CP_SET_MARKER(RT_WA_END)
...
CP_SET_MARKER(SHADER_USES_RT)
CP_DRAW_INDX(...) or CP_EXEC_CS(...)
-->
<bitfield name="SHADER_USES_RT" pos="9" type="boolean" variants="A7XX-"/>
<bitfield name="RT_WA_START" pos="10" type="boolean" variants="A7XX-"/>
<bitfield name="RT_WA_END" pos="11" type="boolean" variants="A7XX-"/>
</reg32>
</domain>

View file

@ -30,6 +30,9 @@
#include "tu_acceleration_structure.h"
#include "radix_sort/radix_sort_u64.h"
#include "common/freedreno_gpu_event.h"
#include "util/u_hexdump.h"
#include "bvh/tu_build_interface.h"

View file

@ -165,6 +165,30 @@ tu_emit_vsc(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
cmd->vsc_initialized = true;
}
/* This workaround, copied from the blob, seems to ensure that the BVH node
* cache is invalidated so that we don't read stale values when multiple BVHs
* share the same address.
*/
static void
tu_emit_rt_workaround(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_RT_WA_START);
tu_cs_emit_regs(cs, A7XX_SP_CS_UNKNOWN_A9BE(.dword = 0x10000));
tu_cs_emit_regs(cs, A7XX_SP_FS_UNKNOWN_A9AB(.dword = 0x10000));
tu_emit_event_write<A7XX>(cmd, cs, FD_DUMMY_EVENT);
tu_cs_emit_regs(cs, A7XX_SP_CS_UNKNOWN_A9BE(.dword = 0));
tu_cs_emit_regs(cs, A7XX_SP_FS_UNKNOWN_A9AB(.dword = 0));
tu_emit_event_write<A7XX>(cmd, cs, FD_DUMMY_EVENT);
tu_emit_event_write<A7XX>(cmd, cs, FD_DUMMY_EVENT);
tu_emit_event_write<A7XX>(cmd, cs, FD_DUMMY_EVENT);
tu_emit_event_write<A7XX>(cmd, cs, FD_DUMMY_EVENT);
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_RT_WA_END);
}
template <chip CHIP>
static void
tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer,
@ -216,6 +240,9 @@ tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer,
/* Invalidating UCHE seems to also invalidate CCHE */
!(flushes & TU_CMD_FLAG_CACHE_INVALIDATE))
tu_cs_emit_pkt7(cs, CP_CCHE_INVALIDATE, 0);
if (CHIP >= A7XX && (flushes & TU_CMD_FLAG_RTU_INVALIDATE) &&
cmd_buffer->device->physical_device->info->a7xx.has_rt_workaround)
tu_emit_rt_workaround(cmd_buffer, cs);
if (flushes & TU_CMD_FLAG_WAIT_MEM_WRITES)
tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
if (flushes & TU_CMD_FLAG_WAIT_FOR_IDLE)
@ -3858,6 +3885,12 @@ tu_flush_for_access(struct tu_cache_state *cache,
flush_bits |= TU_CMD_FLAG_BLIT_CACHE_CLEAN;
}
/* Nothing writes through the RTU cache so there's no point trying to
* optimize this. Just always invalidate.
*/
if (dst_mask & TU_ACCESS_RTU_READ)
flush_bits |= TU_CMD_FLAG_RTU_INVALIDATE;
#undef DST_INCOHERENT_FLUSH
cache->flush_bits |= flush_bits;
@ -3968,6 +4001,11 @@ vk2tu_access(VkAccessFlags2 flags, VkPipelineStageFlags2 stages, bool image_only
SHADER_STAGES))
mask |= TU_ACCESS_UCHE_READ | TU_ACCESS_CCHE_READ;
if (gfx_read_access(flags, stages,
VK_ACCESS_2_ACCELERATION_STRUCTURE_READ_BIT_KHR,
SHADER_STAGES))
mask |= TU_ACCESS_UCHE_READ | TU_ACCESS_CCHE_READ | TU_ACCESS_RTU_READ;
/* Reading the AS for copying involves doing CmdDispatchIndirect with the
* copy size as a parameter, so it's read by the CP as well as a shader.
*/
@ -3975,7 +4013,8 @@ vk2tu_access(VkAccessFlags2 flags, VkPipelineStageFlags2 stages, bool image_only
VK_ACCESS_2_ACCELERATION_STRUCTURE_READ_BIT_KHR,
VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR |
VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_COPY_BIT_KHR))
mask |= TU_ACCESS_SYSMEM_READ;
mask |= TU_ACCESS_SYSMEM_READ | TU_ACCESS_UCHE_READ |
TU_ACCESS_CCHE_READ;
if (gfx_read_access(flags, stages,
@ -5826,6 +5865,12 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
tess_params->output_lower_left));
}
if (cmd->device->physical_device->info->a7xx.has_rt_workaround &&
cmd->state.program.uses_ray_intersection) {
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_SHADER_USES_RT);
}
/* Early exit if there is nothing to emit, saves CPU cycles */
uint32_t dirty = cmd->state.dirty;
if (!dynamic_draw_state_dirty && !(dirty & ~TU_CMD_DIRTY_COMPUTE_DESC_SETS))
@ -6926,6 +6971,12 @@ tu_dispatch(struct tu_cmd_buffer *cmd,
}
}
if (cmd->device->physical_device->info->a7xx.has_rt_workaround &&
shader->variant->info.uses_ray_intersection) {
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_SHADER_USES_RT);
}
if (info->indirect) {
trace_start_compute_indirect(&cmd->trace, cs, info->unaligned);

View file

@ -148,6 +148,8 @@ enum tu_cmd_access_mask {
*/
TU_ACCESS_CCHE_READ = 1 << 16,
TU_ACCESS_RTU_READ = 1 << 17,
TU_ACCESS_READ =
TU_ACCESS_UCHE_READ |
TU_ACCESS_CCU_COLOR_READ |
@ -212,6 +214,7 @@ enum tu_cmd_flush_bits {
* as it isn't necessary. Therefore, it's not included in ALL_FLUSH.
*/
TU_CMD_FLAG_BLIT_CACHE_CLEAN = 1 << 11,
TU_CMD_FLAG_RTU_INVALIDATE = 1 << 12,
TU_CMD_FLAG_ALL_CLEAN =
TU_CMD_FLAG_CCU_CLEAN_DEPTH |
@ -234,7 +237,8 @@ enum tu_cmd_flush_bits {
* in case there was another command before the current command buffer
* that it needs to wait for.
*/
TU_CMD_FLAG_WAIT_FOR_ME,
TU_CMD_FLAG_WAIT_FOR_ME |
TU_CMD_FLAG_RTU_INVALIDATE,
};
/* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty

View file

@ -2241,6 +2241,9 @@ tu_emit_program_state(struct tu_cs *sub_cs,
push_consts->type == IR3_PUSH_CONSTS_SHARED_PREAMBLE) {
prog->shared_consts = *push_consts;
}
if (variants[i]->info.uses_ray_intersection)
prog->uses_ray_intersection = true;
}
unsigned dynamic_descriptor_offset = 0;

View file

@ -105,6 +105,7 @@ struct tu_program_state
bool writes_shading_rate;
bool reads_shading_rate;
bool accesses_smask;
bool uses_ray_intersection;
};
struct tu_pipeline_executable {