intel/ds: rework RT tracepoints

That way we can identify single dispatch within each step.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Michael Cheng <michael.cheng@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33684>
This commit is contained in:
Lionel Landwerlin 2025-02-23 15:47:04 +02:00 committed by Marge Bot
parent 31c5c386d1
commit e4f31b8744
7 changed files with 70 additions and 111 deletions

View file

@ -73,12 +73,10 @@ enum intel_ds_tracepoint_flags {
*/
INTEL_DS_TRACEPOINT_FLAG_END_CS = BITFIELD_BIT(1),
/**
* Whether this tracepoint's timestamp is recorded on the compute pipeline
* or from top of pipe if there was no dispatch (useful for acceleration
* structure builds where the runtime might choose to not emit anything for
* a number of reasons).
* Whether this tracepoint doesn't generate a timestamp but instead repeats
* the last one.
*/
INTEL_DS_TRACEPOINT_FLAG_END_CS_OR_NOOP = BITFIELD_BIT(2),
INTEL_DS_TRACEPOINT_FLAG_REPEAST_LAST = BITFIELD_BIT(2),
};
/* Convert internal driver PIPE_CONTROL stall bits to intel_ds_stall_flag. */

View file

@ -47,7 +47,7 @@ def define_tracepoints(args):
def begin_end_tp(name, tp_args=[], tp_struct=None, tp_print=None,
tp_default_enabled=True, end_pipelined=True,
compute=False, maybe_compute=False,
compute=False, repeat_last=False,
need_cs_param=False):
global intel_default_tps
if tp_default_enabled:
@ -69,8 +69,8 @@ def define_tracepoints(args):
if end_pipelined:
if compute:
tp_flags.append('INTEL_DS_TRACEPOINT_FLAG_END_CS')
elif maybe_compute:
tp_flags.append('INTEL_DS_TRACEPOINT_FLAG_END_CS_OR_NOOP')
elif repeat_last:
tp_flags.append('INTEL_DS_TRACEPOINT_FLAG_REPEAST_LAST')
else:
tp_flags.append('INTEL_DS_TRACEPOINT_FLAG_END_OF_PIPE')
Tracepoint('intel_end_{0}'.format(name),
@ -229,15 +229,14 @@ def define_tracepoints(args):
tp_args=[Arg(type='uint32_t', var='count', c_format='%u'),],
need_cs_param=True)
rt_args = [Arg(type='uint32_t', var='cs_hash', c_format='%u')]
begin_end_tp('as_build', tp_args=rt_args)
begin_end_tp('as_build_leaves', tp_args=rt_args, maybe_compute=True)
begin_end_tp('as_morton_generate', tp_args=rt_args, maybe_compute=True)
begin_end_tp('as_morton_sort', tp_args=rt_args, maybe_compute=True)
begin_end_tp('as_lbvh_build_internal', tp_args=rt_args, maybe_compute=True)
begin_end_tp('as_ploc_build_internal', tp_args=rt_args, maybe_compute=True)
begin_end_tp('as_encode', tp_args=rt_args, maybe_compute=True)
begin_end_tp('as_copy', tp_args=rt_args, maybe_compute=True)
begin_end_tp('as_build')
begin_end_tp('as_build_leaves', repeat_last=True)
begin_end_tp('as_morton_generate', repeat_last=True)
begin_end_tp('as_morton_sort', repeat_last=True)
begin_end_tp('as_lbvh_build_internal', repeat_last=True)
begin_end_tp('as_ploc_build_internal', repeat_last=True)
begin_end_tp('as_encode', repeat_last=True)
begin_end_tp('as_copy', repeat_last=True)
begin_end_tp('rays',
tp_args=[Arg(type='uint32_t', var='group_x', c_format='%u'),

View file

@ -1010,6 +1010,7 @@ enum anv_timestamp_capture_type {
ANV_TIMESTAMP_CAPTURE_AT_CS_STALL,
ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER,
ANV_TIMESTAMP_REWRITE_INDIRECT_DISPATCH,
ANV_TIMESTAMP_REPEAT_LAST,
};
struct anv_physical_device {
@ -6532,10 +6533,13 @@ struct anv_utrace_submit {
struct anv_state_stream general_state_stream;
/* Last fully read 64bit timestamp (used to rebuild the upper bits of 32bit
* timestamps)
* timestamps), the timestamp is not scaled to the CPU time domain.
*/
uint64_t last_full_timestamp;
/* Last timestamp, not scaled to the CPU time domain */
uint64_t last_timestamp;
/* Memcpy state tracking (only used for timestamp copies on render engine) */
struct anv_memcpy_state memcpy_state;

View file

@ -351,11 +351,10 @@ anv_utrace_record_ts(struct u_trace *ut, void *cs,
const bool is_end_compute =
cs == NULL &&
(flags & INTEL_DS_TRACEPOINT_FLAG_END_CS);
const bool is_end_compute_or_noop =
cs == NULL &&
(flags & INTEL_DS_TRACEPOINT_FLAG_END_CS_OR_NOOP);
enum anv_timestamp_capture_type capture_type;
if (is_end_compute) {
if (flags & INTEL_DS_TRACEPOINT_FLAG_REPEAST_LAST) {
capture_type = ANV_TIMESTAMP_REPEAT_LAST;
} else if (is_end_compute) {
assert(device->info->verx10 < 125 ||
!is_end_compute ||
cmd_buffer->state.last_indirect_dispatch != NULL ||
@ -366,15 +365,6 @@ anv_utrace_record_ts(struct u_trace *ut, void *cs,
ANV_TIMESTAMP_REWRITE_INDIRECT_DISPATCH :
ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER) :
ANV_TIMESTAMP_CAPTURE_END_OF_PIPE;
} else if (is_end_compute_or_noop) {
capture_type =
device->info->verx10 >= 125 ?
(cmd_buffer->state.last_indirect_dispatch != NULL ?
ANV_TIMESTAMP_REWRITE_INDIRECT_DISPATCH :
(cmd_buffer->state.last_compute_walker != NULL ?
ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER :
ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE)) :
ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE;
} else {
capture_type = (flags & INTEL_DS_TRACEPOINT_FLAG_END_CS) ?
ANV_TIMESTAMP_CAPTURE_END_OF_PIPE :
@ -418,6 +408,11 @@ anv_utrace_read_ts(struct u_trace_context *utctx,
assert(result == VK_SUCCESS);
}
if (flags & INTEL_DS_TRACEPOINT_FLAG_REPEAST_LAST) {
return intel_device_info_timebase_scale(device->info,
submit->last_timestamp);
}
assert(offset_B % sizeof(union anv_utrace_timestamp) == 0);
union anv_utrace_timestamp *ts =
(union anv_utrace_timestamp *)(bo->map + offset_B);
@ -426,31 +421,32 @@ anv_utrace_read_ts(struct u_trace_context *utctx,
if (ts->timestamp == U_TRACE_NO_TIMESTAMP)
return U_TRACE_NO_TIMESTAMP;
uint64_t timestamp;
/* Detect a 16/32 bytes timestamp write */
if (ts->gfx20_postsync_data[1] != 0 ||
ts->gfx20_postsync_data[2] != 0 ||
ts->gfx20_postsync_data[3] != 0) {
if (device->info->ver >= 20) {
return intel_device_info_timebase_scale(device->info,
ts->gfx20_postsync_data[3]);
timestamp = ts->gfx20_postsync_data[3];
} else {
/* The timestamp written by COMPUTE_WALKER::PostSync only as 32bits.
* We need to rebuild the full 64bits using the previous timestamp.
* We assume that utrace is reading the timestamp in order. Anyway
* timestamp rollover on 32bits in a few minutes so in most cases
* that should be correct.
*/
timestamp =
(submit->last_full_timestamp & 0xffffffff00000000) |
(uint64_t) ts->gfx125_postsync_data[3];
}
/* The timestamp written by COMPUTE_WALKER::PostSync only as 32bits. We
* need to rebuild the full 64bits using the previous timestamp. We
* assume that utrace is reading the timestamp in order. Anyway
* timestamp rollover on 32bits in a few minutes so in most cases that
* should be correct.
*/
uint64_t timestamp =
(submit->last_full_timestamp & 0xffffffff00000000) |
(uint64_t) ts->gfx125_postsync_data[3];
return intel_device_info_timebase_scale(device->info, timestamp);
} else {
submit->last_full_timestamp = timestamp = ts->timestamp;
}
submit->last_full_timestamp = ts->timestamp;
submit->last_timestamp = timestamp;
return intel_device_info_timebase_scale(device->info, ts->timestamp);
return intel_device_info_timebase_scale(device->info, timestamp);
}
static void

View file

@ -72,41 +72,29 @@ static void
end_debug_marker(VkCommandBuffer commandBuffer)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
struct anv_compute_pipeline *pipeline =
anv_pipeline_to_compute(comp_state->base.pipeline);
const struct brw_cs_prog_data *cs_prog_data =
brw_cs_prog_data_const(pipeline->cs->prog_data);
cmd_buffer->state.rt.debug_marker_count--;
switch (cmd_buffer->state.rt.debug_markers[cmd_buffer->state.rt.debug_marker_count]) {
case VK_ACCELERATION_STRUCTURE_BUILD_STEP_TOP:
trace_intel_end_as_build(&cmd_buffer->trace,
cs_prog_data->base.source_hash);
trace_intel_end_as_build(&cmd_buffer->trace);
break;
case VK_ACCELERATION_STRUCTURE_BUILD_STEP_BUILD_LEAVES:
trace_intel_end_as_build_leaves(&cmd_buffer->trace,
cs_prog_data->base.source_hash);
trace_intel_end_as_build_leaves(&cmd_buffer->trace);
break;
case VK_ACCELERATION_STRUCTURE_BUILD_STEP_MORTON_GENERATE:
trace_intel_end_as_morton_generate(&cmd_buffer->trace,
cs_prog_data->base.source_hash);
trace_intel_end_as_morton_generate(&cmd_buffer->trace);
break;
case VK_ACCELERATION_STRUCTURE_BUILD_STEP_MORTON_SORT:
trace_intel_end_as_morton_sort(&cmd_buffer->trace,
cs_prog_data->base.source_hash);
trace_intel_end_as_morton_sort(&cmd_buffer->trace);
break;
case VK_ACCELERATION_STRUCTURE_BUILD_STEP_LBVH_BUILD_INTERNAL:
trace_intel_end_as_lbvh_build_internal(&cmd_buffer->trace,
cs_prog_data->base.source_hash);
trace_intel_end_as_lbvh_build_internal(&cmd_buffer->trace);
break;
case VK_ACCELERATION_STRUCTURE_BUILD_STEP_PLOC_BUILD_INTERNAL:
trace_intel_end_as_ploc_build_internal(&cmd_buffer->trace,
cs_prog_data->base.source_hash);
trace_intel_end_as_ploc_build_internal(&cmd_buffer->trace);
break;
case VK_ACCELERATION_STRUCTURE_BUILD_STEP_ENCODE:
trace_intel_end_as_encode(&cmd_buffer->trace,
cs_prog_data->base.source_hash);
trace_intel_end_as_encode(&cmd_buffer->trace);
break;
default:
unreachable("Invalid build step");
@ -725,12 +713,6 @@ genX(CmdCopyAccelerationStructureKHR)(
return;
}
ANV_FROM_HANDLE(anv_pipeline, anv_pipeline, pipeline);
struct anv_compute_pipeline *compute_pipeline =
anv_pipeline_to_compute(anv_pipeline);
const struct brw_cs_prog_data *cs_prog_data =
brw_cs_prog_data_const(compute_pipeline->cs->prog_data);
struct anv_cmd_saved_state saved;
anv_cmd_buffer_save_state(cmd_buffer,
ANV_CMD_SAVED_STATE_COMPUTE_PIPELINE |
@ -773,8 +755,7 @@ genX(CmdCopyAccelerationStructureKHR)(
anv_cmd_buffer_restore_state(cmd_buffer, &saved);
trace_intel_end_as_copy(&cmd_buffer->trace,
cs_prog_data->base.source_hash);
trace_intel_end_as_copy(&cmd_buffer->trace);
}
void
@ -799,12 +780,6 @@ genX(CmdCopyAccelerationStructureToMemoryKHR)(
return;
}
ANV_FROM_HANDLE(anv_pipeline, anv_pipeline, pipeline);
struct anv_compute_pipeline *compute_pipeline =
anv_pipeline_to_compute(anv_pipeline);
const struct brw_cs_prog_data *cs_prog_data =
brw_cs_prog_data_const(compute_pipeline->cs->prog_data);
struct anv_cmd_saved_state saved;
anv_cmd_buffer_save_state(cmd_buffer,
ANV_CMD_SAVED_STATE_COMPUTE_PIPELINE |
@ -851,8 +826,7 @@ genX(CmdCopyAccelerationStructureToMemoryKHR)(
anv_cmd_buffer_restore_state(cmd_buffer, &saved);
trace_intel_end_as_copy(&cmd_buffer->trace,
cs_prog_data->base.source_hash);
trace_intel_end_as_copy(&cmd_buffer->trace);
}
void
@ -876,12 +850,6 @@ genX(CmdCopyMemoryToAccelerationStructureKHR)(
return;
}
ANV_FROM_HANDLE(anv_pipeline, anv_pipeline, pipeline);
struct anv_compute_pipeline *compute_pipeline =
anv_pipeline_to_compute(anv_pipeline);
const struct brw_cs_prog_data *cs_prog_data =
brw_cs_prog_data_const(compute_pipeline->cs->prog_data);
struct anv_cmd_saved_state saved;
anv_cmd_buffer_save_state(cmd_buffer,
ANV_CMD_SAVED_STATE_COMPUTE_PIPELINE |
@ -911,8 +879,7 @@ genX(CmdCopyMemoryToAccelerationStructureKHR)(
vk_common_CmdDispatch(commandBuffer, 512, 1, 1);
anv_cmd_buffer_restore_state(cmd_buffer, &saved);
trace_intel_end_as_copy(&cmd_buffer->trace,
cs_prog_data->base.source_hash);
trace_intel_end_as_copy(&cmd_buffer->trace);
}
void

View file

@ -6269,6 +6269,10 @@ void genX(cmd_emit_timestamp)(struct anv_batch *batch,
}
#endif
case ANV_TIMESTAMP_REPEAT_LAST:
/* Noop */
break;
default:
unreachable("invalid");
}

View file

@ -626,8 +626,7 @@ void genX(CmdDispatchBase)(
prog_data->local_size[0] * prog_data->local_size[1] *
prog_data->local_size[2]);
if (cmd_buffer->state.rt.debug_marker_count == 0)
trace_intel_begin_compute(&cmd_buffer->trace);
trace_intel_begin_compute(&cmd_buffer->trace);
cmd_buffer_flush_compute_state(cmd_buffer);
@ -639,11 +638,9 @@ void genX(CmdDispatchBase)(
groupCountX, groupCountY, groupCountZ,
false);
if (cmd_buffer->state.rt.debug_marker_count == 0) {
trace_intel_end_compute(&cmd_buffer->trace,
groupCountX, groupCountY, groupCountZ,
prog_data->base.source_hash);
}
trace_intel_end_compute(&cmd_buffer->trace,
groupCountX, groupCountY, groupCountZ,
prog_data->base.source_hash);
}
static void
@ -686,8 +683,7 @@ emit_unaligned_cs_walker(
prog_data->local_size[0] * prog_data->local_size[1] *
prog_data->local_size[2]);
if (cmd_buffer->state.rt.debug_marker_count == 0)
trace_intel_begin_compute(&cmd_buffer->trace);
trace_intel_begin_compute(&cmd_buffer->trace);
assert(!prog_data->uses_num_work_groups);
genX(cmd_buffer_flush_compute_state)(cmd_buffer);
@ -700,11 +696,9 @@ emit_unaligned_cs_walker(
dispatch, groupCountX, groupCountY, groupCountZ);
#endif
if (cmd_buffer->state.rt.debug_marker_count == 0) {
trace_intel_end_compute(&cmd_buffer->trace,
groupCountX, groupCountY, groupCountZ,
prog_data->base.source_hash);
}
trace_intel_end_compute(&cmd_buffer->trace,
groupCountX, groupCountY, groupCountZ,
prog_data->base.source_hash);
}
/*
@ -795,8 +789,7 @@ genX(cmd_buffer_dispatch_indirect)(struct anv_cmd_buffer *cmd_buffer,
"compute indirect",
0);
if (cmd_buffer->state.rt.debug_marker_count == 0)
trace_intel_begin_compute_indirect(&cmd_buffer->trace);
trace_intel_begin_compute_indirect(&cmd_buffer->trace);
cmd_buffer_flush_compute_state(cmd_buffer);
@ -806,11 +799,9 @@ genX(cmd_buffer_dispatch_indirect)(struct anv_cmd_buffer *cmd_buffer,
emit_cs_walker(cmd_buffer, pipeline, prog_data, dispatch, indirect_addr, 0,
0, 0, is_unaligned_size_x);
if (cmd_buffer->state.rt.debug_marker_count == 0) {
trace_intel_end_compute_indirect(&cmd_buffer->trace,
anv_address_utrace(indirect_addr),
prog_data->base.source_hash);
}
trace_intel_end_compute_indirect(&cmd_buffer->trace,
anv_address_utrace(indirect_addr),
prog_data->base.source_hash);
}
void genX(CmdDispatchIndirect)(