anv: use COMPUTE_WALKER post sync field to track compute work

This is more accurate than PIPE_CONTROL as it won't introduce stalls
between the compute dispatches.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Felix DeGrood <felix.j.degrood@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23131>
This commit is contained in:
Lionel Landwerlin 2023-05-19 17:01:23 +03:00
parent ddc37cf430
commit 521c216efc
6 changed files with 154 additions and 51 deletions

View file

@ -79,6 +79,8 @@ anv_cmd_state_reset(struct anv_cmd_buffer *cmd_buffer)
{
anv_cmd_state_finish(cmd_buffer);
anv_cmd_state_init(cmd_buffer);
cmd_buffer->last_compute_walker = NULL;
}
static VkResult
@ -136,6 +138,8 @@ anv_create_cmd_buffer(struct vk_command_pool *pool,
cmd_buffer->generation_return_addr = ANV_NULL_ADDRESS;
cmd_buffer->generation_bt_state = ANV_STATE_NULL;
cmd_buffer->last_compute_walker = NULL;
anv_cmd_state_init(cmd_buffer);
anv_measure_init(cmd_buffer);

View file

@ -172,7 +172,8 @@ void genX(blorp_exec)(struct blorp_batch *batch,
void genX(cmd_emit_timestamp)(struct anv_batch *batch,
struct anv_device *device,
struct anv_address addr,
enum anv_timestamp_capture_type);
enum anv_timestamp_capture_type type,
void *data);
void genX(batch_emit_dummy_post_sync_op)(struct anv_batch *batch,
struct anv_device *device,

View file

@ -139,7 +139,8 @@ anv_measure_start_snapshot(struct anv_cmd_buffer *cmd_buffer,
(struct anv_address) {
.bo = measure->bo,
.offset = index * sizeof(uint64_t) },
ANV_TIMESTAMP_CAPTURE_AT_CS_STALL);
ANV_TIMESTAMP_CAPTURE_AT_CS_STALL,
NULL);
struct intel_measure_snapshot *snapshot = &(measure->base.snapshots[index]);
memset(snapshot, 0, sizeof(*snapshot));
@ -183,7 +184,8 @@ anv_measure_end_snapshot(struct anv_cmd_buffer *cmd_buffer,
(struct anv_address) {
.bo = measure->bo,
.offset = index * sizeof(uint64_t) },
ANV_TIMESTAMP_CAPTURE_AT_CS_STALL);
ANV_TIMESTAMP_CAPTURE_AT_CS_STALL,
NULL);
struct intel_measure_snapshot *snapshot = &(measure->base.snapshots[index]);
memset(snapshot, 0, sizeof(*snapshot));

View file

@ -843,6 +843,7 @@ enum anv_timestamp_capture_type {
ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE,
ANV_TIMESTAMP_CAPTURE_END_OF_PIPE,
ANV_TIMESTAMP_CAPTURE_AT_CS_STALL,
ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER,
};
struct anv_physical_device {
@ -962,7 +963,8 @@ struct anv_physical_device {
int64_t master_minor;
struct intel_query_engine_info * engine_info;
void (*cmd_emit_timestamp)(struct anv_batch *, struct anv_device *, struct anv_address, enum anv_timestamp_capture_type);
void (*cmd_emit_timestamp)(struct anv_batch *, struct anv_device *, struct anv_address,
enum anv_timestamp_capture_type, void *);
struct intel_measure_device measure_device;
};
@ -2847,6 +2849,13 @@ struct anv_cmd_buffer {
*/
struct u_trace trace;
/** Pointer to the last emitted COMPUTE_WALKER.
*
* This is used to edit the instruction post emission to replace the "Post
* Sync" field for utrace timestamp emission.
*/
void *last_compute_walker;
struct {
struct anv_video_session *vid;
struct anv_video_session_params *params;
@ -4436,6 +4445,11 @@ struct anv_utrace_submit {
/* Buffer of 64bits timestamps (only used for timestamp copies) */
struct anv_bo *trace_bo;
/* Last fully read 64bit timestamp (used to rebuild the upper bits of 32bit
* timestamps)
*/
uint64_t last_full_timestamp;
/* Memcpy state tracking (only used for timestamp copies) */
struct anv_memcpy_state memcpy_state;
};

View file

@ -29,6 +29,27 @@
#include "vulkan/runtime/vk_common_entrypoints.h"
/** Timestamp structure format */
union anv_utrace_timestamp {
/* Timestamp writtem by either 2 * MI_STORE_REGISTER_MEM or
* PIPE_CONTROL.
*/
uint64_t timestamp;
/* Timestamp written by COMPUTE_WALKER::PostSync
*
* Layout is described in PRMs.
* ATSM PRMs, Volume 2d: Command Reference: Structures, POSTSYNC_DATA:
*
* "The timestamp layout :
* [0] = 32b Context Timestamp Start
* [1] = 32b Global Timestamp Start
* [2] = 32b Context Timestamp End
* [3] = 32b Global Timestamp End"
*/
uint32_t compute_walker[4];
};
static uint32_t
command_buffers_count_utraces(struct anv_device *device,
uint32_t cmd_buffer_count,
@ -88,7 +109,8 @@ anv_device_utrace_emit_copy_ts_buffer(struct u_trace_context *utctx,
.bo = ts_to, .offset = to_offset * sizeof(uint64_t) };
anv_genX(device->info, emit_so_memcpy)(&submit->memcpy_state,
to_addr, from_addr, count * sizeof(uint64_t));
to_addr, from_addr,
count * sizeof(union anv_utrace_timestamp));
}
VkResult
@ -162,6 +184,7 @@ anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue,
}
}
anv_genX(device->info, emit_so_memcpy_fini)(&submit->memcpy_state);
anv_genX(device->info, emit_so_memcpy_end)(&submit->memcpy_state);
u_trace_flush(&submit->ds.trace, submit, true);
@ -203,13 +226,19 @@ anv_utrace_create_ts_buffer(struct u_trace_context *utctx, uint32_t size_b)
struct anv_device *device =
container_of(utctx, struct anv_device, ds.trace_context);
uint32_t anv_ts_size_b = (size_b / sizeof(uint64_t)) *
sizeof(union anv_utrace_timestamp);
struct anv_bo *bo = NULL;
UNUSED VkResult result =
anv_bo_pool_alloc(&device->utrace_bo_pool,
align(size_b, 4096),
align(anv_ts_size_b, 4096),
&bo);
assert(result == VK_SUCCESS);
memset(bo->map, 0, bo->size);
intel_clflush_range(bo->map, bo->size);
return bo;
}
@ -230,19 +259,30 @@ anv_utrace_record_ts(struct u_trace *ut, void *cs,
{
struct anv_device *device =
container_of(ut->utctx, struct anv_device, ds.trace_context);
struct anv_batch *batch =
cs != NULL ? cs :
&container_of(ut, struct anv_cmd_buffer, trace)->batch;
struct anv_cmd_buffer *cmd_buffer =
container_of(ut, struct anv_cmd_buffer, trace);
/* cmd_buffer is only valid if cs == NULL */
struct anv_batch *batch = cs != NULL ? cs : &cmd_buffer->batch;
struct anv_bo *bo = timestamps;
enum anv_timestamp_capture_type capture_type =
(end_of_pipe) ? ANV_TIMESTAMP_CAPTURE_END_OF_PIPE
: ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE;
device->physical->cmd_emit_timestamp(batch, device,
(struct anv_address) {
.bo = bo,
.offset = idx * sizeof(uint64_t) },
capture_type);
struct anv_address ts_address = (struct anv_address) {
.bo = bo,
.offset = idx * sizeof(union anv_utrace_timestamp)
};
/* Is this a end of compute trace point? */
const bool is_end_compute =
(cs == NULL && cmd_buffer->last_compute_walker != NULL && end_of_pipe);
enum anv_timestamp_capture_type capture_type = end_of_pipe ?
is_end_compute ? ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER :
ANV_TIMESTAMP_CAPTURE_END_OF_PIPE : ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE;
device->physical->cmd_emit_timestamp(batch, device, ts_address,
capture_type,
is_end_compute ?
cmd_buffer->last_compute_walker : NULL);
if (is_end_compute)
cmd_buffer->last_compute_walker = NULL;
}
static uint64_t
@ -265,13 +305,30 @@ anv_utrace_read_ts(struct u_trace_context *utctx,
assert(result == VK_SUCCESS);
}
uint64_t *ts = bo->map;
union anv_utrace_timestamp *ts = (union anv_utrace_timestamp *)bo->map;
/* Don't translate the no-timestamp marker: */
if (ts[idx] == U_TRACE_NO_TIMESTAMP)
if (ts[idx].timestamp == U_TRACE_NO_TIMESTAMP)
return U_TRACE_NO_TIMESTAMP;
return intel_device_info_timebase_scale(device->info, ts[idx]);
/* Detect a 16bytes timestamp write */
if (ts[idx].compute_walker[2] != 0 || ts[idx].compute_walker[3] != 0) {
/* The timestamp written by COMPUTE_WALKER::PostSync only as 32bits. We
* need to rebuild the full 64bits using the previous timestamp. We
* assume that utrace is reading the timestamp in order. Anyway
* timestamp rollover on 32bits in a few minutes so in most cases that
* should be correct.
*/
uint64_t timestamp =
(submit->last_full_timestamp & 0xffffffff00000000) |
(uint64_t) ts[idx].compute_walker[3];
return intel_device_info_timebase_scale(device->info, timestamp);
}
submit->last_full_timestamp = ts[idx].timestamp;
return intel_device_info_timebase_scale(device->info, ts[idx].timestamp);
}
void

View file

@ -5659,37 +5659,42 @@ emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
const struct brw_cs_dispatch_info dispatch =
brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
cw.IndirectParameterEnable = indirect;
cw.PredicateEnable = predicate;
cw.SIMDSize = dispatch.simd_size / 16;
cw.IndirectDataStartAddress = comp_state->push_data.offset;
cw.IndirectDataLength = comp_state->push_data.alloc_size;
cw.LocalXMaximum = prog_data->local_size[0] - 1;
cw.LocalYMaximum = prog_data->local_size[1] - 1;
cw.LocalZMaximum = prog_data->local_size[2] - 1;
cw.ThreadGroupIDXDimension = groupCountX;
cw.ThreadGroupIDYDimension = groupCountY;
cw.ThreadGroupIDZDimension = groupCountZ;
cw.ExecutionMask = dispatch.right_mask;
cw.PostSync.MOCS = anv_mocs(pipeline->base.device, NULL, 0);
cmd_buffer->last_compute_walker =
anv_batch_emitn(
&cmd_buffer->batch,
GENX(COMPUTE_WALKER_length),
GENX(COMPUTE_WALKER),
.IndirectParameterEnable = indirect,
.PredicateEnable = predicate,
.SIMDSize = dispatch.simd_size / 16,
.IndirectDataStartAddress = comp_state->push_data.offset,
.IndirectDataLength = comp_state->push_data.alloc_size,
.LocalXMaximum = prog_data->local_size[0] - 1,
.LocalYMaximum = prog_data->local_size[1] - 1,
.LocalZMaximum = prog_data->local_size[2] - 1,
.ThreadGroupIDXDimension = groupCountX,
.ThreadGroupIDYDimension = groupCountY,
.ThreadGroupIDZDimension = groupCountZ,
.ExecutionMask = dispatch.right_mask,
.PostSync = {
.MOCS = anv_mocs(pipeline->base.device, NULL, 0),
},
cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
.KernelStartPointer = cs_bin->kernel.offset,
.SamplerStatePointer =
cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
.BindingTablePointer =
cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
/* Typically set to 0 to avoid prefetching on every thread dispatch. */
.BindingTableEntryCount = devinfo->verx10 == 125 ?
0 : 1 + MIN2(pipeline->cs->bind_map.surface_count, 30),
.NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
.SharedLocalMemorySize = encode_slm_size(GFX_VER,
prog_data->base.total_shared),
.PreferredSLMAllocationSize = preferred_slm_allocation_size(devinfo),
.NumberOfBarriers = prog_data->uses_barrier,
};
}
.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
.KernelStartPointer = cs_bin->kernel.offset,
.SamplerStatePointer = cmd_buffer->state.samplers[
MESA_SHADER_COMPUTE].offset,
.BindingTablePointer = cmd_buffer->state.binding_tables[
MESA_SHADER_COMPUTE].offset,
/* Typically set to 0 to avoid prefetching on every thread dispatch. */
.BindingTableEntryCount = devinfo->verx10 == 125 ?
0 : 1 + MIN2(pipeline->cs->bind_map.surface_count, 30),
.NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
.SharedLocalMemorySize = encode_slm_size(
GFX_VER, prog_data->base.total_shared),
.PreferredSLMAllocationSize = preferred_slm_allocation_size(devinfo),
.NumberOfBarriers = prog_data->uses_barrier,
});
}
#else /* #if GFX_VERx10 >= 125 */
@ -8067,7 +8072,8 @@ VkResult genX(CmdSetPerformanceStreamMarkerINTEL)(
void genX(cmd_emit_timestamp)(struct anv_batch *batch,
struct anv_device *device,
struct anv_address addr,
enum anv_timestamp_capture_type type) {
enum anv_timestamp_capture_type type,
void *data) {
switch (type) {
case ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE: {
struct mi_builder b;
@ -8077,6 +8083,7 @@ void genX(cmd_emit_timestamp)(struct anv_batch *batch,
}
case ANV_TIMESTAMP_CAPTURE_END_OF_PIPE:
anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
pc.PostSyncOperation = WriteTimestamp;
pc.Address = addr;
@ -8093,6 +8100,24 @@ void genX(cmd_emit_timestamp)(struct anv_batch *batch,
}
break;
#if GFX_VERx10 >= 125
case ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER: {
uint32_t dwords[GENX(COMPUTE_WALKER_length)];
GENX(COMPUTE_WALKER_pack)(batch, dwords, &(struct GENX(COMPUTE_WALKER)) {
.PostSync = (struct GENX(POSTSYNC_DATA)) {
.Operation = WriteTimestamp,
.DestinationAddress = addr,
.MOCS = anv_mocs(device, NULL, 0),
},
});
for (uint32_t i = 0; i < ARRAY_SIZE(dwords); i++)
((uint32_t *)data)[i] |= dwords[i];
break;
}
#endif
default:
unreachable("invalid");
}