anv: use COMPUTE_WALKER post sync field to track compute work

This is more accurate than PIPE_CONTROL as it won't introduce stalls
between the compute dispatches.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Felix DeGrood <felix.j.degrood@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23131>
This commit is contained in:
Lionel Landwerlin 2023-05-19 17:01:23 +03:00
parent ddc37cf430
commit 521c216efc
6 changed files with 154 additions and 51 deletions

View file

@ -79,6 +79,8 @@ anv_cmd_state_reset(struct anv_cmd_buffer *cmd_buffer)
{ {
anv_cmd_state_finish(cmd_buffer); anv_cmd_state_finish(cmd_buffer);
anv_cmd_state_init(cmd_buffer); anv_cmd_state_init(cmd_buffer);
cmd_buffer->last_compute_walker = NULL;
} }
static VkResult static VkResult
@ -136,6 +138,8 @@ anv_create_cmd_buffer(struct vk_command_pool *pool,
cmd_buffer->generation_return_addr = ANV_NULL_ADDRESS; cmd_buffer->generation_return_addr = ANV_NULL_ADDRESS;
cmd_buffer->generation_bt_state = ANV_STATE_NULL; cmd_buffer->generation_bt_state = ANV_STATE_NULL;
cmd_buffer->last_compute_walker = NULL;
anv_cmd_state_init(cmd_buffer); anv_cmd_state_init(cmd_buffer);
anv_measure_init(cmd_buffer); anv_measure_init(cmd_buffer);

View file

@ -172,7 +172,8 @@ void genX(blorp_exec)(struct blorp_batch *batch,
void genX(cmd_emit_timestamp)(struct anv_batch *batch, void genX(cmd_emit_timestamp)(struct anv_batch *batch,
struct anv_device *device, struct anv_device *device,
struct anv_address addr, struct anv_address addr,
enum anv_timestamp_capture_type); enum anv_timestamp_capture_type type,
void *data);
void genX(batch_emit_dummy_post_sync_op)(struct anv_batch *batch, void genX(batch_emit_dummy_post_sync_op)(struct anv_batch *batch,
struct anv_device *device, struct anv_device *device,

View file

@ -139,7 +139,8 @@ anv_measure_start_snapshot(struct anv_cmd_buffer *cmd_buffer,
(struct anv_address) { (struct anv_address) {
.bo = measure->bo, .bo = measure->bo,
.offset = index * sizeof(uint64_t) }, .offset = index * sizeof(uint64_t) },
ANV_TIMESTAMP_CAPTURE_AT_CS_STALL); ANV_TIMESTAMP_CAPTURE_AT_CS_STALL,
NULL);
struct intel_measure_snapshot *snapshot = &(measure->base.snapshots[index]); struct intel_measure_snapshot *snapshot = &(measure->base.snapshots[index]);
memset(snapshot, 0, sizeof(*snapshot)); memset(snapshot, 0, sizeof(*snapshot));
@ -183,7 +184,8 @@ anv_measure_end_snapshot(struct anv_cmd_buffer *cmd_buffer,
(struct anv_address) { (struct anv_address) {
.bo = measure->bo, .bo = measure->bo,
.offset = index * sizeof(uint64_t) }, .offset = index * sizeof(uint64_t) },
ANV_TIMESTAMP_CAPTURE_AT_CS_STALL); ANV_TIMESTAMP_CAPTURE_AT_CS_STALL,
NULL);
struct intel_measure_snapshot *snapshot = &(measure->base.snapshots[index]); struct intel_measure_snapshot *snapshot = &(measure->base.snapshots[index]);
memset(snapshot, 0, sizeof(*snapshot)); memset(snapshot, 0, sizeof(*snapshot));

View file

@ -843,6 +843,7 @@ enum anv_timestamp_capture_type {
ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE, ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE,
ANV_TIMESTAMP_CAPTURE_END_OF_PIPE, ANV_TIMESTAMP_CAPTURE_END_OF_PIPE,
ANV_TIMESTAMP_CAPTURE_AT_CS_STALL, ANV_TIMESTAMP_CAPTURE_AT_CS_STALL,
ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER,
}; };
struct anv_physical_device { struct anv_physical_device {
@ -962,7 +963,8 @@ struct anv_physical_device {
int64_t master_minor; int64_t master_minor;
struct intel_query_engine_info * engine_info; struct intel_query_engine_info * engine_info;
void (*cmd_emit_timestamp)(struct anv_batch *, struct anv_device *, struct anv_address, enum anv_timestamp_capture_type); void (*cmd_emit_timestamp)(struct anv_batch *, struct anv_device *, struct anv_address,
enum anv_timestamp_capture_type, void *);
struct intel_measure_device measure_device; struct intel_measure_device measure_device;
}; };
@ -2847,6 +2849,13 @@ struct anv_cmd_buffer {
*/ */
struct u_trace trace; struct u_trace trace;
/** Pointer to the last emitted COMPUTE_WALKER.
*
* This is used to edit the instruction post emission to replace the "Post
* Sync" field for utrace timestamp emission.
*/
void *last_compute_walker;
struct { struct {
struct anv_video_session *vid; struct anv_video_session *vid;
struct anv_video_session_params *params; struct anv_video_session_params *params;
@ -4436,6 +4445,11 @@ struct anv_utrace_submit {
/* Buffer of 64bits timestamps (only used for timestamp copies) */ /* Buffer of 64bits timestamps (only used for timestamp copies) */
struct anv_bo *trace_bo; struct anv_bo *trace_bo;
/* Last fully read 64bit timestamp (used to rebuild the upper bits of 32bit
* timestamps)
*/
uint64_t last_full_timestamp;
/* Memcpy state tracking (only used for timestamp copies) */ /* Memcpy state tracking (only used for timestamp copies) */
struct anv_memcpy_state memcpy_state; struct anv_memcpy_state memcpy_state;
}; };

View file

@ -29,6 +29,27 @@
#include "vulkan/runtime/vk_common_entrypoints.h" #include "vulkan/runtime/vk_common_entrypoints.h"
/** Timestamp structure format */
union anv_utrace_timestamp {
/* Timestamp writtem by either 2 * MI_STORE_REGISTER_MEM or
* PIPE_CONTROL.
*/
uint64_t timestamp;
/* Timestamp written by COMPUTE_WALKER::PostSync
*
* Layout is described in PRMs.
* ATSM PRMs, Volume 2d: Command Reference: Structures, POSTSYNC_DATA:
*
* "The timestamp layout :
* [0] = 32b Context Timestamp Start
* [1] = 32b Global Timestamp Start
* [2] = 32b Context Timestamp End
* [3] = 32b Global Timestamp End"
*/
uint32_t compute_walker[4];
};
static uint32_t static uint32_t
command_buffers_count_utraces(struct anv_device *device, command_buffers_count_utraces(struct anv_device *device,
uint32_t cmd_buffer_count, uint32_t cmd_buffer_count,
@ -88,7 +109,8 @@ anv_device_utrace_emit_copy_ts_buffer(struct u_trace_context *utctx,
.bo = ts_to, .offset = to_offset * sizeof(uint64_t) }; .bo = ts_to, .offset = to_offset * sizeof(uint64_t) };
anv_genX(device->info, emit_so_memcpy)(&submit->memcpy_state, anv_genX(device->info, emit_so_memcpy)(&submit->memcpy_state,
to_addr, from_addr, count * sizeof(uint64_t)); to_addr, from_addr,
count * sizeof(union anv_utrace_timestamp));
} }
VkResult VkResult
@ -162,6 +184,7 @@ anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue,
} }
} }
anv_genX(device->info, emit_so_memcpy_fini)(&submit->memcpy_state); anv_genX(device->info, emit_so_memcpy_fini)(&submit->memcpy_state);
anv_genX(device->info, emit_so_memcpy_end)(&submit->memcpy_state); anv_genX(device->info, emit_so_memcpy_end)(&submit->memcpy_state);
u_trace_flush(&submit->ds.trace, submit, true); u_trace_flush(&submit->ds.trace, submit, true);
@ -203,13 +226,19 @@ anv_utrace_create_ts_buffer(struct u_trace_context *utctx, uint32_t size_b)
struct anv_device *device = struct anv_device *device =
container_of(utctx, struct anv_device, ds.trace_context); container_of(utctx, struct anv_device, ds.trace_context);
uint32_t anv_ts_size_b = (size_b / sizeof(uint64_t)) *
sizeof(union anv_utrace_timestamp);
struct anv_bo *bo = NULL; struct anv_bo *bo = NULL;
UNUSED VkResult result = UNUSED VkResult result =
anv_bo_pool_alloc(&device->utrace_bo_pool, anv_bo_pool_alloc(&device->utrace_bo_pool,
align(size_b, 4096), align(anv_ts_size_b, 4096),
&bo); &bo);
assert(result == VK_SUCCESS); assert(result == VK_SUCCESS);
memset(bo->map, 0, bo->size);
intel_clflush_range(bo->map, bo->size);
return bo; return bo;
} }
@ -230,19 +259,30 @@ anv_utrace_record_ts(struct u_trace *ut, void *cs,
{ {
struct anv_device *device = struct anv_device *device =
container_of(ut->utctx, struct anv_device, ds.trace_context); container_of(ut->utctx, struct anv_device, ds.trace_context);
struct anv_batch *batch = struct anv_cmd_buffer *cmd_buffer =
cs != NULL ? cs : container_of(ut, struct anv_cmd_buffer, trace);
&container_of(ut, struct anv_cmd_buffer, trace)->batch; /* cmd_buffer is only valid if cs == NULL */
struct anv_batch *batch = cs != NULL ? cs : &cmd_buffer->batch;
struct anv_bo *bo = timestamps; struct anv_bo *bo = timestamps;
enum anv_timestamp_capture_type capture_type = struct anv_address ts_address = (struct anv_address) {
(end_of_pipe) ? ANV_TIMESTAMP_CAPTURE_END_OF_PIPE .bo = bo,
: ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE; .offset = idx * sizeof(union anv_utrace_timestamp)
device->physical->cmd_emit_timestamp(batch, device, };
(struct anv_address) {
.bo = bo, /* Is this a end of compute trace point? */
.offset = idx * sizeof(uint64_t) }, const bool is_end_compute =
capture_type); (cs == NULL && cmd_buffer->last_compute_walker != NULL && end_of_pipe);
enum anv_timestamp_capture_type capture_type = end_of_pipe ?
is_end_compute ? ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER :
ANV_TIMESTAMP_CAPTURE_END_OF_PIPE : ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE;
device->physical->cmd_emit_timestamp(batch, device, ts_address,
capture_type,
is_end_compute ?
cmd_buffer->last_compute_walker : NULL);
if (is_end_compute)
cmd_buffer->last_compute_walker = NULL;
} }
static uint64_t static uint64_t
@ -265,13 +305,30 @@ anv_utrace_read_ts(struct u_trace_context *utctx,
assert(result == VK_SUCCESS); assert(result == VK_SUCCESS);
} }
uint64_t *ts = bo->map; union anv_utrace_timestamp *ts = (union anv_utrace_timestamp *)bo->map;
/* Don't translate the no-timestamp marker: */ /* Don't translate the no-timestamp marker: */
if (ts[idx] == U_TRACE_NO_TIMESTAMP) if (ts[idx].timestamp == U_TRACE_NO_TIMESTAMP)
return U_TRACE_NO_TIMESTAMP; return U_TRACE_NO_TIMESTAMP;
return intel_device_info_timebase_scale(device->info, ts[idx]); /* Detect a 16bytes timestamp write */
if (ts[idx].compute_walker[2] != 0 || ts[idx].compute_walker[3] != 0) {
/* The timestamp written by COMPUTE_WALKER::PostSync only as 32bits. We
* need to rebuild the full 64bits using the previous timestamp. We
* assume that utrace is reading the timestamp in order. Anyway
* timestamp rollover on 32bits in a few minutes so in most cases that
* should be correct.
*/
uint64_t timestamp =
(submit->last_full_timestamp & 0xffffffff00000000) |
(uint64_t) ts[idx].compute_walker[3];
return intel_device_info_timebase_scale(device->info, timestamp);
}
submit->last_full_timestamp = ts[idx].timestamp;
return intel_device_info_timebase_scale(device->info, ts[idx].timestamp);
} }
void void

View file

@ -5659,37 +5659,42 @@ emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
const struct brw_cs_dispatch_info dispatch = const struct brw_cs_dispatch_info dispatch =
brw_cs_get_dispatch_info(devinfo, prog_data, NULL); brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) { cmd_buffer->last_compute_walker =
cw.IndirectParameterEnable = indirect; anv_batch_emitn(
cw.PredicateEnable = predicate; &cmd_buffer->batch,
cw.SIMDSize = dispatch.simd_size / 16; GENX(COMPUTE_WALKER_length),
cw.IndirectDataStartAddress = comp_state->push_data.offset; GENX(COMPUTE_WALKER),
cw.IndirectDataLength = comp_state->push_data.alloc_size; .IndirectParameterEnable = indirect,
cw.LocalXMaximum = prog_data->local_size[0] - 1; .PredicateEnable = predicate,
cw.LocalYMaximum = prog_data->local_size[1] - 1; .SIMDSize = dispatch.simd_size / 16,
cw.LocalZMaximum = prog_data->local_size[2] - 1; .IndirectDataStartAddress = comp_state->push_data.offset,
cw.ThreadGroupIDXDimension = groupCountX; .IndirectDataLength = comp_state->push_data.alloc_size,
cw.ThreadGroupIDYDimension = groupCountY; .LocalXMaximum = prog_data->local_size[0] - 1,
cw.ThreadGroupIDZDimension = groupCountZ; .LocalYMaximum = prog_data->local_size[1] - 1,
cw.ExecutionMask = dispatch.right_mask; .LocalZMaximum = prog_data->local_size[2] - 1,
cw.PostSync.MOCS = anv_mocs(pipeline->base.device, NULL, 0); .ThreadGroupIDXDimension = groupCountX,
.ThreadGroupIDYDimension = groupCountY,
.ThreadGroupIDZDimension = groupCountZ,
.ExecutionMask = dispatch.right_mask,
.PostSync = {
.MOCS = anv_mocs(pipeline->base.device, NULL, 0),
},
cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) { .InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
.KernelStartPointer = cs_bin->kernel.offset, .KernelStartPointer = cs_bin->kernel.offset,
.SamplerStatePointer = .SamplerStatePointer = cmd_buffer->state.samplers[
cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset, MESA_SHADER_COMPUTE].offset,
.BindingTablePointer = .BindingTablePointer = cmd_buffer->state.binding_tables[
cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset, MESA_SHADER_COMPUTE].offset,
/* Typically set to 0 to avoid prefetching on every thread dispatch. */ /* Typically set to 0 to avoid prefetching on every thread dispatch. */
.BindingTableEntryCount = devinfo->verx10 == 125 ? .BindingTableEntryCount = devinfo->verx10 == 125 ?
0 : 1 + MIN2(pipeline->cs->bind_map.surface_count, 30), 0 : 1 + MIN2(pipeline->cs->bind_map.surface_count, 30),
.NumberofThreadsinGPGPUThreadGroup = dispatch.threads, .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
.SharedLocalMemorySize = encode_slm_size(GFX_VER, .SharedLocalMemorySize = encode_slm_size(
prog_data->base.total_shared), GFX_VER, prog_data->base.total_shared),
.PreferredSLMAllocationSize = preferred_slm_allocation_size(devinfo), .PreferredSLMAllocationSize = preferred_slm_allocation_size(devinfo),
.NumberOfBarriers = prog_data->uses_barrier, .NumberOfBarriers = prog_data->uses_barrier,
}; });
}
} }
#else /* #if GFX_VERx10 >= 125 */ #else /* #if GFX_VERx10 >= 125 */
@ -8067,7 +8072,8 @@ VkResult genX(CmdSetPerformanceStreamMarkerINTEL)(
void genX(cmd_emit_timestamp)(struct anv_batch *batch, void genX(cmd_emit_timestamp)(struct anv_batch *batch,
struct anv_device *device, struct anv_device *device,
struct anv_address addr, struct anv_address addr,
enum anv_timestamp_capture_type type) { enum anv_timestamp_capture_type type,
void *data) {
switch (type) { switch (type) {
case ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE: { case ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE: {
struct mi_builder b; struct mi_builder b;
@ -8077,6 +8083,7 @@ void genX(cmd_emit_timestamp)(struct anv_batch *batch,
} }
case ANV_TIMESTAMP_CAPTURE_END_OF_PIPE: case ANV_TIMESTAMP_CAPTURE_END_OF_PIPE:
anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) { anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
pc.PostSyncOperation = WriteTimestamp; pc.PostSyncOperation = WriteTimestamp;
pc.Address = addr; pc.Address = addr;
@ -8093,6 +8100,24 @@ void genX(cmd_emit_timestamp)(struct anv_batch *batch,
} }
break; break;
#if GFX_VERx10 >= 125
case ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER: {
uint32_t dwords[GENX(COMPUTE_WALKER_length)];
GENX(COMPUTE_WALKER_pack)(batch, dwords, &(struct GENX(COMPUTE_WALKER)) {
.PostSync = (struct GENX(POSTSYNC_DATA)) {
.Operation = WriteTimestamp,
.DestinationAddress = addr,
.MOCS = anv_mocs(device, NULL, 0),
},
});
for (uint32_t i = 0; i < ARRAY_SIZE(dwords); i++)
((uint32_t *)data)[i] |= dwords[i];
break;
}
#endif
default: default:
unreachable("invalid"); unreachable("invalid");
} }