mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-21 09:20:12 +01:00
anv: use COMPUTE_WALKER post sync field to track compute work
This is more accurate than PIPE_CONTROL as it won't introduce stalls between the compute dispatches. Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Felix DeGrood <felix.j.degrood@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23131>
This commit is contained in:
parent
ddc37cf430
commit
521c216efc
6 changed files with 154 additions and 51 deletions
|
|
@ -79,6 +79,8 @@ anv_cmd_state_reset(struct anv_cmd_buffer *cmd_buffer)
|
|||
{
|
||||
anv_cmd_state_finish(cmd_buffer);
|
||||
anv_cmd_state_init(cmd_buffer);
|
||||
|
||||
cmd_buffer->last_compute_walker = NULL;
|
||||
}
|
||||
|
||||
static VkResult
|
||||
|
|
@ -136,6 +138,8 @@ anv_create_cmd_buffer(struct vk_command_pool *pool,
|
|||
cmd_buffer->generation_return_addr = ANV_NULL_ADDRESS;
|
||||
cmd_buffer->generation_bt_state = ANV_STATE_NULL;
|
||||
|
||||
cmd_buffer->last_compute_walker = NULL;
|
||||
|
||||
anv_cmd_state_init(cmd_buffer);
|
||||
|
||||
anv_measure_init(cmd_buffer);
|
||||
|
|
|
|||
|
|
@ -172,7 +172,8 @@ void genX(blorp_exec)(struct blorp_batch *batch,
|
|||
void genX(cmd_emit_timestamp)(struct anv_batch *batch,
|
||||
struct anv_device *device,
|
||||
struct anv_address addr,
|
||||
enum anv_timestamp_capture_type);
|
||||
enum anv_timestamp_capture_type type,
|
||||
void *data);
|
||||
|
||||
void genX(batch_emit_dummy_post_sync_op)(struct anv_batch *batch,
|
||||
struct anv_device *device,
|
||||
|
|
|
|||
|
|
@ -139,7 +139,8 @@ anv_measure_start_snapshot(struct anv_cmd_buffer *cmd_buffer,
|
|||
(struct anv_address) {
|
||||
.bo = measure->bo,
|
||||
.offset = index * sizeof(uint64_t) },
|
||||
ANV_TIMESTAMP_CAPTURE_AT_CS_STALL);
|
||||
ANV_TIMESTAMP_CAPTURE_AT_CS_STALL,
|
||||
NULL);
|
||||
|
||||
struct intel_measure_snapshot *snapshot = &(measure->base.snapshots[index]);
|
||||
memset(snapshot, 0, sizeof(*snapshot));
|
||||
|
|
@ -183,7 +184,8 @@ anv_measure_end_snapshot(struct anv_cmd_buffer *cmd_buffer,
|
|||
(struct anv_address) {
|
||||
.bo = measure->bo,
|
||||
.offset = index * sizeof(uint64_t) },
|
||||
ANV_TIMESTAMP_CAPTURE_AT_CS_STALL);
|
||||
ANV_TIMESTAMP_CAPTURE_AT_CS_STALL,
|
||||
NULL);
|
||||
|
||||
struct intel_measure_snapshot *snapshot = &(measure->base.snapshots[index]);
|
||||
memset(snapshot, 0, sizeof(*snapshot));
|
||||
|
|
|
|||
|
|
@ -843,6 +843,7 @@ enum anv_timestamp_capture_type {
|
|||
ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE,
|
||||
ANV_TIMESTAMP_CAPTURE_END_OF_PIPE,
|
||||
ANV_TIMESTAMP_CAPTURE_AT_CS_STALL,
|
||||
ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER,
|
||||
};
|
||||
|
||||
struct anv_physical_device {
|
||||
|
|
@ -962,7 +963,8 @@ struct anv_physical_device {
|
|||
int64_t master_minor;
|
||||
struct intel_query_engine_info * engine_info;
|
||||
|
||||
void (*cmd_emit_timestamp)(struct anv_batch *, struct anv_device *, struct anv_address, enum anv_timestamp_capture_type);
|
||||
void (*cmd_emit_timestamp)(struct anv_batch *, struct anv_device *, struct anv_address,
|
||||
enum anv_timestamp_capture_type, void *);
|
||||
struct intel_measure_device measure_device;
|
||||
};
|
||||
|
||||
|
|
@ -2847,6 +2849,13 @@ struct anv_cmd_buffer {
|
|||
*/
|
||||
struct u_trace trace;
|
||||
|
||||
/** Pointer to the last emitted COMPUTE_WALKER.
|
||||
*
|
||||
* This is used to edit the instruction post emission to replace the "Post
|
||||
* Sync" field for utrace timestamp emission.
|
||||
*/
|
||||
void *last_compute_walker;
|
||||
|
||||
struct {
|
||||
struct anv_video_session *vid;
|
||||
struct anv_video_session_params *params;
|
||||
|
|
@ -4436,6 +4445,11 @@ struct anv_utrace_submit {
|
|||
/* Buffer of 64bits timestamps (only used for timestamp copies) */
|
||||
struct anv_bo *trace_bo;
|
||||
|
||||
/* Last fully read 64bit timestamp (used to rebuild the upper bits of 32bit
|
||||
* timestamps)
|
||||
*/
|
||||
uint64_t last_full_timestamp;
|
||||
|
||||
/* Memcpy state tracking (only used for timestamp copies) */
|
||||
struct anv_memcpy_state memcpy_state;
|
||||
};
|
||||
|
|
|
|||
|
|
@ -29,6 +29,27 @@
|
|||
|
||||
#include "vulkan/runtime/vk_common_entrypoints.h"
|
||||
|
||||
/** Timestamp structure format */
|
||||
union anv_utrace_timestamp {
|
||||
/* Timestamp writtem by either 2 * MI_STORE_REGISTER_MEM or
|
||||
* PIPE_CONTROL.
|
||||
*/
|
||||
uint64_t timestamp;
|
||||
|
||||
/* Timestamp written by COMPUTE_WALKER::PostSync
|
||||
*
|
||||
* Layout is described in PRMs.
|
||||
* ATSM PRMs, Volume 2d: Command Reference: Structures, POSTSYNC_DATA:
|
||||
*
|
||||
* "The timestamp layout :
|
||||
* [0] = 32b Context Timestamp Start
|
||||
* [1] = 32b Global Timestamp Start
|
||||
* [2] = 32b Context Timestamp End
|
||||
* [3] = 32b Global Timestamp End"
|
||||
*/
|
||||
uint32_t compute_walker[4];
|
||||
};
|
||||
|
||||
static uint32_t
|
||||
command_buffers_count_utraces(struct anv_device *device,
|
||||
uint32_t cmd_buffer_count,
|
||||
|
|
@ -88,7 +109,8 @@ anv_device_utrace_emit_copy_ts_buffer(struct u_trace_context *utctx,
|
|||
.bo = ts_to, .offset = to_offset * sizeof(uint64_t) };
|
||||
|
||||
anv_genX(device->info, emit_so_memcpy)(&submit->memcpy_state,
|
||||
to_addr, from_addr, count * sizeof(uint64_t));
|
||||
to_addr, from_addr,
|
||||
count * sizeof(union anv_utrace_timestamp));
|
||||
}
|
||||
|
||||
VkResult
|
||||
|
|
@ -162,6 +184,7 @@ anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue,
|
|||
}
|
||||
}
|
||||
anv_genX(device->info, emit_so_memcpy_fini)(&submit->memcpy_state);
|
||||
|
||||
anv_genX(device->info, emit_so_memcpy_end)(&submit->memcpy_state);
|
||||
|
||||
u_trace_flush(&submit->ds.trace, submit, true);
|
||||
|
|
@ -203,13 +226,19 @@ anv_utrace_create_ts_buffer(struct u_trace_context *utctx, uint32_t size_b)
|
|||
struct anv_device *device =
|
||||
container_of(utctx, struct anv_device, ds.trace_context);
|
||||
|
||||
uint32_t anv_ts_size_b = (size_b / sizeof(uint64_t)) *
|
||||
sizeof(union anv_utrace_timestamp);
|
||||
|
||||
struct anv_bo *bo = NULL;
|
||||
UNUSED VkResult result =
|
||||
anv_bo_pool_alloc(&device->utrace_bo_pool,
|
||||
align(size_b, 4096),
|
||||
align(anv_ts_size_b, 4096),
|
||||
&bo);
|
||||
assert(result == VK_SUCCESS);
|
||||
|
||||
memset(bo->map, 0, bo->size);
|
||||
intel_clflush_range(bo->map, bo->size);
|
||||
|
||||
return bo;
|
||||
}
|
||||
|
||||
|
|
@ -230,19 +259,30 @@ anv_utrace_record_ts(struct u_trace *ut, void *cs,
|
|||
{
|
||||
struct anv_device *device =
|
||||
container_of(ut->utctx, struct anv_device, ds.trace_context);
|
||||
struct anv_batch *batch =
|
||||
cs != NULL ? cs :
|
||||
&container_of(ut, struct anv_cmd_buffer, trace)->batch;
|
||||
struct anv_cmd_buffer *cmd_buffer =
|
||||
container_of(ut, struct anv_cmd_buffer, trace);
|
||||
/* cmd_buffer is only valid if cs == NULL */
|
||||
struct anv_batch *batch = cs != NULL ? cs : &cmd_buffer->batch;
|
||||
struct anv_bo *bo = timestamps;
|
||||
|
||||
enum anv_timestamp_capture_type capture_type =
|
||||
(end_of_pipe) ? ANV_TIMESTAMP_CAPTURE_END_OF_PIPE
|
||||
: ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE;
|
||||
device->physical->cmd_emit_timestamp(batch, device,
|
||||
(struct anv_address) {
|
||||
.bo = bo,
|
||||
.offset = idx * sizeof(uint64_t) },
|
||||
capture_type);
|
||||
struct anv_address ts_address = (struct anv_address) {
|
||||
.bo = bo,
|
||||
.offset = idx * sizeof(union anv_utrace_timestamp)
|
||||
};
|
||||
|
||||
/* Is this a end of compute trace point? */
|
||||
const bool is_end_compute =
|
||||
(cs == NULL && cmd_buffer->last_compute_walker != NULL && end_of_pipe);
|
||||
|
||||
enum anv_timestamp_capture_type capture_type = end_of_pipe ?
|
||||
is_end_compute ? ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER :
|
||||
ANV_TIMESTAMP_CAPTURE_END_OF_PIPE : ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE;
|
||||
device->physical->cmd_emit_timestamp(batch, device, ts_address,
|
||||
capture_type,
|
||||
is_end_compute ?
|
||||
cmd_buffer->last_compute_walker : NULL);
|
||||
if (is_end_compute)
|
||||
cmd_buffer->last_compute_walker = NULL;
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
|
|
@ -265,13 +305,30 @@ anv_utrace_read_ts(struct u_trace_context *utctx,
|
|||
assert(result == VK_SUCCESS);
|
||||
}
|
||||
|
||||
uint64_t *ts = bo->map;
|
||||
union anv_utrace_timestamp *ts = (union anv_utrace_timestamp *)bo->map;
|
||||
|
||||
/* Don't translate the no-timestamp marker: */
|
||||
if (ts[idx] == U_TRACE_NO_TIMESTAMP)
|
||||
if (ts[idx].timestamp == U_TRACE_NO_TIMESTAMP)
|
||||
return U_TRACE_NO_TIMESTAMP;
|
||||
|
||||
return intel_device_info_timebase_scale(device->info, ts[idx]);
|
||||
/* Detect a 16bytes timestamp write */
|
||||
if (ts[idx].compute_walker[2] != 0 || ts[idx].compute_walker[3] != 0) {
|
||||
/* The timestamp written by COMPUTE_WALKER::PostSync only as 32bits. We
|
||||
* need to rebuild the full 64bits using the previous timestamp. We
|
||||
* assume that utrace is reading the timestamp in order. Anyway
|
||||
* timestamp rollover on 32bits in a few minutes so in most cases that
|
||||
* should be correct.
|
||||
*/
|
||||
uint64_t timestamp =
|
||||
(submit->last_full_timestamp & 0xffffffff00000000) |
|
||||
(uint64_t) ts[idx].compute_walker[3];
|
||||
|
||||
return intel_device_info_timebase_scale(device->info, timestamp);
|
||||
}
|
||||
|
||||
submit->last_full_timestamp = ts[idx].timestamp;
|
||||
|
||||
return intel_device_info_timebase_scale(device->info, ts[idx].timestamp);
|
||||
}
|
||||
|
||||
void
|
||||
|
|
|
|||
|
|
@ -5659,37 +5659,42 @@ emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
|
|||
const struct brw_cs_dispatch_info dispatch =
|
||||
brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
|
||||
|
||||
anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
|
||||
cw.IndirectParameterEnable = indirect;
|
||||
cw.PredicateEnable = predicate;
|
||||
cw.SIMDSize = dispatch.simd_size / 16;
|
||||
cw.IndirectDataStartAddress = comp_state->push_data.offset;
|
||||
cw.IndirectDataLength = comp_state->push_data.alloc_size;
|
||||
cw.LocalXMaximum = prog_data->local_size[0] - 1;
|
||||
cw.LocalYMaximum = prog_data->local_size[1] - 1;
|
||||
cw.LocalZMaximum = prog_data->local_size[2] - 1;
|
||||
cw.ThreadGroupIDXDimension = groupCountX;
|
||||
cw.ThreadGroupIDYDimension = groupCountY;
|
||||
cw.ThreadGroupIDZDimension = groupCountZ;
|
||||
cw.ExecutionMask = dispatch.right_mask;
|
||||
cw.PostSync.MOCS = anv_mocs(pipeline->base.device, NULL, 0);
|
||||
cmd_buffer->last_compute_walker =
|
||||
anv_batch_emitn(
|
||||
&cmd_buffer->batch,
|
||||
GENX(COMPUTE_WALKER_length),
|
||||
GENX(COMPUTE_WALKER),
|
||||
.IndirectParameterEnable = indirect,
|
||||
.PredicateEnable = predicate,
|
||||
.SIMDSize = dispatch.simd_size / 16,
|
||||
.IndirectDataStartAddress = comp_state->push_data.offset,
|
||||
.IndirectDataLength = comp_state->push_data.alloc_size,
|
||||
.LocalXMaximum = prog_data->local_size[0] - 1,
|
||||
.LocalYMaximum = prog_data->local_size[1] - 1,
|
||||
.LocalZMaximum = prog_data->local_size[2] - 1,
|
||||
.ThreadGroupIDXDimension = groupCountX,
|
||||
.ThreadGroupIDYDimension = groupCountY,
|
||||
.ThreadGroupIDZDimension = groupCountZ,
|
||||
.ExecutionMask = dispatch.right_mask,
|
||||
.PostSync = {
|
||||
.MOCS = anv_mocs(pipeline->base.device, NULL, 0),
|
||||
},
|
||||
|
||||
cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
|
||||
.KernelStartPointer = cs_bin->kernel.offset,
|
||||
.SamplerStatePointer =
|
||||
cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
|
||||
.BindingTablePointer =
|
||||
cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
|
||||
/* Typically set to 0 to avoid prefetching on every thread dispatch. */
|
||||
.BindingTableEntryCount = devinfo->verx10 == 125 ?
|
||||
0 : 1 + MIN2(pipeline->cs->bind_map.surface_count, 30),
|
||||
.NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
|
||||
.SharedLocalMemorySize = encode_slm_size(GFX_VER,
|
||||
prog_data->base.total_shared),
|
||||
.PreferredSLMAllocationSize = preferred_slm_allocation_size(devinfo),
|
||||
.NumberOfBarriers = prog_data->uses_barrier,
|
||||
};
|
||||
}
|
||||
.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
|
||||
.KernelStartPointer = cs_bin->kernel.offset,
|
||||
.SamplerStatePointer = cmd_buffer->state.samplers[
|
||||
MESA_SHADER_COMPUTE].offset,
|
||||
.BindingTablePointer = cmd_buffer->state.binding_tables[
|
||||
MESA_SHADER_COMPUTE].offset,
|
||||
/* Typically set to 0 to avoid prefetching on every thread dispatch. */
|
||||
.BindingTableEntryCount = devinfo->verx10 == 125 ?
|
||||
0 : 1 + MIN2(pipeline->cs->bind_map.surface_count, 30),
|
||||
.NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
|
||||
.SharedLocalMemorySize = encode_slm_size(
|
||||
GFX_VER, prog_data->base.total_shared),
|
||||
.PreferredSLMAllocationSize = preferred_slm_allocation_size(devinfo),
|
||||
.NumberOfBarriers = prog_data->uses_barrier,
|
||||
});
|
||||
}
|
||||
|
||||
#else /* #if GFX_VERx10 >= 125 */
|
||||
|
|
@ -8067,7 +8072,8 @@ VkResult genX(CmdSetPerformanceStreamMarkerINTEL)(
|
|||
void genX(cmd_emit_timestamp)(struct anv_batch *batch,
|
||||
struct anv_device *device,
|
||||
struct anv_address addr,
|
||||
enum anv_timestamp_capture_type type) {
|
||||
enum anv_timestamp_capture_type type,
|
||||
void *data) {
|
||||
switch (type) {
|
||||
case ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE: {
|
||||
struct mi_builder b;
|
||||
|
|
@ -8077,6 +8083,7 @@ void genX(cmd_emit_timestamp)(struct anv_batch *batch,
|
|||
}
|
||||
|
||||
case ANV_TIMESTAMP_CAPTURE_END_OF_PIPE:
|
||||
|
||||
anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
|
||||
pc.PostSyncOperation = WriteTimestamp;
|
||||
pc.Address = addr;
|
||||
|
|
@ -8093,6 +8100,24 @@ void genX(cmd_emit_timestamp)(struct anv_batch *batch,
|
|||
}
|
||||
break;
|
||||
|
||||
#if GFX_VERx10 >= 125
|
||||
case ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER: {
|
||||
uint32_t dwords[GENX(COMPUTE_WALKER_length)];
|
||||
|
||||
GENX(COMPUTE_WALKER_pack)(batch, dwords, &(struct GENX(COMPUTE_WALKER)) {
|
||||
.PostSync = (struct GENX(POSTSYNC_DATA)) {
|
||||
.Operation = WriteTimestamp,
|
||||
.DestinationAddress = addr,
|
||||
.MOCS = anv_mocs(device, NULL, 0),
|
||||
},
|
||||
});
|
||||
|
||||
for (uint32_t i = 0; i < ARRAY_SIZE(dwords); i++)
|
||||
((uint32_t *)data)[i] |= dwords[i];
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
default:
|
||||
unreachable("invalid");
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue