anv: actually use the COMPUTE_WALKER_BODY prepacked field

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: José Roberto de Souza <jose.souza@intel.com>
Reviewed-by: Ivan Briano <ivan.briano@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36711>
This commit is contained in:
Lionel Landwerlin 2025-03-28 11:24:32 +02:00 committed by Marge Bot
parent 9c8571794a
commit 5a2fb0da32
4 changed files with 104 additions and 127 deletions

View file

@ -54,6 +54,8 @@ genX_bits_included_symbols = [
'3DSTATE_SO_BUFFER::Stream Offset', '3DSTATE_SO_BUFFER::Stream Offset',
'3DSTATE_CPSIZE_CONTROL_BUFFER::Surface Base Address', '3DSTATE_CPSIZE_CONTROL_BUFFER::Surface Base Address',
'3DSTATE_CPSIZE_CONTROL_BUFFER::Surface Pitch', '3DSTATE_CPSIZE_CONTROL_BUFFER::Surface Pitch',
'COMPUTE_WALKER::body',
'EXECUTE_INDIRECT_DISPATCH::body',
# structures # structures
'RENDER_SURFACE_STATE::Surface Base Address', 'RENDER_SURFACE_STATE::Surface Base Address',
'RENDER_SURFACE_STATE::Surface Pitch', 'RENDER_SURFACE_STATE::Surface Pitch',

View file

@ -2725,6 +2725,31 @@ _anv_combine_address(struct anv_batch *batch, void *location,
__dst; \ __dst; \
}) })
/* Emit an instruction with fields set in the arguments of this macro and
* combine it with a prepacked instructions.
*/
#define anv_batch_emitn_merge_at(batch, n, offset, to_merge, cmd, ...) ({ \
void *__dst = anv_batch_emit_dwords(batch, n); \
if (__dst) { \
struct cmd __template = { \
__anv_cmd_header(cmd), \
.DWordLength = n - __anv_cmd_length_bias(cmd), \
__VA_ARGS__ \
}; \
uint32_t __partial[__anv_cmd_length(cmd)]; \
__anv_cmd_pack(cmd)(batch, __partial, &__template); \
for (uint32_t i = 0; i < (offset); i++) \
((uint32_t *)__dst)[i] = __partial[i]; \
for (uint32_t i = (offset); i < n; i++) { \
((uint32_t *)__dst)[i] = \
(to_merge)[i - (offset)] | __partial[i]; \
} \
VG(VALGRIND_CHECK_MEM_IS_DEFINED(__dst, \
__anv_cmd_length(cmd) * 4)); \
} \
__dst; \
})
#define anv_batch_emit_merge(batch, cmd, pipeline, state, name) \ #define anv_batch_emit_merge(batch, cmd, pipeline, state, name) \
for (struct cmd name = { 0 }, \ for (struct cmd name = { 0 }, \
*_dst = anv_batch_emit_dwords(batch, __anv_cmd_length(cmd)); \ *_dst = anv_batch_emit_dwords(batch, __anv_cmd_length(cmd)); \
@ -5260,7 +5285,7 @@ struct anv_compute_pipeline {
uint32_t gpgpu_walker[15]; uint32_t gpgpu_walker[15];
} gfx9; } gfx9;
struct { struct {
uint32_t compute_walker[40]; uint32_t compute_walker_body[39];
} gfx125; } gfx125;
}; };
}; };

View file

@ -354,33 +354,11 @@ compute_store_indirect_params(struct anv_cmd_buffer *cmd_buffer,
#if GFX_VERx10 >= 125 #if GFX_VERx10 >= 125
static inline struct GENX(INTERFACE_DESCRIPTOR_DATA) static inline struct GENX(INTERFACE_DESCRIPTOR_DATA)
get_interface_descriptor_data(struct anv_cmd_buffer *cmd_buffer, get_interface_descriptor_data_tables(struct anv_cmd_buffer *cmd_buffer)
const struct anv_shader_bin *shader,
const struct brw_cs_prog_data *prog_data,
const struct intel_cs_dispatch_info *dispatch)
{ {
const struct intel_device_info *devinfo = cmd_buffer->device->info;
return (struct GENX(INTERFACE_DESCRIPTOR_DATA)) { return (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
.SamplerCount = DIV_ROUND_UP(CLAMP(shader->bind_map.sampler_count, 0, 16), 4),
.KernelStartPointer = shader->kernel.offset,
.SamplerStatePointer = cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset, .SamplerStatePointer = cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
.BindingTablePointer = cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset, .BindingTablePointer = cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
/* Typically set to 0 to avoid prefetching on every thread dispatch. */
.BindingTableEntryCount = devinfo->verx10 == 125 ?
0 : MIN2(shader->bind_map.surface_count, 30),
.NumberofThreadsinGPGPUThreadGroup = dispatch->threads,
.ThreadGroupDispatchSize = intel_compute_threads_group_dispatch_size(dispatch->threads),
.SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER, prog_data->base.total_shared),
.PreferredSLMAllocationSize =
intel_compute_preferred_slm_calc_encode_size(devinfo,
prog_data->base.total_shared,
dispatch->group_size,
dispatch->simd_size),
.NumberOfBarriers = prog_data->uses_barrier,
#if GFX_VER >= 30
.RegistersPerThread = ptl_register_blocks(prog_data->base.grf_used),
#endif
}; };
} }
@ -447,7 +425,6 @@ emit_indirect_compute_walker(struct anv_cmd_buffer *cmd_buffer,
const struct intel_cs_dispatch_info dispatch = const struct intel_cs_dispatch_info dispatch =
brw_cs_get_dispatch_info(devinfo, prog_data, NULL); brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
const int dispatch_size = dispatch.simd_size / 16;
uint64_t indirect_addr64 = anv_address_physical(indirect_addr); uint64_t indirect_addr64 = anv_address_physical(indirect_addr);
@ -457,51 +434,29 @@ emit_indirect_compute_walker(struct anv_cmd_buffer *cmd_buffer,
compute_update_async_threads_limit(cmd_buffer, prog_data, &dispatch); compute_update_async_threads_limit(cmd_buffer, prog_data, &dispatch);
struct GENX(COMPUTE_WALKER_BODY) body = {
.SIMDSize = dispatch_size,
/* HSD 14016252163: Use of Morton walk order (and batching using a batch
* size of 4) is expected to increase sampler cache hit rates by
* increasing sample address locality within a subslice.
*/
#if GFX_VER >= 30
.DispatchWalkOrder = prog_data->uses_sampler ?
MortonWalk :
LinearWalk,
.ThreadGroupBatchSize = prog_data->uses_sampler ? TG_BATCH_4 :
TG_BATCH_1,
#endif
.MessageSIMD = dispatch_size,
.GenerateLocalID = prog_data->generate_local_id != 0,
.EmitLocal = prog_data->generate_local_id,
.WalkOrder = prog_data->walk_order,
.TileLayout = prog_data->walk_order == INTEL_WALK_ORDER_YXZ ?
TileY32bpe : Linear,
.LocalXMaximum = prog_data->local_size[0] - 1,
.LocalYMaximum = prog_data->local_size[1] - 1,
.LocalZMaximum = prog_data->local_size[2] - 1,
.ExecutionMask = dispatch.right_mask,
.PostSync.MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
.InterfaceDescriptor =
get_interface_descriptor_data(cmd_buffer, comp_state->shader,
prog_data, &dispatch),
.EmitInlineParameter = prog_data->uses_inline_data,
.InlineData = {
[ANV_INLINE_PARAM_PUSH_ADDRESS_OFFSET / 4 + 0] = push_addr64 & 0xffffffff,
[ANV_INLINE_PARAM_PUSH_ADDRESS_OFFSET / 4 + 1] = push_addr64 >> 32,
[ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 0] = UINT32_MAX,
[ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 1] = indirect_addr64 & 0xffffffff,
[ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 2] = indirect_addr64 >> 32,
},
};
cmd_buffer->state.last_indirect_dispatch = cmd_buffer->state.last_indirect_dispatch =
anv_batch_emitn( anv_batch_emitn_merge_at(
&cmd_buffer->batch, &cmd_buffer->batch,
GENX(EXECUTE_INDIRECT_DISPATCH_length), GENX(EXECUTE_INDIRECT_DISPATCH_length),
GENX(EXECUTE_INDIRECT_DISPATCH_body_start) / 32,
anv_pipeline_to_compute(comp_state->base.pipeline)->gfx125.compute_walker_body,
GENX(EXECUTE_INDIRECT_DISPATCH), GENX(EXECUTE_INDIRECT_DISPATCH),
.PredicateEnable = predicate, .PredicateEnable = predicate,
.MaxCount = 1, .MaxCount = 1,
.body = body, .body = {
.InterfaceDescriptor = get_interface_descriptor_data_tables(cmd_buffer),
.ExecutionMask = dispatch.right_mask,
.InlineData = {
[ANV_INLINE_PARAM_PUSH_ADDRESS_OFFSET / 4 + 0] = push_addr64 & 0xffffffff,
[ANV_INLINE_PARAM_PUSH_ADDRESS_OFFSET / 4 + 1] = push_addr64 >> 32,
[ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 0] = UINT32_MAX,
[ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 1] = indirect_addr64 & 0xffffffff,
[ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 2] = indirect_addr64 >> 32,
},
.PostSync = {
.MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
},
},
.ArgumentBufferStartAddress = indirect_addr, .ArgumentBufferStartAddress = indirect_addr,
.MOCS = anv_mocs(cmd_buffer->device, .MOCS = anv_mocs(cmd_buffer->device,
indirect_addr.bo, 0), indirect_addr.bo, 0),
@ -538,27 +493,11 @@ emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
comp_state->base.push_constants_state)); comp_state->base.push_constants_state));
struct GENX(COMPUTE_WALKER_BODY) body = { struct GENX(COMPUTE_WALKER_BODY) body = {
.SIMDSize = dispatch.simd_size / 16, .InterfaceDescriptor = get_interface_descriptor_data_tables(cmd_buffer),
.MessageSIMD = dispatch.simd_size / 16,
.GenerateLocalID = prog_data->generate_local_id != 0,
.EmitLocal = prog_data->generate_local_id,
.WalkOrder = prog_data->walk_order,
.TileLayout = prog_data->walk_order == INTEL_WALK_ORDER_YXZ ?
TileY32bpe : Linear,
.LocalXMaximum = prog_data->local_size[0] - 1,
.LocalYMaximum = prog_data->local_size[1] - 1,
.LocalZMaximum = prog_data->local_size[2] - 1,
.ThreadGroupIDXDimension = groupCountX, .ThreadGroupIDXDimension = groupCountX,
.ThreadGroupIDYDimension = groupCountY, .ThreadGroupIDYDimension = groupCountY,
.ThreadGroupIDZDimension = groupCountZ, .ThreadGroupIDZDimension = groupCountZ,
.ExecutionMask = dispatch.right_mask, .ExecutionMask = dispatch.right_mask,
.PostSync = {
.MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
},
.InterfaceDescriptor =
get_interface_descriptor_data(cmd_buffer, comp_state->shader,
prog_data, &dispatch),
.EmitInlineParameter = prog_data->uses_inline_data,
.InlineData = { .InlineData = {
[ANV_INLINE_PARAM_PUSH_ADDRESS_OFFSET / 4 + 0] = push_addr64 & 0xffffffff, [ANV_INLINE_PARAM_PUSH_ADDRESS_OFFSET / 4 + 0] = push_addr64 & 0xffffffff,
[ANV_INLINE_PARAM_PUSH_ADDRESS_OFFSET / 4 + 1] = push_addr64 >> 32, [ANV_INLINE_PARAM_PUSH_ADDRESS_OFFSET / 4 + 1] = push_addr64 >> 32,
@ -566,17 +505,17 @@ emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
[ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 1] = num_workgroup_data[1], [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 1] = num_workgroup_data[1],
[ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 2] = num_workgroup_data[2], [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 2] = num_workgroup_data[2],
}, },
#if GFX_VER >= 30 .PostSync = {
/* HSD 14016252163 */ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
.DispatchWalkOrder = prog_data->uses_sampler ? MortonWalk : LinearWalk, },
.ThreadGroupBatchSize = prog_data->uses_sampler ? TG_BATCH_4 : TG_BATCH_1,
#endif
}; };
cmd_buffer->state.last_compute_walker = cmd_buffer->state.last_compute_walker =
anv_batch_emitn( anv_batch_emitn_merge_at(
&cmd_buffer->batch, &cmd_buffer->batch,
GENX(COMPUTE_WALKER_length), GENX(COMPUTE_WALKER_length),
GENX(COMPUTE_WALKER_body_start) / 32,
anv_pipeline_to_compute(comp_state->base.pipeline)->gfx125.compute_walker_body,
GENX(COMPUTE_WALKER), GENX(COMPUTE_WALKER),
.IndirectParameterEnable = !anv_address_is_null(indirect_addr), .IndirectParameterEnable = !anv_address_is_null(indirect_addr),
.PredicateEnable = predicate, .PredicateEnable = predicate,

View file

@ -1542,49 +1542,57 @@ genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline)
brw_cs_get_dispatch_info(devinfo, prog_data, NULL); brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
const struct anv_shader_bin *shader = pipeline->cs; const struct anv_shader_bin *shader = pipeline->cs;
struct GENX(COMPUTE_WALKER) walker = { struct GENX(COMPUTE_WALKER_BODY) walker = {
GENX(COMPUTE_WALKER_header), /* HSD 14016252163: Use of Morton walk order (and batching using a batch
#if GFX_VERx10 == 125 * size of 4) is expected to increase sampler cache hit rates by
.SystolicModeEnable = prog_data->uses_systolic, * increasing sample address locality within a subslice.
*/
#if GFX_VER >= 30
.DispatchWalkOrder = prog_data->uses_sampler ?
MortonWalk :
LinearWalk,
.ThreadGroupBatchSize = prog_data->uses_sampler ? TG_BATCH_4 :
TG_BATCH_1,
#endif #endif
.body = { .SIMDSize = dispatch.simd_size / 16,
.SIMDSize = dispatch.simd_size / 16, .MessageSIMD = dispatch.simd_size / 16,
.MessageSIMD = dispatch.simd_size / 16, .GenerateLocalID = prog_data->generate_local_id != 0,
.GenerateLocalID = prog_data->generate_local_id != 0, .EmitLocal = prog_data->generate_local_id,
.EmitLocal = prog_data->generate_local_id, .WalkOrder = prog_data->walk_order,
.WalkOrder = prog_data->walk_order, .TileLayout = prog_data->walk_order == INTEL_WALK_ORDER_YXZ ?
.TileLayout = prog_data->walk_order == INTEL_WALK_ORDER_YXZ ? TileY32bpe : Linear,
TileY32bpe : Linear, .LocalXMaximum = prog_data->local_size[0] - 1,
.LocalXMaximum = prog_data->local_size[0] - 1, .LocalYMaximum = prog_data->local_size[1] - 1,
.LocalYMaximum = prog_data->local_size[1] - 1, .LocalZMaximum = prog_data->local_size[2] - 1,
.LocalZMaximum = prog_data->local_size[2] - 1, .PostSync = {
.ExecutionMask = dispatch.right_mask, .MOCS = anv_mocs(pipeline->base.device, NULL, 0),
.PostSync = {
.MOCS = anv_mocs(pipeline->base.device, NULL, 0),
},
.InterfaceDescriptor = {
.KernelStartPointer = shader->kernel.offset,
/* Typically set to 0 to avoid prefetching on every thread dispatch. */
.BindingTableEntryCount = devinfo->verx10 == 125 ?
0 : 1 + MIN2(shader->bind_map.surface_count, 30),
.NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
.ThreadGroupDispatchSize =
intel_compute_threads_group_dispatch_size(dispatch.threads),
.SharedLocalMemorySize =
intel_compute_slm_encode_size(GFX_VER, prog_data->base.total_shared),
.PreferredSLMAllocationSize =
intel_compute_preferred_slm_calc_encode_size(devinfo,
prog_data->base.total_shared,
dispatch.group_size,
dispatch.simd_size),
.NumberOfBarriers = prog_data->uses_barrier,
},
.EmitInlineParameter = prog_data->uses_inline_push_addr,
}, },
.EmitInlineParameter = prog_data->uses_inline_push_addr,
.InterfaceDescriptor = {
.KernelStartPointer = shader->kernel.offset,
.SamplerCount = DIV_ROUND_UP(CLAMP(shader->bind_map.sampler_count, 0, 16), 4),
/* Typically set to 0 to avoid prefetching on every thread dispatch. */
.BindingTableEntryCount = devinfo->verx10 == 125 ?
0 : 1 + MIN2(shader->bind_map.surface_count, 30),
.NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
.SharedLocalMemorySize = intel_compute_slm_encode_size(
GFX_VER, prog_data->base.total_shared),
.PreferredSLMAllocationSize = intel_compute_preferred_slm_calc_encode_size(
devinfo, prog_data->base.total_shared,
dispatch.group_size, dispatch.simd_size),
.NumberOfBarriers = prog_data->uses_barrier,
#if GFX_VER >= 30
.RegistersPerThread = ptl_register_blocks(prog_data->base.grf_used),
#endif
},
.EmitInlineParameter = prog_data->uses_inline_push_addr,
}; };
assert(ARRAY_SIZE(pipeline->gfx125.compute_walker) >= GENX(COMPUTE_WALKER_length)); assert(ARRAY_SIZE(pipeline->gfx125.compute_walker_body) >=
GENX(COMPUTE_WALKER_pack)(NULL, pipeline->gfx125.compute_walker, &walker); GENX(COMPUTE_WALKER_BODY_length));
GENX(COMPUTE_WALKER_BODY_pack)(NULL,
pipeline->gfx125.compute_walker_body,
&walker);
} }
#else /* #if GFX_VERx10 >= 125 */ #else /* #if GFX_VERx10 >= 125 */
@ -1660,7 +1668,10 @@ genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline)
*/ */
.ThreadPreemptionDisable = true, .ThreadPreemptionDisable = true,
#endif #endif
#if GFX_VERx10 >= 125
.ThreadGroupDispatchSize =
intel_compute_threads_group_dispatch_size(dispatch->threads),
#endif
.NumberofThreadsinGPGPUThreadGroup = dispatch.threads, .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
}; };
GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL,