mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-21 13:40:16 +01:00
anv: only emit CFE_STATE when scratch space increases
On Gen12.5+, we only need to emit CFE_STATE when scratch space has changed, not on every pipeline binding. Also, only grow the scratch space, never shrink it. Need to reset after secondary buf. Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22179>
This commit is contained in:
parent
c88de6c18c
commit
ecb709c853
5 changed files with 68 additions and 33 deletions
|
|
@ -132,6 +132,9 @@ void genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buff
|
||||||
|
|
||||||
struct anv_state genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer);
|
struct anv_state genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer);
|
||||||
|
|
||||||
|
void genX(cmd_buffer_ensure_cfe_state)(struct anv_cmd_buffer *cmd_buffer,
|
||||||
|
uint32_t total_scratch);
|
||||||
|
|
||||||
void
|
void
|
||||||
genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch,
|
genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch,
|
||||||
const struct intel_l3_config *l3_config,
|
const struct intel_l3_config *l3_config,
|
||||||
|
|
|
||||||
|
|
@ -2593,6 +2593,9 @@ struct anv_cmd_compute_state {
|
||||||
struct anv_state push_data;
|
struct anv_state push_data;
|
||||||
|
|
||||||
struct anv_address num_workgroups;
|
struct anv_address num_workgroups;
|
||||||
|
|
||||||
|
uint32_t scratch_size;
|
||||||
|
bool cfe_state_valid;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct anv_cmd_ray_tracing_state {
|
struct anv_cmd_ray_tracing_state {
|
||||||
|
|
|
||||||
|
|
@ -817,6 +817,17 @@ cmd_build_acceleration_structures(
|
||||||
&data, sizeof(data));
|
&data, sizeof(data));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (anv_cmd_buffer_is_render_queue(cmd_buffer))
|
||||||
|
genX(flush_pipeline_select_gpgpu)(cmd_buffer);
|
||||||
|
|
||||||
|
/* Due to the nature of GRL and its heavy use of jumps/predication, we
|
||||||
|
* cannot tell exactly in what order the CFE_STATE we insert are going to
|
||||||
|
* be executed. So always use the largest possible size.
|
||||||
|
*/
|
||||||
|
genX(cmd_buffer_ensure_cfe_state)(
|
||||||
|
cmd_buffer,
|
||||||
|
cmd_buffer->device->physical->max_grl_scratch_size);
|
||||||
|
|
||||||
/* Round 1 : init_globals kernel */
|
/* Round 1 : init_globals kernel */
|
||||||
genX(grl_misc_batched_init_globals)(
|
genX(grl_misc_batched_init_globals)(
|
||||||
cmd_buffer,
|
cmd_buffer,
|
||||||
|
|
|
||||||
|
|
@ -3955,6 +3955,7 @@ genX(CmdExecuteCommands)(
|
||||||
primary->state.current_l3_config = NULL;
|
primary->state.current_l3_config = NULL;
|
||||||
primary->state.current_hash_scale = 0;
|
primary->state.current_hash_scale = 0;
|
||||||
primary->state.gfx.push_constant_stages = 0;
|
primary->state.gfx.push_constant_stages = 0;
|
||||||
|
primary->state.compute.cfe_state_valid = false;
|
||||||
vk_dynamic_graphics_state_dirty_all(&primary->vk.dynamic_graphics_state);
|
vk_dynamic_graphics_state_dirty_all(&primary->vk.dynamic_graphics_state);
|
||||||
|
|
||||||
/* Each of the secondary command buffers will use its own state base
|
/* Each of the secondary command buffers will use its own state base
|
||||||
|
|
@ -5492,11 +5493,56 @@ genX(CmdDrawMeshTasksIndirectCountEXT)(
|
||||||
|
|
||||||
#endif /* GFX_VERx10 >= 125 */
|
#endif /* GFX_VERx10 >= 125 */
|
||||||
|
|
||||||
|
void
|
||||||
|
genX(cmd_buffer_ensure_cfe_state)(struct anv_cmd_buffer *cmd_buffer,
|
||||||
|
uint32_t total_scratch)
|
||||||
|
{
|
||||||
|
#if GFX_VERx10 >= 125
|
||||||
|
assert(cmd_buffer->state.current_pipeline == GPGPU);
|
||||||
|
|
||||||
|
struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
|
||||||
|
|
||||||
|
if (comp_state->cfe_state_valid &&
|
||||||
|
total_scratch <= comp_state->scratch_size)
|
||||||
|
return;
|
||||||
|
|
||||||
|
const struct intel_device_info *devinfo = cmd_buffer->device->info;
|
||||||
|
anv_batch_emit(&cmd_buffer->batch, GENX(CFE_STATE), cfe) {
|
||||||
|
const uint32_t subslices = MAX2(devinfo->subslice_total, 1);
|
||||||
|
cfe.MaximumNumberofThreads =
|
||||||
|
devinfo->max_cs_threads * subslices - 1;
|
||||||
|
|
||||||
|
uint32_t scratch_surf = 0xffffffff;
|
||||||
|
if (total_scratch > 0) {
|
||||||
|
struct anv_bo *scratch_bo =
|
||||||
|
anv_scratch_pool_alloc(cmd_buffer->device,
|
||||||
|
&cmd_buffer->device->scratch_pool,
|
||||||
|
MESA_SHADER_COMPUTE,
|
||||||
|
total_scratch);
|
||||||
|
anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
|
||||||
|
cmd_buffer->batch.alloc,
|
||||||
|
scratch_bo);
|
||||||
|
scratch_surf =
|
||||||
|
anv_scratch_pool_get_surf(cmd_buffer->device,
|
||||||
|
&cmd_buffer->device->scratch_pool,
|
||||||
|
total_scratch);
|
||||||
|
cfe.ScratchSpaceBuffer = scratch_surf >> 4;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
comp_state->scratch_size = total_scratch;
|
||||||
|
comp_state->cfe_state_valid = true;
|
||||||
|
#else
|
||||||
|
unreachable("Invalid call");
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
|
genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
|
||||||
{
|
{
|
||||||
struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
|
struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
|
||||||
struct anv_compute_pipeline *pipeline = comp_state->pipeline;
|
struct anv_compute_pipeline *pipeline = comp_state->pipeline;
|
||||||
|
const UNUSED struct intel_device_info *devinfo = cmd_buffer->device->info;
|
||||||
|
|
||||||
assert(pipeline->cs);
|
assert(pipeline->cs);
|
||||||
|
|
||||||
|
|
@ -5528,6 +5574,11 @@ genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
|
||||||
|
|
||||||
anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
|
anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
|
||||||
|
|
||||||
|
#if GFX_VERx10 >= 125
|
||||||
|
const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
|
||||||
|
genX(cmd_buffer_ensure_cfe_state)(cmd_buffer, prog_data->base.total_scratch);
|
||||||
|
#endif
|
||||||
|
|
||||||
/* The workgroup size of the pipeline affects our push constant layout
|
/* The workgroup size of the pipeline affects our push constant layout
|
||||||
* so flag push constants as dirty if we change the pipeline.
|
* so flag push constants as dirty if we change the pipeline.
|
||||||
*/
|
*/
|
||||||
|
|
@ -5939,28 +5990,6 @@ genX(cmd_buffer_dispatch_kernel)(struct anv_cmd_buffer *cmd_buffer,
|
||||||
struct brw_cs_dispatch_info dispatch =
|
struct brw_cs_dispatch_info dispatch =
|
||||||
brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL);
|
brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL);
|
||||||
|
|
||||||
anv_batch_emit(&cmd_buffer->batch, GENX(CFE_STATE), cfe) {
|
|
||||||
const uint32_t subslices = MAX2(devinfo->subslice_total, 1);
|
|
||||||
cfe.MaximumNumberofThreads =
|
|
||||||
devinfo->max_cs_threads * subslices - 1;
|
|
||||||
|
|
||||||
if (cs_prog_data->base.total_scratch > 0) {
|
|
||||||
struct anv_bo *scratch_bo =
|
|
||||||
anv_scratch_pool_alloc(cmd_buffer->device,
|
|
||||||
&cmd_buffer->device->scratch_pool,
|
|
||||||
MESA_SHADER_COMPUTE,
|
|
||||||
cs_prog_data->base.total_scratch);
|
|
||||||
anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
|
|
||||||
cmd_buffer->batch.alloc,
|
|
||||||
scratch_bo);
|
|
||||||
uint32_t scratch_surf =
|
|
||||||
anv_scratch_pool_get_surf(cmd_buffer->device,
|
|
||||||
&cmd_buffer->device->scratch_pool,
|
|
||||||
cs_prog_data->base.total_scratch);
|
|
||||||
cfe.ScratchSpaceBuffer = scratch_surf >> 4;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
|
anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
|
||||||
cw.PredicateEnable = false;
|
cw.PredicateEnable = false;
|
||||||
cw.SIMDSize = dispatch.simd_size / 16;
|
cw.SIMDSize = dispatch.simd_size / 16;
|
||||||
|
|
|
||||||
|
|
@ -1879,19 +1879,8 @@ genX(graphics_pipeline_emit)(struct anv_graphics_pipeline *pipeline,
|
||||||
void
|
void
|
||||||
genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline)
|
genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline)
|
||||||
{
|
{
|
||||||
struct anv_device *device = pipeline->base.device;
|
|
||||||
const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
|
const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
|
||||||
anv_pipeline_setup_l3_config(&pipeline->base, cs_prog_data->base.total_shared > 0);
|
anv_pipeline_setup_l3_config(&pipeline->base, cs_prog_data->base.total_shared > 0);
|
||||||
|
|
||||||
const UNUSED struct anv_shader_bin *cs_bin = pipeline->cs;
|
|
||||||
const struct intel_device_info *devinfo = device->info;
|
|
||||||
|
|
||||||
anv_batch_emit(&pipeline->base.batch, GENX(CFE_STATE), cfe) {
|
|
||||||
cfe.MaximumNumberofThreads =
|
|
||||||
devinfo->max_cs_threads * devinfo->subslice_total;
|
|
||||||
cfe.ScratchSpaceBuffer =
|
|
||||||
get_scratch_surf(&pipeline->base, MESA_SHADER_COMPUTE, cs_bin);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#else /* #if GFX_VERx10 >= 125 */
|
#else /* #if GFX_VERx10 >= 125 */
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue