mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-04-21 01:10:44 +02:00
anv: Mask off excessive invocations
For unaligned invocations, don't launch two COMPUTE_WALKER, instead we can mask off excessive invocations in the shader itself at nir level and launch one additional workgroup. Signed-off-by: Sagar Ghuge <sagar.ghuge@intel.com> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36245>
This commit is contained in:
parent
7b634ebb63
commit
cac3b4f404
7 changed files with 99 additions and 89 deletions
|
|
@ -414,6 +414,14 @@ brw_wm_prog_key_is_dynamic(const struct brw_wm_prog_key *key)
|
|||
|
||||
struct brw_cs_prog_key {
|
||||
struct brw_base_prog_key base;
|
||||
|
||||
/**
|
||||
* Lowers unaligned dispatches into aligned one by dispatching one more
|
||||
* extra workgroup and masking off excessive invocations in the shader.
|
||||
*/
|
||||
bool lower_unaligned_dispatch:1;
|
||||
|
||||
uint32_t padding:31;
|
||||
};
|
||||
|
||||
struct brw_bs_prog_key {
|
||||
|
|
|
|||
|
|
@ -130,6 +130,8 @@ void anv_nir_validate_push_layout(const struct anv_physical_device *pdevice,
|
|||
|
||||
bool anv_nir_update_resource_intel_block(nir_shader *shader);
|
||||
|
||||
bool anv_nir_lower_unaligned_dispatch(nir_shader *shader);
|
||||
|
||||
bool anv_nir_lower_resource_intel(nir_shader *shader,
|
||||
const struct anv_physical_device *device,
|
||||
enum anv_descriptor_set_layout_type desc_type);
|
||||
|
|
|
|||
28
src/intel/vulkan/anv_nir_lower_unaligned_dispatch.c
Normal file
28
src/intel/vulkan/anv_nir_lower_unaligned_dispatch.c
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
/*
|
||||
* Copyright 2025 Intel Corporation
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#include "anv_nir.h"
|
||||
#include "nir_builder.h"
|
||||
#include "compiler/brw_nir.h"
|
||||
|
||||
bool
|
||||
anv_nir_lower_unaligned_dispatch(nir_shader *shader)
|
||||
{
|
||||
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
|
||||
nir_builder b = nir_builder_at(nir_before_impl(impl));
|
||||
|
||||
nir_def *global_idx = nir_channel(&b, nir_load_global_invocation_id(&b, 32), 0);
|
||||
nir_def *max_unaligned_invocations_x =
|
||||
nir_load_inline_data_intel(&b, 1, 32,
|
||||
.base = ANV_INLINE_PARAM_UNALIGNED_INVOCATIONS_X_OFFSET);
|
||||
|
||||
nir_push_if(&b, nir_uge(&b, global_idx, max_unaligned_invocations_x));
|
||||
{
|
||||
nir_jump(&b, nir_jump_return);
|
||||
}
|
||||
nir_pop_if(&b, NULL);
|
||||
|
||||
return nir_progress(true, impl, nir_metadata_none);
|
||||
}
|
||||
|
|
@ -44,6 +44,7 @@
|
|||
#include "vk_pipeline.h"
|
||||
#include "vk_render_pass.h"
|
||||
#include "vk_util.h"
|
||||
#include "vk_shader.h"
|
||||
|
||||
/* Eventually, this will become part of anv_CreateShader. Unfortunately,
|
||||
* we can't do that yet because we don't have the ability to copy nir.
|
||||
|
|
@ -578,13 +579,15 @@ populate_wm_prog_key(struct anv_pipeline_stage *stage,
|
|||
|
||||
static void
|
||||
populate_cs_prog_key(struct anv_pipeline_stage *stage,
|
||||
const struct anv_device *device)
|
||||
const struct anv_device *device,
|
||||
bool lower_unaligned_dispatch)
|
||||
{
|
||||
memset(&stage->key, 0, sizeof(stage->key));
|
||||
|
||||
populate_base_prog_key(stage, device, INTEL_VUE_LAYOUT_FIXED);
|
||||
|
||||
stage->key.base.uses_inline_push_addr = device->info->verx10 >= 125;
|
||||
stage->key.cs.lower_unaligned_dispatch = lower_unaligned_dispatch;
|
||||
}
|
||||
|
||||
static void
|
||||
|
|
@ -2115,6 +2118,15 @@ anv_pipeline_nir_preprocess(struct anv_pipeline *pipeline,
|
|||
};
|
||||
NIR_PASS(_, stage->nir, nir_opt_access, &opt_access_options);
|
||||
|
||||
if (stage->nir->info.stage == MESA_SHADER_COMPUTE &&
|
||||
stage->key.cs.lower_unaligned_dispatch) {
|
||||
NIR_PASS(_, stage->nir, anv_nir_lower_unaligned_dispatch);
|
||||
/* anv_nir_lower_unaligned_dispatch pass uses nir_jump_return that we
|
||||
* need to lower it.
|
||||
*/
|
||||
NIR_PASS(_, stage->nir, nir_lower_returns);
|
||||
}
|
||||
|
||||
/* Use a separate-shader linking model for pipeline libraries, we do cross
|
||||
* stage linking otherwise.
|
||||
*/
|
||||
|
|
@ -2633,6 +2645,9 @@ anv_pipeline_compile_cs(struct anv_compute_pipeline *pipeline,
|
|||
};
|
||||
int64_t pipeline_start = os_time_get_nano();
|
||||
|
||||
const bool lower_unaligned_dispatch =
|
||||
(sinfo->flags & VK_SHADER_CREATE_UNALIGNED_DISPATCH_BIT_MESA) != 0;
|
||||
|
||||
struct anv_device *device = pipeline->base.device;
|
||||
const struct brw_compiler *compiler = device->physical->compiler;
|
||||
|
||||
|
|
@ -2650,7 +2665,7 @@ anv_pipeline_compile_cs(struct anv_compute_pipeline *pipeline,
|
|||
};
|
||||
anv_stage_write_shader_hash(&stage, device);
|
||||
|
||||
populate_cs_prog_key(&stage, device);
|
||||
populate_cs_prog_key(&stage, device, lower_unaligned_dispatch);
|
||||
|
||||
const bool skip_cache_lookup =
|
||||
(pipeline->base.flags & VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR);
|
||||
|
|
|
|||
|
|
@ -251,6 +251,7 @@ get_max_vbs(const struct intel_device_info *devinfo) {
|
|||
#define ANV_INLINE_PARAM_PUSH_ADDRESS_OFFSET (0)
|
||||
#define ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET (8)
|
||||
#define ANV_INLINE_PARAM_MESH_PROVOKING_VERTEX (8)
|
||||
#define ANV_INLINE_PARAM_UNALIGNED_INVOCATIONS_X_OFFSET (20)
|
||||
|
||||
/* RENDER_SURFACE_STATE is a bit smaller (48b) but since it is aligned to 64
|
||||
* and we can't put anything else there we use 64b.
|
||||
|
|
|
|||
|
|
@ -469,7 +469,8 @@ emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
|
|||
const struct brw_cs_prog_data *prog_data,
|
||||
struct intel_cs_dispatch_info dispatch,
|
||||
uint32_t groupCountX, uint32_t groupCountY,
|
||||
uint32_t groupCountZ)
|
||||
uint32_t groupCountZ,
|
||||
uint32_t unaligned_invocations_x)
|
||||
{
|
||||
const struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
|
||||
const bool predicate = cmd_buffer->state.conditional_render_enabled;
|
||||
|
|
@ -504,6 +505,8 @@ emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
|
|||
[ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 0] = num_workgroup_data[0],
|
||||
[ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 1] = num_workgroup_data[1],
|
||||
[ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 2] = num_workgroup_data[2],
|
||||
[ANV_INLINE_PARAM_UNALIGNED_INVOCATIONS_X_OFFSET / 4 + 0] =
|
||||
unaligned_invocations_x,
|
||||
},
|
||||
.PostSync = {
|
||||
.MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
|
||||
|
|
@ -566,7 +569,7 @@ emit_cs_walker(struct anv_cmd_buffer *cmd_buffer,
|
|||
struct intel_cs_dispatch_info dispatch,
|
||||
struct anv_address indirect_addr,
|
||||
uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ,
|
||||
bool is_unaligned_size_x)
|
||||
bool is_unaligned_size_x, uint32_t unaligned_invocations_x)
|
||||
{
|
||||
struct anv_device *device = cmd_buffer->device;
|
||||
struct anv_instance *instance = device->physical->instance;
|
||||
|
|
@ -598,7 +601,8 @@ emit_cs_walker(struct anv_cmd_buffer *cmd_buffer,
|
|||
|
||||
#if GFX_VERx10 >= 125
|
||||
emit_compute_walker(cmd_buffer, indirect_addr, prog_data,
|
||||
dispatch, groupCountX, groupCountY, groupCountZ);
|
||||
dispatch, groupCountX, groupCountY, groupCountZ,
|
||||
unaligned_invocations_x);
|
||||
#else
|
||||
emit_gpgpu_walker(cmd_buffer, is_indirect, prog_data,
|
||||
groupCountX, groupCountY, groupCountZ);
|
||||
|
|
@ -647,7 +651,7 @@ void genX(CmdDispatchBase)(
|
|||
emit_cs_walker(cmd_buffer, prog_data, dispatch,
|
||||
ANV_NULL_ADDRESS /* no indirect data */,
|
||||
groupCountX, groupCountY, groupCountZ,
|
||||
false);
|
||||
false, 0);
|
||||
|
||||
genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
|
||||
|
||||
|
|
@ -656,63 +660,6 @@ void genX(CmdDispatchBase)(
|
|||
prog_data->base.source_hash);
|
||||
}
|
||||
|
||||
static void
|
||||
emit_unaligned_cs_walker(
|
||||
VkCommandBuffer commandBuffer,
|
||||
uint32_t baseGroupX,
|
||||
uint32_t baseGroupY,
|
||||
uint32_t baseGroupZ,
|
||||
uint32_t groupCountX,
|
||||
uint32_t groupCountY,
|
||||
uint32_t groupCountZ,
|
||||
struct intel_cs_dispatch_info dispatch)
|
||||
{
|
||||
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
|
||||
struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
|
||||
const struct brw_cs_prog_data *prog_data = get_cs_prog_data(comp_state);
|
||||
|
||||
if (anv_batch_has_error(&cmd_buffer->batch))
|
||||
return;
|
||||
|
||||
anv_cmd_buffer_push_workgroups(cmd_buffer, prog_data,
|
||||
baseGroupX, baseGroupY, baseGroupZ,
|
||||
groupCountX, groupCountY, groupCountZ,
|
||||
ANV_NULL_ADDRESS);
|
||||
|
||||
/* RT shaders have Y and Z local size set to 1 always. */
|
||||
assert(prog_data->local_size[1] == 1 && prog_data->local_size[2] == 1);
|
||||
|
||||
/* RT shaders dispatched with group Y and Z set to 1 always. */
|
||||
assert(groupCountY == 1 && groupCountZ == 1);
|
||||
|
||||
if (anv_batch_has_error(&cmd_buffer->batch))
|
||||
return;
|
||||
|
||||
anv_measure_snapshot(cmd_buffer,
|
||||
INTEL_SNAPSHOT_COMPUTE,
|
||||
"compute-unaligned-cs-walker",
|
||||
groupCountX * groupCountY * groupCountZ *
|
||||
prog_data->local_size[0] * prog_data->local_size[1] *
|
||||
prog_data->local_size[2]);
|
||||
|
||||
trace_intel_begin_compute(&cmd_buffer->trace);
|
||||
|
||||
assert(!prog_data->uses_num_work_groups);
|
||||
genX(cmd_buffer_flush_compute_state)(cmd_buffer);
|
||||
|
||||
if (cmd_buffer->state.conditional_render_enabled)
|
||||
genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
|
||||
|
||||
#if GFX_VERx10 >= 125
|
||||
emit_compute_walker(cmd_buffer, ANV_NULL_ADDRESS, prog_data,
|
||||
dispatch, groupCountX, groupCountY, groupCountZ);
|
||||
#endif
|
||||
|
||||
trace_intel_end_compute(&cmd_buffer->trace,
|
||||
groupCountX, groupCountY, groupCountZ,
|
||||
prog_data->base.source_hash);
|
||||
}
|
||||
|
||||
/*
|
||||
* Dispatch compute work item with unaligned thread invocations.
|
||||
*
|
||||
|
|
@ -734,43 +681,51 @@ genX(cmd_dispatch_unaligned)(
|
|||
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
|
||||
struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
|
||||
const struct brw_cs_prog_data *prog_data = get_cs_prog_data(comp_state);
|
||||
if (anv_batch_has_error(&cmd_buffer->batch))
|
||||
return;
|
||||
|
||||
/* Group X can be unaligned for RT dispatches. */
|
||||
uint32_t groupCountX = invocations_x / prog_data->local_size[0];
|
||||
uint32_t groupCountX = DIV_ROUND_UP(invocations_x, prog_data->local_size[0]);
|
||||
uint32_t groupCountY = invocations_y;
|
||||
uint32_t groupCountZ = invocations_z;
|
||||
|
||||
struct intel_cs_dispatch_info dispatch =
|
||||
brw_cs_get_dispatch_info(cmd_buffer->device->info, prog_data, NULL);
|
||||
|
||||
/* Launch first CS walker with aligned group count X. */
|
||||
if (groupCountX) {
|
||||
emit_unaligned_cs_walker(commandBuffer, 0, 0, 0, groupCountX,
|
||||
groupCountY, groupCountZ, dispatch);
|
||||
}
|
||||
anv_cmd_buffer_push_workgroups(cmd_buffer, prog_data, 0, 0, 0, groupCountX,
|
||||
groupCountY, groupCountZ, ANV_NULL_ADDRESS);
|
||||
|
||||
uint32_t unaligned_invocations_x = invocations_x % prog_data->local_size[0];
|
||||
if (unaligned_invocations_x) {
|
||||
dispatch.threads = DIV_ROUND_UP(unaligned_invocations_x,
|
||||
dispatch.simd_size);
|
||||
/* RT shaders have Y and Z local size set to 1 always. */
|
||||
assert(prog_data->local_size[1] == 1 && prog_data->local_size[2] == 1);
|
||||
/* RT shaders dispatched with group Y and Z set to 1 always. */
|
||||
assert(groupCountY == 1 && groupCountZ == 1);
|
||||
|
||||
/* Make sure the 2nd walker has the same amount of invocations per
|
||||
* workgroup as the 1st walker, so that gl_GlobalInvocationsID can be
|
||||
* calculated correctly with baseGroup.
|
||||
*/
|
||||
assert(dispatch.threads * dispatch.simd_size == prog_data->local_size[0]);
|
||||
anv_measure_snapshot(cmd_buffer,
|
||||
INTEL_SNAPSHOT_COMPUTE,
|
||||
"compute-unaligned-cs-walker",
|
||||
groupCountX * groupCountY * groupCountZ *
|
||||
prog_data->local_size[0] * prog_data->local_size[1] *
|
||||
prog_data->local_size[2]);
|
||||
|
||||
const uint32_t remainder = unaligned_invocations_x & (dispatch.simd_size - 1);
|
||||
if (remainder > 0) {
|
||||
dispatch.right_mask = ~0u >> (32 - remainder);
|
||||
} else {
|
||||
dispatch.right_mask = ~0u >> (32 - dispatch.simd_size);
|
||||
}
|
||||
trace_intel_begin_compute(&cmd_buffer->trace);
|
||||
|
||||
/* Launch second CS walker for unaligned part. */
|
||||
emit_unaligned_cs_walker(commandBuffer, groupCountX, 0, 0, 1, 1, 1,
|
||||
dispatch);
|
||||
}
|
||||
assert(!prog_data->uses_num_work_groups);
|
||||
genX(cmd_buffer_flush_compute_state)(cmd_buffer);
|
||||
if (cmd_buffer->state.conditional_render_enabled)
|
||||
genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
|
||||
|
||||
genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
|
||||
|
||||
emit_cs_walker(cmd_buffer, prog_data, dispatch,
|
||||
ANV_NULL_ADDRESS /* no indirect data */,
|
||||
groupCountX, groupCountY, groupCountZ,
|
||||
false, invocations_x);
|
||||
|
||||
genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
|
||||
|
||||
trace_intel_end_compute(&cmd_buffer->trace,
|
||||
groupCountX, groupCountY, groupCountZ,
|
||||
prog_data->base.source_hash);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
@ -809,7 +764,7 @@ genX(cmd_buffer_dispatch_indirect)(struct anv_cmd_buffer *cmd_buffer,
|
|||
genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
|
||||
|
||||
emit_cs_walker(cmd_buffer, prog_data, dispatch, indirect_addr,
|
||||
0, 0, 0, is_unaligned_size_x);
|
||||
0, 0, 0, is_unaligned_size_x, 0);
|
||||
|
||||
genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
|
||||
|
||||
|
|
|
|||
|
|
@ -177,6 +177,7 @@ libanv_files = files(
|
|||
'anv_nir_lower_multiview.c',
|
||||
'anv_nir_lower_ubo_loads.c',
|
||||
'anv_nir_lower_resource_intel.c',
|
||||
'anv_nir_lower_unaligned_dispatch.c',
|
||||
'anv_nir_push_descriptor_analysis.c',
|
||||
'anv_perf.c',
|
||||
'anv_physical_device.c',
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue