anv: Mask off excessive invocations
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

For unaligned invocations, don't launch two COMPUTE_WALKER, instead we
can mask off excessive invocations in the shader itself at nir level and
launch one additional workgroup.

Signed-off-by: Sagar Ghuge <sagar.ghuge@intel.com>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36245>
This commit is contained in:
Sagar Ghuge 2025-07-24 15:55:02 -07:00 committed by Marge Bot
parent 7b634ebb63
commit cac3b4f404
7 changed files with 99 additions and 89 deletions

View file

@ -414,6 +414,14 @@ brw_wm_prog_key_is_dynamic(const struct brw_wm_prog_key *key)
struct brw_cs_prog_key {
struct brw_base_prog_key base;
/**
* Lowers unaligned dispatches into aligned one by dispatching one more
* extra workgroup and masking off excessive invocations in the shader.
*/
bool lower_unaligned_dispatch:1;
uint32_t padding:31;
};
struct brw_bs_prog_key {

View file

@ -130,6 +130,8 @@ void anv_nir_validate_push_layout(const struct anv_physical_device *pdevice,
bool anv_nir_update_resource_intel_block(nir_shader *shader);
bool anv_nir_lower_unaligned_dispatch(nir_shader *shader);
bool anv_nir_lower_resource_intel(nir_shader *shader,
const struct anv_physical_device *device,
enum anv_descriptor_set_layout_type desc_type);

View file

@ -0,0 +1,28 @@
/*
* Copyright 2025 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#include "anv_nir.h"
#include "nir_builder.h"
#include "compiler/brw_nir.h"
bool
anv_nir_lower_unaligned_dispatch(nir_shader *shader)
{
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
nir_builder b = nir_builder_at(nir_before_impl(impl));
nir_def *global_idx = nir_channel(&b, nir_load_global_invocation_id(&b, 32), 0);
nir_def *max_unaligned_invocations_x =
nir_load_inline_data_intel(&b, 1, 32,
.base = ANV_INLINE_PARAM_UNALIGNED_INVOCATIONS_X_OFFSET);
nir_push_if(&b, nir_uge(&b, global_idx, max_unaligned_invocations_x));
{
nir_jump(&b, nir_jump_return);
}
nir_pop_if(&b, NULL);
return nir_progress(true, impl, nir_metadata_none);
}

View file

@ -44,6 +44,7 @@
#include "vk_pipeline.h"
#include "vk_render_pass.h"
#include "vk_util.h"
#include "vk_shader.h"
/* Eventually, this will become part of anv_CreateShader. Unfortunately,
* we can't do that yet because we don't have the ability to copy nir.
@ -578,13 +579,15 @@ populate_wm_prog_key(struct anv_pipeline_stage *stage,
static void
populate_cs_prog_key(struct anv_pipeline_stage *stage,
const struct anv_device *device)
const struct anv_device *device,
bool lower_unaligned_dispatch)
{
memset(&stage->key, 0, sizeof(stage->key));
populate_base_prog_key(stage, device, INTEL_VUE_LAYOUT_FIXED);
stage->key.base.uses_inline_push_addr = device->info->verx10 >= 125;
stage->key.cs.lower_unaligned_dispatch = lower_unaligned_dispatch;
}
static void
@ -2115,6 +2118,15 @@ anv_pipeline_nir_preprocess(struct anv_pipeline *pipeline,
};
NIR_PASS(_, stage->nir, nir_opt_access, &opt_access_options);
if (stage->nir->info.stage == MESA_SHADER_COMPUTE &&
stage->key.cs.lower_unaligned_dispatch) {
NIR_PASS(_, stage->nir, anv_nir_lower_unaligned_dispatch);
/* anv_nir_lower_unaligned_dispatch pass uses nir_jump_return that we
* need to lower it.
*/
NIR_PASS(_, stage->nir, nir_lower_returns);
}
/* Use a separate-shader linking model for pipeline libraries, we do cross
* stage linking otherwise.
*/
@ -2633,6 +2645,9 @@ anv_pipeline_compile_cs(struct anv_compute_pipeline *pipeline,
};
int64_t pipeline_start = os_time_get_nano();
const bool lower_unaligned_dispatch =
(sinfo->flags & VK_SHADER_CREATE_UNALIGNED_DISPATCH_BIT_MESA) != 0;
struct anv_device *device = pipeline->base.device;
const struct brw_compiler *compiler = device->physical->compiler;
@ -2650,7 +2665,7 @@ anv_pipeline_compile_cs(struct anv_compute_pipeline *pipeline,
};
anv_stage_write_shader_hash(&stage, device);
populate_cs_prog_key(&stage, device);
populate_cs_prog_key(&stage, device, lower_unaligned_dispatch);
const bool skip_cache_lookup =
(pipeline->base.flags & VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR);

View file

@ -251,6 +251,7 @@ get_max_vbs(const struct intel_device_info *devinfo) {
#define ANV_INLINE_PARAM_PUSH_ADDRESS_OFFSET (0)
#define ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET (8)
#define ANV_INLINE_PARAM_MESH_PROVOKING_VERTEX (8)
#define ANV_INLINE_PARAM_UNALIGNED_INVOCATIONS_X_OFFSET (20)
/* RENDER_SURFACE_STATE is a bit smaller (48b) but since it is aligned to 64
* and we can't put anything else there we use 64b.

View file

@ -469,7 +469,8 @@ emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
const struct brw_cs_prog_data *prog_data,
struct intel_cs_dispatch_info dispatch,
uint32_t groupCountX, uint32_t groupCountY,
uint32_t groupCountZ)
uint32_t groupCountZ,
uint32_t unaligned_invocations_x)
{
const struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
const bool predicate = cmd_buffer->state.conditional_render_enabled;
@ -504,6 +505,8 @@ emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
[ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 0] = num_workgroup_data[0],
[ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 1] = num_workgroup_data[1],
[ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 2] = num_workgroup_data[2],
[ANV_INLINE_PARAM_UNALIGNED_INVOCATIONS_X_OFFSET / 4 + 0] =
unaligned_invocations_x,
},
.PostSync = {
.MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
@ -566,7 +569,7 @@ emit_cs_walker(struct anv_cmd_buffer *cmd_buffer,
struct intel_cs_dispatch_info dispatch,
struct anv_address indirect_addr,
uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ,
bool is_unaligned_size_x)
bool is_unaligned_size_x, uint32_t unaligned_invocations_x)
{
struct anv_device *device = cmd_buffer->device;
struct anv_instance *instance = device->physical->instance;
@ -598,7 +601,8 @@ emit_cs_walker(struct anv_cmd_buffer *cmd_buffer,
#if GFX_VERx10 >= 125
emit_compute_walker(cmd_buffer, indirect_addr, prog_data,
dispatch, groupCountX, groupCountY, groupCountZ);
dispatch, groupCountX, groupCountY, groupCountZ,
unaligned_invocations_x);
#else
emit_gpgpu_walker(cmd_buffer, is_indirect, prog_data,
groupCountX, groupCountY, groupCountZ);
@ -647,7 +651,7 @@ void genX(CmdDispatchBase)(
emit_cs_walker(cmd_buffer, prog_data, dispatch,
ANV_NULL_ADDRESS /* no indirect data */,
groupCountX, groupCountY, groupCountZ,
false);
false, 0);
genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
@ -656,63 +660,6 @@ void genX(CmdDispatchBase)(
prog_data->base.source_hash);
}
static void
emit_unaligned_cs_walker(
VkCommandBuffer commandBuffer,
uint32_t baseGroupX,
uint32_t baseGroupY,
uint32_t baseGroupZ,
uint32_t groupCountX,
uint32_t groupCountY,
uint32_t groupCountZ,
struct intel_cs_dispatch_info dispatch)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
const struct brw_cs_prog_data *prog_data = get_cs_prog_data(comp_state);
if (anv_batch_has_error(&cmd_buffer->batch))
return;
anv_cmd_buffer_push_workgroups(cmd_buffer, prog_data,
baseGroupX, baseGroupY, baseGroupZ,
groupCountX, groupCountY, groupCountZ,
ANV_NULL_ADDRESS);
/* RT shaders have Y and Z local size set to 1 always. */
assert(prog_data->local_size[1] == 1 && prog_data->local_size[2] == 1);
/* RT shaders dispatched with group Y and Z set to 1 always. */
assert(groupCountY == 1 && groupCountZ == 1);
if (anv_batch_has_error(&cmd_buffer->batch))
return;
anv_measure_snapshot(cmd_buffer,
INTEL_SNAPSHOT_COMPUTE,
"compute-unaligned-cs-walker",
groupCountX * groupCountY * groupCountZ *
prog_data->local_size[0] * prog_data->local_size[1] *
prog_data->local_size[2]);
trace_intel_begin_compute(&cmd_buffer->trace);
assert(!prog_data->uses_num_work_groups);
genX(cmd_buffer_flush_compute_state)(cmd_buffer);
if (cmd_buffer->state.conditional_render_enabled)
genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
#if GFX_VERx10 >= 125
emit_compute_walker(cmd_buffer, ANV_NULL_ADDRESS, prog_data,
dispatch, groupCountX, groupCountY, groupCountZ);
#endif
trace_intel_end_compute(&cmd_buffer->trace,
groupCountX, groupCountY, groupCountZ,
prog_data->base.source_hash);
}
/*
* Dispatch compute work item with unaligned thread invocations.
*
@ -734,43 +681,51 @@ genX(cmd_dispatch_unaligned)(
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
const struct brw_cs_prog_data *prog_data = get_cs_prog_data(comp_state);
if (anv_batch_has_error(&cmd_buffer->batch))
return;
/* Group X can be unaligned for RT dispatches. */
uint32_t groupCountX = invocations_x / prog_data->local_size[0];
uint32_t groupCountX = DIV_ROUND_UP(invocations_x, prog_data->local_size[0]);
uint32_t groupCountY = invocations_y;
uint32_t groupCountZ = invocations_z;
struct intel_cs_dispatch_info dispatch =
brw_cs_get_dispatch_info(cmd_buffer->device->info, prog_data, NULL);
/* Launch first CS walker with aligned group count X. */
if (groupCountX) {
emit_unaligned_cs_walker(commandBuffer, 0, 0, 0, groupCountX,
groupCountY, groupCountZ, dispatch);
}
anv_cmd_buffer_push_workgroups(cmd_buffer, prog_data, 0, 0, 0, groupCountX,
groupCountY, groupCountZ, ANV_NULL_ADDRESS);
uint32_t unaligned_invocations_x = invocations_x % prog_data->local_size[0];
if (unaligned_invocations_x) {
dispatch.threads = DIV_ROUND_UP(unaligned_invocations_x,
dispatch.simd_size);
/* RT shaders have Y and Z local size set to 1 always. */
assert(prog_data->local_size[1] == 1 && prog_data->local_size[2] == 1);
/* RT shaders dispatched with group Y and Z set to 1 always. */
assert(groupCountY == 1 && groupCountZ == 1);
/* Make sure the 2nd walker has the same amount of invocations per
* workgroup as the 1st walker, so that gl_GlobalInvocationsID can be
* calculated correctly with baseGroup.
*/
assert(dispatch.threads * dispatch.simd_size == prog_data->local_size[0]);
anv_measure_snapshot(cmd_buffer,
INTEL_SNAPSHOT_COMPUTE,
"compute-unaligned-cs-walker",
groupCountX * groupCountY * groupCountZ *
prog_data->local_size[0] * prog_data->local_size[1] *
prog_data->local_size[2]);
const uint32_t remainder = unaligned_invocations_x & (dispatch.simd_size - 1);
if (remainder > 0) {
dispatch.right_mask = ~0u >> (32 - remainder);
} else {
dispatch.right_mask = ~0u >> (32 - dispatch.simd_size);
}
trace_intel_begin_compute(&cmd_buffer->trace);
/* Launch second CS walker for unaligned part. */
emit_unaligned_cs_walker(commandBuffer, groupCountX, 0, 0, 1, 1, 1,
dispatch);
}
assert(!prog_data->uses_num_work_groups);
genX(cmd_buffer_flush_compute_state)(cmd_buffer);
if (cmd_buffer->state.conditional_render_enabled)
genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
emit_cs_walker(cmd_buffer, prog_data, dispatch,
ANV_NULL_ADDRESS /* no indirect data */,
groupCountX, groupCountY, groupCountZ,
false, invocations_x);
genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
trace_intel_end_compute(&cmd_buffer->trace,
groupCountX, groupCountY, groupCountZ,
prog_data->base.source_hash);
}
/*
@ -809,7 +764,7 @@ genX(cmd_buffer_dispatch_indirect)(struct anv_cmd_buffer *cmd_buffer,
genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
emit_cs_walker(cmd_buffer, prog_data, dispatch, indirect_addr,
0, 0, 0, is_unaligned_size_x);
0, 0, 0, is_unaligned_size_x, 0);
genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);

View file

@ -177,6 +177,7 @@ libanv_files = files(
'anv_nir_lower_multiview.c',
'anv_nir_lower_ubo_loads.c',
'anv_nir_lower_resource_intel.c',
'anv_nir_lower_unaligned_dispatch.c',
'anv_nir_push_descriptor_analysis.c',
'anv_perf.c',
'anv_physical_device.c',