nvk: Multiply by local_size for CS invocations in DGC codepath
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

We were entirely missing the local size in the accounting for CS
invocations.

Cc: mesa-stable
Signed-off-by: Mary Guillemard <mary@mary.zone>
Reviewed-by: Mel Henning <mhenning@darkrefraction.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41553>
This commit is contained in:
Mary Guillemard 2026-05-13 17:35:34 +02:00 committed by Marge Bot
parent dfa779c559
commit 662a346ce3
3 changed files with 47 additions and 1 deletions

View file

@ -284,6 +284,10 @@ struct nak_qmd_dispatch_size_layout {
uint16_t x_start, x_end;
uint16_t y_start, y_end;
uint16_t z_start, z_end;
uint16_t local_x_start, local_x_end;
uint16_t local_y_start, local_y_end;
uint16_t local_z_start, local_z_end;
};
struct nak_qmd_dispatch_size_layout

View file

@ -80,6 +80,11 @@ macro_rules! qmd_impl_common {
let w = paste! {$c::[<$s _CTA_RASTER_WIDTH>]};
let h = paste! {$c::[<$s _CTA_RASTER_HEIGHT>]};
let d = paste! {$c::[<$s _CTA_RASTER_DEPTH>]};
let local_w = paste! {$c::[<$s _CTA_THREAD_DIMENSION0>]};
let local_h = paste! {$c::[<$s _CTA_THREAD_DIMENSION1>]};
let local_d = paste! {$c::[<$s _CTA_THREAD_DIMENSION2>]};
nak_qmd_dispatch_size_layout {
x_start: w.start as u16,
x_end: w.end as u16,
@ -87,6 +92,12 @@ macro_rules! qmd_impl_common {
y_end: h.end as u16,
z_start: d.start as u16,
z_end: d.end as u16,
local_x_start: local_w.start as u16,
local_x_end: local_w.end as u16,
local_y_start: local_h.start as u16,
local_y_end: local_h.end as u16,
local_z_start: local_d.start as u16,
local_z_end: local_d.end as u16,
}
};

View file

@ -394,12 +394,43 @@ build_process_cs_cmd_seq(nir_builder *b, struct nvk_nir_push *p,
nir_ior(b, load_global_dw(b, shader_qmd_addr, cb0_addr_hi_dw),
nir_unpack_64_2x32_split_y(b, root_addr_shifted));
/* Ensure each local size parameters are part of the same 32-bit
* value */
assert(qmd_layout.local_x_start / 32 ==
(qmd_layout.local_x_end - 1) / 32);
assert(qmd_layout.local_y_start / 32 ==
(qmd_layout.local_y_end - 1) / 32);
assert(qmd_layout.local_z_start / 32 ==
(qmd_layout.local_z_end - 1) / 32);
/* Preload local size parameters as we are going to use them */
qmd_repl[qmd_layout.local_x_start / 32] = load_global_dw(
b, shader_qmd_addr, qmd_layout.local_x_start / 32);
qmd_repl[qmd_layout.local_y_start / 32] = load_global_dw(
b, shader_qmd_addr, qmd_layout.local_y_start / 32);
qmd_repl[qmd_layout.local_z_start / 32] = load_global_dw(
b, shader_qmd_addr, qmd_layout.local_z_start / 32);
copy_repl_global_dw(b, qmd_addr, shader_qmd_addr,
qmd_repl, qmd_repl_count);
/* Now emit commands */
nir_def *local_x = nir_ubitfield_extract_imm(
b, qmd_repl[qmd_layout.local_x_start / 32],
qmd_layout.local_x_start % 32, qmd_layout.local_x_end % 32);
nir_def *local_y = nir_ubitfield_extract_imm(
b, qmd_repl[qmd_layout.local_y_start / 32],
qmd_layout.local_y_start % 32, qmd_layout.local_y_end % 32);
nir_def *local_z = nir_ubitfield_extract_imm(
b, qmd_repl[qmd_layout.local_z_start / 32],
qmd_layout.local_z_start % 32, qmd_layout.local_z_end % 32);
nir_def *local_size =
nir_imul(b, nir_imul(b, local_x, local_y), local_z);
nir_def *invoc = nir_imul_2x32_64(b, disp_size_x, disp_size_y);
invoc = nir_imul(b, invoc, nir_u2u64(b, disp_size_z));
invoc = nir_imul(b, invoc, nir_u2u64(b, local_size));
/* Now emit commands */
if (pdev->info.cls_compute >= AMPERE_COMPUTE_B)
nvk_nir_P_1INC(b, p, NVC7C0, CALL_MME_MACRO(NVK_MME_ADD_CS_INVOCATIONS), 2);
else