anv: Set execution mask based on SIMD size

Execution mask gets applied to last thread in the threadgroup to mask
off simd lanes, But with BTD enabled, we are seeing only last 4
components has valid stack ID's and upper 4 components of the register
are zero.

Changing execution mask somehow populates the stack IDs properly.

This is on simulator, before changing the execution mask:
00000000 00000000 00000000 00000000  000F000E 000D000C 000B000A 00090008  00000000 00000000 00000000 00000000  000F000E 000D000C 000B000A 00090008  r1

After changing execution mask:
000F000E 000D000C 000B000A 00090008  00070006 00050004 00030002 00010000  000F000E 000D000C 000B000A 00090008  00070006 00050004 00030002 00010000  r1

Signed-off-by: Sagar Ghuge <sagar.ghuge@intel.com>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41409>
This commit is contained in:
Sagar Ghuge 2026-05-06 15:22:54 -07:00 committed by Marge Bot
parent 7b286abe33
commit 17f7e7f96b

View file

@ -931,7 +931,8 @@ genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer)
#if GFX_VERx10 >= 125
static void
calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3])
calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3],
uint32_t max_shift)
{
unsigned total_shift = 0;
memset(local_shift, 0, 3);
@ -947,13 +948,13 @@ calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3])
total_shift++;
}
if (total_shift == 3)
if (total_shift == max_shift)
return;
}
} while(progress);
/* Assign whatever's left to x */
local_shift[0] += 3 - total_shift;
local_shift[0] += max_shift - total_shift;
}
static struct GENX(RT_SHADER_TABLE)
@ -1390,6 +1391,10 @@ cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer,
struct anv_address rtdg_addr =
anv_cmd_buffer_temporary_state_address(cmd_buffer, rtdg_state);
const struct brw_cs_prog_data *cs_prog_data =
brw_cs_prog_data_const(device->rt_trampoline->prog_data);
struct intel_cs_dispatch_info dispatch =
brw_cs_get_dispatch_info(device->info, cs_prog_data, NULL);
uint8_t local_size_log2[3];
uint32_t global_size[3] = {};
@ -1398,7 +1403,7 @@ cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer,
* will use a two-dimensional dispatch size. Worst case, our initial
* dispatch will be a little slower than it has to be.
*/
local_size_log2[0] = 2;
local_size_log2[0] = util_logbase2(dispatch.simd_size) - 1;
local_size_log2[1] = 1;
local_size_log2[2] = 0;
@ -1448,7 +1453,8 @@ cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer,
mi_store(&b, mi_reg32(GENX(GPGPU_DISPATCHDIMZ_num)), launch_size[2]);
} else {
calc_local_trace_size(local_size_log2, params->launch_size);
calc_local_trace_size(local_size_log2, params->launch_size,
util_logbase2(dispatch.simd_size));
for (unsigned i = 0; i < 3; i++) {
/* We have to be a bit careful here because DIV_ROUND_UP adds to the
@ -1459,11 +1465,6 @@ cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer,
}
}
const struct brw_cs_prog_data *cs_prog_data =
brw_cs_prog_data_const(device->rt_trampoline->prog_data);
struct intel_cs_dispatch_info dispatch =
brw_cs_get_dispatch_info(device->info, cs_prog_data, NULL);
const mesa_shader_stage s = MESA_SHADER_RAYGEN;
struct anv_state *surfaces = &cmd_buffer->state.binding_tables[s];
struct anv_state *samplers = &cmd_buffer->state.samplers[s];
@ -1494,7 +1495,7 @@ cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer,
.ThreadGroupIDXDimension = global_size[0],
.ThreadGroupIDYDimension = global_size[1],
.ThreadGroupIDZDimension = global_size[2],
.ExecutionMask = 0xff,
.ExecutionMask = dispatch.right_mask,
.EmitInlineParameter = true,
.PostSync.MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
#if GFX_VER >= 30