From 17f7e7f96bae980844c16d787155dedadd8befb2 Mon Sep 17 00:00:00 2001 From: Sagar Ghuge Date: Wed, 6 May 2026 15:22:54 -0700 Subject: [PATCH] anv: Set execution mask based on SIMD size Execution mask gets applied to last thread in the threadgroup to mask off simd lanes, But with BTD enabled, we are seeing only last 4 components has valid stack ID's and upper 4 components of the register are zero. Changing execution mask somehow populates the stack IDs properly. This is on simulator, before changing the execution mask: 00000000 00000000 00000000 00000000 000F000E 000D000C 000B000A 00090008 00000000 00000000 00000000 00000000 000F000E 000D000C 000B000A 00090008 r1 After changing execution mask: 000F000E 000D000C 000B000A 00090008 00070006 00050004 00030002 00010000 000F000E 000D000C 000B000A 00090008 00070006 00050004 00030002 00010000 r1 Signed-off-by: Sagar Ghuge Reviewed-by: Lionel Landwerlin Part-of: --- src/intel/vulkan/genX_cmd_compute.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/intel/vulkan/genX_cmd_compute.c b/src/intel/vulkan/genX_cmd_compute.c index 9df9f8a0334..56fe5a8c481 100644 --- a/src/intel/vulkan/genX_cmd_compute.c +++ b/src/intel/vulkan/genX_cmd_compute.c @@ -931,7 +931,8 @@ genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer) #if GFX_VERx10 >= 125 static void -calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3]) +calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3], + uint32_t max_shift) { unsigned total_shift = 0; memset(local_shift, 0, 3); @@ -947,13 +948,13 @@ calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3]) total_shift++; } - if (total_shift == 3) + if (total_shift == max_shift) return; } } while(progress); /* Assign whatever's left to x */ - local_shift[0] += 3 - total_shift; + local_shift[0] += max_shift - total_shift; } static struct GENX(RT_SHADER_TABLE) @@ -1390,6 +1391,10 @@ cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer, struct anv_address rtdg_addr = anv_cmd_buffer_temporary_state_address(cmd_buffer, rtdg_state); + const struct brw_cs_prog_data *cs_prog_data = + brw_cs_prog_data_const(device->rt_trampoline->prog_data); + struct intel_cs_dispatch_info dispatch = + brw_cs_get_dispatch_info(device->info, cs_prog_data, NULL); uint8_t local_size_log2[3]; uint32_t global_size[3] = {}; @@ -1398,7 +1403,7 @@ cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer, * will use a two-dimensional dispatch size. Worst case, our initial * dispatch will be a little slower than it has to be. */ - local_size_log2[0] = 2; + local_size_log2[0] = util_logbase2(dispatch.simd_size) - 1; local_size_log2[1] = 1; local_size_log2[2] = 0; @@ -1448,7 +1453,8 @@ cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer, mi_store(&b, mi_reg32(GENX(GPGPU_DISPATCHDIMZ_num)), launch_size[2]); } else { - calc_local_trace_size(local_size_log2, params->launch_size); + calc_local_trace_size(local_size_log2, params->launch_size, + util_logbase2(dispatch.simd_size)); for (unsigned i = 0; i < 3; i++) { /* We have to be a bit careful here because DIV_ROUND_UP adds to the @@ -1459,11 +1465,6 @@ cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer, } } - const struct brw_cs_prog_data *cs_prog_data = - brw_cs_prog_data_const(device->rt_trampoline->prog_data); - struct intel_cs_dispatch_info dispatch = - brw_cs_get_dispatch_info(device->info, cs_prog_data, NULL); - const mesa_shader_stage s = MESA_SHADER_RAYGEN; struct anv_state *surfaces = &cmd_buffer->state.binding_tables[s]; struct anv_state *samplers = &cmd_buffer->state.samplers[s]; @@ -1494,7 +1495,7 @@ cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer, .ThreadGroupIDXDimension = global_size[0], .ThreadGroupIDYDimension = global_size[1], .ThreadGroupIDZDimension = global_size[2], - .ExecutionMask = 0xff, + .ExecutionMask = dispatch.right_mask, .EmitInlineParameter = true, .PostSync.MOCS = anv_mocs(cmd_buffer->device, NULL, 0), #if GFX_VER >= 30