anv/xe3+: Set RegistersPerThread for bindless shader dispatch.

v2: Use MOV and wrap in conditional during BTD spawn header setup
    (Lionel).  Remove references to SIMD8 (Tapani).

v3: Update brw_bsr() to specify number of registers per thread, don't
    initialize Registers Per Thread on BTD spawn header (Lionel).

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32664>
This commit is contained in:
Francisco Jerez 2025-01-29 11:05:49 -08:00 committed by Marge Bot
parent b25d0f899b
commit dd1712515b
4 changed files with 45 additions and 7 deletions

View file

@ -18,13 +18,15 @@
static uint64_t
brw_bsr(const struct intel_device_info *devinfo,
uint32_t offset, uint8_t simd_size, uint8_t local_arg_offset)
uint32_t offset, uint8_t simd_size, uint8_t local_arg_offset,
uint8_t grf_used)
{
assert(offset % 64 == 0);
assert(simd_size == 8 || simd_size == 16);
assert(local_arg_offset % 8 == 0);
return offset |
return ((uint64_t)ptl_register_blocks(grf_used) << 60) |
offset |
SET_BITS(simd_size == 8, 4, 4) |
SET_BITS(local_arg_offset / 8, 2, 0);
}
@ -69,7 +71,8 @@ compile_single_bs(const struct brw_compiler *compiler,
nir_shader *shader,
brw_generator *g,
struct brw_compile_stats *stats,
int *prog_offset)
int *prog_offset,
uint64_t *bsr)
{
const bool debug_enabled = brw_should_print_shader(shader, DEBUG_RT);
@ -147,7 +150,10 @@ compile_single_bs(const struct brw_compiler *compiler,
else
assert(offset == 0);
if (!prog_offset)
if (bsr)
*bsr = brw_bsr(compiler->devinfo, offset, dispatch_width, 0,
selected->grf_used);
else
prog_data->base.grf_used = MAX2(prog_data->base.grf_used,
selected->grf_used);
@ -185,7 +191,7 @@ brw_compile_bs(const struct brw_compiler *compiler,
prog_data->simd_size =
compile_single_bs(compiler, params, params->key, prog_data,
shader, &g, params->base.stats, NULL);
shader, &g, params->base.stats, NULL, NULL);
if (prog_data->simd_size == 0)
return NULL;
@ -206,12 +212,12 @@ brw_compile_bs(const struct brw_compiler *compiler,
int offset = 0;
uint8_t simd_size =
compile_single_bs(compiler, params, params->key,
prog_data, resume_shaders[i], &g, NULL, &offset);
prog_data, resume_shaders[i], &g, NULL, &offset,
&resume_sbt[i]);
if (simd_size == 0)
return NULL;
assert(offset > 0);
resume_sbt[i] = brw_bsr(compiler->devinfo, offset, simd_size, 0);
}
/* We only have one constant data so we want to make sure they're all the

View file

@ -2214,6 +2214,14 @@ lower_btd_logical_send(const brw_builder &bld, fs_inst *inst)
global_addr.type = BRW_TYPE_UD;
global_addr.stride = 1;
ubld.group(2, 0).MOV(header, global_addr);
/* XXX - There is a Registers Per Thread field in the BTD spawn
* header starting on Xe3, it doesn't appear to be needed
* by the hardware so we don't set it. If it's ever
* needed though we will need some sort of reloc since
* we'll have to initialize it based on the prog_data
* structure of the callee.
*/
break;
case SHADER_OPCODE_BTD_RETIRE_LOGICAL:

View file

@ -1,4 +1,12 @@
<?xml version='1.0' encoding='utf-8'?>
<genxml name="RT" gen="30">
<import name="gen200_rt.xml" />
<struct name="BINDLESS_SHADER_RECORD" length="2">
<field name="Offset To Local Arguments" start="0" end="2" type="uint" />
<field name="Bindless Shader Dispatch Mode" start="4" end="4" type="uint">
<value name="RT_SIMD16" value="0" />
</field>
<field name="Kernel Start Pointer" start="6" end="31" type="offset" />
<field name="Registers Per Thread" start="60" end="63" type="uint" />
</struct>
</genxml>

View file

@ -276,6 +276,21 @@ genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline);
void
genX(ray_tracing_pipeline_emit)(struct anv_ray_tracing_pipeline *pipeline);
#if GFX_VERx10 >= 300
#define anv_shader_bin_get_bsr(bin, local_arg_offset) ({ \
assert((local_arg_offset) % 8 == 0); \
const struct brw_bs_prog_data *prog_data = \
brw_bs_prog_data_const(bin->prog_data); \
assert(prog_data->simd_size == 16); \
\
(struct GENX(BINDLESS_SHADER_RECORD)) { \
.OffsetToLocalArguments = (local_arg_offset) / 8, \
.BindlessShaderDispatchMode = RT_SIMD16, \
.KernelStartPointer = bin->kernel.offset, \
.RegistersPerThread = ptl_register_blocks(prog_data->base.grf_used), \
}; \
})
#else
#define anv_shader_bin_get_bsr(bin, local_arg_offset) ({ \
assert((local_arg_offset) % 8 == 0); \
const struct brw_bs_prog_data *prog_data = \
@ -289,6 +304,7 @@ genX(ray_tracing_pipeline_emit)(struct anv_ray_tracing_pipeline *pipeline);
.KernelStartPointer = bin->kernel.offset, \
}; \
})
#endif
void
genX(batch_set_preemption)(struct anv_batch *batch,