brw/nir: rework inline_data_intel to work with compute

This intrinsic was initially dedicated to mesh/task shaders, but the
mechanism it exposes also exists in the compute shaders on Gfx12.5+.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31508>
This commit is contained in:
Lionel Landwerlin 2024-09-30 08:45:21 +03:00 committed by Marge Bot
parent 1dc125338e
commit 97b17aa0b1
12 changed files with 103 additions and 50 deletions

View file

@ -230,7 +230,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
case nir_intrinsic_load_reloc_const_intel:
case nir_intrinsic_load_btd_global_arg_addr_intel:
case nir_intrinsic_load_btd_local_arg_addr_intel:
case nir_intrinsic_load_mesh_inline_data_intel:
case nir_intrinsic_load_inline_data_intel:
case nir_intrinsic_load_ray_num_dss_rt_stacks_intel:
case nir_intrinsic_load_lshs_vertex_stride_amd:
case nir_intrinsic_load_esgs_vertex_stride_amd:

View file

@ -2216,8 +2216,10 @@ load("ssbo_uniform_block_intel", [-1, 1], [ACCESS, ALIGN_MUL, ALIGN_OFFSET], [CA
# src[] = { offset }.
load("shared_uniform_block_intel", [1], [BASE, ALIGN_MUL, ALIGN_OFFSET], [CAN_ELIMINATE])
# Intrinsics for Intel mesh shading
system_value("mesh_inline_data_intel", 1, [ALIGN_OFFSET], bit_sizes=[32, 64])
# Inline register delivery (available on Gfx12.5+ for CS/Mesh/Task stages)
intrinsic("load_inline_data_intel", [], dest_comp=0,
indices=[BASE],
flags=[CAN_ELIMINATE, CAN_REORDER])
# Intrinsics for Intel bindless thread dispatch
# BASE=brw_topoloy_id

View file

@ -100,7 +100,7 @@ const unsigned *
brw_compile_cs(const struct brw_compiler *compiler,
struct brw_compile_cs_params *params)
{
const nir_shader *nir = params->base.nir;
struct nir_shader *nir = params->base.nir;
const struct brw_cs_prog_key *key = params->key;
struct brw_cs_prog_data *prog_data = params->prog_data;
@ -112,6 +112,8 @@ brw_compile_cs(const struct brw_compiler *compiler,
prog_data->base.total_shared = nir->info.shared_size;
prog_data->base.ray_queries = nir->info.ray_queries;
prog_data->base.total_scratch = 0;
prog_data->uses_inline_data = brw_nir_uses_inline_data(nir);
assert(compiler->devinfo->verx10 >= 125 || !prog_data->uses_inline_data);
if (!nir->info.workgroup_size_variable) {
prog_data->local_size[0] = nir->info.workgroup_size[0];
@ -220,4 +222,3 @@ brw_compile_cs(const struct brw_compiler *compiler,
return g.get_assembly();
}

View file

@ -47,33 +47,40 @@ brw_nir_lower_load_uniforms_filter(const nir_instr *instr,
static nir_def *
brw_nir_lower_load_uniforms_impl(nir_builder *b, nir_instr *instr,
UNUSED void *data)
void *data)
{
const struct intel_device_info *devinfo =
(const struct intel_device_info *)data;
assert(instr->type == nir_instr_type_intrinsic);
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
assert(intrin->intrinsic == nir_intrinsic_load_uniform);
/* Read the first few 32-bit scalars from InlineData. */
if (nir_src_is_const(intrin->src[0]) &&
intrin->def.bit_size == 32 &&
intrin->def.num_components == 1) {
unsigned off = nir_intrinsic_base(intrin) + nir_src_as_uint(intrin->src[0]);
unsigned off_dw = off / 4;
if (off % 4 == 0 && off_dw < BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW) {
off_dw += BRW_TASK_MESH_PUSH_CONSTANTS_START_DW;
return nir_load_mesh_inline_data_intel(b, 32, off_dw);
/* Use the first few bytes of InlineData as push constants. */
if (nir_src_is_const(intrin->src[0])) {
int offset =
BRW_TASK_MESH_PUSH_CONSTANTS_START_DW * 4 +
nir_intrinsic_base(intrin) + nir_src_as_uint(intrin->src[0]);
int range = intrin->def.num_components * intrin->def.bit_size / 8;
if ((offset + range) <= (int)(REG_SIZE * reg_unit(devinfo))) {
return nir_load_inline_data_intel(b,
intrin->def.num_components,
intrin->def.bit_size,
.base = offset);
}
}
return brw_nir_load_global_const(b, intrin,
nir_load_mesh_inline_data_intel(b, 64, 0), 0);
nir_load_inline_data_intel(b, 1, 64, 0), 0);
}
static bool
brw_nir_lower_load_uniforms(nir_shader *nir)
brw_nir_lower_load_uniforms(nir_shader *nir,
const struct intel_device_info *devinfo)
{
return nir_shader_lower_instructions(nir, brw_nir_lower_load_uniforms_filter,
brw_nir_lower_load_uniforms_impl, NULL);
brw_nir_lower_load_uniforms_impl,
(void *)devinfo);
}
static inline int
@ -355,6 +362,9 @@ brw_compile_task(const struct brw_compiler *compiler,
prog_data->uses_drawid =
BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
NIR_PASS(_, nir, brw_nir_lower_load_uniforms, compiler->devinfo);
prog_data->base.uses_inline_data = brw_nir_uses_inline_data(nir);
brw_simd_selection_state simd_state{
.devinfo = compiler->devinfo,
.prog_data = &prog_data->base,
@ -372,7 +382,6 @@ brw_compile_task(const struct brw_compiler *compiler,
nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir);
brw_nir_apply_key(shader, compiler, &key->base, dispatch_width);
NIR_PASS(_, shader, brw_nir_lower_load_uniforms);
NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width);
brw_postprocess_nir(shader, compiler, debug_enabled,
@ -1633,6 +1642,9 @@ brw_compile_mesh(const struct brw_compiler *compiler,
prog_data->autostrip_enable = brw_mesh_autostrip_enable(compiler, nir, &prog_data->map);
NIR_PASS(_, nir, brw_nir_lower_load_uniforms, compiler->devinfo);
prog_data->base.uses_inline_data = brw_nir_uses_inline_data(nir);
brw_simd_selection_state simd_state{
.devinfo = compiler->devinfo,
.prog_data = &prog_data->base,
@ -1661,7 +1673,6 @@ brw_compile_mesh(const struct brw_compiler *compiler,
NIR_PASS(_, shader, brw_nir_adjust_offset_for_arrayed_indices, &prog_data->map);
/* Load uniforms can do a better job for constants, so fold before it. */
NIR_PASS(_, shader, nir_opt_constant_folding);
NIR_PASS(_, shader, brw_nir_lower_load_uniforms);
NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width);

View file

@ -162,6 +162,14 @@ brw_shader_stage_requires_bindless_resources(gl_shader_stage stage)
return brw_shader_stage_is_bindless(stage) || gl_shader_stage_is_mesh(stage);
}
static inline bool
brw_shader_stage_has_inline_data(const struct intel_device_info *devinfo,
gl_shader_stage stage)
{
return stage == MESA_SHADER_MESH || stage == MESA_SHADER_TASK ||
(stage == MESA_SHADER_COMPUTE && devinfo->verx10 >= 125);
}
/**
* Program key structures.
*

View file

@ -935,15 +935,7 @@ fs_visitor::assign_curb_setup()
uint64_t used = 0;
bool is_compute = gl_shader_stage_is_compute(stage);
if (is_compute && brw_cs_prog_data(prog_data)->uses_inline_data) {
/* With COMPUTE_WALKER, we can push up to one register worth of data via
* the inline data parameter in the COMPUTE_WALKER command itself.
*
* TODO: Support inline data and push at the same time.
*/
assert(devinfo->verx10 >= 125);
assert(uniform_push_length <= reg_unit(devinfo));
} else if (is_compute && devinfo->verx10 >= 125 && uniform_push_length > 0) {
if (is_compute && devinfo->verx10 >= 125 && uniform_push_length > 0) {
assert(devinfo->has_lsc);
fs_builder ubld = fs_builder(this, 1).exec_all().at(
cfg->first_block(), cfg->first_block()->start());

View file

@ -221,6 +221,8 @@ struct cs_thread_payload : public thread_payload {
brw_reg local_invocation_id[3];
brw_reg inline_parameter;
protected:
brw_reg subgroup_id_;
};
@ -230,7 +232,6 @@ struct task_mesh_thread_payload : public cs_thread_payload {
brw_reg extended_parameter_0;
brw_reg local_index;
brw_reg inline_parameter;
brw_reg urb_output;

View file

@ -4522,6 +4522,19 @@ fs_nir_emit_cs_intrinsic(nir_to_brw_state &ntb,
}
break;
case nir_intrinsic_load_inline_data_intel: {
const cs_thread_payload &payload = s.cs_payload();
unsigned inline_stride = brw_type_size_bytes(dest.type);
for (unsigned c = 0; c < instr->def.num_components; c++)
bld.MOV(offset(dest, bld, c),
retype(
byte_offset(payload.inline_parameter,
nir_intrinsic_base(instr) +
c * inline_stride),
dest.type));
break;
}
case nir_intrinsic_load_subgroup_id:
s.cs_payload().load_subgroup_id(bld, dest);
break;
@ -4884,20 +4897,21 @@ try_rebuild_source(nir_to_brw_state &ntb, const brw::fs_builder &bld,
break;
}
case nir_intrinsic_load_mesh_inline_data_intel: {
assert(ntb.s.stage == MESA_SHADER_MESH ||
ntb.s.stage == MESA_SHADER_TASK);
const task_mesh_thread_payload &payload = ntb.s.task_mesh_payload();
case nir_intrinsic_load_inline_data_intel: {
assert(brw_shader_stage_has_inline_data(ntb.devinfo, ntb.s.stage));
const cs_thread_payload &payload = ntb.s.cs_payload();
enum brw_reg_type type =
brw_type_with_size(BRW_TYPE_D, intrin->def.bit_size);
brw_reg dst_data = ubld.vgrf(type, intrin->def.num_components);
unsigned inline_stride = brw_type_size_bytes(type);
for (unsigned c = 0; c < intrin->def.num_components; c++) {
brw_reg src = retype(
offset(payload.inline_parameter, 1,
nir_intrinsic_align_offset(intrin) + c * intrin->def.bit_size / 8),
brw_type_with_size(BRW_TYPE_D, intrin->def.bit_size));
fs_inst *inst = ubld.MOV(byte_offset(dst_data, c * grf_size), src);
fs_inst *inst = ubld.MOV(byte_offset(dst_data, c * grf_size),
retype(
byte_offset(payload.inline_parameter,
nir_intrinsic_base(intrin) +
c * inline_stride),
type));
if (c == 0)
ntb.resource_insts[def->index] = inst;
}
@ -5800,12 +5814,6 @@ fs_nir_emit_task_mesh_intrinsic(nir_to_brw_state &ntb, const fs_builder &bld,
dest = get_nir_def(ntb, instr->def);
switch (instr->intrinsic) {
case nir_intrinsic_load_mesh_inline_data_intel: {
brw_reg data = offset(payload.inline_parameter, 1, nir_intrinsic_align_offset(instr));
bld.MOV(dest, retype(data, dest.type));
break;
}
case nir_intrinsic_load_draw_id:
dest = retype(dest, BRW_TYPE_UD);
bld.MOV(dest, payload.extended_parameter_0);

View file

@ -379,6 +379,11 @@ cs_thread_payload::cs_thread_payload(const fs_visitor &v)
/* TODO: Fill out uses_btd_stack_ids automatically */
if (prog_data->uses_btd_stack_ids)
r += reg_unit(v.devinfo);
if (v.stage == MESA_SHADER_COMPUTE && prog_data->uses_inline_data) {
inline_parameter = brw_ud1_grf(r, 0);
r += reg_unit(v.devinfo);
}
}
num_regs = r;
@ -458,8 +463,11 @@ task_mesh_thread_payload::task_mesh_thread_payload(fs_visitor &v)
if (v.devinfo->ver < 20 && v.dispatch_width == 32)
r += reg_unit(v.devinfo);
inline_parameter = brw_ud1_grf(r, 0);
r += reg_unit(v.devinfo);
struct brw_cs_prog_data *prog_data = brw_cs_prog_data(v.prog_data);
if (prog_data->uses_inline_data) {
inline_parameter = brw_ud1_grf(r, 0);
r += reg_unit(v.devinfo);
}
num_regs = r;
}

View file

@ -2265,3 +2265,24 @@ brw_nir_get_var_type(const struct nir_shader *nir, nir_variable *var)
return type;
}
bool
brw_nir_uses_inline_data(nir_shader *shader)
{
nir_foreach_function_impl(impl, shader) {
nir_foreach_block(block, impl) {
nir_foreach_instr(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic != nir_intrinsic_load_inline_data_intel)
continue;
return true;
}
}
}
return false;
}

View file

@ -290,6 +290,8 @@ brw_nir_no_indirect_mask(const struct brw_compiler *compiler,
return indirect_mask;
}
bool brw_nir_uses_inline_data(nir_shader *shader);
#ifdef __cplusplus
}
#endif

View file

@ -399,9 +399,8 @@ static nir_def *
build_load_uniform(nir_builder *b, unsigned offset,
unsigned num_components, unsigned bit_size)
{
return nir_load_uniform(b, num_components, bit_size, nir_imm_int(b, 0),
.base = offset,
.range = num_components * bit_size / 8);
return nir_load_inline_data_intel(b, num_components, bit_size,
.base = offset);
}
#define load_trampoline_param(b, name, num_components, bit_size) \