brw/nir: rework inline_data_intel to work with compute

This intrinsic was initially dedicated to mesh/task shaders, but the
mechanism it exposes also exists in the compute shaders on Gfx12.5+.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31508>
This commit is contained in:
Lionel Landwerlin 2024-09-30 08:45:21 +03:00 committed by Marge Bot
parent 1dc125338e
commit 97b17aa0b1
12 changed files with 103 additions and 50 deletions

View file

@ -230,7 +230,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
case nir_intrinsic_load_reloc_const_intel: case nir_intrinsic_load_reloc_const_intel:
case nir_intrinsic_load_btd_global_arg_addr_intel: case nir_intrinsic_load_btd_global_arg_addr_intel:
case nir_intrinsic_load_btd_local_arg_addr_intel: case nir_intrinsic_load_btd_local_arg_addr_intel:
case nir_intrinsic_load_mesh_inline_data_intel: case nir_intrinsic_load_inline_data_intel:
case nir_intrinsic_load_ray_num_dss_rt_stacks_intel: case nir_intrinsic_load_ray_num_dss_rt_stacks_intel:
case nir_intrinsic_load_lshs_vertex_stride_amd: case nir_intrinsic_load_lshs_vertex_stride_amd:
case nir_intrinsic_load_esgs_vertex_stride_amd: case nir_intrinsic_load_esgs_vertex_stride_amd:

View file

@ -2216,8 +2216,10 @@ load("ssbo_uniform_block_intel", [-1, 1], [ACCESS, ALIGN_MUL, ALIGN_OFFSET], [CA
# src[] = { offset }. # src[] = { offset }.
load("shared_uniform_block_intel", [1], [BASE, ALIGN_MUL, ALIGN_OFFSET], [CAN_ELIMINATE]) load("shared_uniform_block_intel", [1], [BASE, ALIGN_MUL, ALIGN_OFFSET], [CAN_ELIMINATE])
# Intrinsics for Intel mesh shading # Inline register delivery (available on Gfx12.5+ for CS/Mesh/Task stages)
system_value("mesh_inline_data_intel", 1, [ALIGN_OFFSET], bit_sizes=[32, 64]) intrinsic("load_inline_data_intel", [], dest_comp=0,
indices=[BASE],
flags=[CAN_ELIMINATE, CAN_REORDER])
# Intrinsics for Intel bindless thread dispatch # Intrinsics for Intel bindless thread dispatch
# BASE=brw_topoloy_id # BASE=brw_topoloy_id

View file

@ -100,7 +100,7 @@ const unsigned *
brw_compile_cs(const struct brw_compiler *compiler, brw_compile_cs(const struct brw_compiler *compiler,
struct brw_compile_cs_params *params) struct brw_compile_cs_params *params)
{ {
const nir_shader *nir = params->base.nir; struct nir_shader *nir = params->base.nir;
const struct brw_cs_prog_key *key = params->key; const struct brw_cs_prog_key *key = params->key;
struct brw_cs_prog_data *prog_data = params->prog_data; struct brw_cs_prog_data *prog_data = params->prog_data;
@ -112,6 +112,8 @@ brw_compile_cs(const struct brw_compiler *compiler,
prog_data->base.total_shared = nir->info.shared_size; prog_data->base.total_shared = nir->info.shared_size;
prog_data->base.ray_queries = nir->info.ray_queries; prog_data->base.ray_queries = nir->info.ray_queries;
prog_data->base.total_scratch = 0; prog_data->base.total_scratch = 0;
prog_data->uses_inline_data = brw_nir_uses_inline_data(nir);
assert(compiler->devinfo->verx10 >= 125 || !prog_data->uses_inline_data);
if (!nir->info.workgroup_size_variable) { if (!nir->info.workgroup_size_variable) {
prog_data->local_size[0] = nir->info.workgroup_size[0]; prog_data->local_size[0] = nir->info.workgroup_size[0];
@ -220,4 +222,3 @@ brw_compile_cs(const struct brw_compiler *compiler,
return g.get_assembly(); return g.get_assembly();
} }

View file

@ -47,33 +47,40 @@ brw_nir_lower_load_uniforms_filter(const nir_instr *instr,
static nir_def * static nir_def *
brw_nir_lower_load_uniforms_impl(nir_builder *b, nir_instr *instr, brw_nir_lower_load_uniforms_impl(nir_builder *b, nir_instr *instr,
UNUSED void *data) void *data)
{ {
const struct intel_device_info *devinfo =
(const struct intel_device_info *)data;
assert(instr->type == nir_instr_type_intrinsic); assert(instr->type == nir_instr_type_intrinsic);
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
assert(intrin->intrinsic == nir_intrinsic_load_uniform); assert(intrin->intrinsic == nir_intrinsic_load_uniform);
/* Read the first few 32-bit scalars from InlineData. */ /* Use the first few bytes of InlineData as push constants. */
if (nir_src_is_const(intrin->src[0]) && if (nir_src_is_const(intrin->src[0])) {
intrin->def.bit_size == 32 && int offset =
intrin->def.num_components == 1) { BRW_TASK_MESH_PUSH_CONSTANTS_START_DW * 4 +
unsigned off = nir_intrinsic_base(intrin) + nir_src_as_uint(intrin->src[0]); nir_intrinsic_base(intrin) + nir_src_as_uint(intrin->src[0]);
unsigned off_dw = off / 4; int range = intrin->def.num_components * intrin->def.bit_size / 8;
if (off % 4 == 0 && off_dw < BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW) { if ((offset + range) <= (int)(REG_SIZE * reg_unit(devinfo))) {
off_dw += BRW_TASK_MESH_PUSH_CONSTANTS_START_DW; return nir_load_inline_data_intel(b,
return nir_load_mesh_inline_data_intel(b, 32, off_dw); intrin->def.num_components,
intrin->def.bit_size,
.base = offset);
} }
} }
return brw_nir_load_global_const(b, intrin, return brw_nir_load_global_const(b, intrin,
nir_load_mesh_inline_data_intel(b, 64, 0), 0); nir_load_inline_data_intel(b, 1, 64, 0), 0);
} }
static bool static bool
brw_nir_lower_load_uniforms(nir_shader *nir) brw_nir_lower_load_uniforms(nir_shader *nir,
const struct intel_device_info *devinfo)
{ {
return nir_shader_lower_instructions(nir, brw_nir_lower_load_uniforms_filter, return nir_shader_lower_instructions(nir, brw_nir_lower_load_uniforms_filter,
brw_nir_lower_load_uniforms_impl, NULL); brw_nir_lower_load_uniforms_impl,
(void *)devinfo);
} }
static inline int static inline int
@ -355,6 +362,9 @@ brw_compile_task(const struct brw_compiler *compiler,
prog_data->uses_drawid = prog_data->uses_drawid =
BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID); BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
NIR_PASS(_, nir, brw_nir_lower_load_uniforms, compiler->devinfo);
prog_data->base.uses_inline_data = brw_nir_uses_inline_data(nir);
brw_simd_selection_state simd_state{ brw_simd_selection_state simd_state{
.devinfo = compiler->devinfo, .devinfo = compiler->devinfo,
.prog_data = &prog_data->base, .prog_data = &prog_data->base,
@ -372,7 +382,6 @@ brw_compile_task(const struct brw_compiler *compiler,
nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir); nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir);
brw_nir_apply_key(shader, compiler, &key->base, dispatch_width); brw_nir_apply_key(shader, compiler, &key->base, dispatch_width);
NIR_PASS(_, shader, brw_nir_lower_load_uniforms);
NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width); NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width);
brw_postprocess_nir(shader, compiler, debug_enabled, brw_postprocess_nir(shader, compiler, debug_enabled,
@ -1633,6 +1642,9 @@ brw_compile_mesh(const struct brw_compiler *compiler,
prog_data->autostrip_enable = brw_mesh_autostrip_enable(compiler, nir, &prog_data->map); prog_data->autostrip_enable = brw_mesh_autostrip_enable(compiler, nir, &prog_data->map);
NIR_PASS(_, nir, brw_nir_lower_load_uniforms, compiler->devinfo);
prog_data->base.uses_inline_data = brw_nir_uses_inline_data(nir);
brw_simd_selection_state simd_state{ brw_simd_selection_state simd_state{
.devinfo = compiler->devinfo, .devinfo = compiler->devinfo,
.prog_data = &prog_data->base, .prog_data = &prog_data->base,
@ -1661,7 +1673,6 @@ brw_compile_mesh(const struct brw_compiler *compiler,
NIR_PASS(_, shader, brw_nir_adjust_offset_for_arrayed_indices, &prog_data->map); NIR_PASS(_, shader, brw_nir_adjust_offset_for_arrayed_indices, &prog_data->map);
/* Load uniforms can do a better job for constants, so fold before it. */ /* Load uniforms can do a better job for constants, so fold before it. */
NIR_PASS(_, shader, nir_opt_constant_folding); NIR_PASS(_, shader, nir_opt_constant_folding);
NIR_PASS(_, shader, brw_nir_lower_load_uniforms);
NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width); NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width);

View file

@ -162,6 +162,14 @@ brw_shader_stage_requires_bindless_resources(gl_shader_stage stage)
return brw_shader_stage_is_bindless(stage) || gl_shader_stage_is_mesh(stage); return brw_shader_stage_is_bindless(stage) || gl_shader_stage_is_mesh(stage);
} }
static inline bool
brw_shader_stage_has_inline_data(const struct intel_device_info *devinfo,
gl_shader_stage stage)
{
return stage == MESA_SHADER_MESH || stage == MESA_SHADER_TASK ||
(stage == MESA_SHADER_COMPUTE && devinfo->verx10 >= 125);
}
/** /**
* Program key structures. * Program key structures.
* *

View file

@ -935,15 +935,7 @@ fs_visitor::assign_curb_setup()
uint64_t used = 0; uint64_t used = 0;
bool is_compute = gl_shader_stage_is_compute(stage); bool is_compute = gl_shader_stage_is_compute(stage);
if (is_compute && brw_cs_prog_data(prog_data)->uses_inline_data) { if (is_compute && devinfo->verx10 >= 125 && uniform_push_length > 0) {
/* With COMPUTE_WALKER, we can push up to one register worth of data via
* the inline data parameter in the COMPUTE_WALKER command itself.
*
* TODO: Support inline data and push at the same time.
*/
assert(devinfo->verx10 >= 125);
assert(uniform_push_length <= reg_unit(devinfo));
} else if (is_compute && devinfo->verx10 >= 125 && uniform_push_length > 0) {
assert(devinfo->has_lsc); assert(devinfo->has_lsc);
fs_builder ubld = fs_builder(this, 1).exec_all().at( fs_builder ubld = fs_builder(this, 1).exec_all().at(
cfg->first_block(), cfg->first_block()->start()); cfg->first_block(), cfg->first_block()->start());

View file

@ -221,6 +221,8 @@ struct cs_thread_payload : public thread_payload {
brw_reg local_invocation_id[3]; brw_reg local_invocation_id[3];
brw_reg inline_parameter;
protected: protected:
brw_reg subgroup_id_; brw_reg subgroup_id_;
}; };
@ -230,7 +232,6 @@ struct task_mesh_thread_payload : public cs_thread_payload {
brw_reg extended_parameter_0; brw_reg extended_parameter_0;
brw_reg local_index; brw_reg local_index;
brw_reg inline_parameter;
brw_reg urb_output; brw_reg urb_output;

View file

@ -4522,6 +4522,19 @@ fs_nir_emit_cs_intrinsic(nir_to_brw_state &ntb,
} }
break; break;
case nir_intrinsic_load_inline_data_intel: {
const cs_thread_payload &payload = s.cs_payload();
unsigned inline_stride = brw_type_size_bytes(dest.type);
for (unsigned c = 0; c < instr->def.num_components; c++)
bld.MOV(offset(dest, bld, c),
retype(
byte_offset(payload.inline_parameter,
nir_intrinsic_base(instr) +
c * inline_stride),
dest.type));
break;
}
case nir_intrinsic_load_subgroup_id: case nir_intrinsic_load_subgroup_id:
s.cs_payload().load_subgroup_id(bld, dest); s.cs_payload().load_subgroup_id(bld, dest);
break; break;
@ -4884,20 +4897,21 @@ try_rebuild_source(nir_to_brw_state &ntb, const brw::fs_builder &bld,
break; break;
} }
case nir_intrinsic_load_mesh_inline_data_intel: { case nir_intrinsic_load_inline_data_intel: {
assert(ntb.s.stage == MESA_SHADER_MESH || assert(brw_shader_stage_has_inline_data(ntb.devinfo, ntb.s.stage));
ntb.s.stage == MESA_SHADER_TASK); const cs_thread_payload &payload = ntb.s.cs_payload();
const task_mesh_thread_payload &payload = ntb.s.task_mesh_payload();
enum brw_reg_type type = enum brw_reg_type type =
brw_type_with_size(BRW_TYPE_D, intrin->def.bit_size); brw_type_with_size(BRW_TYPE_D, intrin->def.bit_size);
brw_reg dst_data = ubld.vgrf(type, intrin->def.num_components); brw_reg dst_data = ubld.vgrf(type, intrin->def.num_components);
unsigned inline_stride = brw_type_size_bytes(type);
for (unsigned c = 0; c < intrin->def.num_components; c++) { for (unsigned c = 0; c < intrin->def.num_components; c++) {
brw_reg src = retype( fs_inst *inst = ubld.MOV(byte_offset(dst_data, c * grf_size),
offset(payload.inline_parameter, 1, retype(
nir_intrinsic_align_offset(intrin) + c * intrin->def.bit_size / 8), byte_offset(payload.inline_parameter,
brw_type_with_size(BRW_TYPE_D, intrin->def.bit_size)); nir_intrinsic_base(intrin) +
fs_inst *inst = ubld.MOV(byte_offset(dst_data, c * grf_size), src); c * inline_stride),
type));
if (c == 0) if (c == 0)
ntb.resource_insts[def->index] = inst; ntb.resource_insts[def->index] = inst;
} }
@ -5800,12 +5814,6 @@ fs_nir_emit_task_mesh_intrinsic(nir_to_brw_state &ntb, const fs_builder &bld,
dest = get_nir_def(ntb, instr->def); dest = get_nir_def(ntb, instr->def);
switch (instr->intrinsic) { switch (instr->intrinsic) {
case nir_intrinsic_load_mesh_inline_data_intel: {
brw_reg data = offset(payload.inline_parameter, 1, nir_intrinsic_align_offset(instr));
bld.MOV(dest, retype(data, dest.type));
break;
}
case nir_intrinsic_load_draw_id: case nir_intrinsic_load_draw_id:
dest = retype(dest, BRW_TYPE_UD); dest = retype(dest, BRW_TYPE_UD);
bld.MOV(dest, payload.extended_parameter_0); bld.MOV(dest, payload.extended_parameter_0);

View file

@ -379,6 +379,11 @@ cs_thread_payload::cs_thread_payload(const fs_visitor &v)
/* TODO: Fill out uses_btd_stack_ids automatically */ /* TODO: Fill out uses_btd_stack_ids automatically */
if (prog_data->uses_btd_stack_ids) if (prog_data->uses_btd_stack_ids)
r += reg_unit(v.devinfo); r += reg_unit(v.devinfo);
if (v.stage == MESA_SHADER_COMPUTE && prog_data->uses_inline_data) {
inline_parameter = brw_ud1_grf(r, 0);
r += reg_unit(v.devinfo);
}
} }
num_regs = r; num_regs = r;
@ -458,8 +463,11 @@ task_mesh_thread_payload::task_mesh_thread_payload(fs_visitor &v)
if (v.devinfo->ver < 20 && v.dispatch_width == 32) if (v.devinfo->ver < 20 && v.dispatch_width == 32)
r += reg_unit(v.devinfo); r += reg_unit(v.devinfo);
inline_parameter = brw_ud1_grf(r, 0); struct brw_cs_prog_data *prog_data = brw_cs_prog_data(v.prog_data);
r += reg_unit(v.devinfo); if (prog_data->uses_inline_data) {
inline_parameter = brw_ud1_grf(r, 0);
r += reg_unit(v.devinfo);
}
num_regs = r; num_regs = r;
} }

View file

@ -2265,3 +2265,24 @@ brw_nir_get_var_type(const struct nir_shader *nir, nir_variable *var)
return type; return type;
} }
bool
brw_nir_uses_inline_data(nir_shader *shader)
{
nir_foreach_function_impl(impl, shader) {
nir_foreach_block(block, impl) {
nir_foreach_instr(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic != nir_intrinsic_load_inline_data_intel)
continue;
return true;
}
}
}
return false;
}

View file

@ -290,6 +290,8 @@ brw_nir_no_indirect_mask(const struct brw_compiler *compiler,
return indirect_mask; return indirect_mask;
} }
bool brw_nir_uses_inline_data(nir_shader *shader);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View file

@ -399,9 +399,8 @@ static nir_def *
build_load_uniform(nir_builder *b, unsigned offset, build_load_uniform(nir_builder *b, unsigned offset,
unsigned num_components, unsigned bit_size) unsigned num_components, unsigned bit_size)
{ {
return nir_load_uniform(b, num_components, bit_size, nir_imm_int(b, 0), return nir_load_inline_data_intel(b, num_components, bit_size,
.base = offset, .base = offset);
.range = num_components * bit_size / 8);
} }
#define load_trampoline_param(b, name, num_components, bit_size) \ #define load_trampoline_param(b, name, num_components, bit_size) \