diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c index 3a170597ff9..15171c1f92b 100644 --- a/src/compiler/nir/nir_divergence_analysis.c +++ b/src/compiler/nir/nir_divergence_analysis.c @@ -230,7 +230,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) case nir_intrinsic_load_reloc_const_intel: case nir_intrinsic_load_btd_global_arg_addr_intel: case nir_intrinsic_load_btd_local_arg_addr_intel: - case nir_intrinsic_load_mesh_inline_data_intel: + case nir_intrinsic_load_inline_data_intel: case nir_intrinsic_load_ray_num_dss_rt_stacks_intel: case nir_intrinsic_load_lshs_vertex_stride_amd: case nir_intrinsic_load_esgs_vertex_stride_amd: diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index de24a64927d..31af10c320b 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -2216,8 +2216,10 @@ load("ssbo_uniform_block_intel", [-1, 1], [ACCESS, ALIGN_MUL, ALIGN_OFFSET], [CA # src[] = { offset }. load("shared_uniform_block_intel", [1], [BASE, ALIGN_MUL, ALIGN_OFFSET], [CAN_ELIMINATE]) -# Intrinsics for Intel mesh shading -system_value("mesh_inline_data_intel", 1, [ALIGN_OFFSET], bit_sizes=[32, 64]) +# Inline register delivery (available on Gfx12.5+ for CS/Mesh/Task stages) +intrinsic("load_inline_data_intel", [], dest_comp=0, + indices=[BASE], + flags=[CAN_ELIMINATE, CAN_REORDER]) # Intrinsics for Intel bindless thread dispatch # BASE=brw_topoloy_id diff --git a/src/intel/compiler/brw_compile_cs.cpp b/src/intel/compiler/brw_compile_cs.cpp index 9f7a701733c..a7ba8cd2e4e 100644 --- a/src/intel/compiler/brw_compile_cs.cpp +++ b/src/intel/compiler/brw_compile_cs.cpp @@ -100,7 +100,7 @@ const unsigned * brw_compile_cs(const struct brw_compiler *compiler, struct brw_compile_cs_params *params) { - const nir_shader *nir = params->base.nir; + struct nir_shader *nir = params->base.nir; const struct brw_cs_prog_key *key = params->key; struct brw_cs_prog_data *prog_data = params->prog_data; @@ -112,6 +112,8 @@ brw_compile_cs(const struct brw_compiler *compiler, prog_data->base.total_shared = nir->info.shared_size; prog_data->base.ray_queries = nir->info.ray_queries; prog_data->base.total_scratch = 0; + prog_data->uses_inline_data = brw_nir_uses_inline_data(nir); + assert(compiler->devinfo->verx10 >= 125 || !prog_data->uses_inline_data); if (!nir->info.workgroup_size_variable) { prog_data->local_size[0] = nir->info.workgroup_size[0]; @@ -220,4 +222,3 @@ brw_compile_cs(const struct brw_compiler *compiler, return g.get_assembly(); } - diff --git a/src/intel/compiler/brw_compile_mesh.cpp b/src/intel/compiler/brw_compile_mesh.cpp index bc97b97c9c8..d25eebef996 100644 --- a/src/intel/compiler/brw_compile_mesh.cpp +++ b/src/intel/compiler/brw_compile_mesh.cpp @@ -47,33 +47,40 @@ brw_nir_lower_load_uniforms_filter(const nir_instr *instr, static nir_def * brw_nir_lower_load_uniforms_impl(nir_builder *b, nir_instr *instr, - UNUSED void *data) + void *data) { + const struct intel_device_info *devinfo = + (const struct intel_device_info *)data; + assert(instr->type == nir_instr_type_intrinsic); nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); assert(intrin->intrinsic == nir_intrinsic_load_uniform); - /* Read the first few 32-bit scalars from InlineData. */ - if (nir_src_is_const(intrin->src[0]) && - intrin->def.bit_size == 32 && - intrin->def.num_components == 1) { - unsigned off = nir_intrinsic_base(intrin) + nir_src_as_uint(intrin->src[0]); - unsigned off_dw = off / 4; - if (off % 4 == 0 && off_dw < BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW) { - off_dw += BRW_TASK_MESH_PUSH_CONSTANTS_START_DW; - return nir_load_mesh_inline_data_intel(b, 32, off_dw); + /* Use the first few bytes of InlineData as push constants. */ + if (nir_src_is_const(intrin->src[0])) { + int offset = + BRW_TASK_MESH_PUSH_CONSTANTS_START_DW * 4 + + nir_intrinsic_base(intrin) + nir_src_as_uint(intrin->src[0]); + int range = intrin->def.num_components * intrin->def.bit_size / 8; + if ((offset + range) <= (int)(REG_SIZE * reg_unit(devinfo))) { + return nir_load_inline_data_intel(b, + intrin->def.num_components, + intrin->def.bit_size, + .base = offset); } } return brw_nir_load_global_const(b, intrin, - nir_load_mesh_inline_data_intel(b, 64, 0), 0); + nir_load_inline_data_intel(b, 1, 64, 0), 0); } static bool -brw_nir_lower_load_uniforms(nir_shader *nir) +brw_nir_lower_load_uniforms(nir_shader *nir, + const struct intel_device_info *devinfo) { return nir_shader_lower_instructions(nir, brw_nir_lower_load_uniforms_filter, - brw_nir_lower_load_uniforms_impl, NULL); + brw_nir_lower_load_uniforms_impl, + (void *)devinfo); } static inline int @@ -355,6 +362,9 @@ brw_compile_task(const struct brw_compiler *compiler, prog_data->uses_drawid = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID); + NIR_PASS(_, nir, brw_nir_lower_load_uniforms, compiler->devinfo); + prog_data->base.uses_inline_data = brw_nir_uses_inline_data(nir); + brw_simd_selection_state simd_state{ .devinfo = compiler->devinfo, .prog_data = &prog_data->base, @@ -372,7 +382,6 @@ brw_compile_task(const struct brw_compiler *compiler, nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir); brw_nir_apply_key(shader, compiler, &key->base, dispatch_width); - NIR_PASS(_, shader, brw_nir_lower_load_uniforms); NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width); brw_postprocess_nir(shader, compiler, debug_enabled, @@ -1633,6 +1642,9 @@ brw_compile_mesh(const struct brw_compiler *compiler, prog_data->autostrip_enable = brw_mesh_autostrip_enable(compiler, nir, &prog_data->map); + NIR_PASS(_, nir, brw_nir_lower_load_uniforms, compiler->devinfo); + prog_data->base.uses_inline_data = brw_nir_uses_inline_data(nir); + brw_simd_selection_state simd_state{ .devinfo = compiler->devinfo, .prog_data = &prog_data->base, @@ -1661,7 +1673,6 @@ brw_compile_mesh(const struct brw_compiler *compiler, NIR_PASS(_, shader, brw_nir_adjust_offset_for_arrayed_indices, &prog_data->map); /* Load uniforms can do a better job for constants, so fold before it. */ NIR_PASS(_, shader, nir_opt_constant_folding); - NIR_PASS(_, shader, brw_nir_lower_load_uniforms); NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width); diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h index 462abb417c0..8183d720231 100644 --- a/src/intel/compiler/brw_compiler.h +++ b/src/intel/compiler/brw_compiler.h @@ -162,6 +162,14 @@ brw_shader_stage_requires_bindless_resources(gl_shader_stage stage) return brw_shader_stage_is_bindless(stage) || gl_shader_stage_is_mesh(stage); } +static inline bool +brw_shader_stage_has_inline_data(const struct intel_device_info *devinfo, + gl_shader_stage stage) +{ + return stage == MESA_SHADER_MESH || stage == MESA_SHADER_TASK || + (stage == MESA_SHADER_COMPUTE && devinfo->verx10 >= 125); +} + /** * Program key structures. * diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 50d459551d2..7cf0331927f 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -935,15 +935,7 @@ fs_visitor::assign_curb_setup() uint64_t used = 0; bool is_compute = gl_shader_stage_is_compute(stage); - if (is_compute && brw_cs_prog_data(prog_data)->uses_inline_data) { - /* With COMPUTE_WALKER, we can push up to one register worth of data via - * the inline data parameter in the COMPUTE_WALKER command itself. - * - * TODO: Support inline data and push at the same time. - */ - assert(devinfo->verx10 >= 125); - assert(uniform_push_length <= reg_unit(devinfo)); - } else if (is_compute && devinfo->verx10 >= 125 && uniform_push_length > 0) { + if (is_compute && devinfo->verx10 >= 125 && uniform_push_length > 0) { assert(devinfo->has_lsc); fs_builder ubld = fs_builder(this, 1).exec_all().at( cfg->first_block(), cfg->first_block()->start()); diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index 98ab4136262..441f6c96733 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -221,6 +221,8 @@ struct cs_thread_payload : public thread_payload { brw_reg local_invocation_id[3]; + brw_reg inline_parameter; + protected: brw_reg subgroup_id_; }; @@ -230,7 +232,6 @@ struct task_mesh_thread_payload : public cs_thread_payload { brw_reg extended_parameter_0; brw_reg local_index; - brw_reg inline_parameter; brw_reg urb_output; diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 50abe318614..858f9383644 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -4522,6 +4522,19 @@ fs_nir_emit_cs_intrinsic(nir_to_brw_state &ntb, } break; + case nir_intrinsic_load_inline_data_intel: { + const cs_thread_payload &payload = s.cs_payload(); + unsigned inline_stride = brw_type_size_bytes(dest.type); + for (unsigned c = 0; c < instr->def.num_components; c++) + bld.MOV(offset(dest, bld, c), + retype( + byte_offset(payload.inline_parameter, + nir_intrinsic_base(instr) + + c * inline_stride), + dest.type)); + break; + } + case nir_intrinsic_load_subgroup_id: s.cs_payload().load_subgroup_id(bld, dest); break; @@ -4884,20 +4897,21 @@ try_rebuild_source(nir_to_brw_state &ntb, const brw::fs_builder &bld, break; } - case nir_intrinsic_load_mesh_inline_data_intel: { - assert(ntb.s.stage == MESA_SHADER_MESH || - ntb.s.stage == MESA_SHADER_TASK); - const task_mesh_thread_payload &payload = ntb.s.task_mesh_payload(); + case nir_intrinsic_load_inline_data_intel: { + assert(brw_shader_stage_has_inline_data(ntb.devinfo, ntb.s.stage)); + const cs_thread_payload &payload = ntb.s.cs_payload(); enum brw_reg_type type = brw_type_with_size(BRW_TYPE_D, intrin->def.bit_size); brw_reg dst_data = ubld.vgrf(type, intrin->def.num_components); + unsigned inline_stride = brw_type_size_bytes(type); for (unsigned c = 0; c < intrin->def.num_components; c++) { - brw_reg src = retype( - offset(payload.inline_parameter, 1, - nir_intrinsic_align_offset(intrin) + c * intrin->def.bit_size / 8), - brw_type_with_size(BRW_TYPE_D, intrin->def.bit_size)); - fs_inst *inst = ubld.MOV(byte_offset(dst_data, c * grf_size), src); + fs_inst *inst = ubld.MOV(byte_offset(dst_data, c * grf_size), + retype( + byte_offset(payload.inline_parameter, + nir_intrinsic_base(intrin) + + c * inline_stride), + type)); if (c == 0) ntb.resource_insts[def->index] = inst; } @@ -5800,12 +5814,6 @@ fs_nir_emit_task_mesh_intrinsic(nir_to_brw_state &ntb, const fs_builder &bld, dest = get_nir_def(ntb, instr->def); switch (instr->intrinsic) { - case nir_intrinsic_load_mesh_inline_data_intel: { - brw_reg data = offset(payload.inline_parameter, 1, nir_intrinsic_align_offset(instr)); - bld.MOV(dest, retype(data, dest.type)); - break; - } - case nir_intrinsic_load_draw_id: dest = retype(dest, BRW_TYPE_UD); bld.MOV(dest, payload.extended_parameter_0); diff --git a/src/intel/compiler/brw_fs_thread_payload.cpp b/src/intel/compiler/brw_fs_thread_payload.cpp index 5a6bca04965..9dd191b27a7 100644 --- a/src/intel/compiler/brw_fs_thread_payload.cpp +++ b/src/intel/compiler/brw_fs_thread_payload.cpp @@ -379,6 +379,11 @@ cs_thread_payload::cs_thread_payload(const fs_visitor &v) /* TODO: Fill out uses_btd_stack_ids automatically */ if (prog_data->uses_btd_stack_ids) r += reg_unit(v.devinfo); + + if (v.stage == MESA_SHADER_COMPUTE && prog_data->uses_inline_data) { + inline_parameter = brw_ud1_grf(r, 0); + r += reg_unit(v.devinfo); + } } num_regs = r; @@ -458,8 +463,11 @@ task_mesh_thread_payload::task_mesh_thread_payload(fs_visitor &v) if (v.devinfo->ver < 20 && v.dispatch_width == 32) r += reg_unit(v.devinfo); - inline_parameter = brw_ud1_grf(r, 0); - r += reg_unit(v.devinfo); + struct brw_cs_prog_data *prog_data = brw_cs_prog_data(v.prog_data); + if (prog_data->uses_inline_data) { + inline_parameter = brw_ud1_grf(r, 0); + r += reg_unit(v.devinfo); + } num_regs = r; } diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index 4de65b347e6..0f55509aa0e 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -2265,3 +2265,24 @@ brw_nir_get_var_type(const struct nir_shader *nir, nir_variable *var) return type; } + +bool +brw_nir_uses_inline_data(nir_shader *shader) +{ + nir_foreach_function_impl(impl, shader) { + nir_foreach_block(block, impl) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (intrin->intrinsic != nir_intrinsic_load_inline_data_intel) + continue; + + return true; + } + } + } + + return false; +} diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h index b40345daa76..ba12fe7e25b 100644 --- a/src/intel/compiler/brw_nir.h +++ b/src/intel/compiler/brw_nir.h @@ -290,6 +290,8 @@ brw_nir_no_indirect_mask(const struct brw_compiler *compiler, return indirect_mask; } +bool brw_nir_uses_inline_data(nir_shader *shader); + #ifdef __cplusplus } #endif diff --git a/src/intel/compiler/brw_nir_rt.c b/src/intel/compiler/brw_nir_rt.c index 5d2a89ce0d1..e01e5ab0415 100644 --- a/src/intel/compiler/brw_nir_rt.c +++ b/src/intel/compiler/brw_nir_rt.c @@ -399,9 +399,8 @@ static nir_def * build_load_uniform(nir_builder *b, unsigned offset, unsigned num_components, unsigned bit_size) { - return nir_load_uniform(b, num_components, bit_size, nir_imm_int(b, 0), - .base = offset, - .range = num_components * bit_size / 8); + return nir_load_inline_data_intel(b, num_components, bit_size, + .base = offset); } #define load_trampoline_param(b, name, num_components, bit_size) \