mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-20 20:20:18 +01:00
brw: enable A64 pulling of push constants
This will be useful for pulling constants in device bound shaders. A64 allows us to put the constants anywhere. Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Caio Oliveira <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32895>
This commit is contained in:
parent
0808125914
commit
5c17299084
10 changed files with 111 additions and 29 deletions
|
|
@ -148,7 +148,8 @@ brw_compile_cs(const struct brw_compiler *compiler,
|
|||
prog_data->base.total_shared = nir->info.shared_size;
|
||||
prog_data->base.ray_queries = nir->info.ray_queries;
|
||||
prog_data->base.total_scratch = 0;
|
||||
prog_data->uses_inline_data = brw_nir_uses_inline_data(nir);
|
||||
prog_data->uses_inline_data = brw_nir_uses_inline_data(nir) ||
|
||||
key->base.uses_inline_push_addr;
|
||||
assert(compiler->devinfo->verx10 >= 125 || !prog_data->uses_inline_data);
|
||||
|
||||
if (!nir->info.workgroup_size_variable) {
|
||||
|
|
|
|||
|
|
@ -386,7 +386,8 @@ brw_compile_task(const struct brw_compiler *compiler,
|
|||
prog_data->uses_drawid =
|
||||
BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
|
||||
|
||||
prog_data->base.uses_inline_data = brw_nir_uses_inline_data(nir);
|
||||
prog_data->base.uses_inline_data = brw_nir_uses_inline_data(nir) ||
|
||||
key->base.uses_inline_push_addr;
|
||||
|
||||
brw_simd_selection_state simd_state{
|
||||
.devinfo = compiler->devinfo,
|
||||
|
|
@ -1696,7 +1697,8 @@ brw_compile_mesh(const struct brw_compiler *compiler,
|
|||
|
||||
prog_data->autostrip_enable = brw_mesh_autostrip_enable(compiler, nir, &prog_data->map);
|
||||
|
||||
prog_data->base.uses_inline_data = brw_nir_uses_inline_data(nir);
|
||||
prog_data->base.uses_inline_data = brw_nir_uses_inline_data(nir) ||
|
||||
key->base.uses_inline_push_addr;
|
||||
|
||||
brw_simd_selection_state simd_state{
|
||||
.devinfo = compiler->devinfo,
|
||||
|
|
|
|||
|
|
@ -208,7 +208,9 @@ struct brw_base_prog_key {
|
|||
|
||||
enum brw_robustness_flags robust_flags:2;
|
||||
|
||||
unsigned padding:22;
|
||||
bool uses_inline_push_addr:1;
|
||||
|
||||
unsigned padding:21;
|
||||
|
||||
/**
|
||||
* Apply workarounds for SIN and COS input range problems.
|
||||
|
|
@ -891,6 +893,10 @@ struct brw_cs_prog_data {
|
|||
bool uses_barrier;
|
||||
bool uses_num_work_groups;
|
||||
bool uses_inline_data;
|
||||
/** Whether inline push data is used to provide a 64bit pointer to push
|
||||
* constants
|
||||
*/
|
||||
bool uses_inline_push_addr;
|
||||
bool uses_btd_stack_ids;
|
||||
bool uses_systolic;
|
||||
uint8_t generate_local_id;
|
||||
|
|
@ -920,6 +926,11 @@ brw_cs_prog_data_prog_offset(const struct brw_cs_prog_data *prog_data,
|
|||
struct brw_bs_prog_data {
|
||||
struct brw_stage_prog_data base;
|
||||
|
||||
/** Whether inline push data is used to provide a 64bit pointer to push
|
||||
* constants
|
||||
*/
|
||||
bool uses_inline_push_addr;
|
||||
|
||||
/** SIMD size of the root shader */
|
||||
uint8_t simd_size;
|
||||
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@
|
|||
#include "brw_fs_live_variables.h"
|
||||
#include "brw_nir.h"
|
||||
#include "brw_cfg.h"
|
||||
#include "brw_rt.h"
|
||||
#include "brw_private.h"
|
||||
#include "intel_nir.h"
|
||||
#include "shader_enums.h"
|
||||
|
|
@ -205,20 +206,39 @@ fs_visitor::assign_curb_setup()
|
|||
prog_data->curb_read_length = MAX2(1, prog_data->curb_read_length);
|
||||
|
||||
uint64_t used = 0;
|
||||
const bool shader_pulls_constants = devinfo->verx10 >= 125 &&
|
||||
(gl_shader_stage_is_compute(stage) || gl_shader_stage_is_mesh(stage));
|
||||
const bool pull_constants =
|
||||
devinfo->verx10 >= 125 &&
|
||||
(gl_shader_stage_is_compute(stage) ||
|
||||
gl_shader_stage_is_mesh(stage)) &&
|
||||
uniform_push_length;
|
||||
|
||||
if (shader_pulls_constants && uniform_push_length > 0) {
|
||||
if (pull_constants) {
|
||||
const bool pull_constants_a64 =
|
||||
(gl_shader_stage_is_rt(stage) &&
|
||||
brw_bs_prog_data(prog_data)->uses_inline_push_addr) ||
|
||||
((gl_shader_stage_is_compute(stage) ||
|
||||
gl_shader_stage_is_mesh(stage)) &&
|
||||
brw_cs_prog_data(prog_data)->uses_inline_push_addr);
|
||||
assert(devinfo->has_lsc);
|
||||
brw_builder ubld = brw_builder(this, 1).exec_all().at(
|
||||
cfg->first_block(), cfg->first_block()->start());
|
||||
|
||||
/* The base offset for our push data is passed in as R0.0[31:6]. We have
|
||||
* to mask off the bottom 6 bits.
|
||||
*/
|
||||
brw_reg base_addr =
|
||||
ubld.AND(retype(brw_vec1_grf(0, 0), BRW_TYPE_UD),
|
||||
brw_imm_ud(INTEL_MASK(31, 6)));
|
||||
brw_reg base_addr;
|
||||
if (pull_constants_a64) {
|
||||
/* The address of the push constants is at offset 0 in the inline
|
||||
* parameter.
|
||||
*/
|
||||
base_addr =
|
||||
gl_shader_stage_is_rt(stage) ?
|
||||
retype(bs_payload().inline_parameter, BRW_TYPE_UQ) :
|
||||
retype(cs_payload().inline_parameter, BRW_TYPE_UQ);
|
||||
} else {
|
||||
/* The base offset for our push data is passed in as R0.0[31:6]. We
|
||||
* have to mask off the bottom 6 bits.
|
||||
*/
|
||||
base_addr = ubld.AND(retype(brw_vec1_grf(0, 0), BRW_TYPE_UD),
|
||||
brw_imm_ud(INTEL_MASK(31, 6)));
|
||||
}
|
||||
|
||||
/* On Gfx12-HP we load constants at the start of the program using A32
|
||||
* stateless messages.
|
||||
|
|
@ -229,11 +249,31 @@ fs_visitor::assign_curb_setup()
|
|||
assert(num_regs > 0);
|
||||
num_regs = 1 << util_logbase2(num_regs);
|
||||
|
||||
/* This pass occurs after all of the optimization passes, so don't
|
||||
* emit an 'ADD addr, base_addr, 0' instruction.
|
||||
*/
|
||||
brw_reg addr = i == 0 ? base_addr :
|
||||
ubld.ADD(base_addr, brw_imm_ud(i * REG_SIZE));
|
||||
brw_reg addr;
|
||||
|
||||
if (i != 0) {
|
||||
if (pull_constants_a64) {
|
||||
/* We need to do the carry manually as when this pass is run,
|
||||
* we're not expecting any 64bit ALUs. Unfortunately all the
|
||||
* 64bit lowering is done in NIR.
|
||||
*/
|
||||
addr = ubld.vgrf(BRW_TYPE_UQ);
|
||||
brw_reg addr_ldw = subscript(addr, BRW_TYPE_UD, 0);
|
||||
brw_reg addr_udw = subscript(addr, BRW_TYPE_UD, 1);
|
||||
brw_reg base_addr_ldw = subscript(base_addr, BRW_TYPE_UD, 0);
|
||||
brw_reg base_addr_udw = subscript(base_addr, BRW_TYPE_UD, 1);
|
||||
ubld.ADD(addr_ldw, base_addr_ldw, brw_imm_ud(i * REG_SIZE));
|
||||
ubld.CMP(ubld.null_reg_d(), addr_ldw, base_addr_ldw, BRW_CONDITIONAL_L);
|
||||
set_predicate(BRW_PREDICATE_NORMAL,
|
||||
ubld.ADD(addr_udw, base_addr_udw, brw_imm_ud(1)));
|
||||
set_predicate_inv(BRW_PREDICATE_NORMAL, true,
|
||||
ubld.MOV(addr_udw, base_addr_udw));
|
||||
} else {
|
||||
addr = ubld.ADD(base_addr, brw_imm_ud(i * REG_SIZE));
|
||||
}
|
||||
} else {
|
||||
addr = base_addr;
|
||||
}
|
||||
|
||||
brw_reg srcs[4] = {
|
||||
brw_imm_ud(0), /* desc */
|
||||
|
|
@ -249,15 +289,20 @@ fs_visitor::assign_curb_setup()
|
|||
send->sfid = GFX12_SFID_UGM;
|
||||
uint32_t desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
|
||||
LSC_ADDR_SURFTYPE_FLAT,
|
||||
LSC_ADDR_SIZE_A32,
|
||||
pull_constants_a64 ?
|
||||
LSC_ADDR_SIZE_A64 : LSC_ADDR_SIZE_A32,
|
||||
LSC_DATA_SIZE_D32,
|
||||
num_regs * 8 /* num_channels */,
|
||||
true /* transpose */,
|
||||
LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
|
||||
send->header_size = 0;
|
||||
send->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, 1);
|
||||
send->mlen = lsc_msg_addr_len(
|
||||
devinfo, pull_constants_a64 ?
|
||||
LSC_ADDR_SIZE_A64 : LSC_ADDR_SIZE_A32, 1);
|
||||
send->size_written =
|
||||
lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, num_regs * 8) * REG_SIZE;
|
||||
assert((payload().num_regs + i + send->size_written / REG_SIZE) <=
|
||||
(payload().num_regs + prog_data->curb_read_length));
|
||||
send->send_is_volatile = true;
|
||||
|
||||
send->src[0] = brw_imm_ud(desc |
|
||||
|
|
|
|||
|
|
@ -227,6 +227,8 @@ struct task_mesh_thread_payload : public cs_thread_payload {
|
|||
struct bs_thread_payload : public thread_payload {
|
||||
bs_thread_payload(const fs_visitor &v);
|
||||
|
||||
brw_reg inline_parameter;
|
||||
|
||||
brw_reg global_arg_ptr;
|
||||
brw_reg local_arg_ptr;
|
||||
|
||||
|
|
|
|||
|
|
@ -361,6 +361,8 @@ cs_thread_payload::cs_thread_payload(const fs_visitor &v)
|
|||
|
||||
unsigned r = reg_unit(v.devinfo);
|
||||
|
||||
prog_data->uses_inline_push_addr = v.key->uses_inline_push_addr;
|
||||
|
||||
/* See nir_setup_uniforms for subgroup_id in earlier versions. */
|
||||
if (v.devinfo->verx10 >= 125) {
|
||||
subgroup_id_ = brw_ud1_grf(0, 2);
|
||||
|
|
@ -380,10 +382,14 @@ cs_thread_payload::cs_thread_payload(const fs_visitor &v)
|
|||
if (prog_data->uses_btd_stack_ids)
|
||||
r += reg_unit(v.devinfo);
|
||||
|
||||
if (v.stage == MESA_SHADER_COMPUTE && prog_data->uses_inline_data) {
|
||||
if (v.stage == MESA_SHADER_COMPUTE &&
|
||||
(prog_data->uses_inline_data ||
|
||||
prog_data->uses_inline_push_addr)) {
|
||||
inline_parameter = brw_ud1_grf(r, 0);
|
||||
r += reg_unit(v.devinfo);
|
||||
}
|
||||
} else {
|
||||
assert(!prog_data->uses_inline_push_addr);
|
||||
}
|
||||
|
||||
num_regs = r;
|
||||
|
|
@ -464,7 +470,7 @@ task_mesh_thread_payload::task_mesh_thread_payload(fs_visitor &v)
|
|||
r += reg_unit(v.devinfo);
|
||||
|
||||
struct brw_cs_prog_data *prog_data = brw_cs_prog_data(v.prog_data);
|
||||
if (prog_data->uses_inline_data) {
|
||||
if (prog_data->uses_inline_data || prog_data->uses_inline_push_addr) {
|
||||
inline_parameter = brw_ud1_grf(r, 0);
|
||||
r += reg_unit(v.devinfo);
|
||||
}
|
||||
|
|
@ -474,6 +480,8 @@ task_mesh_thread_payload::task_mesh_thread_payload(fs_visitor &v)
|
|||
|
||||
bs_thread_payload::bs_thread_payload(const fs_visitor &v)
|
||||
{
|
||||
struct brw_bs_prog_data *prog_data = brw_bs_prog_data(v.prog_data);
|
||||
|
||||
unsigned r = 0;
|
||||
|
||||
/* R0: Thread header. */
|
||||
|
|
@ -483,6 +491,8 @@ bs_thread_payload::bs_thread_payload(const fs_visitor &v)
|
|||
r += reg_unit(v.devinfo);
|
||||
|
||||
/* R2: Inline Parameter. Used for argument addresses. */
|
||||
prog_data->uses_inline_push_addr = v.key->uses_inline_push_addr;
|
||||
inline_parameter = brw_ud1_grf(r, 0);
|
||||
global_arg_ptr = brw_ud1_grf(r, 0);
|
||||
local_arg_ptr = brw_ud1_grf(r, 2);
|
||||
r += reg_unit(v.devinfo);
|
||||
|
|
|
|||
|
|
@ -64,6 +64,7 @@ build_leaf_is_procedural(nir_builder *b, struct brw_nir_rt_mem_hit_defs *hit)
|
|||
|
||||
static void
|
||||
lower_rt_intrinsics_impl(nir_function_impl *impl,
|
||||
const struct brw_base_prog_key *key,
|
||||
const struct intel_device_info *devinfo)
|
||||
{
|
||||
bool progress = false;
|
||||
|
|
@ -153,8 +154,14 @@ lower_rt_intrinsics_impl(nir_function_impl *impl,
|
|||
break;
|
||||
|
||||
case nir_intrinsic_load_uniform: {
|
||||
/* We don't want to lower this in the launch trampoline. */
|
||||
if (stage == MESA_SHADER_COMPUTE)
|
||||
/* We don't want to lower this in the launch trampoline.
|
||||
*
|
||||
* Also if the driver chooses to use an inline push address, we
|
||||
* can do all the loading of the push constant in
|
||||
* assign_curb_setup() (more efficient as we can do NoMask
|
||||
* instructions for address calculations).
|
||||
*/
|
||||
if (stage == MESA_SHADER_COMPUTE || key->uses_inline_push_addr)
|
||||
break;
|
||||
|
||||
sysval = brw_nir_load_global_const(b, intrin,
|
||||
|
|
@ -409,9 +416,10 @@ lower_rt_intrinsics_impl(nir_function_impl *impl,
|
|||
*/
|
||||
void
|
||||
brw_nir_lower_rt_intrinsics(nir_shader *nir,
|
||||
const struct brw_base_prog_key *key,
|
||||
const struct intel_device_info *devinfo)
|
||||
{
|
||||
nir_foreach_function_impl(impl, nir) {
|
||||
lower_rt_intrinsics_impl(impl, devinfo);
|
||||
lower_rt_intrinsics_impl(impl, key, devinfo);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -495,7 +495,8 @@ brw_nir_create_raygen_trampoline(const struct brw_compiler *compiler,
|
|||
struct brw_nir_compiler_opts opts = {};
|
||||
brw_preprocess_nir(compiler, nir, &opts);
|
||||
|
||||
NIR_PASS_V(nir, brw_nir_lower_rt_intrinsics, devinfo);
|
||||
struct brw_cs_prog_key key = {};
|
||||
NIR_PASS_V(nir, brw_nir_lower_rt_intrinsics, &key.base, devinfo);
|
||||
|
||||
b = nir_builder_create(nir_shader_get_entrypoint(b.shader));
|
||||
/* brw_nir_lower_rt_intrinsics will leave us with a btd_global_arg_addr
|
||||
|
|
|
|||
|
|
@ -56,6 +56,7 @@ void brw_nir_lower_shader_returns(nir_shader *shader);
|
|||
bool brw_nir_lower_shader_calls(nir_shader *shader, struct brw_bs_prog_key *key);
|
||||
|
||||
void brw_nir_lower_rt_intrinsics(nir_shader *shader,
|
||||
const struct brw_base_prog_key *key,
|
||||
const struct intel_device_info *devinfo);
|
||||
void brw_nir_lower_intersection_shader(nir_shader *intersection,
|
||||
const nir_shader *any_hit,
|
||||
|
|
|
|||
|
|
@ -3309,12 +3309,12 @@ compile_upload_rt_shader(struct anv_ray_tracing_pipeline *pipeline,
|
|||
NIR_PASS(_, nir, nir_lower_shader_calls, &opts,
|
||||
&resume_shaders, &num_resume_shaders, mem_ctx);
|
||||
NIR_PASS(_, nir, brw_nir_lower_shader_calls, &stage->key.bs);
|
||||
NIR_PASS_V(nir, brw_nir_lower_rt_intrinsics, devinfo);
|
||||
NIR_PASS_V(nir, brw_nir_lower_rt_intrinsics, &stage->key.base, devinfo);
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < num_resume_shaders; i++) {
|
||||
NIR_PASS(_,resume_shaders[i], brw_nir_lower_shader_calls, &stage->key.bs);
|
||||
NIR_PASS_V(resume_shaders[i], brw_nir_lower_rt_intrinsics, devinfo);
|
||||
NIR_PASS_V(resume_shaders[i], brw_nir_lower_rt_intrinsics, &stage->key.base, devinfo);
|
||||
}
|
||||
|
||||
struct brw_compile_bs_params params = {
|
||||
|
|
@ -3853,7 +3853,8 @@ anv_device_init_rt_shaders(struct anv_device *device)
|
|||
nir_shader *trivial_return_nir =
|
||||
brw_nir_create_trivial_return_shader(device->physical->compiler, tmp_ctx);
|
||||
|
||||
NIR_PASS_V(trivial_return_nir, brw_nir_lower_rt_intrinsics, device->info);
|
||||
NIR_PASS_V(trivial_return_nir, brw_nir_lower_rt_intrinsics,
|
||||
&return_key.key.base, device->info);
|
||||
|
||||
struct brw_bs_prog_data return_prog_data = { 0, };
|
||||
struct brw_compile_bs_params params = {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue