brw: enable A64 pulling of push constants

This will be useful for pulling constants in device bound shaders. A64
allows us to put the constants anywhere.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32895>
This commit is contained in:
Lionel Landwerlin 2024-04-24 16:14:16 +03:00 committed by Marge Bot
parent 0808125914
commit 5c17299084
10 changed files with 111 additions and 29 deletions

View file

@ -148,7 +148,8 @@ brw_compile_cs(const struct brw_compiler *compiler,
prog_data->base.total_shared = nir->info.shared_size;
prog_data->base.ray_queries = nir->info.ray_queries;
prog_data->base.total_scratch = 0;
prog_data->uses_inline_data = brw_nir_uses_inline_data(nir);
prog_data->uses_inline_data = brw_nir_uses_inline_data(nir) ||
key->base.uses_inline_push_addr;
assert(compiler->devinfo->verx10 >= 125 || !prog_data->uses_inline_data);
if (!nir->info.workgroup_size_variable) {

View file

@ -386,7 +386,8 @@ brw_compile_task(const struct brw_compiler *compiler,
prog_data->uses_drawid =
BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
prog_data->base.uses_inline_data = brw_nir_uses_inline_data(nir);
prog_data->base.uses_inline_data = brw_nir_uses_inline_data(nir) ||
key->base.uses_inline_push_addr;
brw_simd_selection_state simd_state{
.devinfo = compiler->devinfo,
@ -1696,7 +1697,8 @@ brw_compile_mesh(const struct brw_compiler *compiler,
prog_data->autostrip_enable = brw_mesh_autostrip_enable(compiler, nir, &prog_data->map);
prog_data->base.uses_inline_data = brw_nir_uses_inline_data(nir);
prog_data->base.uses_inline_data = brw_nir_uses_inline_data(nir) ||
key->base.uses_inline_push_addr;
brw_simd_selection_state simd_state{
.devinfo = compiler->devinfo,

View file

@ -208,7 +208,9 @@ struct brw_base_prog_key {
enum brw_robustness_flags robust_flags:2;
unsigned padding:22;
bool uses_inline_push_addr:1;
unsigned padding:21;
/**
* Apply workarounds for SIN and COS input range problems.
@ -891,6 +893,10 @@ struct brw_cs_prog_data {
bool uses_barrier;
bool uses_num_work_groups;
bool uses_inline_data;
/** Whether inline push data is used to provide a 64bit pointer to push
* constants
*/
bool uses_inline_push_addr;
bool uses_btd_stack_ids;
bool uses_systolic;
uint8_t generate_local_id;
@ -920,6 +926,11 @@ brw_cs_prog_data_prog_offset(const struct brw_cs_prog_data *prog_data,
struct brw_bs_prog_data {
struct brw_stage_prog_data base;
/** Whether inline push data is used to provide a 64bit pointer to push
* constants
*/
bool uses_inline_push_addr;
/** SIMD size of the root shader */
uint8_t simd_size;

View file

@ -34,6 +34,7 @@
#include "brw_fs_live_variables.h"
#include "brw_nir.h"
#include "brw_cfg.h"
#include "brw_rt.h"
#include "brw_private.h"
#include "intel_nir.h"
#include "shader_enums.h"
@ -205,20 +206,39 @@ fs_visitor::assign_curb_setup()
prog_data->curb_read_length = MAX2(1, prog_data->curb_read_length);
uint64_t used = 0;
const bool shader_pulls_constants = devinfo->verx10 >= 125 &&
(gl_shader_stage_is_compute(stage) || gl_shader_stage_is_mesh(stage));
const bool pull_constants =
devinfo->verx10 >= 125 &&
(gl_shader_stage_is_compute(stage) ||
gl_shader_stage_is_mesh(stage)) &&
uniform_push_length;
if (shader_pulls_constants && uniform_push_length > 0) {
if (pull_constants) {
const bool pull_constants_a64 =
(gl_shader_stage_is_rt(stage) &&
brw_bs_prog_data(prog_data)->uses_inline_push_addr) ||
((gl_shader_stage_is_compute(stage) ||
gl_shader_stage_is_mesh(stage)) &&
brw_cs_prog_data(prog_data)->uses_inline_push_addr);
assert(devinfo->has_lsc);
brw_builder ubld = brw_builder(this, 1).exec_all().at(
cfg->first_block(), cfg->first_block()->start());
/* The base offset for our push data is passed in as R0.0[31:6]. We have
* to mask off the bottom 6 bits.
*/
brw_reg base_addr =
ubld.AND(retype(brw_vec1_grf(0, 0), BRW_TYPE_UD),
brw_imm_ud(INTEL_MASK(31, 6)));
brw_reg base_addr;
if (pull_constants_a64) {
/* The address of the push constants is at offset 0 in the inline
* parameter.
*/
base_addr =
gl_shader_stage_is_rt(stage) ?
retype(bs_payload().inline_parameter, BRW_TYPE_UQ) :
retype(cs_payload().inline_parameter, BRW_TYPE_UQ);
} else {
/* The base offset for our push data is passed in as R0.0[31:6]. We
* have to mask off the bottom 6 bits.
*/
base_addr = ubld.AND(retype(brw_vec1_grf(0, 0), BRW_TYPE_UD),
brw_imm_ud(INTEL_MASK(31, 6)));
}
/* On Gfx12-HP we load constants at the start of the program using A32
* stateless messages.
@ -229,11 +249,31 @@ fs_visitor::assign_curb_setup()
assert(num_regs > 0);
num_regs = 1 << util_logbase2(num_regs);
/* This pass occurs after all of the optimization passes, so don't
* emit an 'ADD addr, base_addr, 0' instruction.
*/
brw_reg addr = i == 0 ? base_addr :
ubld.ADD(base_addr, brw_imm_ud(i * REG_SIZE));
brw_reg addr;
if (i != 0) {
if (pull_constants_a64) {
/* We need to do the carry manually as when this pass is run,
* we're not expecting any 64bit ALUs. Unfortunately all the
* 64bit lowering is done in NIR.
*/
addr = ubld.vgrf(BRW_TYPE_UQ);
brw_reg addr_ldw = subscript(addr, BRW_TYPE_UD, 0);
brw_reg addr_udw = subscript(addr, BRW_TYPE_UD, 1);
brw_reg base_addr_ldw = subscript(base_addr, BRW_TYPE_UD, 0);
brw_reg base_addr_udw = subscript(base_addr, BRW_TYPE_UD, 1);
ubld.ADD(addr_ldw, base_addr_ldw, brw_imm_ud(i * REG_SIZE));
ubld.CMP(ubld.null_reg_d(), addr_ldw, base_addr_ldw, BRW_CONDITIONAL_L);
set_predicate(BRW_PREDICATE_NORMAL,
ubld.ADD(addr_udw, base_addr_udw, brw_imm_ud(1)));
set_predicate_inv(BRW_PREDICATE_NORMAL, true,
ubld.MOV(addr_udw, base_addr_udw));
} else {
addr = ubld.ADD(base_addr, brw_imm_ud(i * REG_SIZE));
}
} else {
addr = base_addr;
}
brw_reg srcs[4] = {
brw_imm_ud(0), /* desc */
@ -249,15 +289,20 @@ fs_visitor::assign_curb_setup()
send->sfid = GFX12_SFID_UGM;
uint32_t desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
LSC_ADDR_SURFTYPE_FLAT,
LSC_ADDR_SIZE_A32,
pull_constants_a64 ?
LSC_ADDR_SIZE_A64 : LSC_ADDR_SIZE_A32,
LSC_DATA_SIZE_D32,
num_regs * 8 /* num_channels */,
true /* transpose */,
LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
send->header_size = 0;
send->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, 1);
send->mlen = lsc_msg_addr_len(
devinfo, pull_constants_a64 ?
LSC_ADDR_SIZE_A64 : LSC_ADDR_SIZE_A32, 1);
send->size_written =
lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, num_regs * 8) * REG_SIZE;
assert((payload().num_regs + i + send->size_written / REG_SIZE) <=
(payload().num_regs + prog_data->curb_read_length));
send->send_is_volatile = true;
send->src[0] = brw_imm_ud(desc |

View file

@ -227,6 +227,8 @@ struct task_mesh_thread_payload : public cs_thread_payload {
struct bs_thread_payload : public thread_payload {
bs_thread_payload(const fs_visitor &v);
brw_reg inline_parameter;
brw_reg global_arg_ptr;
brw_reg local_arg_ptr;

View file

@ -361,6 +361,8 @@ cs_thread_payload::cs_thread_payload(const fs_visitor &v)
unsigned r = reg_unit(v.devinfo);
prog_data->uses_inline_push_addr = v.key->uses_inline_push_addr;
/* See nir_setup_uniforms for subgroup_id in earlier versions. */
if (v.devinfo->verx10 >= 125) {
subgroup_id_ = brw_ud1_grf(0, 2);
@ -380,10 +382,14 @@ cs_thread_payload::cs_thread_payload(const fs_visitor &v)
if (prog_data->uses_btd_stack_ids)
r += reg_unit(v.devinfo);
if (v.stage == MESA_SHADER_COMPUTE && prog_data->uses_inline_data) {
if (v.stage == MESA_SHADER_COMPUTE &&
(prog_data->uses_inline_data ||
prog_data->uses_inline_push_addr)) {
inline_parameter = brw_ud1_grf(r, 0);
r += reg_unit(v.devinfo);
}
} else {
assert(!prog_data->uses_inline_push_addr);
}
num_regs = r;
@ -464,7 +470,7 @@ task_mesh_thread_payload::task_mesh_thread_payload(fs_visitor &v)
r += reg_unit(v.devinfo);
struct brw_cs_prog_data *prog_data = brw_cs_prog_data(v.prog_data);
if (prog_data->uses_inline_data) {
if (prog_data->uses_inline_data || prog_data->uses_inline_push_addr) {
inline_parameter = brw_ud1_grf(r, 0);
r += reg_unit(v.devinfo);
}
@ -474,6 +480,8 @@ task_mesh_thread_payload::task_mesh_thread_payload(fs_visitor &v)
bs_thread_payload::bs_thread_payload(const fs_visitor &v)
{
struct brw_bs_prog_data *prog_data = brw_bs_prog_data(v.prog_data);
unsigned r = 0;
/* R0: Thread header. */
@ -483,6 +491,8 @@ bs_thread_payload::bs_thread_payload(const fs_visitor &v)
r += reg_unit(v.devinfo);
/* R2: Inline Parameter. Used for argument addresses. */
prog_data->uses_inline_push_addr = v.key->uses_inline_push_addr;
inline_parameter = brw_ud1_grf(r, 0);
global_arg_ptr = brw_ud1_grf(r, 0);
local_arg_ptr = brw_ud1_grf(r, 2);
r += reg_unit(v.devinfo);

View file

@ -64,6 +64,7 @@ build_leaf_is_procedural(nir_builder *b, struct brw_nir_rt_mem_hit_defs *hit)
static void
lower_rt_intrinsics_impl(nir_function_impl *impl,
const struct brw_base_prog_key *key,
const struct intel_device_info *devinfo)
{
bool progress = false;
@ -153,8 +154,14 @@ lower_rt_intrinsics_impl(nir_function_impl *impl,
break;
case nir_intrinsic_load_uniform: {
/* We don't want to lower this in the launch trampoline. */
if (stage == MESA_SHADER_COMPUTE)
/* We don't want to lower this in the launch trampoline.
*
* Also if the driver chooses to use an inline push address, we
* can do all the loading of the push constant in
* assign_curb_setup() (more efficient as we can do NoMask
* instructions for address calculations).
*/
if (stage == MESA_SHADER_COMPUTE || key->uses_inline_push_addr)
break;
sysval = brw_nir_load_global_const(b, intrin,
@ -409,9 +416,10 @@ lower_rt_intrinsics_impl(nir_function_impl *impl,
*/
void
brw_nir_lower_rt_intrinsics(nir_shader *nir,
const struct brw_base_prog_key *key,
const struct intel_device_info *devinfo)
{
nir_foreach_function_impl(impl, nir) {
lower_rt_intrinsics_impl(impl, devinfo);
lower_rt_intrinsics_impl(impl, key, devinfo);
}
}

View file

@ -495,7 +495,8 @@ brw_nir_create_raygen_trampoline(const struct brw_compiler *compiler,
struct brw_nir_compiler_opts opts = {};
brw_preprocess_nir(compiler, nir, &opts);
NIR_PASS_V(nir, brw_nir_lower_rt_intrinsics, devinfo);
struct brw_cs_prog_key key = {};
NIR_PASS_V(nir, brw_nir_lower_rt_intrinsics, &key.base, devinfo);
b = nir_builder_create(nir_shader_get_entrypoint(b.shader));
/* brw_nir_lower_rt_intrinsics will leave us with a btd_global_arg_addr

View file

@ -56,6 +56,7 @@ void brw_nir_lower_shader_returns(nir_shader *shader);
bool brw_nir_lower_shader_calls(nir_shader *shader, struct brw_bs_prog_key *key);
void brw_nir_lower_rt_intrinsics(nir_shader *shader,
const struct brw_base_prog_key *key,
const struct intel_device_info *devinfo);
void brw_nir_lower_intersection_shader(nir_shader *intersection,
const nir_shader *any_hit,

View file

@ -3309,12 +3309,12 @@ compile_upload_rt_shader(struct anv_ray_tracing_pipeline *pipeline,
NIR_PASS(_, nir, nir_lower_shader_calls, &opts,
&resume_shaders, &num_resume_shaders, mem_ctx);
NIR_PASS(_, nir, brw_nir_lower_shader_calls, &stage->key.bs);
NIR_PASS_V(nir, brw_nir_lower_rt_intrinsics, devinfo);
NIR_PASS_V(nir, brw_nir_lower_rt_intrinsics, &stage->key.base, devinfo);
}
for (unsigned i = 0; i < num_resume_shaders; i++) {
NIR_PASS(_,resume_shaders[i], brw_nir_lower_shader_calls, &stage->key.bs);
NIR_PASS_V(resume_shaders[i], brw_nir_lower_rt_intrinsics, devinfo);
NIR_PASS_V(resume_shaders[i], brw_nir_lower_rt_intrinsics, &stage->key.base, devinfo);
}
struct brw_compile_bs_params params = {
@ -3853,7 +3853,8 @@ anv_device_init_rt_shaders(struct anv_device *device)
nir_shader *trivial_return_nir =
brw_nir_create_trivial_return_shader(device->physical->compiler, tmp_ctx);
NIR_PASS_V(trivial_return_nir, brw_nir_lower_rt_intrinsics, device->info);
NIR_PASS_V(trivial_return_nir, brw_nir_lower_rt_intrinsics,
&return_key.key.base, device->info);
struct brw_bs_prog_data return_prog_data = { 0, };
struct brw_compile_bs_params params = {