tu: Move a bunch of program config to CRB.

This shows off how we don't need to pass an explicit size per CRB instance
in our non-growable CSes.

However, I don't like the additional indentation I did to make a CRB go
out of scope when I needed.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38762>
This commit is contained in:
Emma Anholt 2025-12-01 17:08:35 -08:00 committed by Marge Bot
parent c130c94bcb
commit 5f091af897
4 changed files with 85 additions and 93 deletions

View file

@ -871,11 +871,13 @@ r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum r3d_type type,
.cs_bindless = CHIP == A6XX ? 0x1f : 0xff,
.gfx_bindless = CHIP == A6XX ? 0x1f : 0xff,));
tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_VERTEX, vs);
tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_TESS_CTRL, NULL);
tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_TESS_EVAL, NULL);
tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_GEOMETRY, NULL);
tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_FRAGMENT, fs);
tu_crb crb = cs->crb(2 * 5);
tu6_emit_xs_config<CHIP>(crb, MESA_SHADER_VERTEX, vs);
tu6_emit_xs_config<CHIP>(crb, MESA_SHADER_TESS_CTRL, NULL);
tu6_emit_xs_config<CHIP>(crb, MESA_SHADER_TESS_EVAL, NULL);
tu6_emit_xs_config<CHIP>(crb, MESA_SHADER_GEOMETRY, NULL);
tu6_emit_xs_config<CHIP>(crb, MESA_SHADER_FRAGMENT, fs);
crb.flush();
struct tu_pvtmem_config pvtmem = {};
tu6_emit_xs(cs, MESA_SHADER_VERTEX, vs, &pvtmem, vs_iova);

View file

@ -374,7 +374,7 @@ static const xs_config<CHIP> xs_configs[] = {
template <chip CHIP>
void
tu6_emit_xs_config(struct tu_cs *cs,
tu6_emit_xs_config(struct tu_crb &crb,
mesa_shader_stage stage, /* xs->type, but xs may be NULL */
const struct ir3_shader_variant *xs)
{
@ -382,28 +382,27 @@ tu6_emit_xs_config(struct tu_cs *cs,
if (!xs) {
/* shader stage disabled */
tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 1);
tu_cs_emit(cs, 0);
tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1);
tu_cs_emit(cs, 0);
crb.add(tu_reg_value { .reg = cfg->reg_sp_xs_config, .value = 0 });
crb.add(tu_reg_value { .reg = cfg->reg_hlsq_xs_ctrl, .value = 0 });
return;
}
tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 1);
tu_cs_emit(cs, A6XX_SP_VS_CONFIG_ENABLED |
COND(xs->bindless_tex, A6XX_SP_VS_CONFIG_BINDLESS_TEX) |
COND(xs->bindless_samp, A6XX_SP_VS_CONFIG_BINDLESS_SAMP) |
COND(xs->bindless_ibo, A6XX_SP_VS_CONFIG_BINDLESS_UAV) |
COND(xs->bindless_ubo, A6XX_SP_VS_CONFIG_BINDLESS_UBO) |
A6XX_SP_VS_CONFIG_NTEX(xs->num_samp) |
A6XX_SP_VS_CONFIG_NSAMP(xs->num_samp));
tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1);
tu_cs_emit(cs, A6XX_SP_VS_CONST_CONFIG_CONSTLEN(xs->constlen) |
A6XX_SP_VS_CONST_CONFIG_ENABLED |
COND(xs->shader_options.push_consts_type == IR3_PUSH_CONSTS_SHARED_PREAMBLE,
A7XX_SP_VS_CONST_CONFIG_READ_IMM_SHARED_CONSTS));
crb.add(tu_reg_value {
.reg = cfg->reg_sp_xs_config,
.value = A6XX_SP_VS_CONFIG_ENABLED |
COND(xs->bindless_tex, A6XX_SP_VS_CONFIG_BINDLESS_TEX) |
COND(xs->bindless_samp, A6XX_SP_VS_CONFIG_BINDLESS_SAMP) |
COND(xs->bindless_ibo, A6XX_SP_VS_CONFIG_BINDLESS_UAV) |
COND(xs->bindless_ubo, A6XX_SP_VS_CONFIG_BINDLESS_UBO) |
A6XX_SP_VS_CONFIG_NTEX(xs->num_samp) |
A6XX_SP_VS_CONFIG_NSAMP(xs->num_samp) });
crb.add(tu_reg_value {
.reg = cfg->reg_hlsq_xs_ctrl,
.value = A6XX_SP_VS_CONST_CONFIG_CONSTLEN(xs->constlen) |
A6XX_SP_VS_CONST_CONFIG_ENABLED |
COND(xs->shader_options.push_consts_type ==
IR3_PUSH_CONSTS_SHARED_PREAMBLE,
A7XX_SP_VS_CONST_CONFIG_READ_IMM_SHARED_CONSTS) });
}
TU_GENX(tu6_emit_xs_config);
@ -466,18 +465,18 @@ tu6_emit_dynamic_offset(struct tu_cs *cs,
template <chip CHIP>
void
tu6_emit_shared_consts_enable(struct tu_cs *cs, bool enable)
tu6_emit_shared_consts_enable(struct tu_crb &crb, bool enable)
{
if (CHIP == A6XX) {
/* Enable/disable shared constants */
tu_cs_emit_regs(cs, HLSQ_SHARED_CONSTS(CHIP, .enable = enable));
crb.add(HLSQ_SHARED_CONSTS(CHIP, .enable = enable));
} else {
assert(!enable);
}
tu_cs_emit_regs(cs, A6XX_SP_MODE_CNTL(.constant_demotion_enable = true,
.isammode = ISAMMODE_GL,
.shared_consts_enable = enable));
crb.add(A6XX_SP_MODE_CNTL(.constant_demotion_enable = true,
.isammode = ISAMMODE_GL,
.shared_consts_enable = enable));
}
TU_GENX(tu6_emit_shared_consts_enable);
@ -1257,24 +1256,24 @@ tu6_emit_program_config(struct tu_cs *cs,
{
STATIC_ASSERT(MESA_SHADER_VERTEX == 0);
tu_crb crb = cs->crb(0);
bool shared_consts_enable =
prog->shared_consts.type == IR3_PUSH_CONSTS_SHARED;
tu6_emit_shared_consts_enable<CHIP>(cs, shared_consts_enable);
tu6_emit_shared_consts_enable<CHIP>(crb, shared_consts_enable);
tu_cs_emit_regs(cs, SP_UPDATE_CNTL(CHIP,
.vs_state = true,
.hs_state = true,
.ds_state = true,
.gs_state = true,
.fs_state = true,
.gfx_uav = true,
.gfx_shared_const = shared_consts_enable));
crb.add(SP_UPDATE_CNTL(CHIP, .vs_state = true, .hs_state = true,
.ds_state = true, .gs_state = true,
.fs_state = true, .gfx_uav = true,
.gfx_shared_const = shared_consts_enable));
for (size_t stage_idx = MESA_SHADER_VERTEX;
stage_idx <= MESA_SHADER_FRAGMENT; stage_idx++) {
mesa_shader_stage stage = (mesa_shader_stage) stage_idx;
tu6_emit_xs_config<CHIP>(cs, stage, variants[stage]);
tu6_emit_xs_config<CHIP>(crb, stage, variants[stage]);
}
crb.flush();
for (size_t stage_idx = MESA_SHADER_VERTEX;
stage_idx <= MESA_SHADER_FRAGMENT; stage_idx++) {
mesa_shader_stage stage = (mesa_shader_stage) stage_idx;

View file

@ -303,13 +303,13 @@ struct tu_pvtmem_config {
template <chip CHIP>
void
tu6_emit_xs_config(struct tu_cs *cs,
tu6_emit_xs_config(struct tu_crb &crb,
mesa_shader_stage stage,
const struct ir3_shader_variant *xs);
template <chip CHIP>
void
tu6_emit_shared_consts_enable(struct tu_cs *cs, bool shared_consts_enable);
tu6_emit_shared_consts_enable(struct tu_crb &crb, bool shared_consts_enable);
template <chip CHIP>
void

View file

@ -1769,29 +1769,29 @@ tu6_emit_cs_config(struct tu_cs *cs,
{
bool shared_consts_enable =
ir3_const_state(v)->push_consts_type == IR3_PUSH_CONSTS_SHARED;
tu6_emit_shared_consts_enable<CHIP>(cs, shared_consts_enable);
tu_cs_emit_regs(cs, SP_UPDATE_CNTL(CHIP,
.cs_state = true,
.cs_uav = true,
.cs_shared_const = shared_consts_enable));
with_crb (cs) {
tu6_emit_shared_consts_enable<CHIP>(crb, shared_consts_enable);
tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_COMPUTE, v);
crb.add(SP_UPDATE_CNTL(CHIP, .cs_state = true, .cs_uav = true,
.cs_shared_const = shared_consts_enable));
tu6_emit_xs_config<CHIP>(crb, MESA_SHADER_COMPUTE, v);
}
tu6_emit_xs(cs, MESA_SHADER_COMPUTE, v, pvtmem, binary_iova);
uint32_t shared_size = MAX2(((int)v->shared_size - 1) / 1024, 1);
tu_crb crb = cs->crb(0);
uint32_t shared_size = MAX2(((int) v->shared_size - 1) / 1024, 1);
enum a6xx_const_ram_mode mode =
v->constlen > 256 ? CONSTLEN_512 :
(v->constlen > 192 ? CONSTLEN_256 :
(v->constlen > 128 ? CONSTLEN_192 : CONSTLEN_128));
tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CNTL_1, 1);
tu_cs_emit(cs, A6XX_SP_CS_CNTL_1_SHARED_SIZE(shared_size) |
A6XX_SP_CS_CNTL_1_CONSTANTRAMMODE(mode));
crb.add(
A6XX_SP_CS_CNTL_1(.shared_size = shared_size, .constantrammode = mode));
if (CHIP == A6XX && cs->device->physical_device->info->props.has_lpac) {
tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_CTRL_REG1, 1);
tu_cs_emit(cs, A6XX_HLSQ_CS_CTRL_REG1_SHARED_SIZE(shared_size) |
A6XX_HLSQ_CS_CTRL_REG1_CONSTANTRAMMODE(mode));
crb.add(HLSQ_CS_CTRL_REG1(CHIP, .shared_size = shared_size,
.constantrammode = mode));
}
uint32_t local_invocation_id =
@ -1808,58 +1808,49 @@ tu6_emit_cs_config(struct tu_cs *cs,
enum a6xx_threadsize thrsz_cs = cs->device->physical_device->info->props
.supports_double_threadsize ? thrsz : THREAD128;
if (CHIP == A6XX) {
tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CONST_CONFIG_0, 2);
tu_cs_emit(cs,
A6XX_SP_CS_CONST_CONFIG_0_WGIDCONSTID(work_group_id) |
A6XX_SP_CS_CONST_CONFIG_0_WGSIZECONSTID(regid(63, 0)) |
A6XX_SP_CS_CONST_CONFIG_0_WGOFFSETCONSTID(regid(63, 0)) |
A6XX_SP_CS_CONST_CONFIG_0_LOCALIDREGID(local_invocation_id));
tu_cs_emit(cs, A6XX_SP_CS_WGE_CNTL_LINEARLOCALIDREGID(regid(63, 0)) |
A6XX_SP_CS_WGE_CNTL_THREADSIZE(thrsz_cs));
crb.add(SP_CS_CONST_CONFIG_0(CHIP, .wgidconstid = work_group_id,
.wgsizeconstid = regid(63, 0),
.wgoffsetconstid = regid(63, 0),
.localidregid = local_invocation_id));
crb.add(SP_CS_WGE_CNTL(CHIP, .linearlocalidregid = regid(63, 0),
.threadsize = thrsz_cs));
if (!cs->device->physical_device->info->props.supports_double_threadsize) {
tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_WAVE_CNTL, 1);
tu_cs_emit(cs, A6XX_SP_PS_WAVE_CNTL_THREADSIZE(thrsz));
crb.add(SP_PS_WAVE_CNTL(CHIP, .threadsize = thrsz));
}
if (cs->device->physical_device->info->props.has_lpac) {
tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_WIE_CNTL_0, 2);
tu_cs_emit(cs,
A6XX_SP_CS_WIE_CNTL_0_WGIDCONSTID(work_group_id) |
A6XX_SP_CS_WIE_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
A6XX_SP_CS_WIE_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
A6XX_SP_CS_WIE_CNTL_0_LOCALIDREGID(local_invocation_id));
tu_cs_emit(cs, A6XX_SP_CS_WIE_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
A6XX_SP_CS_WIE_CNTL_1_THREADSIZE(thrsz));
crb.add(A6XX_SP_CS_WIE_CNTL_0(.wgidconstid = work_group_id,
.wgsizeconstid = regid(63, 0),
.wgoffsetconstid = regid(63, 0),
.localidregid = local_invocation_id));
crb.add(SP_CS_WIE_CNTL_1(CHIP, .linearlocalidregid = regid(63, 0),
.threadsize = thrsz));
}
} else {
unsigned tile_height = (v->local_size[1] % 8 == 0) ? 3
: (v->local_size[1] % 4 == 0) ? 5
: (v->local_size[1] % 2 == 0) ? 9
: 17;
tu_cs_emit_regs(
cs, SP_CS_WGE_CNTL(CHIP,
.linearlocalidregid = regid(63, 0), .threadsize = thrsz_cs,
.workgrouprastorderzfirsten = true,
.wgtilewidth = 4, .wgtileheight = tile_height));
crb.add(SP_CS_WGE_CNTL(CHIP, .linearlocalidregid = regid(63, 0),
.threadsize = thrsz_cs,
.workgrouprastorderzfirsten = true,
.wgtilewidth = 4, .wgtileheight = tile_height));
tu_cs_emit_regs(cs, SP_PS_WAVE_CNTL(CHIP, .threadsize = THREAD64));
crb.add(SP_PS_WAVE_CNTL(CHIP, .threadsize = THREAD64));
tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_WIE_CNTL_0, 1);
tu_cs_emit(cs, A6XX_SP_CS_WIE_CNTL_0_WGIDCONSTID(work_group_id) |
A6XX_SP_CS_WIE_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
A6XX_SP_CS_WIE_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
A6XX_SP_CS_WIE_CNTL_0_LOCALIDREGID(local_invocation_id));
crb.add(A6XX_SP_CS_WIE_CNTL_0(.wgidconstid = work_group_id,
.wgsizeconstid = regid(63, 0),
.wgoffsetconstid = regid(63, 0),
.localidregid = local_invocation_id));
tu_cs_emit_regs(cs,
SP_CS_WIE_CNTL_1(CHIP,
.linearlocalidregid = regid(63, 0),
.threadsize = thrsz_cs,
.workitemrastorder =
v->cs.force_linear_dispatch ?
WORKITEMRASTORDER_LINEAR :
WORKITEMRASTORDER_TILED, ));
crb.add(SP_CS_WIE_CNTL_1(
CHIP, .linearlocalidregid = regid(63, 0), .threadsize = thrsz_cs,
.workitemrastorder = v->cs.force_linear_dispatch
? WORKITEMRASTORDER_LINEAR
: WORKITEMRASTORDER_TILED));
tu_cs_emit_regs(cs, SP_CS_HYSTERESIS(CHIP, 0)); // Sometimes is 0x08000000
crb.add(SP_CS_HYSTERESIS(CHIP, 0)); // Sometimes is 0x08000000
}
}