tu: Move tu_xs_config() to use the CRB builder.
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

This duplicates the field setup per stage, but lets us use the nice reg
packers.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38762>
This commit is contained in:
Emma Anholt 2025-12-08 11:28:07 -08:00 committed by Marge Bot
parent 780de476e7
commit 71b59563fe
3 changed files with 144 additions and 147 deletions

View file

@ -871,17 +871,16 @@ r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum r3d_type type,
.cs_bindless = CHIP == A6XX ? 0x1f : 0xff, .cs_bindless = CHIP == A6XX ? 0x1f : 0xff,
.gfx_bindless = CHIP == A6XX ? 0x1f : 0xff,)); .gfx_bindless = CHIP == A6XX ? 0x1f : 0xff,));
tu_crb crb = cs->crb(2 * 5); tu_crb crb = cs->crb(2 * 5 + 2 * 11);
tu6_emit_xs_config<CHIP>(crb, MESA_SHADER_VERTEX, vs); tu6_emit_xs_config<CHIP>(crb, MESA_SHADER_VERTEX, vs);
tu6_emit_xs_config<CHIP>(crb, MESA_SHADER_TESS_CTRL, NULL); tu6_emit_xs_config<CHIP>(crb, MESA_SHADER_TESS_CTRL, NULL);
tu6_emit_xs_config<CHIP>(crb, MESA_SHADER_TESS_EVAL, NULL); tu6_emit_xs_config<CHIP>(crb, MESA_SHADER_TESS_EVAL, NULL);
tu6_emit_xs_config<CHIP>(crb, MESA_SHADER_GEOMETRY, NULL); tu6_emit_xs_config<CHIP>(crb, MESA_SHADER_GEOMETRY, NULL);
tu6_emit_xs_config<CHIP>(crb, MESA_SHADER_FRAGMENT, fs); tu6_emit_xs_config<CHIP>(crb, MESA_SHADER_FRAGMENT, fs);
crb.flush();
struct tu_pvtmem_config pvtmem = {}; struct tu_pvtmem_config pvtmem = {};
tu6_emit_xs(cs, MESA_SHADER_VERTEX, vs, &pvtmem, vs_iova); tu6_emit_xs(crb, cs->device, MESA_SHADER_VERTEX, vs, &pvtmem, vs_iova);
tu6_emit_xs(cs, MESA_SHADER_FRAGMENT, fs, &pvtmem, fs_iova); tu6_emit_xs(crb, cs->device, MESA_SHADER_FRAGMENT, fs, &pvtmem, fs_iova);
crb.flush();
tu6_emit_xs_constants(cs, MESA_SHADER_VERTEX, vs, vs_iova); tu6_emit_xs_constants(cs, MESA_SHADER_VERTEX, vs, vs_iova);
tu6_emit_xs_constants(cs, MESA_SHADER_FRAGMENT, fs, fs_iova); tu6_emit_xs_constants(cs, MESA_SHADER_FRAGMENT, fs, fs_iova);

View file

@ -1502,143 +1502,19 @@ tu_xs_get_additional_cs_size_dwords(const struct ir3_shader_variant *xs)
return size; return size;
} }
static const struct xs_config {
uint16_t reg_sp_xs_config;
uint16_t reg_sp_xs_instrlen;
uint16_t reg_sp_xs_first_exec_offset;
uint16_t reg_sp_xs_pvt_mem_hw_stack_offset;
uint16_t reg_sp_xs_vgpr_config;
} xs_config[] = {
[MESA_SHADER_VERTEX] = {
REG_A6XX_SP_VS_CONFIG,
REG_A6XX_SP_VS_INSTR_SIZE,
REG_A6XX_SP_VS_PROGRAM_COUNTER_OFFSET,
REG_A6XX_SP_VS_PVT_MEM_STACK_OFFSET,
REG_A7XX_SP_VS_VGS_CNTL,
},
[MESA_SHADER_TESS_CTRL] = {
REG_A6XX_SP_HS_CONFIG,
REG_A6XX_SP_HS_INSTR_SIZE,
REG_A6XX_SP_HS_PROGRAM_COUNTER_OFFSET,
REG_A6XX_SP_HS_PVT_MEM_STACK_OFFSET,
REG_A7XX_SP_HS_VGS_CNTL,
},
[MESA_SHADER_TESS_EVAL] = {
REG_A6XX_SP_DS_CONFIG,
REG_A6XX_SP_DS_INSTR_SIZE,
REG_A6XX_SP_DS_PROGRAM_COUNTER_OFFSET,
REG_A6XX_SP_DS_PVT_MEM_STACK_OFFSET,
REG_A7XX_SP_DS_VGS_CNTL,
},
[MESA_SHADER_GEOMETRY] = {
REG_A6XX_SP_GS_CONFIG,
REG_A6XX_SP_GS_INSTR_SIZE,
REG_A6XX_SP_GS_PROGRAM_COUNTER_OFFSET,
REG_A6XX_SP_GS_PVT_MEM_STACK_OFFSET,
REG_A7XX_SP_GS_VGS_CNTL,
},
[MESA_SHADER_FRAGMENT] = {
REG_A6XX_SP_PS_CONFIG,
REG_A6XX_SP_PS_INSTR_SIZE,
REG_A6XX_SP_PS_PROGRAM_COUNTER_OFFSET,
REG_A6XX_SP_PS_PVT_MEM_STACK_OFFSET,
REG_A7XX_SP_PS_VGS_CNTL,
},
[MESA_SHADER_COMPUTE] = {
REG_A6XX_SP_CS_CONFIG,
REG_A6XX_SP_CS_INSTR_SIZE,
REG_A6XX_SP_CS_PROGRAM_COUNTER_OFFSET,
REG_A6XX_SP_CS_PVT_MEM_STACK_OFFSET,
REG_A7XX_SP_CS_VGS_CNTL,
},
};
void void
tu6_emit_xs(struct tu_cs *cs, tu6_emit_xs(struct tu_crb &crb,
struct tu_device *device,
mesa_shader_stage stage, /* xs->type, but xs may be NULL */ mesa_shader_stage stage, /* xs->type, but xs may be NULL */
const struct ir3_shader_variant *xs, const struct ir3_shader_variant *xs,
const struct tu_pvtmem_config *pvtmem, const struct tu_pvtmem_config *pvtmem,
uint64_t binary_iova) uint64_t binary_iova)
{ {
const struct xs_config *cfg = &xs_config[stage];
if (!xs) { if (!xs) {
/* shader stage disabled */ /* shader stage disabled */
return; return;
} }
enum a6xx_threadsize thrsz =
xs->info.double_threadsize ? THREAD128 : THREAD64;
switch (stage) {
case MESA_SHADER_VERTEX:
tu_cs_emit_regs(cs, A6XX_SP_VS_CNTL_0(
.halfregfootprint = xs->info.max_half_reg + 1,
.fullregfootprint = xs->info.max_reg + 1,
.branchstack = ir3_shader_branchstack_hw(xs),
.mergedregs = xs->mergedregs,
.earlypreamble = xs->early_preamble,
));
break;
case MESA_SHADER_TESS_CTRL:
tu_cs_emit_regs(cs, A6XX_SP_HS_CNTL_0(
.halfregfootprint = xs->info.max_half_reg + 1,
.fullregfootprint = xs->info.max_reg + 1,
.branchstack = ir3_shader_branchstack_hw(xs),
.earlypreamble = xs->early_preamble,
));
break;
case MESA_SHADER_TESS_EVAL:
tu_cs_emit_regs(cs, A6XX_SP_DS_CNTL_0(
.halfregfootprint = xs->info.max_half_reg + 1,
.fullregfootprint = xs->info.max_reg + 1,
.branchstack = ir3_shader_branchstack_hw(xs),
.earlypreamble = xs->early_preamble,
));
break;
case MESA_SHADER_GEOMETRY:
tu_cs_emit_regs(cs, A6XX_SP_GS_CNTL_0(
.halfregfootprint = xs->info.max_half_reg + 1,
.fullregfootprint = xs->info.max_reg + 1,
.branchstack = ir3_shader_branchstack_hw(xs),
.earlypreamble = xs->early_preamble,
));
break;
case MESA_SHADER_FRAGMENT:
tu_cs_emit_regs(cs, A6XX_SP_PS_CNTL_0(
.halfregfootprint = xs->info.max_half_reg + 1,
.fullregfootprint = xs->info.max_reg + 1,
.branchstack = ir3_shader_branchstack_hw(xs),
.threadsize = thrsz,
.varying = xs->total_in != 0,
.lodpixmask = xs->need_full_quad,
/* inoutregoverlap had no effect on perf in anholt's testing:
* https://gitlab.freedesktop.org/anholt/mesa/-/commits/tu-inout-reg
*/
.inoutregoverlap = true,
.pixlodenable = xs->need_pixlod,
.earlypreamble = xs->early_preamble,
.mergedregs = xs->mergedregs,
));
break;
case MESA_SHADER_COMPUTE:
thrsz = cs->device->physical_device->info->props
.supports_double_threadsize ? thrsz : THREAD128;
tu_cs_emit_regs(cs, A6XX_SP_CS_CNTL_0(
.halfregfootprint = xs->info.max_half_reg + 1,
.fullregfootprint = xs->info.max_reg + 1,
.branchstack = ir3_shader_branchstack_hw(xs),
.threadsize = thrsz,
.earlypreamble = xs->early_preamble,
.mergedregs = xs->mergedregs,
));
break;
default:
UNREACHABLE("bad shader stage");
}
tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_instrlen, 1);
tu_cs_emit(cs, xs->instrlen);
/* emit program binary & private memory layout /* emit program binary & private memory layout
* binary_iova should be aligned to 1 instrlen unit (128 bytes) * binary_iova should be aligned to 1 instrlen unit (128 bytes)
*/ */
@ -1646,21 +1522,140 @@ tu6_emit_xs(struct tu_cs *cs,
assert((binary_iova & 0x7f) == 0); assert((binary_iova & 0x7f) == 0);
assert((pvtmem->iova & 0x1f) == 0); assert((pvtmem->iova & 0x1f) == 0);
tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_first_exec_offset, 7); enum a6xx_threadsize thrsz =
tu_cs_emit(cs, 0); xs->info.double_threadsize ? THREAD128 : THREAD64;
tu_cs_emit_qw(cs, binary_iova); switch (stage) {
tu_cs_emit(cs, case MESA_SHADER_VERTEX:
A6XX_SP_VS_PVT_MEM_PARAM_MEMSIZEPERITEM(pvtmem->per_fiber_size)); crb.add(A6XX_SP_VS_CNTL_0(.halfregfootprint = xs->info.max_half_reg + 1,
tu_cs_emit_qw(cs, pvtmem->iova); .fullregfootprint = xs->info.max_reg + 1,
tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_SIZE_TOTALPVTMEMSIZE(pvtmem->per_sp_size) | .branchstack = ir3_shader_branchstack_hw(xs),
COND(pvtmem->per_wave, A6XX_SP_VS_PVT_MEM_SIZE_PERWAVEMEMLAYOUT)); .mergedregs = xs->mergedregs,
.earlypreamble = xs->early_preamble, ));
crb.add(A6XX_SP_VS_INSTR_SIZE(xs->instrlen));
crb.add(A6XX_SP_VS_PROGRAM_COUNTER_OFFSET(0));
crb.add(A6XX_SP_VS_BASE(.qword = binary_iova));
crb.add(
A6XX_SP_VS_PVT_MEM_PARAM(.memsizeperitem = pvtmem->per_fiber_size));
crb.add(A6XX_SP_VS_PVT_MEM_BASE(.qword = pvtmem->iova));
crb.add(
A6XX_SP_VS_PVT_MEM_SIZE(.totalpvtmemsize = pvtmem->per_sp_size,
.perwavememlayout = xs->pvtmem_per_wave));
crb.add(A6XX_SP_VS_PVT_MEM_STACK_OFFSET(.offset = pvtmem->per_sp_size));
if (device->physical_device->info->chip >= A7XX)
crb.add(SP_VS_VGS_CNTL(A7XX, 0));
break;
tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_pvt_mem_hw_stack_offset, 1); case MESA_SHADER_TESS_CTRL:
tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_STACK_OFFSET_OFFSET(pvtmem->per_sp_size)); crb.add(A6XX_SP_HS_CNTL_0(.halfregfootprint = xs->info.max_half_reg + 1,
.fullregfootprint = xs->info.max_reg + 1,
.branchstack = ir3_shader_branchstack_hw(xs),
.earlypreamble = xs->early_preamble, ));
crb.add(A6XX_SP_HS_INSTR_SIZE(xs->instrlen));
crb.add(A6XX_SP_HS_PROGRAM_COUNTER_OFFSET(0));
crb.add(A6XX_SP_HS_BASE(.qword = binary_iova));
crb.add(
A6XX_SP_HS_PVT_MEM_PARAM(.memsizeperitem = pvtmem->per_fiber_size));
crb.add(A6XX_SP_HS_PVT_MEM_BASE(.qword = pvtmem->iova));
crb.add(
A6XX_SP_HS_PVT_MEM_SIZE(.totalpvtmemsize = pvtmem->per_sp_size,
.perwavememlayout = xs->pvtmem_per_wave));
crb.add(A6XX_SP_HS_PVT_MEM_STACK_OFFSET(.offset = pvtmem->per_sp_size));
if (device->physical_device->info->chip >= A7XX)
crb.add(SP_HS_VGS_CNTL(A7XX, 0));
if (cs->device->physical_device->info->chip >= A7XX) { break;
tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_vgpr_config, 1);
tu_cs_emit(cs, 0); case MESA_SHADER_TESS_EVAL:
crb.add(A6XX_SP_DS_CNTL_0(.halfregfootprint = xs->info.max_half_reg + 1,
.fullregfootprint = xs->info.max_reg + 1,
.branchstack = ir3_shader_branchstack_hw(xs),
.earlypreamble = xs->early_preamble, ));
crb.add(A6XX_SP_DS_INSTR_SIZE(xs->instrlen));
crb.add(A6XX_SP_DS_PROGRAM_COUNTER_OFFSET(0));
crb.add(A6XX_SP_DS_BASE(.qword = binary_iova));
crb.add(
A6XX_SP_DS_PVT_MEM_PARAM(.memsizeperitem = pvtmem->per_fiber_size));
crb.add(A6XX_SP_DS_PVT_MEM_BASE(.qword = pvtmem->iova));
crb.add(
A6XX_SP_DS_PVT_MEM_SIZE(.totalpvtmemsize = pvtmem->per_sp_size,
.perwavememlayout = xs->pvtmem_per_wave));
crb.add(A6XX_SP_DS_PVT_MEM_STACK_OFFSET(.offset = pvtmem->per_sp_size));
if (device->physical_device->info->chip >= A7XX)
crb.add(SP_DS_VGS_CNTL(A7XX, 0));
break;
case MESA_SHADER_GEOMETRY:
crb.add(A6XX_SP_GS_CNTL_0(.halfregfootprint = xs->info.max_half_reg + 1,
.fullregfootprint = xs->info.max_reg + 1,
.branchstack = ir3_shader_branchstack_hw(xs),
.earlypreamble = xs->early_preamble, ));
crb.add(A6XX_SP_GS_INSTR_SIZE(xs->instrlen));
crb.add(A6XX_SP_GS_PROGRAM_COUNTER_OFFSET(0));
crb.add(A6XX_SP_GS_BASE(.qword = binary_iova));
crb.add(
A6XX_SP_GS_PVT_MEM_PARAM(.memsizeperitem = pvtmem->per_fiber_size));
crb.add(A6XX_SP_GS_PVT_MEM_BASE(.qword = pvtmem->iova));
crb.add(
A6XX_SP_GS_PVT_MEM_SIZE(.totalpvtmemsize = pvtmem->per_sp_size,
.perwavememlayout = xs->pvtmem_per_wave));
crb.add(A6XX_SP_GS_PVT_MEM_STACK_OFFSET(.offset = pvtmem->per_sp_size));
if (device->physical_device->info->chip >= A7XX)
crb.add(SP_GS_VGS_CNTL(A7XX, 0));
break;
case MESA_SHADER_FRAGMENT:
crb.add(A6XX_SP_PS_CNTL_0(
.halfregfootprint = xs->info.max_half_reg + 1,
.fullregfootprint = xs->info.max_reg + 1,
.branchstack = ir3_shader_branchstack_hw(xs), .threadsize = thrsz,
.varying = xs->total_in != 0, .lodpixmask = xs->need_full_quad,
/* inoutregoverlap had no effect on perf in anholt's testing:
* https://gitlab.freedesktop.org/anholt/mesa/-/commits/tu-inout-reg
*/
.inoutregoverlap = true, .pixlodenable = xs->need_pixlod,
.earlypreamble = xs->early_preamble,
.mergedregs = xs->mergedregs, ));
crb.add(A6XX_SP_PS_INSTR_SIZE(xs->instrlen));
crb.add(A6XX_SP_PS_PROGRAM_COUNTER_OFFSET(0));
crb.add(A6XX_SP_PS_BASE(.qword = binary_iova));
crb.add(
A6XX_SP_PS_PVT_MEM_PARAM(.memsizeperitem = pvtmem->per_fiber_size));
crb.add(A6XX_SP_PS_PVT_MEM_BASE(.qword = pvtmem->iova));
crb.add(
A6XX_SP_PS_PVT_MEM_SIZE(.totalpvtmemsize = pvtmem->per_sp_size,
.perwavememlayout = xs->pvtmem_per_wave));
crb.add(A6XX_SP_PS_PVT_MEM_STACK_OFFSET(.offset = pvtmem->per_sp_size));
if (device->physical_device->info->chip >= A7XX)
crb.add(SP_PS_VGS_CNTL(A7XX, 0));
break;
case MESA_SHADER_COMPUTE:
thrsz = device->physical_device->info->props.supports_double_threadsize
? thrsz
: THREAD128;
crb.add(A6XX_SP_CS_CNTL_0(.halfregfootprint = xs->info.max_half_reg + 1,
.fullregfootprint = xs->info.max_reg + 1,
.branchstack = ir3_shader_branchstack_hw(xs),
.threadsize = thrsz,
.earlypreamble = xs->early_preamble,
.mergedregs = xs->mergedregs, ));
crb.add(A6XX_SP_CS_INSTR_SIZE(xs->instrlen));
crb.add(A6XX_SP_CS_PROGRAM_COUNTER_OFFSET(0));
crb.add(A6XX_SP_CS_BASE(.qword = binary_iova));
crb.add(
A6XX_SP_CS_PVT_MEM_PARAM(.memsizeperitem = pvtmem->per_fiber_size));
crb.add(A6XX_SP_CS_PVT_MEM_BASE(.qword = pvtmem->iova));
crb.add(
A6XX_SP_CS_PVT_MEM_SIZE(.totalpvtmemsize = pvtmem->per_sp_size,
.perwavememlayout = xs->pvtmem_per_wave));
crb.add(A6XX_SP_CS_PVT_MEM_STACK_OFFSET(.offset = pvtmem->per_sp_size));
if (device->physical_device->info->chip >= A7XX)
crb.add(SP_CS_VGS_CNTL(A7XX, 0));
break;
default:
UNREACHABLE("bad shader stage");
} }
} }
@ -1787,8 +1782,8 @@ tu6_emit_cs_config(struct tu_cs *cs,
crb.add(SP_UPDATE_CNTL(CHIP, .cs_state = true, .cs_uav = true, crb.add(SP_UPDATE_CNTL(CHIP, .cs_state = true, .cs_uav = true,
.cs_shared_const = shared_consts_enable)); .cs_shared_const = shared_consts_enable));
tu6_emit_xs_config<CHIP>(crb, MESA_SHADER_COMPUTE, v); tu6_emit_xs_config<CHIP>(crb, MESA_SHADER_COMPUTE, v);
tu6_emit_xs(crb, cs->device, MESA_SHADER_COMPUTE, v, pvtmem, binary_iova);
} }
tu6_emit_xs(cs, MESA_SHADER_COMPUTE, v, pvtmem, binary_iova);
tu6_emit_xs_constants(cs, MESA_SHADER_COMPUTE, v, binary_iova); tu6_emit_xs_constants(cs, MESA_SHADER_COMPUTE, v, binary_iova);
tu_crb crb = cs->crb(0); tu_crb crb = cs->crb(0);
@ -2400,7 +2395,9 @@ tu6_emit_variant(struct tu_cs *cs,
return; return;
} }
tu6_emit_xs(cs, stage, xs, pvtmem_config, binary_iova); with_crb(cs) {
tu6_emit_xs(crb, cs->device, stage, xs, pvtmem_config, binary_iova);
}
switch (stage) { switch (stage) {
case MESA_SHADER_VERTEX: case MESA_SHADER_VERTEX:

View file

@ -152,7 +152,8 @@ tu_spirv_to_nir(struct tu_device *dev,
mesa_shader_stage stage); mesa_shader_stage stage);
void void
tu6_emit_xs(struct tu_cs *cs, tu6_emit_xs(struct tu_crb &crb,
struct tu_device *device,
mesa_shader_stage stage, mesa_shader_stage stage,
const struct ir3_shader_variant *xs, const struct ir3_shader_variant *xs,
const struct tu_pvtmem_config *pvtmem, const struct tu_pvtmem_config *pvtmem,