mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-01-11 08:00:13 +01:00
freedreno/computerator: Convert to new cs builders
Signed-off-by: Rob Clark <rob.clark@oss.qualcomm.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36589>
This commit is contained in:
parent
5a3607fb3d
commit
2bff78dcab
2 changed files with 243 additions and 233 deletions
|
|
@ -76,7 +76,7 @@ struct fd6_control {
|
|||
};
|
||||
|
||||
#define control_ptr(a6xx_backend, member) \
|
||||
(a6xx_backend)->control_mem, offsetof(struct fd6_control, member), 0, 0
|
||||
(a6xx_backend)->control_mem, offsetof(struct fd6_control, member)
|
||||
|
||||
struct PACKED fd6_query_sample {
|
||||
uint64_t start;
|
||||
|
|
@ -88,8 +88,7 @@ struct PACKED fd6_query_sample {
|
|||
#define query_sample_idx(a6xx_backend, idx, field) \
|
||||
(a6xx_backend)->query_mem, \
|
||||
(idx * sizeof(struct fd6_query_sample)) + \
|
||||
offsetof(struct fd6_query_sample, field), \
|
||||
0, 0
|
||||
offsetof(struct fd6_query_sample, field)
|
||||
|
||||
/*
|
||||
* Backend implementation:
|
||||
|
|
@ -112,27 +111,25 @@ a6xx_disassemble(struct kernel *kernel, FILE *out)
|
|||
|
||||
template<chip CHIP>
|
||||
static void
|
||||
cs_restore_emit(struct fd_ringbuffer *ring, struct a6xx_backend *a6xx_backend)
|
||||
cs_restore_emit(fd_cs &cs, struct a6xx_backend *a6xx_backend)
|
||||
{
|
||||
OUT_PKT4(ring, REG_A6XX_SP_PERFCTR_SHADER_MASK, 1);
|
||||
OUT_RING(ring, A6XX_SP_PERFCTR_SHADER_MASK_CS);
|
||||
fd_ncrb<CHIP> ncrb(cs, 2 + ARRAY_SIZE(a6xx_backend->info->a6xx.magic_raw));
|
||||
|
||||
OUT_PKT4(ring, REG_A6XX_SP_NC_MODE_CNTL_2, 1);
|
||||
OUT_RING(ring, 0);
|
||||
ncrb.add(A6XX_SP_PERFCTR_SHADER_MASK(.cs = true));
|
||||
ncrb.add(A6XX_SP_NC_MODE_CNTL_2());
|
||||
|
||||
for (size_t i = 0; i < ARRAY_SIZE(a6xx_backend->info->a6xx.magic_raw); i++) {
|
||||
auto magic_reg = a6xx_backend->info->a6xx.magic_raw[i];
|
||||
if (!magic_reg.reg)
|
||||
break;
|
||||
|
||||
OUT_PKT4(ring, magic_reg.reg, 1);
|
||||
OUT_RING(ring, magic_reg.value);
|
||||
ncrb.add({magic_reg.reg, magic_reg.value});
|
||||
}
|
||||
}
|
||||
|
||||
template<chip CHIP>
|
||||
static void
|
||||
cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel)
|
||||
cs_program_emit_regs(fd_cs &cs, struct kernel *kernel)
|
||||
{
|
||||
struct ir3_kernel *ir3_kernel = to_ir3_kernel(kernel);
|
||||
struct a6xx_backend *a6xx_backend = to_a6xx_backend(ir3_kernel->backend);
|
||||
|
|
@ -140,12 +137,15 @@ cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel)
|
|||
const unsigned *local_size = kernel->local_size;
|
||||
const struct ir3_info *i = &v->info;
|
||||
enum a6xx_threadsize thrsz = i->double_threadsize ? THREAD128 : THREAD64;
|
||||
fd_crb crb(cs, 25);
|
||||
|
||||
OUT_REG(ring, A6XX_SP_MODE_CNTL(.constant_demotion_enable = true,
|
||||
.isammode = ISAMMODE_GL,
|
||||
.shared_consts_enable = false));
|
||||
crb.add(A6XX_SP_MODE_CNTL(
|
||||
.constant_demotion_enable = true,
|
||||
.isammode = ISAMMODE_GL,
|
||||
.shared_consts_enable = false,
|
||||
));
|
||||
|
||||
OUT_REG(ring, SP_UPDATE_CNTL(CHIP,
|
||||
crb.add(SP_UPDATE_CNTL(CHIP,
|
||||
.vs_state = true,
|
||||
.hs_state = true,
|
||||
.ds_state = true,
|
||||
|
|
@ -156,30 +156,32 @@ cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel)
|
|||
));
|
||||
|
||||
unsigned constlen = align(v->constlen, 4);
|
||||
OUT_REG(ring, SP_CS_CONST_CONFIG(CHIP, .constlen = constlen, .enabled = true, ));
|
||||
crb.add(SP_CS_CONST_CONFIG(CHIP, .constlen = constlen, .enabled = true, ));
|
||||
|
||||
OUT_PKT4(ring, REG_A6XX_SP_CS_CONFIG, 2);
|
||||
OUT_RING(ring, A6XX_SP_CS_CONFIG_ENABLED |
|
||||
A6XX_SP_CS_CONFIG_NUAV(kernel->num_bufs) |
|
||||
A6XX_SP_CS_CONFIG_NTEX(v->num_samp) |
|
||||
A6XX_SP_CS_CONFIG_NSAMP(v->num_samp)); /* SP_VS_CONFIG */
|
||||
OUT_RING(ring, v->instrlen); /* SP_VS_INSTRLEN */
|
||||
crb.add(A6XX_SP_CS_CONFIG(
|
||||
.enabled = true,
|
||||
.ntex = v->num_samp,
|
||||
.nsamp = v->num_samp,
|
||||
.nuav = kernel->num_bufs,
|
||||
));
|
||||
crb.add(A6XX_SP_CS_INSTR_SIZE(v->instrlen));
|
||||
|
||||
crb.add(A6XX_SP_CS_CNTL_0(
|
||||
.halfregfootprint = i->max_half_reg + 1,
|
||||
.fullregfootprint = i->max_reg + 1,
|
||||
.branchstack = ir3_shader_branchstack_hw(v),
|
||||
.threadsize = thrsz,
|
||||
.earlypreamble = v->early_preamble,
|
||||
.mergedregs = v->mergedregs,
|
||||
));
|
||||
|
||||
OUT_PKT4(ring, REG_A6XX_SP_CS_CNTL_0, 1);
|
||||
OUT_RING(ring,
|
||||
A6XX_SP_CS_CNTL_0_THREADSIZE(thrsz) |
|
||||
A6XX_SP_CS_CNTL_0_FULLREGFOOTPRINT(i->max_reg + 1) |
|
||||
A6XX_SP_CS_CNTL_0_HALFREGFOOTPRINT(i->max_half_reg + 1) |
|
||||
COND(v->mergedregs, A6XX_SP_CS_CNTL_0_MERGEDREGS) |
|
||||
COND(v->early_preamble, A6XX_SP_CS_CNTL_0_EARLYPREAMBLE) |
|
||||
A6XX_SP_CS_CNTL_0_BRANCHSTACK(ir3_shader_branchstack_hw(v)));
|
||||
if (CHIP == A7XX) {
|
||||
OUT_REG(ring, SP_PS_WAVE_CNTL(CHIP, .threadsize = THREAD64));
|
||||
crb.add(SP_PS_WAVE_CNTL(CHIP, .threadsize = THREAD64));
|
||||
|
||||
OUT_REG(ring, SP_REG_PROG_ID_0(CHIP, .dword = 0xfcfcfcfc),
|
||||
SP_REG_PROG_ID_1(CHIP, .dword = 0xfcfcfcfc),
|
||||
SP_REG_PROG_ID_2(CHIP, .dword = 0xfcfcfcfc),
|
||||
SP_REG_PROG_ID_3(CHIP, .dword = 0x0000fc00), );
|
||||
crb.add(SP_REG_PROG_ID_0(CHIP, .dword = 0xfcfcfcfc));
|
||||
crb.add(SP_REG_PROG_ID_1(CHIP, .dword = 0xfcfcfcfc));
|
||||
crb.add(SP_REG_PROG_ID_2(CHIP, .dword = 0xfcfcfcfc));
|
||||
crb.add(SP_REG_PROG_ID_3(CHIP, .dword = 0x0000fc00));
|
||||
}
|
||||
|
||||
uint32_t shared_size = MAX2(((int)v->shared_size - 1) / 1024, 1);
|
||||
|
|
@ -187,14 +189,10 @@ cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel)
|
|||
v->constlen > 256 ? CONSTLEN_512 :
|
||||
(v->constlen > 192 ? CONSTLEN_256 :
|
||||
(v->constlen > 128 ? CONSTLEN_192 : CONSTLEN_128));
|
||||
OUT_PKT4(ring, REG_A6XX_SP_CS_CNTL_1, 1);
|
||||
OUT_RING(ring, A6XX_SP_CS_CNTL_1_SHARED_SIZE(shared_size) |
|
||||
A6XX_SP_CS_CNTL_1_CONSTANTRAMMODE(mode));
|
||||
crb.add(A6XX_SP_CS_CNTL_1(.shared_size = shared_size, .constantrammode = mode));
|
||||
|
||||
if (CHIP == A6XX && a6xx_backend->info->a6xx.has_lpac) {
|
||||
OUT_PKT4(ring, REG_A6XX_HLSQ_CS_CTRL_REG1, 1);
|
||||
OUT_RING(ring, A6XX_HLSQ_CS_CTRL_REG1_SHARED_SIZE(1) |
|
||||
A6XX_HLSQ_CS_CTRL_REG1_CONSTANTRAMMODE(mode));
|
||||
crb.add(A6XX_HLSQ_CS_CTRL_REG1(.shared_size = 1, .constantrammode = mode));
|
||||
}
|
||||
|
||||
uint32_t local_invocation_id, work_group_id;
|
||||
|
|
@ -203,67 +201,58 @@ cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel)
|
|||
work_group_id = ir3_find_sysval_regid(v, SYSTEM_VALUE_WORKGROUP_ID);
|
||||
|
||||
if (CHIP == A6XX) {
|
||||
OUT_PKT4(ring, REG_A6XX_SP_CS_CONST_CONFIG_0, 2);
|
||||
OUT_RING(ring, A6XX_SP_CS_CONST_CONFIG_0_WGIDCONSTID(work_group_id) |
|
||||
A6XX_SP_CS_CONST_CONFIG_0_WGSIZECONSTID(regid(63, 0)) |
|
||||
A6XX_SP_CS_CONST_CONFIG_0_WGOFFSETCONSTID(regid(63, 0)) |
|
||||
A6XX_SP_CS_CONST_CONFIG_0_LOCALIDREGID(local_invocation_id));
|
||||
OUT_RING(ring, A6XX_SP_CS_WGE_CNTL_LINEARLOCALIDREGID(regid(63, 0)) |
|
||||
A6XX_SP_CS_WGE_CNTL_THREADSIZE(thrsz));
|
||||
crb.add(A6XX_SP_CS_CONST_CONFIG_0(
|
||||
.wgidconstid = work_group_id,
|
||||
.wgsizeconstid = INVALID_REG,
|
||||
.wgoffsetconstid = INVALID_REG,
|
||||
.localidregid = local_invocation_id,
|
||||
));
|
||||
crb.add(SP_CS_WGE_CNTL(CHIP,
|
||||
.linearlocalidregid = INVALID_REG,
|
||||
.threadsize = thrsz,
|
||||
));
|
||||
} else {
|
||||
unsigned tile_height = (local_size[1] % 8 == 0) ? 3
|
||||
: (local_size[1] % 4 == 0) ? 5
|
||||
: (local_size[1] % 2 == 0) ? 9
|
||||
: 17;
|
||||
|
||||
OUT_REG(ring,
|
||||
SP_CS_WGE_CNTL(CHIP,
|
||||
.linearlocalidregid = regid(63, 0),
|
||||
.threadsize = thrsz,
|
||||
.workgrouprastorderzfirsten = true,
|
||||
.wgtilewidth = 4,
|
||||
.wgtileheight = tile_height,
|
||||
)
|
||||
);
|
||||
crb.add(SP_CS_WGE_CNTL(CHIP,
|
||||
.linearlocalidregid = INVALID_REG,
|
||||
.threadsize = thrsz,
|
||||
.workgrouprastorderzfirsten = true,
|
||||
.wgtilewidth = 4,
|
||||
.wgtileheight = tile_height,
|
||||
));
|
||||
}
|
||||
|
||||
if (CHIP == A7XX || a6xx_backend->info->a6xx.has_lpac) {
|
||||
OUT_PKT4(ring, REG_A6XX_SP_CS_WIE_CNTL_0, 1);
|
||||
OUT_RING(ring, A6XX_SP_CS_WIE_CNTL_0_WGIDCONSTID(work_group_id) |
|
||||
A6XX_SP_CS_WIE_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
|
||||
A6XX_SP_CS_WIE_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
|
||||
A6XX_SP_CS_WIE_CNTL_0_LOCALIDREGID(local_invocation_id));
|
||||
crb.add(A6XX_SP_CS_WIE_CNTL_0(
|
||||
.wgidconstid = work_group_id,
|
||||
.wgsizeconstid = INVALID_REG,
|
||||
.wgoffsetconstid = INVALID_REG,
|
||||
.localidregid = local_invocation_id,
|
||||
));
|
||||
|
||||
if (CHIP == A7XX) {
|
||||
/* TODO allow the shader to control the tiling */
|
||||
OUT_REG(ring,
|
||||
SP_CS_WIE_CNTL_1(A7XX, .linearlocalidregid = regid(63, 0),
|
||||
.threadsize = thrsz,
|
||||
.workitemrastorder = WORKITEMRASTORDER_LINEAR));
|
||||
crb.add(SP_CS_WIE_CNTL_1(CHIP,
|
||||
.linearlocalidregid = INVALID_REG,
|
||||
.threadsize = thrsz,
|
||||
.workitemrastorder = WORKITEMRASTORDER_LINEAR,
|
||||
));
|
||||
} else {
|
||||
OUT_REG(ring,
|
||||
SP_CS_WIE_CNTL_1(CHIP, .linearlocalidregid = regid(63, 0),
|
||||
.threadsize = thrsz));
|
||||
crb.add(SP_CS_WIE_CNTL_1(CHIP,
|
||||
.linearlocalidregid = INVALID_REG,
|
||||
.threadsize = thrsz,
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
OUT_PKT4(ring, REG_A6XX_SP_CS_BASE, 2);
|
||||
OUT_RELOC(ring, v->bo, 0, 0, 0); /* SP_CS_BASE_LO/HI */
|
||||
crb.attach_bo(v->bo);
|
||||
|
||||
OUT_PKT4(ring, REG_A6XX_SP_CS_INSTR_SIZE, 1);
|
||||
OUT_RING(ring, v->instrlen);
|
||||
|
||||
OUT_PKT4(ring, REG_A6XX_SP_CS_BASE, 2);
|
||||
OUT_RELOC(ring, v->bo, 0, 0, 0);
|
||||
|
||||
uint32_t shader_preload_size =
|
||||
MIN2(v->instrlen, a6xx_backend->info->a6xx.instr_cache_size);
|
||||
OUT_PKT7(ring, CP_LOAD_STATE6_FRAG, 3);
|
||||
OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) |
|
||||
CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
|
||||
CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
|
||||
CP_LOAD_STATE6_0_STATE_BLOCK(SB6_CS_SHADER) |
|
||||
CP_LOAD_STATE6_0_NUM_UNIT(shader_preload_size));
|
||||
OUT_RELOC(ring, v->bo, 0, 0, 0);
|
||||
crb.add(A6XX_SP_CS_BASE(v->bo));
|
||||
crb.add(A6XX_SP_CS_INSTR_SIZE(v->instrlen));
|
||||
|
||||
if (v->pvtmem_size > 0) {
|
||||
uint32_t per_fiber_size = v->pvtmem_size;
|
||||
|
|
@ -272,52 +261,68 @@ cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel)
|
|||
uint32_t total_size = per_sp_size * a6xx_backend->info->num_sp_cores;
|
||||
|
||||
struct fd_bo *pvtmem = fd_bo_new(a6xx_backend->dev, total_size, 0, "pvtmem");
|
||||
OUT_PKT4(ring, REG_A6XX_SP_CS_PVT_MEM_PARAM, 4);
|
||||
OUT_RING(ring, A6XX_SP_CS_PVT_MEM_PARAM_MEMSIZEPERITEM(per_fiber_size));
|
||||
OUT_RELOC(ring, pvtmem, 0, 0, 0);
|
||||
OUT_RING(ring, A6XX_SP_CS_PVT_MEM_SIZE_TOTALPVTMEMSIZE(per_sp_size) |
|
||||
COND(v->pvtmem_per_wave,
|
||||
A6XX_SP_CS_PVT_MEM_SIZE_PERWAVEMEMLAYOUT));
|
||||
crb.add(A6XX_SP_CS_PVT_MEM_PARAM(.memsizeperitem = per_fiber_size));
|
||||
crb.add(A6XX_SP_CS_PVT_MEM_BASE(pvtmem));
|
||||
crb.add(A6XX_SP_CS_PVT_MEM_SIZE(
|
||||
.totalpvtmemsize = per_sp_size,
|
||||
.perwavememlayout = v->pvtmem_per_wave,
|
||||
));
|
||||
|
||||
OUT_PKT4(ring, REG_A6XX_SP_CS_PVT_MEM_STACK_OFFSET, 1);
|
||||
OUT_RING(ring, A6XX_SP_CS_PVT_MEM_STACK_OFFSET_OFFSET(per_sp_size));
|
||||
crb.add(A6XX_SP_CS_PVT_MEM_STACK_OFFSET(.offset = per_sp_size));
|
||||
}
|
||||
}
|
||||
|
||||
template<chip CHIP>
|
||||
static void
|
||||
emit_const(struct fd_ringbuffer *ring, uint32_t regid, uint32_t sizedwords,
|
||||
const uint32_t *dwords)
|
||||
cs_program_emit(fd_cs &cs, struct kernel *kernel)
|
||||
{
|
||||
struct ir3_kernel *ir3_kernel = to_ir3_kernel(kernel);
|
||||
struct a6xx_backend *a6xx_backend = to_a6xx_backend(ir3_kernel->backend);
|
||||
struct ir3_shader_variant *v = ir3_kernel->v;
|
||||
|
||||
cs_program_emit_regs<CHIP>(cs, kernel);
|
||||
|
||||
uint32_t shader_preload_size =
|
||||
MIN2(v->instrlen, a6xx_backend->info->a6xx.instr_cache_size);
|
||||
|
||||
fd_pkt7(cs, CP_LOAD_STATE6_FRAG, 3)
|
||||
.add(CP_LOAD_STATE6_0(
|
||||
.state_type = ST6_SHADER,
|
||||
.state_src = SS6_INDIRECT,
|
||||
.state_block = SB6_CS_SHADER,
|
||||
.num_unit = shader_preload_size,
|
||||
))
|
||||
.add(CP_LOAD_STATE6_EXT_SRC_ADDR(v->bo));
|
||||
}
|
||||
|
||||
template<chip CHIP>
|
||||
static void
|
||||
emit_const(fd_cs &cs, uint32_t regid, uint32_t sizedwords, const uint32_t *dwords)
|
||||
{
|
||||
uint32_t zero[4] = {};
|
||||
uint32_t align_sz;
|
||||
|
||||
assert((regid % 4) == 0);
|
||||
|
||||
align_sz = align(sizedwords, 4);
|
||||
|
||||
OUT_PKT7(ring, CP_LOAD_STATE6_FRAG, 3 + align_sz);
|
||||
OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(regid / 4) |
|
||||
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
|
||||
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
|
||||
CP_LOAD_STATE6_0_STATE_BLOCK(SB6_CS_SHADER) |
|
||||
CP_LOAD_STATE6_0_NUM_UNIT(DIV_ROUND_UP(sizedwords, 4)));
|
||||
OUT_RING(ring, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
|
||||
OUT_RING(ring, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
|
||||
|
||||
for (uint32_t i = 0; i < sizedwords; i++) {
|
||||
OUT_RING(ring, dwords[i]);
|
||||
}
|
||||
|
||||
/* Zero-pad to multiple of 4 dwords */
|
||||
for (uint32_t i = sizedwords; i < align_sz; i++) {
|
||||
OUT_RING(ring, 0);
|
||||
}
|
||||
fd_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + align_sz)
|
||||
.add(CP_LOAD_STATE6_0(
|
||||
.dst_off = regid / 4,
|
||||
.state_type = ST6_CONSTANTS,
|
||||
.state_src = SS6_DIRECT,
|
||||
.state_block = SB6_CS_SHADER,
|
||||
.num_unit = DIV_ROUND_UP(sizedwords, 4)
|
||||
))
|
||||
.add(CP_LOAD_STATE6_EXT_SRC_ADDR())
|
||||
.add(dwords, sizedwords)
|
||||
/* Zero-pad to multiple of 4 dwords */
|
||||
.add(zero, align_sz - sizedwords);
|
||||
}
|
||||
|
||||
template<chip CHIP>
|
||||
static void
|
||||
cs_const_emit(struct fd_ringbuffer *ring, struct kernel *kernel,
|
||||
uint32_t grid[3])
|
||||
cs_const_emit(fd_cs &cs, struct kernel *kernel, uint32_t grid[3])
|
||||
{
|
||||
struct ir3_kernel *ir3_kernel = to_ir3_kernel(kernel);
|
||||
struct ir3_shader_variant *v = ir3_kernel->v;
|
||||
|
|
@ -357,7 +362,7 @@ cs_const_emit(struct fd_ringbuffer *ring, struct kernel *kernel,
|
|||
size *= 4;
|
||||
|
||||
if (size > 0) {
|
||||
emit_const<CHIP>(ring, base, size, imm_state->values);
|
||||
emit_const<CHIP>(cs, base, size, imm_state->values);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -377,8 +382,7 @@ kernel_num_bufs(struct kernel *kernel, enum kernel_buf_type buf_type)
|
|||
|
||||
template<chip CHIP>
|
||||
static void
|
||||
cs_uav_emit(struct fd_ringbuffer *ring, struct fd_device *dev,
|
||||
struct kernel *kernel)
|
||||
cs_uav_emit(fd_cs &cs, struct fd_device *dev, struct kernel *kernel)
|
||||
{
|
||||
unsigned num_bufs = kernel_num_bufs(kernel, KERNEL_BUF_UAV);
|
||||
|
||||
|
|
@ -390,7 +394,7 @@ cs_uav_emit(struct fd_ringbuffer *ring, struct fd_device *dev,
|
|||
FD_BO_GPUREADONLY | FD_BO_HINT_COMMAND,
|
||||
"tex_desc");
|
||||
|
||||
fd_ringbuffer_attach_bo(ring, state);
|
||||
cs.attach_bo(state);
|
||||
|
||||
uint32_t *buf = (uint32_t *)fd_bo_map(state);
|
||||
|
||||
|
|
@ -399,7 +403,7 @@ cs_uav_emit(struct fd_ringbuffer *ring, struct fd_device *dev,
|
|||
continue;
|
||||
}
|
||||
|
||||
fd_ringbuffer_attach_bo(ring, kernel->bufs[i]);
|
||||
cs.attach_bo(kernel->bufs[i]);
|
||||
|
||||
/* size is encoded with low 15b in WIDTH and high bits in HEIGHT,
|
||||
* in units of elements:
|
||||
|
|
@ -424,29 +428,25 @@ cs_uav_emit(struct fd_ringbuffer *ring, struct fd_device *dev,
|
|||
buf += 16;
|
||||
}
|
||||
|
||||
OUT_PKT7(ring, CP_LOAD_STATE6_FRAG, 3);
|
||||
OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) |
|
||||
CP_LOAD_STATE6_0_STATE_TYPE(ST6_UAV) |
|
||||
CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
|
||||
CP_LOAD_STATE6_0_STATE_BLOCK(SB6_CS_SHADER) |
|
||||
CP_LOAD_STATE6_0_NUM_UNIT(num_bufs));
|
||||
OUT_RELOC(ring, state, 0);
|
||||
fd_pkt7(cs, CP_LOAD_STATE6_FRAG, 3)
|
||||
.add(CP_LOAD_STATE6_0(
|
||||
.state_type = ST6_UAV,
|
||||
.state_src = SS6_INDIRECT,
|
||||
.state_block = SB6_CS_SHADER,
|
||||
.num_unit = num_bufs,
|
||||
))
|
||||
.add(CP_LOAD_STATE6_EXT_SRC_ADDR(state));
|
||||
|
||||
if (CHIP == A6XX) {
|
||||
OUT_PKT4(ring, REG_A6XX_SP_CS_UAV_BASE, 2);
|
||||
} else {
|
||||
OUT_PKT4(ring, REG_A7XX_SP_CS_UAV_BASE, 2);
|
||||
}
|
||||
OUT_RELOC(ring, state, 0);
|
||||
fd_crb crb(cs, 3);
|
||||
|
||||
OUT_PKT4(ring, REG_A6XX_SP_CS_USIZE, 1);
|
||||
OUT_RING(ring, num_bufs);
|
||||
crb.add(SP_CS_UAV_BASE(CHIP, state));
|
||||
crb.add(A6XX_SP_CS_USIZE(num_bufs));
|
||||
|
||||
fd_bo_del(state);
|
||||
}
|
||||
|
||||
static void
|
||||
cs_ubo_emit(struct fd_ringbuffer *ring, struct kernel *kernel)
|
||||
cs_ubo_emit(fd_cs &cs, struct kernel *kernel)
|
||||
{
|
||||
unsigned num_bufs = kernel_num_bufs(kernel, KERNEL_BUF_UBO);
|
||||
|
||||
|
|
@ -459,17 +459,20 @@ cs_ubo_emit(struct fd_ringbuffer *ring, struct kernel *kernel)
|
|||
continue;
|
||||
}
|
||||
|
||||
OUT_PKT7(ring, CP_LOAD_STATE6_FRAG, 5);
|
||||
OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(offset) |
|
||||
CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) |
|
||||
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
|
||||
CP_LOAD_STATE6_0_STATE_BLOCK(SB6_CS_SHADER) |
|
||||
CP_LOAD_STATE6_0_NUM_UNIT(1));
|
||||
OUT_RING(ring, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
|
||||
OUT_RING(ring, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
|
||||
cs.attach_bo(kernel->bufs[i]);
|
||||
|
||||
unsigned size_vec4s = DIV_ROUND_UP(kernel->buf_sizes[i], 4);
|
||||
OUT_RELOC(ring, kernel->bufs[i], 0,
|
||||
(uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32, 0);
|
||||
|
||||
fd_pkt7(cs, CP_LOAD_STATE6_FRAG, 5)
|
||||
.add(CP_LOAD_STATE6_0(
|
||||
.dst_off = offset,
|
||||
.state_type = ST6_UBO,
|
||||
.state_src = SS6_DIRECT,
|
||||
.state_block = SB6_CS_SHADER,
|
||||
.num_unit = 1,
|
||||
))
|
||||
.add(CP_LOAD_STATE6_EXT_SRC_ADDR())
|
||||
.add(A6XX_UBO_DESC(0, kernel->bufs[i], 0, size_vec4s));
|
||||
|
||||
offset++;
|
||||
}
|
||||
|
|
@ -477,28 +480,28 @@ cs_ubo_emit(struct fd_ringbuffer *ring, struct kernel *kernel)
|
|||
|
||||
template<chip CHIP>
|
||||
static inline unsigned
|
||||
event_write(struct fd_ringbuffer *ring, struct kernel *kernel,
|
||||
enum vgt_event_type evt, bool timestamp)
|
||||
event_write(fd_cs &cs, struct kernel *kernel, enum vgt_event_type evt, bool timestamp)
|
||||
{
|
||||
unsigned seqno = 0;
|
||||
unsigned len = timestamp ? 4 : 1;
|
||||
|
||||
fd_pkt7 pkt(cs, CP_EVENT_WRITE, len);
|
||||
|
||||
if (CHIP == A6XX) {
|
||||
OUT_PKT7(ring, CP_EVENT_WRITE, timestamp ? 4 : 1);
|
||||
OUT_RING(ring, CP_EVENT_WRITE_0_EVENT(evt));
|
||||
pkt.add(CP_EVENT_WRITE_0_EVENT(evt) |
|
||||
COND(timestamp, CP_EVENT_WRITE_0_TIMESTAMP));
|
||||
} else {
|
||||
OUT_PKT7(ring, CP_EVENT_WRITE7, timestamp ? 4 : 1);
|
||||
OUT_RING(ring,
|
||||
CP_EVENT_WRITE7_0_EVENT(evt) |
|
||||
COND(timestamp, CP_EVENT_WRITE7_0_WRITE_ENABLED |
|
||||
CP_EVENT_WRITE7_0_WRITE_SRC(EV_WRITE_USER_32B)));
|
||||
pkt.add(CP_EVENT_WRITE7_0_EVENT(evt) |
|
||||
CP_EVENT_WRITE7_0_WRITE_SRC(EV_WRITE_USER_32B) |
|
||||
COND(timestamp, CP_EVENT_WRITE7_0_WRITE_ENABLED));
|
||||
}
|
||||
|
||||
if (timestamp) {
|
||||
struct ir3_kernel *ir3_kernel = to_ir3_kernel(kernel);
|
||||
struct a6xx_backend *a6xx_backend = to_a6xx_backend(ir3_kernel->backend);
|
||||
seqno = ++a6xx_backend->seqno;
|
||||
OUT_RELOC(ring, control_ptr(a6xx_backend, seqno)); /* ADDR_LO/HI */
|
||||
OUT_RING(ring, seqno);
|
||||
pkt.add(CP_EVENT_WRITE_ADDR(control_ptr(a6xx_backend, seqno)));
|
||||
pkt.add(seqno);
|
||||
}
|
||||
|
||||
return seqno;
|
||||
|
|
@ -506,31 +509,30 @@ event_write(struct fd_ringbuffer *ring, struct kernel *kernel,
|
|||
|
||||
template<chip CHIP>
|
||||
static inline void
|
||||
cache_flush(struct fd_ringbuffer *ring, struct kernel *kernel)
|
||||
cache_flush(fd_cs &cs, struct kernel *kernel)
|
||||
{
|
||||
struct ir3_kernel *ir3_kernel = to_ir3_kernel(kernel);
|
||||
struct a6xx_backend *a6xx_backend = to_a6xx_backend(ir3_kernel->backend);
|
||||
unsigned seqno;
|
||||
|
||||
seqno = event_write<CHIP>(ring, kernel, RB_DONE_TS, true);
|
||||
seqno = event_write<CHIP>(cs, kernel, RB_DONE_TS, true);
|
||||
|
||||
OUT_PKT7(ring, CP_WAIT_REG_MEM, 6);
|
||||
OUT_RING(ring, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
|
||||
CP_WAIT_REG_MEM_0_POLL(POLL_MEMORY));
|
||||
OUT_RELOC(ring, control_ptr(a6xx_backend, seqno));
|
||||
OUT_RING(ring, CP_WAIT_REG_MEM_3_REF(seqno));
|
||||
OUT_RING(ring, CP_WAIT_REG_MEM_4_MASK(~0));
|
||||
OUT_RING(ring, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
|
||||
fd_pkt7(cs, CP_WAIT_REG_MEM, 6)
|
||||
.add(CP_WAIT_REG_MEM_0(.function = WRITE_EQ, .poll = POLL_MEMORY))
|
||||
.add(CP_WAIT_REG_MEM_POLL_ADDR(control_ptr(a6xx_backend, seqno)))
|
||||
.add(CP_WAIT_REG_MEM_3(.ref = seqno))
|
||||
.add(CP_WAIT_REG_MEM_4(.mask = !0))
|
||||
.add(CP_WAIT_REG_MEM_5(.delay_loop_cycles = 16));
|
||||
|
||||
if (CHIP == A6XX) {
|
||||
seqno = event_write<CHIP>(ring, kernel, CACHE_FLUSH_TS, true);
|
||||
seqno = event_write<CHIP>(cs, kernel, CACHE_FLUSH_TS, true);
|
||||
|
||||
OUT_PKT7(ring, CP_WAIT_MEM_GTE, 4);
|
||||
OUT_RING(ring, CP_WAIT_MEM_GTE_0_RESERVED(0));
|
||||
OUT_RELOC(ring, control_ptr(a6xx_backend, seqno));
|
||||
OUT_RING(ring, CP_WAIT_MEM_GTE_3_REF(seqno));
|
||||
fd_pkt7(cs, CP_WAIT_MEM_GTE, 4)
|
||||
.add(CP_WAIT_MEM_GTE_0())
|
||||
.add(CP_WAIT_MEM_GTE_POLL_ADDR(control_ptr(a6xx_backend, seqno)))
|
||||
.add(CP_WAIT_MEM_GTE_3(.ref = seqno));
|
||||
} else {
|
||||
event_write<CHIP>(ring, kernel, CACHE_FLUSH7, false);
|
||||
event_write<CHIP>(cs, kernel, CACHE_FLUSH7, false);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -541,18 +543,19 @@ a6xx_emit_grid(struct kernel *kernel, uint32_t grid[3],
|
|||
{
|
||||
struct ir3_kernel *ir3_kernel = to_ir3_kernel(kernel);
|
||||
struct a6xx_backend *a6xx_backend = to_a6xx_backend(ir3_kernel->backend);
|
||||
struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
|
||||
submit, 0,
|
||||
(enum fd_ringbuffer_flags)(FD_RINGBUFFER_PRIMARY | FD_RINGBUFFER_GROWABLE));
|
||||
fd_cs cs(fd_submit_new_ringbuffer(submit, 0,
|
||||
(enum fd_ringbuffer_flags)(FD_RINGBUFFER_PRIMARY | FD_RINGBUFFER_GROWABLE)));
|
||||
|
||||
cs_restore_emit<CHIP>(ring, a6xx_backend);
|
||||
cs_program_emit<CHIP>(ring, kernel);
|
||||
cs_const_emit<CHIP>(ring, kernel, grid);
|
||||
cs_uav_emit<CHIP>(ring, a6xx_backend->dev, kernel);
|
||||
cs_ubo_emit(ring, kernel);
|
||||
cs.attach_bo(a6xx_backend->control_mem);
|
||||
|
||||
OUT_PKT7(ring, CP_SET_MARKER, 1);
|
||||
OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_COMPUTE));
|
||||
cs_restore_emit<CHIP>(cs, a6xx_backend);
|
||||
cs_program_emit<CHIP>(cs, kernel);
|
||||
cs_const_emit<CHIP>(cs, kernel, grid);
|
||||
cs_uav_emit<CHIP>(cs, a6xx_backend->dev, kernel);
|
||||
cs_ubo_emit(cs, kernel);
|
||||
|
||||
fd_pkt7(cs, CP_SET_MARKER, 1)
|
||||
.add(A6XX_CP_SET_MARKER_0(.mode = RM6_COMPUTE));
|
||||
|
||||
const unsigned *local_size = kernel->local_size;
|
||||
const unsigned *num_groups = grid;
|
||||
|
|
@ -564,35 +567,40 @@ a6xx_emit_grid(struct kernel *kernel, uint32_t grid[3],
|
|||
work_dim++;
|
||||
}
|
||||
|
||||
OUT_REG(ring, SP_CS_NDRANGE_0(CHIP,
|
||||
.kerneldim = work_dim,
|
||||
.localsizex = local_size[0] - 1,
|
||||
.localsizey = local_size[1] - 1,
|
||||
.localsizez = local_size[2] - 1,
|
||||
));
|
||||
if (CHIP == A7XX) {
|
||||
OUT_REG(ring, A7XX_SP_CS_NDRANGE_7(.localsizex = local_size[0] - 1,
|
||||
.localsizey = local_size[1] - 1,
|
||||
.localsizez = local_size[2] - 1, ));
|
||||
with_crb (cs, 11) {
|
||||
crb.add(SP_CS_NDRANGE_0(CHIP,
|
||||
.kerneldim = work_dim,
|
||||
.localsizex = local_size[0] - 1,
|
||||
.localsizey = local_size[1] - 1,
|
||||
.localsizez = local_size[2] - 1,
|
||||
));
|
||||
|
||||
if (CHIP == A7XX) {
|
||||
crb.add(A7XX_SP_CS_NDRANGE_7(
|
||||
.localsizex = local_size[0] - 1,
|
||||
.localsizey = local_size[1] - 1,
|
||||
.localsizez = local_size[2] - 1,
|
||||
));
|
||||
}
|
||||
|
||||
crb.add(SP_CS_NDRANGE_1(CHIP,
|
||||
.globalsize_x = local_size[0] * num_groups[0],
|
||||
));
|
||||
crb.add(SP_CS_NDRANGE_2(CHIP, 0));
|
||||
crb.add(SP_CS_NDRANGE_3(CHIP,
|
||||
.globalsize_y = local_size[1] * num_groups[1],
|
||||
));
|
||||
crb.add(SP_CS_NDRANGE_4(CHIP, 0));
|
||||
crb.add(SP_CS_NDRANGE_5(CHIP,
|
||||
.globalsize_z = local_size[2] * num_groups[2],
|
||||
));
|
||||
crb.add(SP_CS_NDRANGE_6(CHIP, 0));
|
||||
|
||||
crb.add(SP_CS_KERNEL_GROUP_X(CHIP, 1));
|
||||
crb.add(SP_CS_KERNEL_GROUP_Y(CHIP, 1));
|
||||
crb.add(SP_CS_KERNEL_GROUP_Z(CHIP, 1));
|
||||
}
|
||||
|
||||
OUT_REG(ring, SP_CS_NDRANGE_1(CHIP,
|
||||
.globalsize_x = local_size[0] * num_groups[0],
|
||||
));
|
||||
OUT_REG(ring, SP_CS_NDRANGE_2(CHIP, 0));
|
||||
OUT_REG(ring, SP_CS_NDRANGE_3(CHIP,
|
||||
.globalsize_y = local_size[1] * num_groups[1],
|
||||
));
|
||||
OUT_REG(ring, SP_CS_NDRANGE_4(CHIP, 0));
|
||||
OUT_REG(ring, SP_CS_NDRANGE_5(CHIP,
|
||||
.globalsize_z = local_size[2] * num_groups[2],
|
||||
));
|
||||
OUT_REG(ring, SP_CS_NDRANGE_6(CHIP, 0));
|
||||
|
||||
OUT_REG(ring, SP_CS_KERNEL_GROUP_X(CHIP, 1));
|
||||
OUT_REG(ring, SP_CS_KERNEL_GROUP_Y(CHIP, 1));
|
||||
OUT_REG(ring, SP_CS_KERNEL_GROUP_Z(CHIP, 1));
|
||||
|
||||
if (a6xx_backend->num_perfcntrs > 0) {
|
||||
a6xx_backend->query_mem = fd_bo_new(
|
||||
a6xx_backend->dev,
|
||||
|
|
@ -604,55 +612,55 @@ a6xx_emit_grid(struct kernel *kernel, uint32_t grid[3],
|
|||
for (unsigned i = 0; i < a6xx_backend->num_perfcntrs; i++) {
|
||||
const struct perfcntr *counter = &a6xx_backend->perfcntrs[i];
|
||||
|
||||
OUT_PKT4(ring, counter->select_reg, 1);
|
||||
OUT_RING(ring, counter->selector);
|
||||
fd_pkt4(cs, 1).add({
|
||||
.reg = counter->select_reg,
|
||||
.value = counter->selector,
|
||||
});
|
||||
}
|
||||
|
||||
OUT_PKT7(ring, CP_WAIT_FOR_IDLE, 0);
|
||||
fd_pkt7(cs, CP_WAIT_FOR_IDLE, 0);
|
||||
|
||||
/* and snapshot the start values: */
|
||||
for (unsigned i = 0; i < a6xx_backend->num_perfcntrs; i++) {
|
||||
const struct perfcntr *counter = &a6xx_backend->perfcntrs[i];
|
||||
|
||||
OUT_PKT7(ring, CP_REG_TO_MEM, 3);
|
||||
OUT_RING(ring, CP_REG_TO_MEM_0_64B |
|
||||
CP_REG_TO_MEM_0_REG(counter->counter_reg_lo));
|
||||
OUT_RELOC(ring, query_sample_idx(a6xx_backend, i, start));
|
||||
fd_pkt7(cs, CP_REG_TO_MEM, 3)
|
||||
.add(CP_REG_TO_MEM_0(.reg = counter->counter_reg_lo, ._64b = true))
|
||||
.add(CP_REG_TO_MEM_DEST(query_sample_idx(a6xx_backend, i, start)));
|
||||
}
|
||||
}
|
||||
|
||||
OUT_PKT7(ring, CP_EXEC_CS, 4);
|
||||
OUT_RING(ring, 0x00000000);
|
||||
OUT_RING(ring, CP_EXEC_CS_1_NGROUPS_X(grid[0]));
|
||||
OUT_RING(ring, CP_EXEC_CS_2_NGROUPS_Y(grid[1]));
|
||||
OUT_RING(ring, CP_EXEC_CS_3_NGROUPS_Z(grid[2]));
|
||||
fd_pkt7(cs, CP_EXEC_CS, 4)
|
||||
.add(CP_EXEC_CS_0())
|
||||
.add(CP_EXEC_CS_1(.ngroups_x = grid[0]))
|
||||
.add(CP_EXEC_CS_2(.ngroups_y = grid[1]))
|
||||
.add(CP_EXEC_CS_3(.ngroups_z = grid[2]));
|
||||
|
||||
OUT_PKT7(ring, CP_WAIT_FOR_IDLE, 0);
|
||||
fd_pkt7(cs, CP_WAIT_FOR_IDLE, 0);
|
||||
|
||||
if (a6xx_backend->num_perfcntrs > 0) {
|
||||
/* snapshot the end values: */
|
||||
for (unsigned i = 0; i < a6xx_backend->num_perfcntrs; i++) {
|
||||
const struct perfcntr *counter = &a6xx_backend->perfcntrs[i];
|
||||
|
||||
OUT_PKT7(ring, CP_REG_TO_MEM, 3);
|
||||
OUT_RING(ring, CP_REG_TO_MEM_0_64B |
|
||||
CP_REG_TO_MEM_0_REG(counter->counter_reg_lo));
|
||||
OUT_RELOC(ring, query_sample_idx(a6xx_backend, i, stop));
|
||||
fd_pkt7(cs, CP_REG_TO_MEM, 3)
|
||||
.add(CP_REG_TO_MEM_0(.reg = counter->counter_reg_lo, ._64b = true))
|
||||
.add(CP_REG_TO_MEM_DEST(query_sample_idx(a6xx_backend, i, stop)));
|
||||
}
|
||||
|
||||
/* and compute the result: */
|
||||
for (unsigned i = 0; i < a6xx_backend->num_perfcntrs; i++) {
|
||||
/* result += stop - start: */
|
||||
OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
|
||||
OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
|
||||
OUT_RELOC(ring, query_sample_idx(a6xx_backend, i, result)); /* dst */
|
||||
OUT_RELOC(ring, query_sample_idx(a6xx_backend, i, result)); /* srcA */
|
||||
OUT_RELOC(ring, query_sample_idx(a6xx_backend, i, stop)); /* srcB */
|
||||
OUT_RELOC(ring, query_sample_idx(a6xx_backend, i, start)); /* srcC */
|
||||
fd_pkt7(cs, CP_MEM_TO_MEM, 9)
|
||||
.add(CP_MEM_TO_MEM_0(.neg_c = true, ._double = true))
|
||||
.add(CP_MEM_TO_MEM_DST(query_sample_idx(a6xx_backend, i, result)))
|
||||
.add(CP_MEM_TO_MEM_SRC_A(query_sample_idx(a6xx_backend, i, result)))
|
||||
.add(CP_MEM_TO_MEM_SRC_B(query_sample_idx(a6xx_backend, i, stop)))
|
||||
.add(CP_MEM_TO_MEM_SRC_C(query_sample_idx(a6xx_backend, i, start)));
|
||||
}
|
||||
}
|
||||
|
||||
cache_flush<CHIP>(ring, kernel);
|
||||
cache_flush<CHIP>(cs, kernel);
|
||||
}
|
||||
|
||||
static void
|
||||
|
|
|
|||
|
|
@ -12,6 +12,8 @@ computerator_files = [
|
|||
|
||||
computerator_cpp_args = cpp.get_supported_arguments([
|
||||
'-Wno-array-bounds',
|
||||
'-Wno-c++11-narrowing',
|
||||
'-Wno-narrowing',
|
||||
'-Wno-sign-compare',
|
||||
])
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue