tu: gen8 support
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

Signed-off-by: Rob Clark <rob.clark@oss.qualcomm.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39167>
This commit is contained in:
Rob Clark 2025-12-16 13:07:45 -08:00 committed by Marge Bot
parent 77e83d1449
commit a9f05399ae
11 changed files with 404 additions and 152 deletions

View file

@ -10,7 +10,7 @@ tu_entrypoints = custom_target(
'--out-h', '@OUTPUT0@', '--out-c', '@OUTPUT1@', '--prefix', 'tu',
'--include', 'adreno_common.xml.h',
'--tmpl-prefix', 'tu', '--tmpl-param', 'chip CHIP',
'--tmpl-variants', '<A6XX>', '<A7XX>',
'--tmpl-variants', '<A6XX>', '<A7XX>', '<A8XX>',
'--beta', with_vulkan_beta.to_string(),
'--device-prefix', 'tu_rmv',
],

View file

@ -375,8 +375,12 @@ r2d_src_buffer_unaligned(struct tu_cmd_buffer *cmd,
enum a6xx_format color_format = fmt.fmt;
fixup_src_format(&format, dst_format, &color_format);
uint32_t offset_texels = ((va & 0x3f) / util_format_get_blocksize(format));
va &= ~0x3f;
uint32_t offset_texels = 0;
if (CHIP < A8XX) {
offset_texels = ((va & 0x3f) / util_format_get_blocksize(format));
va &= ~0x3f;
}
tu_cs_emit_regs(cs, TPL1_A2D_BLT_CNTL(CHIP,
.raw_copy = false,
.type = A6XX_TEX_IMG_BUFFER,
@ -877,18 +881,18 @@ r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum r3d_type type,
.cs_bindless = CHIP == A6XX ? 0x1f : 0xff,
.gfx_bindless = CHIP == A6XX ? 0x1f : 0xff,));
with_crb (cs, 2 * 5 + 2 * 11) {
with_crb (cs, 2 * 5 + 2 * 12) {
tu6_emit_xs_config<CHIP>(crb, { .vs = vs, .fs = fs });
struct tu_pvtmem_config pvtmem = {};
tu6_emit_xs(crb, cs->device, MESA_SHADER_VERTEX, vs, &pvtmem, vs_iova);
tu6_emit_xs(crb, cs->device, MESA_SHADER_FRAGMENT, fs, &pvtmem, fs_iova);
tu6_emit_xs<CHIP>(crb, cs->device, MESA_SHADER_VERTEX, vs, &pvtmem, vs_iova);
tu6_emit_xs<CHIP>(crb, cs->device, MESA_SHADER_FRAGMENT, fs, &pvtmem, fs_iova);
}
tu6_emit_xs_constants(cs, MESA_SHADER_VERTEX, vs, vs_iova);
tu6_emit_xs_constants(cs, MESA_SHADER_FRAGMENT, fs, fs_iova);
tu_cs_emit_regs(cs, PC_CNTL(CHIP));
if (CHIP == A7XX) {
if (CHIP >= A7XX) {
tu_cs_emit_regs(cs, VPC_PC_CNTL(CHIP));
}
@ -916,11 +920,16 @@ r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum r3d_type type,
.persp_division_disable = 1,));
tu_cs_emit_regs(cs, GRAS_SU_CNTL(CHIP)); // XXX msaa enable?
if (CHIP >= A8XX) {
tu_cs_emit_regs(cs, GRAS_SU_STEREO_CNTL(CHIP));
}
tu_cs_emit_regs(cs, VPC_RAST_STREAM_CNTL(CHIP));
if (CHIP == A6XX) {
tu_cs_emit_regs(cs, VPC_UNKNOWN_9107(CHIP));
} else {
tu_cs_emit_regs(cs, VPC_RAST_STREAM_CNTL_V2(CHIP));
if (CHIP == A7XX)
tu_cs_emit_regs(cs, VPC_RAST_STREAM_CNTL_V2(CHIP));
tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP,
.raster_mode = TYPE_TILED,
@ -1361,7 +1370,16 @@ r3d_src_gmem(struct tu_cmd_buffer *cmd,
tu_desc_set_tex_line_offset<CHIP>(desc, cmd->state.tiling->tile0.width * cpp);
tu_desc_set_array_slice_offset<CHIP>(desc, 0);
tu_desc_set_depth<CHIP>(desc, 1);
tu_desc_set_addr<CHIP>(desc, cmd->device->physical_device->gmem_base + gmem_offset);
uint64_t va = gmem_offset;
if (CHIP < A8XX) {
/* For gen8, address is simply gmem_offset if tile_mode is gmem
* tiling (TILE6_2)
*/
va += cmd->device->physical_device->gmem_base;
}
tu_desc_set_addr<CHIP>(desc, va);
/* patch the format so that depth/stencil get the right format and swizzle */
tu_desc_set_format<CHIP>(desc, fmt);
@ -1642,6 +1660,10 @@ r3d_setup(struct tu_cmd_buffer *cmd,
tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0,
.component_enable = aspect_write_mask(dst_format, aspect_mask)));
if (CHIP >= A8XX) {
tu_cs_emit_regs(cs, SP_MRT_BLEND_CNTL_REG(CHIP, 0,
.component_write_mask = aspect_write_mask(dst_format, aspect_mask)));
}
tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(util_format_is_srgb(dst_format)));
tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(util_format_is_srgb(dst_format)));
@ -4192,11 +4214,17 @@ tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
tu_cs_emit_regs(cs,
A6XX_RB_PS_MRT_CNTL(.mrt = mrt_count));
tu_cs_emit_regs(cs, SP_BLEND_CNTL(CHIP));
tu_cs_emit_regs(cs, SP_BLEND_CNTL(CHIP,
.independent_blend_en = true,
));
tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
for (uint32_t i = 0; i < mrt_count; i++) {
tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
.component_enable = COND(clear_rts & (1 << i), 0xf)));
if (CHIP >= A8XX) {
tu_cs_emit_regs(cs, SP_MRT_BLEND_CNTL_REG(CHIP, i,
.component_write_mask = COND(clear_rts & (1 << i), 0xf)));
}
}
tu_cs_emit_regs(cs, GRAS_LRZ_CNTL(CHIP, 0));
@ -5247,6 +5275,14 @@ store_cp_blit(struct tu_cmd_buffer *cmd,
src_height += tiling->tile0.height;
}
uint64_t va = gmem_offset;
if (CHIP < A8XX) {
/* For gen8, address is simply gmem_offset if tile_mode is gmem
* tiling (TILE6_2)
*/
va += cmd->device->physical_device->gmem_base;
}
tu_cs_emit_regs(cs,
TPL1_A2D_SRC_TEXTURE_INFO(CHIP,
.color_format = format,
@ -5262,7 +5298,7 @@ store_cp_blit(struct tu_cmd_buffer *cmd,
TPL1_A2D_SRC_TEXTURE_SIZE(CHIP,
.width = src_width,
.height = src_height),
TPL1_A2D_SRC_TEXTURE_BASE(CHIP, .qword = cmd->device->physical_device->gmem_base + gmem_offset),
TPL1_A2D_SRC_TEXTURE_BASE(CHIP, .qword = va),
TPL1_A2D_SRC_TEXTURE_PITCH(CHIP, .pitch = cmd->state.tiling->tile0.width * cpp));
/* sync GMEM writes with CACHE. */

View file

@ -266,9 +266,14 @@ static void
tu_set_render_mode(struct tu_cs *cs, tu_set_render_mode args)
{
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(args.mode) |
COND(args.uses_gmem, A6XX_CP_SET_MARKER_0_USES_GMEM) |
COND(args.shader_uses_rt, A6XX_CP_SET_MARKER_0_SHADER_USES_RT));
if (CHIP >= A8XX) {
tu_cs_emit(cs, A8XX_CP_SET_MARKER_0_MODE(args.mode) |
COND(args.uses_gmem, A8XX_CP_SET_MARKER_0_USES_GMEM));
} else {
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(args.mode) |
COND(args.uses_gmem, A6XX_CP_SET_MARKER_0_USES_GMEM) |
COND(args.shader_uses_rt, A6XX_CP_SET_MARKER_0_SHADER_USES_RT));
}
}
/* This workaround, copied from the blob, seems to ensure that the BVH node
@ -347,7 +352,7 @@ tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer,
/* Invalidating UCHE seems to also invalidate CCHE */
!(flushes & TU_CMD_FLAG_CACHE_INVALIDATE))
tu_cs_emit_pkt7(cs, CP_CCHE_INVALIDATE, 0);
if (CHIP >= A7XX && (flushes & TU_CMD_FLAG_RTU_INVALIDATE) &&
if (CHIP == A7XX && (flushes & TU_CMD_FLAG_RTU_INVALIDATE) &&
cmd_buffer->device->physical_device->info->props.has_rt_workaround)
tu_emit_rt_workaround<CHIP>(cmd_buffer, cs);
if (flushes & TU_CMD_FLAG_WAIT_MEM_WRITES)
@ -523,16 +528,24 @@ emit_vpc_attr_buf(struct tu_cs *cs, struct tu_device *dev, bool gmem)
if (!dev->physical_device->info->props.has_gmem_vpc_attr_buf)
return;
tu_crb crb(cs, 9);
const struct fd6_gmem_config *cfg = gmem ?
&dev->physical_device->config_gmem :
&dev->physical_device->config_sysmem;
tu_cs_emit_regs(cs,
VPC_ATTR_BUF_GMEM_SIZE(CHIP, cfg->vpc_attr_buf_size),
VPC_ATTR_BUF_GMEM_BASE(CHIP, cfg->vpc_attr_buf_offset),
);
crb.add(VPC_ATTR_BUF_GMEM_SIZE(CHIP, cfg->vpc_attr_buf_size));
crb.add(VPC_ATTR_BUF_GMEM_BASE(CHIP, cfg->vpc_attr_buf_offset));
crb.add(PC_ATTR_BUF_GMEM_SIZE(CHIP, cfg->vpc_attr_buf_size));
tu_cs_emit_regs(cs, PC_ATTR_BUF_GMEM_SIZE(CHIP, cfg->vpc_attr_buf_size));
if (CHIP >= A8XX) {
crb.add(VPC_POS_BUF_GMEM_SIZE(CHIP, cfg->vpc_pos_buf_size));
crb.add(VPC_POS_BUF_GMEM_BASE(CHIP, cfg->vpc_pos_buf_offset));
crb.add(PC_POS_BUF_GMEM_SIZE(CHIP, cfg->vpc_pos_buf_size));
crb.add(VPC_BV_POS_BUF_GMEM_BASE(CHIP, cfg->vpc_bv_pos_buf_offset));
crb.add(VPC_BV_POS_BUF_GMEM_SIZE(CHIP, cfg->vpc_bv_pos_buf_size));
crb.add(PC_BV_POS_BUF_GMEM_SIZE(CHIP, cfg->vpc_bv_pos_buf_size));
}
}
template <chip CHIP>
@ -575,7 +588,14 @@ emit_rb_ccu_cntl(struct tu_cs *cs, struct tu_device *dev, bool gmem)
enum a6xx_ccu_cache_size color_cache_size = !gmem ? CCU_CACHE_SIZE_FULL : !gmem ? CCU_CACHE_SIZE_FULL :
(a6xx_ccu_cache_size)(dev->physical_device->info->props.gmem_ccu_color_cache_fraction);
if (CHIP == A7XX) {
if (CHIP == A8XX) {
tu_cs_emit_regs(cs, RB_CCU_CACHE_CNTL(CHIP,
.depth_cache_size = (enum a6xx_ccu_cache_size)cfg->depth_cache_fraction,
.depth_offset = cfg->depth_ccu_offset,
.color_cache_size = (enum a6xx_ccu_cache_size)cfg->color_cache_fraction,
.color_offset = cfg->color_ccu_offset,
));
} else if (CHIP == A7XX) {
tu_cs_emit_regs(cs, RB_CCU_CACHE_CNTL(CHIP,
.depth_offset_hi = depth_offset_hi,
.color_offset_hi = color_offset_hi,
@ -584,7 +604,7 @@ emit_rb_ccu_cntl(struct tu_cs *cs, struct tu_device *dev, bool gmem)
.color_cache_size = color_cache_size,
.color_offset = color_offset
));
} else {
} else if (CHIP == A6XX) {
tu_cs_emit_regs(cs, RB_CCU_CNTL(CHIP,
.gmem_fast_clear_disable =
!dev->physical_device->info->props.has_gmem_fast_clear,
@ -865,24 +885,14 @@ tu6_emit_bin_size(struct tu_cs *cs,
uint32_t bin_h,
struct tu_bin_size_params &&p)
{
if (CHIP == A6XX) {
tu_cs_emit_regs(
cs, GRAS_SC_BIN_CNTL(CHIP, .binw = bin_w,
.binh = bin_h,
.render_mode = p.render_mode,
.force_lrz_write_dis = p.force_lrz_write_dis,
.buffers_location = p.buffers_location,
.lrz_feedback_zmode_mask = p.lrz_feedback_zmode_mask, ));
} else {
tu_cs_emit_regs(cs,
GRAS_SC_BIN_CNTL(CHIP, .binw = bin_w,
.binh = bin_h,
.render_mode = p.render_mode,
.force_lrz_write_dis = p.force_lrz_write_dis,
.lrz_feedback_zmode_mask =
p.lrz_feedback_zmode_mask,
.force_lrz_dis = p.force_lrz_dis));
}
tu_cs_emit_regs(
cs, GRAS_SC_BIN_CNTL(CHIP, .binw = bin_w,
.binh = bin_h,
.render_mode = p.render_mode,
.force_lrz_write_dis = p.force_lrz_write_dis,
.buffers_location = p.buffers_location,
.lrz_feedback_zmode_mask = p.lrz_feedback_zmode_mask,
.force_lrz_dis = p.force_lrz_dis));
tu_cs_emit_regs(cs, RB_CNTL(CHIP,
.binw = bin_w,
@ -892,6 +902,32 @@ tu6_emit_bin_size(struct tu_cs *cs,
.buffers_location = p.buffers_location,
.lrz_feedback_zmode_mask = p.lrz_feedback_zmode_mask, ));
if (CHIP >= A8XX) {
tu_crb crb = cs->crb(13);
crb.add(TPL1_BIN_SIZE(CHIP, .binw = bin_w, .binh = bin_h));
crb.add(TPL1_A2D_BIN_SIZE(CHIP, .binw = bin_w, .binh = bin_h));
crb.add(SP_BIN_SIZE(CHIP, .binw = bin_w, .binh = bin_h));
for (int i = 0; i < 8; i++) {
// gen8 TODO: 0x0 if !cbuf_cpp[i]
crb.add(RB_MRT_GMEM_DIMENSION_REG(CHIP, i,
.width = bin_w,
.height = bin_h,
));
}
// gen8 TODO: 0x0 if !zsbuf_cpp[0]
crb.add(RB_DEPTH_GMEM_DIMENSION(CHIP,
.width = bin_w,
.height = bin_h,
));
// gen8 TODO: 0x0 if !(zsbuf_cpp[0] || zsbuf_cpp[1])
crb.add(RB_STENCIL_GMEM_DIMENSION(CHIP,
.width = bin_w,
.height = bin_h,
));
}
/* no flag for RB_RESOLVE_CNTL_3... */
tu_cs_emit_regs(cs, RB_RESOLVE_CNTL_3(CHIP, .binw = bin_w, .binh = bin_h));
}
@ -1682,7 +1718,7 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
frag_offsets[i].y = y1 - y1 / tile->frag_areas[view].height;
}
with_crb (cs, 6) {
with_crb (cs, 26) {
crb.add(GRAS_BIN_FOVEAT(CHIP,
.binscaleen = bin_scale_en,
.xscale_0 = (enum a7xx_bin_scale)util_logbase2(frag_areas[0].width),
@ -1697,24 +1733,46 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
.yscale_4 = (enum a7xx_bin_scale)util_logbase2(frag_areas[4].height),
.xscale_5 = (enum a7xx_bin_scale)util_logbase2(frag_areas[5].width),
.yscale_5 = (enum a7xx_bin_scale)util_logbase2(frag_areas[5].height)))
.add(GRAS_BIN_FOVEAT_OFFSET_0(CHIP,
.xoffset_0 = frag_offsets[0].x,
.xoffset_1 = frag_offsets[1].x,
.xoffset_2 = frag_offsets[2].x))
.add(GRAS_BIN_FOVEAT_OFFSET_1(CHIP,
.xoffset_3 = frag_offsets[3].x,
.xoffset_4 = frag_offsets[4].x,
.xoffset_5 = frag_offsets[5].x))
.add(GRAS_BIN_FOVEAT_OFFSET_2(CHIP,
.yoffset_0 = frag_offsets[0].y,
.yoffset_1 = frag_offsets[1].y,
.yoffset_2 = frag_offsets[2].y))
.add(GRAS_BIN_FOVEAT_OFFSET_3(CHIP,
.yoffset_3 = frag_offsets[3].y,
.yoffset_4 = frag_offsets[4].y,
.yoffset_5 = frag_offsets[5].y))
.add(RB_BIN_FOVEAT(CHIP,
.add(RB_BIN_FOVEAT(CHIP,
.binscaleen = bin_scale_en));
if (CHIP >= A8XX) {
for (unsigned i = 0; i < MAX_HW_SCALED_VIEWS; i++) {
crb.add(GRAS_BIN_FOVEAT_XY_OFFSET(CHIP, i,
.xoffset = frag_offsets[i].x,
.yoffset = frag_offsets[i].y,
));
crb.add(RB_BIN_FOVEAT_XY_OFFSET(CHIP, i,
.xoffset = frag_offsets[i].x,
.yoffset = frag_offsets[i].y,
));
crb.add(GRAS_BIN_FOVEAT_XY_FDM_OFFSET(CHIP, i,
.xoffset = frag_offsets[i].x,
.yoffset = frag_offsets[i].y,
));
crb.add(RB_BIN_FOVEAT_XY_FDM_OFFSET(CHIP, i,
.xoffset = frag_offsets[i].x,
.yoffset = frag_offsets[i].y,
));
}
} else {
crb.add(GRAS_BIN_FOVEAT_OFFSET_0(CHIP,
.xoffset_0 = frag_offsets[0].x,
.xoffset_1 = frag_offsets[1].x,
.xoffset_2 = frag_offsets[2].x))
.add(GRAS_BIN_FOVEAT_OFFSET_1(CHIP,
.xoffset_3 = frag_offsets[3].x,
.xoffset_4 = frag_offsets[4].x,
.xoffset_5 = frag_offsets[5].x))
.add(GRAS_BIN_FOVEAT_OFFSET_2(CHIP,
.yoffset_0 = frag_offsets[0].y,
.yoffset_1 = frag_offsets[1].y,
.yoffset_2 = frag_offsets[2].y))
.add(GRAS_BIN_FOVEAT_OFFSET_3(CHIP,
.yoffset_3 = frag_offsets[3].y,
.yoffset_4 = frag_offsets[4].y,
.yoffset_5 = frag_offsets[5].y));
}
}
} else {
@ -2017,7 +2075,7 @@ tu6_init_static_regs(struct tu_device *dev, struct tu_cs *cs)
{
const struct tu_physical_device *phys_dev = dev->physical_device;
if (CHIP >= A7XX) {
if (CHIP == A7XX) {
/* On A7XX, RB_CCU_CNTL was broken into two registers, RB_CCU_CNTL which has
* static properties that can be set once, this requires a WFI to take effect.
* While the newly introduced register RB_CCU_CACHE_CNTL has properties that may
@ -2071,13 +2129,15 @@ tu6_init_static_regs(struct tu_device *dev, struct tu_cs *cs)
}
if (dev->physical_device->info->props.has_attachment_shading_rate) {
tu_cs_emit_write_reg(cs, REG_A7XX_GRAS_LRZ_QUALITY_LOOKUP_TABLE(0),
fd_gras_shading_rate_lut(0));
tu_cs_emit_write_reg(cs, REG_A7XX_GRAS_LRZ_QUALITY_LOOKUP_TABLE(1),
fd_gras_shading_rate_lut(1));
tu_cs_emit_regs(cs, GRAS_LRZ_QUALITY_LOOKUP_TABLE_REG(CHIP, 0,
fd_gras_shading_rate_lut(0)));
tu_cs_emit_regs(cs, GRAS_LRZ_QUALITY_LOOKUP_TABLE_REG(CHIP, 1,
fd_gras_shading_rate_lut(1)));
}
tu_cs_emit_write_reg(cs, REG_A6XX_SP_NC_MODE_CNTL_2, 0);
if (CHIP < A8XX) {
tu_cs_emit_write_reg(cs, REG_A6XX_SP_NC_MODE_CNTL_2, 0);
}
tu_cs_emit_write_reg(cs, REG_A6XX_SP_PERFCTR_SHADER_MASK, 0x3f);
if (CHIP == A6XX && !cs->device->physical_device->info->props.is_a702)
tu_cs_emit_regs(cs, TPL1_UNKNOWN_B605(CHIP, .dword = 0x44));
@ -2113,7 +2173,6 @@ tu6_init_static_regs(struct tu_device *dev, struct tu_cs *cs)
tu_cs_emit_regs(cs, RB_UNKNOWN_88F0(CHIP));
}
tu_cs_emit_regs(cs, VPC_REPLACE_MODE_CNTL(CHIP, false));
tu_cs_emit_regs(cs, VPC_ROTATION_CNTL(CHIP));
@ -2129,8 +2188,12 @@ tu6_init_static_regs(struct tu_device *dev, struct tu_cs *cs)
tu_cs_emit_regs(cs, VPC_UNKNOWN_9210(CHIP));
tu_cs_emit_regs(cs, VPC_UNKNOWN_9211(CHIP));
}
tu_cs_emit_regs(cs, VPC_LB_MODE_CNTL(CHIP));
tu_cs_emit_regs(cs, PC_CONTEXT_SWITCH_GFX_PREEMPTION_MODE(CHIP));
if (CHIP < A8XX) {
tu_cs_emit_regs(cs, VPC_LB_MODE_CNTL(CHIP));
tu_cs_emit_regs(cs, PC_CONTEXT_SWITCH_GFX_PREEMPTION_MODE(CHIP));
}
tu_cs_emit_regs(cs, TPL1_MODE_CNTL(CHIP, .isammode = ISAMMODE_GL,
.texcoordroundmode = dev->instance->use_tex_coord_round_nearest_even_mode
? COORD_ROUND_NEAREST_EVEN
@ -2143,21 +2206,26 @@ tu6_init_static_regs(struct tu_device *dev, struct tu_cs *cs)
tu_cs_emit_write_reg(cs, REG_A6XX_VFD_RENDER_MODE, 0x00000000);
tu_cs_emit_regs(cs, A6XX_RB_ALPHA_TEST_CNTL()); /* always disable alpha test */
if (CHIP >= A8XX)
tu_cs_emit_regs(cs, SP_ALPHA_TEST_CNTL(CHIP));
tu_cs_emit_regs(cs, A6XX_TPL1_GFX_BORDER_COLOR_BASE(.qword = dev->global_bo->iova + gb_offset(bcolor)));
tu_cs_emit_regs(cs, A6XX_TPL1_CS_BORDER_COLOR_BASE(.qword = dev->global_bo->iova + gb_offset(bcolor)));
/* BR-only registers */
if (CHIP >= A7XX)
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(THREAD_MODE) |
CP_COND_REG_EXEC_0_BR);
tu_cs_emit_write_reg(cs, REG_A6XX_RB_DBG_ECO_CNTL,
phys_dev->info->magic.RB_DBG_ECO_CNTL);
tu_cs_emit_write_reg(cs, REG_A6XX_RB_RBP_CNTL,
phys_dev->info->magic.RB_RBP_CNTL);
if (CHIP >= A7XX) {
tu_cs_emit_regs(cs, RB_UNKNOWN_8E09(CHIP, 0x7));
tu_cond_exec_end(cs);
/* non-ctx regs programmed by KMD (and blocked from UMD) on gen8+ */
if (CHIP < A8XX) {
if (CHIP == A7XX)
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(THREAD_MODE) |
CP_COND_REG_EXEC_0_BR);
tu_cs_emit_write_reg(cs, REG_A6XX_RB_DBG_ECO_CNTL,
phys_dev->info->magic.RB_DBG_ECO_CNTL);
tu_cs_emit_write_reg(cs, REG_A6XX_RB_RBP_CNTL,
phys_dev->info->magic.RB_RBP_CNTL);
if (CHIP == A7XX) {
tu_cs_emit_regs(cs, RB_UNKNOWN_8E09(CHIP, 0x7));
tu_cond_exec_end(cs);
}
}
if (CHIP == A7XX) {
@ -2170,12 +2238,12 @@ tu6_init_static_regs(struct tu_device *dev, struct tu_cs *cs)
if (CHIP >= A7XX) {
/* Blob sets these two per draw. */
tu_cs_emit_regs(cs, PC_HS_BUFFER_SIZE(CHIP, TU_TESS_PARAM_SIZE));
tu_cs_emit_regs(cs, PC_HS_BUFFER_SIZE(CHIP, TU_TESS<CHIP>::PARAM_SIZE));
/* Blob adds a bit more space ({0x10, 0x20, 0x30, 0x40} bytes)
* but the meaning of this additional space is not known,
* so we play safe and don't add it.
*/
tu_cs_emit_regs(cs, PC_TF_BUFFER_SIZE(CHIP, TU_TESS_FACTOR_SIZE));
tu_cs_emit_regs(cs, PC_TF_BUFFER_SIZE(CHIP, TU_TESS<CHIP>::FACTOR_SIZE));
}
/* There is an optimization to skip executing draw states for draws with no
@ -2224,7 +2292,7 @@ tu_emit_bin_preamble(struct tu_device *dev, struct tu_cs *cs, bool bv)
emit_rb_ccu_cntl<CHIP>(cs, dev, true);
emit_vpc_attr_buf<CHIP>(cs, dev, true);
if (CHIP == A7XX && !bv) {
if (CHIP >= A7XX && !bv) {
tu7_emit_tile_render_begin_regs<CHIP>(cs);
}
@ -2742,7 +2810,16 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd,
} else {
tu_desc_set_array_slice_offset<CHIP>(dst, 0);
}
tu_desc_set_addr<CHIP>(dst, cmd->device->physical_device->gmem_base + gmem_offset);
uint64_t va = gmem_offset;
if (CHIP < A8XX) {
/* For gen8, address is simply gmem_offset if tile_mode is gmem
* tiling (TILE6_2)
*/
va += cmd->device->physical_device->gmem_base;
}
tu_desc_set_addr<CHIP>(dst, va);
memcpy(&texture.map[i * FDL6_TEX_CONST_DWORDS], dst, sizeof(dst));
}
@ -3070,7 +3147,7 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
: LRZ_FEEDBACK_NONE,
});
if (CHIP == A7XX) {
if (CHIP >= A7XX) {
tu7_emit_sysmem_render_begin_regs<CHIP>(cmd, cs);
}
@ -5233,7 +5310,7 @@ tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
/* VPC_SO_FLUSH_BASE has dwords counter, but counter should be in bytes */
tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3);
tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A6XX_CP_SCRATCH(0)) |
tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(tu_scratch_reg<CHIP>(0).reg) |
COND(CHIP == A6XX, CP_MEM_TO_REG_0_SHIFT_BY_2) |
0x40000 | /* ??? */
CP_MEM_TO_REG_0_UNK31 |
@ -5242,14 +5319,14 @@ tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
if (offset) {
tu_cs_emit_pkt7(cs, CP_REG_RMW, 3);
tu_cs_emit(cs, CP_REG_RMW_0_DST_REG(REG_A6XX_CP_SCRATCH(0)) |
tu_cs_emit(cs, CP_REG_RMW_0_DST_REG(tu_scratch_reg<CHIP>(0).reg) |
CP_REG_RMW_0_SRC1_ADD);
tu_cs_emit(cs, 0xffffffff);
tu_cs_emit(cs, -offset);
}
tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_SCRATCH(0)) |
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(tu_scratch_reg<CHIP>(0).reg) |
CP_REG_TO_MEM_0_CNT(1));
tu_cs_emit_qw(cs, vk_buffer_address(&buf->vk, counter_buffer_offset));
}
@ -7471,13 +7548,14 @@ tu6_emit_shared_consts(struct tu_cs *cs,
}
}
template <chip CHIP>
static void
tu7_emit_shared_preamble_consts(
struct tu_cs *cs,
const struct tu_push_constant_range *shared_consts,
uint32_t *push_constants)
{
tu_cs_emit_pkt4(cs, REG_A7XX_SP_SHARED_CONSTANT_GFX(shared_consts->lo_dwords),
tu_cs_emit_pkt4(cs, SP_SHARED_CONSTANT_GFX_REG(CHIP, shared_consts->lo_dwords).reg,
shared_consts->dwords);
tu_cs_emit_array(cs, push_constants + shared_consts->lo_dwords,
shared_consts->dwords);
@ -7508,6 +7586,7 @@ tu6_const_size(struct tu_cmd_buffer *cmd,
return dwords;
}
template <chip CHIP>
static struct tu_draw_state
tu_emit_consts(struct tu_cmd_buffer *cmd, bool compute)
{
@ -7527,7 +7606,7 @@ tu_emit_consts(struct tu_cmd_buffer *cmd, bool compute)
if (shared_consts->type == IR3_PUSH_CONSTS_SHARED) {
tu6_emit_shared_consts(&cs, shared_consts, cmd->push_constants, compute);
} else if (shared_consts->type == IR3_PUSH_CONSTS_SHARED_PREAMBLE) {
tu7_emit_shared_preamble_consts(&cs, shared_consts, cmd->push_constants);
tu7_emit_shared_preamble_consts<CHIP>(&cs, shared_consts, cmd->push_constants);
}
if (compute) {
@ -8118,7 +8197,7 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
.provoking_vtx_last = provoking_vtx_last)
.value;
tu_cs_emit_regs(cs, PC_CNTL(CHIP, .dword = primitive_cntl_0));
if (CHIP == A7XX) {
if (CHIP >= A7XX) {
tu_cs_emit_regs(cs, VPC_PC_CNTL(CHIP, .dword = primitive_cntl_0));
}
}
@ -8156,11 +8235,15 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
const struct tu_shader *tcs = cmd->state.shaders[MESA_SHADER_TESS_CTRL];
/* maximum number of patches that can fit in tess factor/param buffers */
uint32_t subdraw_size = MIN2(TU_TESS_FACTOR_SIZE / ir3_tess_factor_stride(tes->variant->key.tessellation),
TU_TESS_PARAM_SIZE / (tcs->variant->output_size * 4));
uint32_t subdraw_size = MIN2(TU_TESS<CHIP>::FACTOR_SIZE / ir3_tess_factor_stride(tes->variant->key.tessellation),
TU_TESS<CHIP>::PARAM_SIZE / (tcs->variant->output_size * 4));
/* convert from # of patches to draw count */
subdraw_size *= cmd->vk.dynamic_graphics_state.ts.patch_control_points;
/* For gen8 tess_bo is sized for two draws, adjust subdraw size accordingly: */
if (CHIP >= A8XX)
subdraw_size /= 2;
tu_cs_emit_pkt7(cs, CP_SET_SUBDRAW_SIZE, 1);
tu_cs_emit(cs, subdraw_size);
}
@ -8213,7 +8296,7 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
}
if (dirty & TU_CMD_DIRTY_SHADER_CONSTS)
cmd->state.shader_const = tu_emit_consts(cmd, false);
cmd->state.shader_const = tu_emit_consts<CHIP>(cmd, false);
if (dirty & TU_CMD_DIRTY_DESC_SETS)
tu6_emit_descriptor_sets<CHIP>(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS);
@ -9120,7 +9203,7 @@ tu_dispatch(struct tu_cmd_buffer *cmd,
tu_emit_cache_flush<CHIP>(cmd);
/* note: no reason to have this in a separate IB */
tu_cs_emit_state_ib(cs, tu_emit_consts(cmd, true));
tu_cs_emit_state_ib(cs, tu_emit_consts<CHIP>(cmd, true));
tu_emit_compute_driver_params<CHIP>(cmd, cs, info);

View file

@ -802,4 +802,15 @@ private:
#define with_crb(...) \
for (tu_crb crb(__VA_ARGS__); crb.first; crb.first = false)
template <chip CHIP>
static inline fd_reg_pair
tu_scratch_reg(int idx, uint32_t val = 0)
{
if (CHIP >= A8XX) {
return CP_SCRATCH_GLOBAL_REG(CHIP, idx, val);
} else {
return CP_SCRATCH_REG(CHIP, idx, val);
}
}
#endif /* TU_CS_H */

View file

@ -911,12 +911,17 @@ tu_get_physical_device_properties_1_2(struct tu_physical_device *pdevice,
p->roundingModeIndependence =
VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL;
p->shaderDenormFlushToZeroFloat16 = true;
p->shaderDenormPreserveFloat16 = false;
if (pdevice->info->chip >= A8XX) {
p->shaderDenormFlushToZeroFloat16 = false;
p->shaderDenormPreserveFloat16 = true;
} else {
p->shaderDenormFlushToZeroFloat16 = true;
p->shaderDenormPreserveFloat16 = false;
}
p->shaderRoundingModeRTEFloat16 = true;
p->shaderRoundingModeRTZFloat16 = false;
p->shaderSignedZeroInfNanPreserveFloat16 = true;
p->shaderDenormFlushToZeroFloat32 = true;
/* FP32 denorm preserve has to be emulated via soft-float. Normal
@ -1579,7 +1584,8 @@ tu_physical_device_init(struct tu_physical_device *device,
switch (fd_dev_gen(&device->dev_id)) {
case 6:
case 7: {
case 7:
case 8: {
device->dev_info = info;
device->info = &device->dev_info;
@ -2046,6 +2052,7 @@ tu_GetPhysicalDeviceFragmentShadingRatesKHR(
uint32_t *pFragmentShadingRateCount,
VkPhysicalDeviceFragmentShadingRateKHR *pFragmentShadingRates)
{
VK_FROM_HANDLE(tu_physical_device, physical_device, physicalDevice);
VK_OUTARRAY_MAKE_TYPED(VkPhysicalDeviceFragmentShadingRateKHR, out,
pFragmentShadingRates, pFragmentShadingRateCount);
@ -2063,6 +2070,9 @@ tu_GetPhysicalDeviceFragmentShadingRatesKHR(
append_rate(4, 4, VK_SAMPLE_COUNT_1_BIT);
append_rate(4, 2, VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_2_BIT);
/* Apparently hw didn't actually have this rate in a7xx: */
if (physical_device->info->chip >= A8XX)
append_rate(2, 4, VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_2_BIT);
append_rate(2, 2, VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT);
append_rate(2, 1, VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT);
append_rate(1, 2, VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT);
@ -2686,6 +2696,13 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
case 7:
vk_device_dispatch_table_from_entrypoints(
&dispatch_table, &tu_device_entrypoints_a7xx, false);
break;
case 8:
/* gen8 TODO: */
tu_env.debug |= TU_DEBUG_NOLRZ; /* WRITE iova faults from UCHE */
tu_env.debug |= TU_DEBUG_FLUSHALL; /* dEQP-VK.draw.\*from_compute\* */
vk_device_dispatch_table_from_entrypoints(
&dispatch_table, &tu_device_entrypoints_a8xx, false);
}
vk_device_dispatch_table_from_entrypoints(
@ -2954,8 +2971,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
goto fail_prepare_perfcntrs_pass_cs;
}
/* TODO: a8xx */
tu_cs_emit_regs(&sub_cs, CP_SCRATCH_REG(A6XX, PERF_CNTRS_REG, 1 << i));
tu_cs_emit_regs(&sub_cs, TU_CALLX(device, tu_scratch_reg)(PERF_CNTRS_REG, 1 << i));
tu_cs_emit_pkt7(&sub_cs, CP_WAIT_FOR_ME, 0);
device->perfcntrs_pass_cs_entries[i] =

View file

@ -375,10 +375,6 @@ struct tu_device
struct tu_suballocator vis_stream_suballocator;
mtx_t vis_stream_suballocator_mtx;
/* the blob seems to always use 8K factor and 128K param sizes, copy them */
#define TU_TESS_FACTOR_SIZE (8 * 1024)
#define TU_TESS_PARAM_SIZE (128 * 1024)
#define TU_TESS_BO_SIZE (TU_TESS_FACTOR_SIZE + TU_TESS_PARAM_SIZE)
/* Lazily allocated, protected by the device mutex. */
struct tu_bo *tess_bo;
@ -500,6 +496,25 @@ struct tu_device
};
VK_DEFINE_HANDLE_CASTS(tu_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
template <chip_range_support>
struct TU_TESS;
template <chip CHIP>
struct TU_TESS<chip_range(CHIP <= A7XX)> {
/* the blob seems to always use 8K factor and 128K param sizes, copy them */
static const size_t FACTOR_SIZE = 8 * 1024;
static const size_t PARAM_SIZE = 128 * 1024;
static const size_t BO_SIZE = FACTOR_SIZE + PARAM_SIZE;
};
template <chip CHIP>
struct TU_TESS<chip_range(CHIP >= A8XX)> {
/* for gen8, buffers are sized for two draws: */
static const size_t FACTOR_SIZE = 0x4040;
static const size_t PARAM_SIZE = 0x40000;
static const size_t BO_SIZE = FACTOR_SIZE + PARAM_SIZE;
};
struct tu_device_memory
{
struct vk_device_memory vk;

View file

@ -109,7 +109,7 @@ template <chip CHIP>
static void
tu6_emit_lrz_buffer(struct tu_cs *cs, struct tu_image *depth_image)
{
tu_crb crb = cs->crb(7);
tu_crb crb = cs->crb(8);
if (!depth_image) {
crb.add(GRAS_LRZ_BUFFER_BASE(CHIP, 0))
@ -121,6 +121,10 @@ tu6_emit_lrz_buffer(struct tu_cs *cs, struct tu_image *depth_image)
crb.add(GRAS_LRZ_CB_CNTL(CHIP));
}
if (CHIP >= A8XX) {
crb.add(GRAS_LRZ_BUFFER_SLICE_PITCH(CHIP));
}
return;
}
@ -142,6 +146,12 @@ tu6_emit_lrz_buffer(struct tu_cs *cs, struct tu_image *depth_image)
crb.add(GRAS_LRZ_CB_CNTL(CHIP, .double_buffer_pitch =
depth_image->lrz_layout.lrz_buffer_size));
}
if (CHIP >= A8XX) {
crb.add(GRAS_LRZ_BUFFER_SLICE_PITCH(CHIP,
depth_image->lrz_layout.lrz_slice_pitch
));
}
}
static void

View file

@ -1167,6 +1167,7 @@ tu6_emit_vs_params(struct tu_cs *cs,
ARRAY_SIZE(vs_params), vs_params);
}
template <chip CHIP>
static void
tu_get_tess_iova(struct tu_device *dev,
uint64_t *tess_factor_iova,
@ -1176,14 +1177,14 @@ tu_get_tess_iova(struct tu_device *dev,
if (!dev->tess_bo) {
mtx_lock(&dev->mutex);
if (!dev->tess_bo) {
tu_bo_init_new(dev, NULL, &dev->tess_bo, TU_TESS_BO_SIZE,
tu_bo_init_new(dev, NULL, &dev->tess_bo, TU_TESS<CHIP>::BO_SIZE,
TU_BO_ALLOC_INTERNAL_RESOURCE, "tess");
}
mtx_unlock(&dev->mutex);
}
*tess_factor_iova = dev->tess_bo->iova;
*tess_param_iova = dev->tess_bo->iova + TU_TESS_FACTOR_SIZE;
*tess_param_iova = dev->tess_bo->iova + TU_TESS<CHIP>::FACTOR_SIZE;
}
static const enum mesa_vk_dynamic_graphics_state tu_patch_control_points_state[] = {
@ -1235,7 +1236,7 @@ tu6_emit_patch_control_points(struct tu_cs *cs,
patch_control_points);
uint64_t tess_factor_iova, tess_param_iova;
tu_get_tess_iova(dev, &tess_factor_iova, &tess_param_iova);
tu_get_tess_iova<CHIP>(dev, &tess_factor_iova, &tess_param_iova);
uint32_t hs_params[HS_PARAMS_SIZE] = {
vs->variant->output_size * patch_control_points * 4, /* hs primitive stride */
@ -1289,6 +1290,7 @@ tu6_emit_patch_control_points(struct tu_cs *cs,
tu_cs_emit(cs, wave_input_size);
}
template <chip CHIP>
static void
tu6_emit_geom_tess_consts(struct tu_cs *cs,
const struct ir3_shader_variant *vs,
@ -1305,7 +1307,7 @@ tu6_emit_geom_tess_consts(struct tu_cs *cs,
if (hs) {
uint64_t tess_factor_iova, tess_param_iova;
tu_get_tess_iova(dev, &tess_factor_iova, &tess_param_iova);
tu_get_tess_iova<CHIP>(dev, &tess_factor_iova, &tess_param_iova);
uint32_t ds_params[8] = {
gs ? ds->output_size * gs->gs.vertices_in * 4 : 0, /* ds primitive stride */
@ -1407,7 +1409,7 @@ tu6_emit_program_config(struct tu_cs *cs,
}
if (gs || hs) {
tu6_emit_geom_tess_consts(cs, vs, hs, ds, gs);
tu6_emit_geom_tess_consts<CHIP>(cs, vs, hs, ds, gs);
}
}
@ -2531,7 +2533,11 @@ template <chip CHIP>
static unsigned
tu6_viewport_nregs(const struct vk_viewport_state *vp)
{
return 10 * vp->viewport_count + 3;
if (CHIP >= A8XX) {
return 12 * vp->viewport_count + 1;
} else {
return 10 * vp->viewport_count + 3;
}
}
template <chip CHIP>
@ -2636,7 +2642,10 @@ tu6_emit_viewport(struct tu_cs *cs,
crb.add(GRAS_CL_VIEWPORT_ZCLAMP_MIN(CHIP, i, zmin));
crb.add(GRAS_CL_VIEWPORT_ZCLAMP_MAX(CHIP, i, zmax));
if (i == 0) {
if (CHIP >= A8XX) {
crb.add(RB_VIEWPORT_ZCLAMP_MIN_REG(CHIP, i, zmin));
crb.add(RB_VIEWPORT_ZCLAMP_MAX_REG(CHIP, i, zmax));
} else if (i == 0) {
/* TODO: what to do about this and multi viewport ? */
crb.add(RB_VIEWPORT_ZCLAMP_MIN(CHIP, zmin));
crb.add(RB_VIEWPORT_ZCLAMP_MAX(CHIP, zmax));
@ -3232,7 +3241,7 @@ tu6_blend_size(struct tu_device *dev,
{
unsigned num_rts = alpha_to_coverage_enable ?
MAX2(cb->attachment_count, 1) : cb->attachment_count;
return 8 + 3 * num_rts;
return 8 + 5 * num_rts;
}
template <chip CHIP>
@ -3281,7 +3290,8 @@ tu6_emit_blend(struct tu_cs *cs,
.dual_color_in_enable =
dual_src_blend,
.alpha_to_coverage =
alpha_to_coverage_enable));
alpha_to_coverage_enable,
.alpha_to_one = alpha_to_one_enable));
/* TODO: set A6XX_RB_BLEND_CNTL_INDEPENDENT_BLEND only when enabled?
*
* We could also set blend_reads_dest more conservatively, but it didn't show
@ -3340,10 +3350,19 @@ tu6_emit_blend(struct tu_cs *cs,
.alpha_src_factor = src_alpha_factor,
.alpha_blend_opcode = alpha_op,
.alpha_dest_factor = dst_alpha_factor));
if (CHIP >= A8XX) {
tu_cs_emit_regs(cs, SP_MRT_BLEND_CNTL_REG(CHIP, remapped_idx,
.color_blend_en = blend_enable,
.alpha_blend_en = blend_enable,
.component_write_mask = att->write_mask));
}
} else {
tu_cs_emit_regs(cs,
A6XX_RB_MRT_CONTROL(remapped_idx,),
A6XX_RB_MRT_BLEND_CONTROL(remapped_idx,));
tu_cs_emit_regs(cs,
A6XX_RB_MRT_CONTROL(remapped_idx,),
A6XX_RB_MRT_BLEND_CONTROL(remapped_idx,));
if (CHIP >= A8XX) {
tu_cs_emit_regs(cs, SP_MRT_BLEND_CNTL_REG(CHIP, remapped_idx,));
}
}
}
tu_cs_emit_regs(cs, A6XX_SP_PS_MRT_CNTL(.mrt = num_remapped_rts));
@ -3400,7 +3419,7 @@ tu6_rast_size(struct tu_device *dev,
} else if (CHIP == A6XX) {
return 15 + (dev->physical_device->info->props.has_legacy_pipeline_shading_rate ? 8 : 0);
} else {
return 27;
return 30;
}
}
@ -3428,6 +3447,13 @@ tu6_emit_rast(struct tu_cs *cs,
.rendertargetindexincr = multiview,
.viewportindexincr = multiview && per_view_viewport));
if (CHIP >= A8XX) {
tu_cs_emit_regs(cs, GRAS_SU_STEREO_CNTL(CHIP,
.rendertargetindexincr = multiview,
.viewportindexincr = multiview && per_view_viewport,
));
}
bool depth_clip_enable = vk_rasterization_state_depth_clip_enable(rs);
tu_cs_emit_regs(cs,
@ -3437,7 +3463,7 @@ tu6_emit_rast(struct tu_cs *cs,
/* To support VK_EXT_depth_clamp_zero_one on a7xx+ */
.z_clamp_enable = rs->depth_clamp_enable || CHIP >= A7XX,
.zero_gb_scale_z = vp->depth_clip_negative_one_to_one ? 0 : 1,
.vp_clip_code_ignore = 1));;
.vp_clip_code_ignore = 1));
enum a6xx_polygon_mode polygon_mode = tu6_polygon_mode(rs->polygon_mode);
@ -3446,7 +3472,10 @@ tu6_emit_rast(struct tu_cs *cs,
tu_cs_emit_regs(cs,
PC_DGEN_RAST_CNTL(CHIP, polygon_mode));
if (CHIP == A7XX || cs->device->physical_device->info->props.is_a702) {
if (CHIP >= A8XX)
tu_cs_emit_regs(cs, GRAS_RAST_CNTL(CHIP, polygon_mode));
if (CHIP >= A7XX || cs->device->physical_device->info->props.is_a702) {
tu_cs_emit_regs(cs, VPC_PS_RAST_CNTL(CHIP, polygon_mode));
}
@ -3457,9 +3486,11 @@ tu6_emit_rast(struct tu_cs *cs,
tu_cs_emit_regs(cs, VPC_UNKNOWN_9107(CHIP,
.raster_discard = rs->rasterizer_discard_enable));
} else {
tu_cs_emit_regs(cs, VPC_RAST_STREAM_CNTL_V2(CHIP,
.stream = rs->rasterization_stream,
.discard = rs->rasterizer_discard_enable));
if (CHIP == A7XX) {
tu_cs_emit_regs(cs, VPC_RAST_STREAM_CNTL_V2(CHIP,
.stream = rs->rasterization_stream,
.discard = rs->rasterizer_discard_enable));
}
bool conservative_ras_en =
rs->conservative_mode ==
@ -3641,7 +3672,9 @@ tu6_emit_rb_depth_cntl(struct tu_cs *cs,
.z_read_enable =
(ds->depth.test_enable && (zfunc != FUNC_NEVER && zfunc != FUNC_ALWAYS)) ||
ds->depth.bounds_test.enable,
.z_bounds_enable = ds->depth.bounds_test.enable));
.z_bounds_enable = ds->depth.bounds_test.enable,
.o_depth_01_clamp_en = CHIP >= A8XX,
));
tu_cs_emit_regs(cs, GRAS_SU_DEPTH_CNTL(CHIP, depth_test));
} else {
tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
@ -3775,6 +3808,7 @@ tu6_emit_fragment_shading_rate(struct tu_cs *cs,
.frag_size_y = util_logbase2(frag_height),
.combiner_op_1 = (a6xx_fsr_combiner) fsr->combiner_ops[0],
.combiner_op_2 = (a6xx_fsr_combiner) fsr->combiner_ops[1],
.combiner_clamp_mode = (CHIP >= A8XX) ? FSR_COMBINER_CLAMP_16_SAMP : FSR_COMBINER_CLAMP_4x4,
.attachment_fsr_enable = enable_att_fsr,
.primitive_fsr_enable = enable_prim_fsr));
}

View file

@ -541,6 +541,18 @@ is_pipeline_query_with_compute_stage(uint32_t pipeline_statistics)
VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT;
}
template <chip CHIP>
static inline void
emit_counter_barrier(struct tu_cs *cs)
{
tu_cs_emit_wfi(cs);
if (CHIP >= A8XX) {
tu_cs_emit_pkt7(cs, CP_BARRIER, 1);
tu_cs_emit(cs, 1);
}
}
/* Wait on the the availability status of a query up until a timeout. */
static VkResult
wait_for_available(struct tu_device *device, struct tu_query_pool *pool,
@ -1165,7 +1177,7 @@ emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf,
tu_emit_event_write<CHIP>(cmdbuf, cs, FD_START_COMPUTE_CTRS);
}
tu_cs_emit_wfi(cs);
emit_counter_barrier<CHIP>(cs);
tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(COUNTER_REG(IAVERTICES)) |
@ -1174,12 +1186,13 @@ emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf,
tu_cs_emit_qw(cs, begin_iova);
}
template <chip CHIP>
static void
emit_perfcntrs_pass_start(bool has_pred_bit, struct tu_cs *cs, uint32_t pass)
{
tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(
REG_A6XX_CP_SCRATCH(PERF_CNTRS_REG)) |
tu_scratch_reg<CHIP>(PERF_CNTRS_REG).reg) |
A6XX_CP_REG_TEST_0_BIT(pass) |
(has_pred_bit ?
A6XX_CP_REG_TEST_0_PRED_BIT(TU_PREDICATE_PERFCNTRS) : 0) |
@ -1222,7 +1235,7 @@ emit_begin_perf_query_raw(struct tu_cmd_buffer *cmdbuf,
* stream below CP_COND_REG_EXEC.
*/
tu_cs_emit_wfi(cs);
emit_counter_barrier<CHIP>(cs);
/* Keep preemption disabled for the duration of this query. This way
* changes in perfcounter values should only apply to work done during
@ -1242,7 +1255,7 @@ emit_begin_perf_query_raw(struct tu_cmd_buffer *cmdbuf,
if (data->pass != 0)
tu_cond_exec_end(cs);
emit_perfcntrs_pass_start(has_pred_bit, cs, data->pass);
emit_perfcntrs_pass_start<CHIP>(has_pred_bit, cs, data->pass);
}
const struct fd_perfcntr_counter *counter =
@ -1256,7 +1269,7 @@ emit_begin_perf_query_raw(struct tu_cmd_buffer *cmdbuf,
tu_cond_exec_end(cs);
last_pass = ~0;
tu_cs_emit_wfi(cs);
emit_counter_barrier<CHIP>(cs);
for (uint32_t i = 0; i < perf_query->counter_index_count; i++) {
struct tu_perf_query_raw_data *data = &perf_query->data[i];
@ -1266,7 +1279,7 @@ emit_begin_perf_query_raw(struct tu_cmd_buffer *cmdbuf,
if (data->pass != 0)
tu_cond_exec_end(cs);
emit_perfcntrs_pass_start(has_pred_bit, cs, data->pass);
emit_perfcntrs_pass_start<CHIP>(has_pred_bit, cs, data->pass);
}
const struct fd_perfcntr_counter *counter =
@ -1291,7 +1304,7 @@ emit_begin_perf_query_derived(struct tu_cmd_buffer *cmdbuf,
struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
struct tu_perf_query_derived *perf_query = &pool->perf_query.derived;
tu_cs_emit_wfi(cs);
emit_counter_barrier<CHIP>(cs);
/* Keep preemption disabled for the duration of this query. This way
* changes in perfcounter values should only apply to work done during
@ -1311,7 +1324,7 @@ emit_begin_perf_query_derived(struct tu_cmd_buffer *cmdbuf,
tu_cs_emit(cs, countable);
}
tu_cs_emit_wfi(cs);
emit_counter_barrier<CHIP>(cs);
/* Collect the enabled perfcntrs. Emit CP_ALWAYS_COUNT collection last, if necessary. */
for (uint32_t i = 0; i < perf_query->collection->num_enabled_perfcntrs; ++i) {
@ -1383,7 +1396,7 @@ emit_begin_prim_generated_query(struct tu_cmd_buffer *cmdbuf,
tu_emit_event_write<CHIP>(cmdbuf, cs, FD_START_PRIMITIVE_CTRS);
tu_cs_emit_wfi(cs);
emit_counter_barrier<CHIP>(cs);
tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(COUNTER_REG(CINVOCATIONS)) |
@ -1539,7 +1552,7 @@ emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf,
.write_accum_sample_count_diff = true).value);
tu_cs_emit_qw(cs, begin_iova);
tu_cs_emit_wfi(cs);
emit_counter_barrier<CHIP>(cs);
if (cmdbuf->device->physical_device->info->props.has_generic_clear) {
/* If the next renderpass uses the same depth attachment, clears it
@ -1651,7 +1664,7 @@ emit_end_stat_query(struct tu_cmd_buffer *cmdbuf,
tu_emit_event_write<CHIP>(cmdbuf, cs, FD_STOP_COMPUTE_CTRS);
}
tu_cs_emit_wfi(cs);
emit_counter_barrier<CHIP>(cs);
tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(COUNTER_REG(IAVERTICES)) |
@ -1705,7 +1718,7 @@ emit_end_perf_query_raw(struct tu_cmd_buffer *cmdbuf,
/* Wait for the profiled work to finish so that collected counter values
* are as accurate as possible.
*/
tu_cs_emit_wfi(cs);
emit_counter_barrier<CHIP>(cs);
for (uint32_t i = 0; i < perf_query->counter_index_count; i++) {
struct tu_perf_query_raw_data *data = &perf_query->data[i];
@ -1715,7 +1728,7 @@ emit_end_perf_query_raw(struct tu_cmd_buffer *cmdbuf,
if (data->pass != 0)
tu_cond_exec_end(cs);
emit_perfcntrs_pass_start(has_pred_bit, cs, data->pass);
emit_perfcntrs_pass_start<CHIP>(has_pred_bit, cs, data->pass);
}
const struct fd_perfcntr_counter *counter =
@ -1731,7 +1744,7 @@ emit_end_perf_query_raw(struct tu_cmd_buffer *cmdbuf,
tu_cond_exec_end(cs);
last_pass = ~0;
tu_cs_emit_wfi(cs);
emit_counter_barrier<CHIP>(cs);
for (uint32_t i = 0; i < perf_query->counter_index_count; i++) {
struct tu_perf_query_raw_data *data = &perf_query->data[i];
@ -1742,7 +1755,7 @@ emit_end_perf_query_raw(struct tu_cmd_buffer *cmdbuf,
if (data->pass != 0)
tu_cond_exec_end(cs);
emit_perfcntrs_pass_start(has_pred_bit, cs, data->pass);
emit_perfcntrs_pass_start<CHIP>(has_pred_bit, cs, data->pass);
}
result_iova = query_result_iova(pool, query, struct perfcntr_query_slot,
@ -1796,7 +1809,7 @@ emit_end_perf_query_derived(struct tu_cmd_buffer *cmdbuf,
/* Wait for the profiled work to finish so that collected counter values
* are as accurate as possible.
*/
tu_cs_emit_wfi(cs);
emit_counter_barrier<CHIP>(cs);
/* Collect the enabled perfcntrs. Emit CP_ALWAYS_COUNT collection first, if necessary. */
if (perf_query->collection->cp_always_count_enabled) {
@ -1822,7 +1835,7 @@ emit_end_perf_query_derived(struct tu_cmd_buffer *cmdbuf,
tu_cs_emit_qw(cs, end_iova);
}
tu_cs_emit_wfi(cs);
emit_counter_barrier<CHIP>(cs);
for (uint32_t i = 0; i < perf_query->collection->num_enabled_perfcntrs; ++i) {
uint64_t result_iova = perf_query_derived_perfcntr_iova(pool, query, result, i);
@ -1884,7 +1897,7 @@ emit_end_xfb_query(struct tu_cmd_buffer *cmdbuf,
tu_cs_emit_regs(cs, VPC_SO_QUERY_BASE(CHIP, .qword = end_iova));
tu_emit_event_write<CHIP>(cmdbuf, cs, FD_WRITE_PRIMITIVE_COUNTS);
tu_cs_emit_wfi(cs);
emit_counter_barrier<CHIP>(cs);
tu_emit_event_write<CHIP>(cmdbuf, cs, FD_CACHE_CLEAN);
/* Set the count of written primitives */
@ -1936,7 +1949,7 @@ emit_end_prim_generated_query(struct tu_cmd_buffer *cmdbuf,
CP_COND_REG_EXEC_0_BINNING);
}
tu_cs_emit_wfi(cs);
emit_counter_barrier<CHIP>(cs);
tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(COUNTER_REG(CINVOCATIONS)) |
@ -2085,7 +2098,7 @@ tu_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
* there's a better solution that allows all 48 bits of precision
* because CP_EVENT_WRITE doesn't support 64-bit timestamps.
*/
tu_cs_emit_wfi(cs);
emit_counter_barrier<CHIP>(cs);
}
tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);

View file

@ -1017,7 +1017,7 @@ tu_lower_io(nir_shader *shader, struct tu_device *dev,
/* Disable pushing constants for this stage if none were loaded in the
* shader. If all stages don't load their declared push constants, as
* is often the case under zink, then we could additionally skip
* emitting REG_A7XX_SP_SHARED_CONSTANT_GFX entirely.
* emitting SP_SHARED_CONSTANT_GFX entirely.
*/
if (!shader_uses_push_consts(shader))
const_state->push_consts = (struct tu_push_constant_range) {};
@ -1502,6 +1502,7 @@ tu_xs_get_additional_cs_size_dwords(const struct ir3_shader_variant *xs)
return size;
}
template <chip CHIP>
void
tu6_emit_xs(struct tu_crb &crb,
struct tu_device *device,
@ -1541,7 +1542,7 @@ tu6_emit_xs(struct tu_crb &crb,
A6XX_SP_VS_PVT_MEM_SIZE(.totalpvtmemsize = pvtmem->per_sp_size,
.perwavememlayout = xs->pvtmem_per_wave));
crb.add(A6XX_SP_VS_PVT_MEM_STACK_OFFSET(.offset = pvtmem->per_sp_size));
if (device->physical_device->info->chip >= A7XX)
if (CHIP >= A7XX)
crb.add(SP_VS_VGS_CNTL(A7XX, 0));
break;
@ -1560,7 +1561,7 @@ tu6_emit_xs(struct tu_crb &crb,
A6XX_SP_HS_PVT_MEM_SIZE(.totalpvtmemsize = pvtmem->per_sp_size,
.perwavememlayout = xs->pvtmem_per_wave));
crb.add(A6XX_SP_HS_PVT_MEM_STACK_OFFSET(.offset = pvtmem->per_sp_size));
if (device->physical_device->info->chip >= A7XX)
if (CHIP >= A7XX)
crb.add(SP_HS_VGS_CNTL(A7XX, 0));
break;
@ -1580,7 +1581,7 @@ tu6_emit_xs(struct tu_crb &crb,
A6XX_SP_DS_PVT_MEM_SIZE(.totalpvtmemsize = pvtmem->per_sp_size,
.perwavememlayout = xs->pvtmem_per_wave));
crb.add(A6XX_SP_DS_PVT_MEM_STACK_OFFSET(.offset = pvtmem->per_sp_size));
if (device->physical_device->info->chip >= A7XX)
if (CHIP >= A7XX)
crb.add(SP_DS_VGS_CNTL(A7XX, 0));
break;
@ -1599,7 +1600,7 @@ tu6_emit_xs(struct tu_crb &crb,
A6XX_SP_GS_PVT_MEM_SIZE(.totalpvtmemsize = pvtmem->per_sp_size,
.perwavememlayout = xs->pvtmem_per_wave));
crb.add(A6XX_SP_GS_PVT_MEM_STACK_OFFSET(.offset = pvtmem->per_sp_size));
if (device->physical_device->info->chip >= A7XX)
if (CHIP >= A7XX)
crb.add(SP_GS_VGS_CNTL(A7XX, 0));
break;
@ -1615,6 +1616,12 @@ tu6_emit_xs(struct tu_crb &crb,
.inoutregoverlap = true, .pixlodenable = xs->need_pixlod,
.earlypreamble = xs->early_preamble,
.mergedregs = xs->mergedregs, ));
if (CHIP >= A8XX) {
crb.add(RB_PS_CNTL(CHIP,
.pixlodenable = xs->need_pixlod,
.lodpixmask = xs->need_full_quad,
));
}
crb.add(A6XX_SP_PS_INSTR_SIZE(xs->instrlen));
crb.add(A6XX_SP_PS_PROGRAM_COUNTER_OFFSET(0));
crb.add(A6XX_SP_PS_BASE(.qword = binary_iova));
@ -1625,7 +1632,7 @@ tu6_emit_xs(struct tu_crb &crb,
A6XX_SP_PS_PVT_MEM_SIZE(.totalpvtmemsize = pvtmem->per_sp_size,
.perwavememlayout = xs->pvtmem_per_wave));
crb.add(A6XX_SP_PS_PVT_MEM_STACK_OFFSET(.offset = pvtmem->per_sp_size));
if (device->physical_device->info->chip >= A7XX)
if (CHIP >= A7XX)
crb.add(SP_PS_VGS_CNTL(A7XX, 0));
break;
@ -1650,7 +1657,7 @@ tu6_emit_xs(struct tu_crb &crb,
A6XX_SP_CS_PVT_MEM_SIZE(.totalpvtmemsize = pvtmem->per_sp_size,
.perwavememlayout = xs->pvtmem_per_wave));
crb.add(A6XX_SP_CS_PVT_MEM_STACK_OFFSET(.offset = pvtmem->per_sp_size));
if (device->physical_device->info->chip >= A7XX)
if (CHIP >= A7XX)
crb.add(SP_CS_VGS_CNTL(A7XX, 0));
break;
@ -1658,6 +1665,7 @@ tu6_emit_xs(struct tu_crb &crb,
UNREACHABLE("bad shader stage");
}
}
TU_GENX(tu6_emit_xs);
void
tu6_emit_xs_constants(
@ -1782,7 +1790,7 @@ tu6_emit_cs_config(struct tu_cs *cs,
crb.add(SP_UPDATE_CNTL(CHIP, .cs_state = true, .cs_uav = true,
.cs_shared_const = shared_consts_enable));
tu6_emit_xs_config<CHIP>(crb, { .cs = v });
tu6_emit_xs(crb, cs->device, MESA_SHADER_COMPUTE, v, pvtmem, binary_iova);
tu6_emit_xs<CHIP>(crb, cs->device, MESA_SHADER_COMPUTE, v, pvtmem, binary_iova);
}
tu6_emit_xs_constants(cs, MESA_SHADER_COMPUTE, v, binary_iova);
@ -1863,6 +1871,7 @@ tu6_emit_cs_config(struct tu_cs *cs,
#define TU6_EMIT_VFD_DEST_MAX_DWORDS (MAX_VERTEX_ATTRIBS + 2)
template <chip CHIP>
static void
tu6_emit_vfd_dest(struct tu_cs *cs,
const struct ir3_shader_variant *vs)
@ -1888,6 +1897,25 @@ tu6_emit_vfd_dest(struct tu_cs *cs,
.fetch_cnt = attr_count, /* decode_cnt for binning pass ? */
.decode_cnt = attr_count));
if (CHIP >= A8XX) {
const uint32_t vertexid_regid =
ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID);
const uint32_t instanceid_regid =
ir3_find_sysval_regid(vs, SYSTEM_VALUE_INSTANCE_ID);
const uint32_t viewid_regid =
ir3_find_sysval_regid(vs, SYSTEM_VALUE_VIEW_INDEX);
unsigned sideband_count =
(vertexid_regid != INVALID_REG) +
(instanceid_regid != INVALID_REG) +
(viewid_regid != INVALID_REG);
tu_cs_emit_regs(cs, PC_VS_INPUT_CNTL(CHIP,
.instr_cnt = attr_count,
.sideband_cnt = sideband_count,
));
}
if (attr_count)
tu_cs_emit_pkt4(cs, REG_A6XX_VFD_DEST_CNTL_INSTR(0), attr_count);
@ -1990,6 +2018,11 @@ tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs)
SP_REG_PROG_ID_3(CHIP, .linelengthregid = 0xfc,
.foveationqualityregid = shading_rate_regid), );
if (CHIP >= A8XX) {
tu_cs_emit_regs(cs, RB_LB_PARAM_LIMIT(CHIP,
cs->device->physical_device->info->props.prim_alloc_threshold));
}
if (CHIP >= A7XX) {
uint32_t sysval_regs = 0;
for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++) {
@ -2245,7 +2278,7 @@ tu6_emit_vs(struct tu_cs *cs,
tu_cs_emit_regs(cs, VPC_STEREO_RENDERING_VIEWMASK(CHIP, view_mask));
}
tu6_emit_vfd_dest(cs, vs);
tu6_emit_vfd_dest<CHIP>(cs, vs);
const uint32_t vertexid_regid =
ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID);
@ -2404,7 +2437,7 @@ tu6_emit_variant(struct tu_cs *cs,
}
with_crb(cs) {
tu6_emit_xs(crb, cs->device, stage, xs, pvtmem_config, binary_iova);
tu6_emit_xs<CHIP>(crb, cs->device, stage, xs, pvtmem_config, binary_iova);
}
switch (stage) {

View file

@ -151,6 +151,7 @@ tu_spirv_to_nir(struct tu_device *dev,
const struct tu_shader_key *key,
mesa_shader_stage stage);
template <chip CHIP>
void
tu6_emit_xs(struct tu_crb &crb,
struct tu_device *device,