mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-03-20 09:50:37 +01:00
tu: gen8 support
Signed-off-by: Rob Clark <rob.clark@oss.qualcomm.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39167>
This commit is contained in:
parent
77e83d1449
commit
a9f05399ae
11 changed files with 404 additions and 152 deletions
|
|
@ -10,7 +10,7 @@ tu_entrypoints = custom_target(
|
|||
'--out-h', '@OUTPUT0@', '--out-c', '@OUTPUT1@', '--prefix', 'tu',
|
||||
'--include', 'adreno_common.xml.h',
|
||||
'--tmpl-prefix', 'tu', '--tmpl-param', 'chip CHIP',
|
||||
'--tmpl-variants', '<A6XX>', '<A7XX>',
|
||||
'--tmpl-variants', '<A6XX>', '<A7XX>', '<A8XX>',
|
||||
'--beta', with_vulkan_beta.to_string(),
|
||||
'--device-prefix', 'tu_rmv',
|
||||
],
|
||||
|
|
|
|||
|
|
@ -375,8 +375,12 @@ r2d_src_buffer_unaligned(struct tu_cmd_buffer *cmd,
|
|||
enum a6xx_format color_format = fmt.fmt;
|
||||
fixup_src_format(&format, dst_format, &color_format);
|
||||
|
||||
uint32_t offset_texels = ((va & 0x3f) / util_format_get_blocksize(format));
|
||||
va &= ~0x3f;
|
||||
uint32_t offset_texels = 0;
|
||||
if (CHIP < A8XX) {
|
||||
offset_texels = ((va & 0x3f) / util_format_get_blocksize(format));
|
||||
va &= ~0x3f;
|
||||
}
|
||||
|
||||
tu_cs_emit_regs(cs, TPL1_A2D_BLT_CNTL(CHIP,
|
||||
.raw_copy = false,
|
||||
.type = A6XX_TEX_IMG_BUFFER,
|
||||
|
|
@ -877,18 +881,18 @@ r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum r3d_type type,
|
|||
.cs_bindless = CHIP == A6XX ? 0x1f : 0xff,
|
||||
.gfx_bindless = CHIP == A6XX ? 0x1f : 0xff,));
|
||||
|
||||
with_crb (cs, 2 * 5 + 2 * 11) {
|
||||
with_crb (cs, 2 * 5 + 2 * 12) {
|
||||
tu6_emit_xs_config<CHIP>(crb, { .vs = vs, .fs = fs });
|
||||
struct tu_pvtmem_config pvtmem = {};
|
||||
tu6_emit_xs(crb, cs->device, MESA_SHADER_VERTEX, vs, &pvtmem, vs_iova);
|
||||
tu6_emit_xs(crb, cs->device, MESA_SHADER_FRAGMENT, fs, &pvtmem, fs_iova);
|
||||
tu6_emit_xs<CHIP>(crb, cs->device, MESA_SHADER_VERTEX, vs, &pvtmem, vs_iova);
|
||||
tu6_emit_xs<CHIP>(crb, cs->device, MESA_SHADER_FRAGMENT, fs, &pvtmem, fs_iova);
|
||||
}
|
||||
|
||||
tu6_emit_xs_constants(cs, MESA_SHADER_VERTEX, vs, vs_iova);
|
||||
tu6_emit_xs_constants(cs, MESA_SHADER_FRAGMENT, fs, fs_iova);
|
||||
|
||||
tu_cs_emit_regs(cs, PC_CNTL(CHIP));
|
||||
if (CHIP == A7XX) {
|
||||
if (CHIP >= A7XX) {
|
||||
tu_cs_emit_regs(cs, VPC_PC_CNTL(CHIP));
|
||||
}
|
||||
|
||||
|
|
@ -916,11 +920,16 @@ r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum r3d_type type,
|
|||
.persp_division_disable = 1,));
|
||||
tu_cs_emit_regs(cs, GRAS_SU_CNTL(CHIP)); // XXX msaa enable?
|
||||
|
||||
if (CHIP >= A8XX) {
|
||||
tu_cs_emit_regs(cs, GRAS_SU_STEREO_CNTL(CHIP));
|
||||
}
|
||||
|
||||
tu_cs_emit_regs(cs, VPC_RAST_STREAM_CNTL(CHIP));
|
||||
if (CHIP == A6XX) {
|
||||
tu_cs_emit_regs(cs, VPC_UNKNOWN_9107(CHIP));
|
||||
} else {
|
||||
tu_cs_emit_regs(cs, VPC_RAST_STREAM_CNTL_V2(CHIP));
|
||||
if (CHIP == A7XX)
|
||||
tu_cs_emit_regs(cs, VPC_RAST_STREAM_CNTL_V2(CHIP));
|
||||
|
||||
tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP,
|
||||
.raster_mode = TYPE_TILED,
|
||||
|
|
@ -1361,7 +1370,16 @@ r3d_src_gmem(struct tu_cmd_buffer *cmd,
|
|||
tu_desc_set_tex_line_offset<CHIP>(desc, cmd->state.tiling->tile0.width * cpp);
|
||||
tu_desc_set_array_slice_offset<CHIP>(desc, 0);
|
||||
tu_desc_set_depth<CHIP>(desc, 1);
|
||||
tu_desc_set_addr<CHIP>(desc, cmd->device->physical_device->gmem_base + gmem_offset);
|
||||
|
||||
uint64_t va = gmem_offset;
|
||||
if (CHIP < A8XX) {
|
||||
/* For gen8, address is simply gmem_offset if tile_mode is gmem
|
||||
* tiling (TILE6_2)
|
||||
*/
|
||||
va += cmd->device->physical_device->gmem_base;
|
||||
}
|
||||
|
||||
tu_desc_set_addr<CHIP>(desc, va);
|
||||
|
||||
/* patch the format so that depth/stencil get the right format and swizzle */
|
||||
tu_desc_set_format<CHIP>(desc, fmt);
|
||||
|
|
@ -1642,6 +1660,10 @@ r3d_setup(struct tu_cmd_buffer *cmd,
|
|||
|
||||
tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0,
|
||||
.component_enable = aspect_write_mask(dst_format, aspect_mask)));
|
||||
if (CHIP >= A8XX) {
|
||||
tu_cs_emit_regs(cs, SP_MRT_BLEND_CNTL_REG(CHIP, 0,
|
||||
.component_write_mask = aspect_write_mask(dst_format, aspect_mask)));
|
||||
}
|
||||
tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(util_format_is_srgb(dst_format)));
|
||||
tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(util_format_is_srgb(dst_format)));
|
||||
|
||||
|
|
@ -4192,11 +4214,17 @@ tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
|
|||
tu_cs_emit_regs(cs,
|
||||
A6XX_RB_PS_MRT_CNTL(.mrt = mrt_count));
|
||||
|
||||
tu_cs_emit_regs(cs, SP_BLEND_CNTL(CHIP));
|
||||
tu_cs_emit_regs(cs, SP_BLEND_CNTL(CHIP,
|
||||
.independent_blend_en = true,
|
||||
));
|
||||
tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
|
||||
for (uint32_t i = 0; i < mrt_count; i++) {
|
||||
tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
|
||||
.component_enable = COND(clear_rts & (1 << i), 0xf)));
|
||||
if (CHIP >= A8XX) {
|
||||
tu_cs_emit_regs(cs, SP_MRT_BLEND_CNTL_REG(CHIP, i,
|
||||
.component_write_mask = COND(clear_rts & (1 << i), 0xf)));
|
||||
}
|
||||
}
|
||||
|
||||
tu_cs_emit_regs(cs, GRAS_LRZ_CNTL(CHIP, 0));
|
||||
|
|
@ -5247,6 +5275,14 @@ store_cp_blit(struct tu_cmd_buffer *cmd,
|
|||
src_height += tiling->tile0.height;
|
||||
}
|
||||
|
||||
uint64_t va = gmem_offset;
|
||||
if (CHIP < A8XX) {
|
||||
/* For gen8, address is simply gmem_offset if tile_mode is gmem
|
||||
* tiling (TILE6_2)
|
||||
*/
|
||||
va += cmd->device->physical_device->gmem_base;
|
||||
}
|
||||
|
||||
tu_cs_emit_regs(cs,
|
||||
TPL1_A2D_SRC_TEXTURE_INFO(CHIP,
|
||||
.color_format = format,
|
||||
|
|
@ -5262,7 +5298,7 @@ store_cp_blit(struct tu_cmd_buffer *cmd,
|
|||
TPL1_A2D_SRC_TEXTURE_SIZE(CHIP,
|
||||
.width = src_width,
|
||||
.height = src_height),
|
||||
TPL1_A2D_SRC_TEXTURE_BASE(CHIP, .qword = cmd->device->physical_device->gmem_base + gmem_offset),
|
||||
TPL1_A2D_SRC_TEXTURE_BASE(CHIP, .qword = va),
|
||||
TPL1_A2D_SRC_TEXTURE_PITCH(CHIP, .pitch = cmd->state.tiling->tile0.width * cpp));
|
||||
|
||||
/* sync GMEM writes with CACHE. */
|
||||
|
|
|
|||
|
|
@ -266,9 +266,14 @@ static void
|
|||
tu_set_render_mode(struct tu_cs *cs, tu_set_render_mode args)
|
||||
{
|
||||
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
|
||||
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(args.mode) |
|
||||
COND(args.uses_gmem, A6XX_CP_SET_MARKER_0_USES_GMEM) |
|
||||
COND(args.shader_uses_rt, A6XX_CP_SET_MARKER_0_SHADER_USES_RT));
|
||||
if (CHIP >= A8XX) {
|
||||
tu_cs_emit(cs, A8XX_CP_SET_MARKER_0_MODE(args.mode) |
|
||||
COND(args.uses_gmem, A8XX_CP_SET_MARKER_0_USES_GMEM));
|
||||
} else {
|
||||
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(args.mode) |
|
||||
COND(args.uses_gmem, A6XX_CP_SET_MARKER_0_USES_GMEM) |
|
||||
COND(args.shader_uses_rt, A6XX_CP_SET_MARKER_0_SHADER_USES_RT));
|
||||
}
|
||||
}
|
||||
|
||||
/* This workaround, copied from the blob, seems to ensure that the BVH node
|
||||
|
|
@ -347,7 +352,7 @@ tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer,
|
|||
/* Invalidating UCHE seems to also invalidate CCHE */
|
||||
!(flushes & TU_CMD_FLAG_CACHE_INVALIDATE))
|
||||
tu_cs_emit_pkt7(cs, CP_CCHE_INVALIDATE, 0);
|
||||
if (CHIP >= A7XX && (flushes & TU_CMD_FLAG_RTU_INVALIDATE) &&
|
||||
if (CHIP == A7XX && (flushes & TU_CMD_FLAG_RTU_INVALIDATE) &&
|
||||
cmd_buffer->device->physical_device->info->props.has_rt_workaround)
|
||||
tu_emit_rt_workaround<CHIP>(cmd_buffer, cs);
|
||||
if (flushes & TU_CMD_FLAG_WAIT_MEM_WRITES)
|
||||
|
|
@ -523,16 +528,24 @@ emit_vpc_attr_buf(struct tu_cs *cs, struct tu_device *dev, bool gmem)
|
|||
if (!dev->physical_device->info->props.has_gmem_vpc_attr_buf)
|
||||
return;
|
||||
|
||||
tu_crb crb(cs, 9);
|
||||
|
||||
const struct fd6_gmem_config *cfg = gmem ?
|
||||
&dev->physical_device->config_gmem :
|
||||
&dev->physical_device->config_sysmem;
|
||||
|
||||
tu_cs_emit_regs(cs,
|
||||
VPC_ATTR_BUF_GMEM_SIZE(CHIP, cfg->vpc_attr_buf_size),
|
||||
VPC_ATTR_BUF_GMEM_BASE(CHIP, cfg->vpc_attr_buf_offset),
|
||||
);
|
||||
crb.add(VPC_ATTR_BUF_GMEM_SIZE(CHIP, cfg->vpc_attr_buf_size));
|
||||
crb.add(VPC_ATTR_BUF_GMEM_BASE(CHIP, cfg->vpc_attr_buf_offset));
|
||||
crb.add(PC_ATTR_BUF_GMEM_SIZE(CHIP, cfg->vpc_attr_buf_size));
|
||||
|
||||
tu_cs_emit_regs(cs, PC_ATTR_BUF_GMEM_SIZE(CHIP, cfg->vpc_attr_buf_size));
|
||||
if (CHIP >= A8XX) {
|
||||
crb.add(VPC_POS_BUF_GMEM_SIZE(CHIP, cfg->vpc_pos_buf_size));
|
||||
crb.add(VPC_POS_BUF_GMEM_BASE(CHIP, cfg->vpc_pos_buf_offset));
|
||||
crb.add(PC_POS_BUF_GMEM_SIZE(CHIP, cfg->vpc_pos_buf_size));
|
||||
crb.add(VPC_BV_POS_BUF_GMEM_BASE(CHIP, cfg->vpc_bv_pos_buf_offset));
|
||||
crb.add(VPC_BV_POS_BUF_GMEM_SIZE(CHIP, cfg->vpc_bv_pos_buf_size));
|
||||
crb.add(PC_BV_POS_BUF_GMEM_SIZE(CHIP, cfg->vpc_bv_pos_buf_size));
|
||||
}
|
||||
}
|
||||
|
||||
template <chip CHIP>
|
||||
|
|
@ -575,7 +588,14 @@ emit_rb_ccu_cntl(struct tu_cs *cs, struct tu_device *dev, bool gmem)
|
|||
enum a6xx_ccu_cache_size color_cache_size = !gmem ? CCU_CACHE_SIZE_FULL : !gmem ? CCU_CACHE_SIZE_FULL :
|
||||
(a6xx_ccu_cache_size)(dev->physical_device->info->props.gmem_ccu_color_cache_fraction);
|
||||
|
||||
if (CHIP == A7XX) {
|
||||
if (CHIP == A8XX) {
|
||||
tu_cs_emit_regs(cs, RB_CCU_CACHE_CNTL(CHIP,
|
||||
.depth_cache_size = (enum a6xx_ccu_cache_size)cfg->depth_cache_fraction,
|
||||
.depth_offset = cfg->depth_ccu_offset,
|
||||
.color_cache_size = (enum a6xx_ccu_cache_size)cfg->color_cache_fraction,
|
||||
.color_offset = cfg->color_ccu_offset,
|
||||
));
|
||||
} else if (CHIP == A7XX) {
|
||||
tu_cs_emit_regs(cs, RB_CCU_CACHE_CNTL(CHIP,
|
||||
.depth_offset_hi = depth_offset_hi,
|
||||
.color_offset_hi = color_offset_hi,
|
||||
|
|
@ -584,7 +604,7 @@ emit_rb_ccu_cntl(struct tu_cs *cs, struct tu_device *dev, bool gmem)
|
|||
.color_cache_size = color_cache_size,
|
||||
.color_offset = color_offset
|
||||
));
|
||||
} else {
|
||||
} else if (CHIP == A6XX) {
|
||||
tu_cs_emit_regs(cs, RB_CCU_CNTL(CHIP,
|
||||
.gmem_fast_clear_disable =
|
||||
!dev->physical_device->info->props.has_gmem_fast_clear,
|
||||
|
|
@ -865,24 +885,14 @@ tu6_emit_bin_size(struct tu_cs *cs,
|
|||
uint32_t bin_h,
|
||||
struct tu_bin_size_params &&p)
|
||||
{
|
||||
if (CHIP == A6XX) {
|
||||
tu_cs_emit_regs(
|
||||
cs, GRAS_SC_BIN_CNTL(CHIP, .binw = bin_w,
|
||||
.binh = bin_h,
|
||||
.render_mode = p.render_mode,
|
||||
.force_lrz_write_dis = p.force_lrz_write_dis,
|
||||
.buffers_location = p.buffers_location,
|
||||
.lrz_feedback_zmode_mask = p.lrz_feedback_zmode_mask, ));
|
||||
} else {
|
||||
tu_cs_emit_regs(cs,
|
||||
GRAS_SC_BIN_CNTL(CHIP, .binw = bin_w,
|
||||
.binh = bin_h,
|
||||
.render_mode = p.render_mode,
|
||||
.force_lrz_write_dis = p.force_lrz_write_dis,
|
||||
.lrz_feedback_zmode_mask =
|
||||
p.lrz_feedback_zmode_mask,
|
||||
.force_lrz_dis = p.force_lrz_dis));
|
||||
}
|
||||
tu_cs_emit_regs(
|
||||
cs, GRAS_SC_BIN_CNTL(CHIP, .binw = bin_w,
|
||||
.binh = bin_h,
|
||||
.render_mode = p.render_mode,
|
||||
.force_lrz_write_dis = p.force_lrz_write_dis,
|
||||
.buffers_location = p.buffers_location,
|
||||
.lrz_feedback_zmode_mask = p.lrz_feedback_zmode_mask,
|
||||
.force_lrz_dis = p.force_lrz_dis));
|
||||
|
||||
tu_cs_emit_regs(cs, RB_CNTL(CHIP,
|
||||
.binw = bin_w,
|
||||
|
|
@ -892,6 +902,32 @@ tu6_emit_bin_size(struct tu_cs *cs,
|
|||
.buffers_location = p.buffers_location,
|
||||
.lrz_feedback_zmode_mask = p.lrz_feedback_zmode_mask, ));
|
||||
|
||||
if (CHIP >= A8XX) {
|
||||
tu_crb crb = cs->crb(13);
|
||||
|
||||
crb.add(TPL1_BIN_SIZE(CHIP, .binw = bin_w, .binh = bin_h));
|
||||
crb.add(TPL1_A2D_BIN_SIZE(CHIP, .binw = bin_w, .binh = bin_h));
|
||||
crb.add(SP_BIN_SIZE(CHIP, .binw = bin_w, .binh = bin_h));
|
||||
|
||||
for (int i = 0; i < 8; i++) {
|
||||
// gen8 TODO: 0x0 if !cbuf_cpp[i]
|
||||
crb.add(RB_MRT_GMEM_DIMENSION_REG(CHIP, i,
|
||||
.width = bin_w,
|
||||
.height = bin_h,
|
||||
));
|
||||
}
|
||||
// gen8 TODO: 0x0 if !zsbuf_cpp[0]
|
||||
crb.add(RB_DEPTH_GMEM_DIMENSION(CHIP,
|
||||
.width = bin_w,
|
||||
.height = bin_h,
|
||||
));
|
||||
// gen8 TODO: 0x0 if !(zsbuf_cpp[0] || zsbuf_cpp[1])
|
||||
crb.add(RB_STENCIL_GMEM_DIMENSION(CHIP,
|
||||
.width = bin_w,
|
||||
.height = bin_h,
|
||||
));
|
||||
}
|
||||
|
||||
/* no flag for RB_RESOLVE_CNTL_3... */
|
||||
tu_cs_emit_regs(cs, RB_RESOLVE_CNTL_3(CHIP, .binw = bin_w, .binh = bin_h));
|
||||
}
|
||||
|
|
@ -1682,7 +1718,7 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
|
|||
frag_offsets[i].y = y1 - y1 / tile->frag_areas[view].height;
|
||||
}
|
||||
|
||||
with_crb (cs, 6) {
|
||||
with_crb (cs, 26) {
|
||||
crb.add(GRAS_BIN_FOVEAT(CHIP,
|
||||
.binscaleen = bin_scale_en,
|
||||
.xscale_0 = (enum a7xx_bin_scale)util_logbase2(frag_areas[0].width),
|
||||
|
|
@ -1697,24 +1733,46 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
|
|||
.yscale_4 = (enum a7xx_bin_scale)util_logbase2(frag_areas[4].height),
|
||||
.xscale_5 = (enum a7xx_bin_scale)util_logbase2(frag_areas[5].width),
|
||||
.yscale_5 = (enum a7xx_bin_scale)util_logbase2(frag_areas[5].height)))
|
||||
.add(GRAS_BIN_FOVEAT_OFFSET_0(CHIP,
|
||||
.xoffset_0 = frag_offsets[0].x,
|
||||
.xoffset_1 = frag_offsets[1].x,
|
||||
.xoffset_2 = frag_offsets[2].x))
|
||||
.add(GRAS_BIN_FOVEAT_OFFSET_1(CHIP,
|
||||
.xoffset_3 = frag_offsets[3].x,
|
||||
.xoffset_4 = frag_offsets[4].x,
|
||||
.xoffset_5 = frag_offsets[5].x))
|
||||
.add(GRAS_BIN_FOVEAT_OFFSET_2(CHIP,
|
||||
.yoffset_0 = frag_offsets[0].y,
|
||||
.yoffset_1 = frag_offsets[1].y,
|
||||
.yoffset_2 = frag_offsets[2].y))
|
||||
.add(GRAS_BIN_FOVEAT_OFFSET_3(CHIP,
|
||||
.yoffset_3 = frag_offsets[3].y,
|
||||
.yoffset_4 = frag_offsets[4].y,
|
||||
.yoffset_5 = frag_offsets[5].y))
|
||||
.add(RB_BIN_FOVEAT(CHIP,
|
||||
.add(RB_BIN_FOVEAT(CHIP,
|
||||
.binscaleen = bin_scale_en));
|
||||
|
||||
if (CHIP >= A8XX) {
|
||||
for (unsigned i = 0; i < MAX_HW_SCALED_VIEWS; i++) {
|
||||
crb.add(GRAS_BIN_FOVEAT_XY_OFFSET(CHIP, i,
|
||||
.xoffset = frag_offsets[i].x,
|
||||
.yoffset = frag_offsets[i].y,
|
||||
));
|
||||
crb.add(RB_BIN_FOVEAT_XY_OFFSET(CHIP, i,
|
||||
.xoffset = frag_offsets[i].x,
|
||||
.yoffset = frag_offsets[i].y,
|
||||
));
|
||||
crb.add(GRAS_BIN_FOVEAT_XY_FDM_OFFSET(CHIP, i,
|
||||
.xoffset = frag_offsets[i].x,
|
||||
.yoffset = frag_offsets[i].y,
|
||||
));
|
||||
crb.add(RB_BIN_FOVEAT_XY_FDM_OFFSET(CHIP, i,
|
||||
.xoffset = frag_offsets[i].x,
|
||||
.yoffset = frag_offsets[i].y,
|
||||
));
|
||||
}
|
||||
} else {
|
||||
crb.add(GRAS_BIN_FOVEAT_OFFSET_0(CHIP,
|
||||
.xoffset_0 = frag_offsets[0].x,
|
||||
.xoffset_1 = frag_offsets[1].x,
|
||||
.xoffset_2 = frag_offsets[2].x))
|
||||
.add(GRAS_BIN_FOVEAT_OFFSET_1(CHIP,
|
||||
.xoffset_3 = frag_offsets[3].x,
|
||||
.xoffset_4 = frag_offsets[4].x,
|
||||
.xoffset_5 = frag_offsets[5].x))
|
||||
.add(GRAS_BIN_FOVEAT_OFFSET_2(CHIP,
|
||||
.yoffset_0 = frag_offsets[0].y,
|
||||
.yoffset_1 = frag_offsets[1].y,
|
||||
.yoffset_2 = frag_offsets[2].y))
|
||||
.add(GRAS_BIN_FOVEAT_OFFSET_3(CHIP,
|
||||
.yoffset_3 = frag_offsets[3].y,
|
||||
.yoffset_4 = frag_offsets[4].y,
|
||||
.yoffset_5 = frag_offsets[5].y));
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
|
|
@ -2017,7 +2075,7 @@ tu6_init_static_regs(struct tu_device *dev, struct tu_cs *cs)
|
|||
{
|
||||
const struct tu_physical_device *phys_dev = dev->physical_device;
|
||||
|
||||
if (CHIP >= A7XX) {
|
||||
if (CHIP == A7XX) {
|
||||
/* On A7XX, RB_CCU_CNTL was broken into two registers, RB_CCU_CNTL which has
|
||||
* static properties that can be set once, this requires a WFI to take effect.
|
||||
* While the newly introduced register RB_CCU_CACHE_CNTL has properties that may
|
||||
|
|
@ -2071,13 +2129,15 @@ tu6_init_static_regs(struct tu_device *dev, struct tu_cs *cs)
|
|||
}
|
||||
|
||||
if (dev->physical_device->info->props.has_attachment_shading_rate) {
|
||||
tu_cs_emit_write_reg(cs, REG_A7XX_GRAS_LRZ_QUALITY_LOOKUP_TABLE(0),
|
||||
fd_gras_shading_rate_lut(0));
|
||||
tu_cs_emit_write_reg(cs, REG_A7XX_GRAS_LRZ_QUALITY_LOOKUP_TABLE(1),
|
||||
fd_gras_shading_rate_lut(1));
|
||||
tu_cs_emit_regs(cs, GRAS_LRZ_QUALITY_LOOKUP_TABLE_REG(CHIP, 0,
|
||||
fd_gras_shading_rate_lut(0)));
|
||||
tu_cs_emit_regs(cs, GRAS_LRZ_QUALITY_LOOKUP_TABLE_REG(CHIP, 1,
|
||||
fd_gras_shading_rate_lut(1)));
|
||||
}
|
||||
|
||||
tu_cs_emit_write_reg(cs, REG_A6XX_SP_NC_MODE_CNTL_2, 0);
|
||||
if (CHIP < A8XX) {
|
||||
tu_cs_emit_write_reg(cs, REG_A6XX_SP_NC_MODE_CNTL_2, 0);
|
||||
}
|
||||
tu_cs_emit_write_reg(cs, REG_A6XX_SP_PERFCTR_SHADER_MASK, 0x3f);
|
||||
if (CHIP == A6XX && !cs->device->physical_device->info->props.is_a702)
|
||||
tu_cs_emit_regs(cs, TPL1_UNKNOWN_B605(CHIP, .dword = 0x44));
|
||||
|
|
@ -2113,7 +2173,6 @@ tu6_init_static_regs(struct tu_device *dev, struct tu_cs *cs)
|
|||
tu_cs_emit_regs(cs, RB_UNKNOWN_88F0(CHIP));
|
||||
}
|
||||
|
||||
|
||||
tu_cs_emit_regs(cs, VPC_REPLACE_MODE_CNTL(CHIP, false));
|
||||
tu_cs_emit_regs(cs, VPC_ROTATION_CNTL(CHIP));
|
||||
|
||||
|
|
@ -2129,8 +2188,12 @@ tu6_init_static_regs(struct tu_device *dev, struct tu_cs *cs)
|
|||
tu_cs_emit_regs(cs, VPC_UNKNOWN_9210(CHIP));
|
||||
tu_cs_emit_regs(cs, VPC_UNKNOWN_9211(CHIP));
|
||||
}
|
||||
tu_cs_emit_regs(cs, VPC_LB_MODE_CNTL(CHIP));
|
||||
tu_cs_emit_regs(cs, PC_CONTEXT_SWITCH_GFX_PREEMPTION_MODE(CHIP));
|
||||
|
||||
if (CHIP < A8XX) {
|
||||
tu_cs_emit_regs(cs, VPC_LB_MODE_CNTL(CHIP));
|
||||
tu_cs_emit_regs(cs, PC_CONTEXT_SWITCH_GFX_PREEMPTION_MODE(CHIP));
|
||||
}
|
||||
|
||||
tu_cs_emit_regs(cs, TPL1_MODE_CNTL(CHIP, .isammode = ISAMMODE_GL,
|
||||
.texcoordroundmode = dev->instance->use_tex_coord_round_nearest_even_mode
|
||||
? COORD_ROUND_NEAREST_EVEN
|
||||
|
|
@ -2143,21 +2206,26 @@ tu6_init_static_regs(struct tu_device *dev, struct tu_cs *cs)
|
|||
tu_cs_emit_write_reg(cs, REG_A6XX_VFD_RENDER_MODE, 0x00000000);
|
||||
|
||||
tu_cs_emit_regs(cs, A6XX_RB_ALPHA_TEST_CNTL()); /* always disable alpha test */
|
||||
if (CHIP >= A8XX)
|
||||
tu_cs_emit_regs(cs, SP_ALPHA_TEST_CNTL(CHIP));
|
||||
|
||||
tu_cs_emit_regs(cs, A6XX_TPL1_GFX_BORDER_COLOR_BASE(.qword = dev->global_bo->iova + gb_offset(bcolor)));
|
||||
tu_cs_emit_regs(cs, A6XX_TPL1_CS_BORDER_COLOR_BASE(.qword = dev->global_bo->iova + gb_offset(bcolor)));
|
||||
|
||||
/* BR-only registers */
|
||||
if (CHIP >= A7XX)
|
||||
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(THREAD_MODE) |
|
||||
CP_COND_REG_EXEC_0_BR);
|
||||
tu_cs_emit_write_reg(cs, REG_A6XX_RB_DBG_ECO_CNTL,
|
||||
phys_dev->info->magic.RB_DBG_ECO_CNTL);
|
||||
tu_cs_emit_write_reg(cs, REG_A6XX_RB_RBP_CNTL,
|
||||
phys_dev->info->magic.RB_RBP_CNTL);
|
||||
if (CHIP >= A7XX) {
|
||||
tu_cs_emit_regs(cs, RB_UNKNOWN_8E09(CHIP, 0x7));
|
||||
tu_cond_exec_end(cs);
|
||||
/* non-ctx regs programmed by KMD (and blocked from UMD) on gen8+ */
|
||||
if (CHIP < A8XX) {
|
||||
if (CHIP == A7XX)
|
||||
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(THREAD_MODE) |
|
||||
CP_COND_REG_EXEC_0_BR);
|
||||
tu_cs_emit_write_reg(cs, REG_A6XX_RB_DBG_ECO_CNTL,
|
||||
phys_dev->info->magic.RB_DBG_ECO_CNTL);
|
||||
tu_cs_emit_write_reg(cs, REG_A6XX_RB_RBP_CNTL,
|
||||
phys_dev->info->magic.RB_RBP_CNTL);
|
||||
if (CHIP == A7XX) {
|
||||
tu_cs_emit_regs(cs, RB_UNKNOWN_8E09(CHIP, 0x7));
|
||||
tu_cond_exec_end(cs);
|
||||
}
|
||||
}
|
||||
|
||||
if (CHIP == A7XX) {
|
||||
|
|
@ -2170,12 +2238,12 @@ tu6_init_static_regs(struct tu_device *dev, struct tu_cs *cs)
|
|||
|
||||
if (CHIP >= A7XX) {
|
||||
/* Blob sets these two per draw. */
|
||||
tu_cs_emit_regs(cs, PC_HS_BUFFER_SIZE(CHIP, TU_TESS_PARAM_SIZE));
|
||||
tu_cs_emit_regs(cs, PC_HS_BUFFER_SIZE(CHIP, TU_TESS<CHIP>::PARAM_SIZE));
|
||||
/* Blob adds a bit more space ({0x10, 0x20, 0x30, 0x40} bytes)
|
||||
* but the meaning of this additional space is not known,
|
||||
* so we play safe and don't add it.
|
||||
*/
|
||||
tu_cs_emit_regs(cs, PC_TF_BUFFER_SIZE(CHIP, TU_TESS_FACTOR_SIZE));
|
||||
tu_cs_emit_regs(cs, PC_TF_BUFFER_SIZE(CHIP, TU_TESS<CHIP>::FACTOR_SIZE));
|
||||
}
|
||||
|
||||
/* There is an optimization to skip executing draw states for draws with no
|
||||
|
|
@ -2224,7 +2292,7 @@ tu_emit_bin_preamble(struct tu_device *dev, struct tu_cs *cs, bool bv)
|
|||
emit_rb_ccu_cntl<CHIP>(cs, dev, true);
|
||||
emit_vpc_attr_buf<CHIP>(cs, dev, true);
|
||||
|
||||
if (CHIP == A7XX && !bv) {
|
||||
if (CHIP >= A7XX && !bv) {
|
||||
tu7_emit_tile_render_begin_regs<CHIP>(cs);
|
||||
}
|
||||
|
||||
|
|
@ -2742,7 +2810,16 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd,
|
|||
} else {
|
||||
tu_desc_set_array_slice_offset<CHIP>(dst, 0);
|
||||
}
|
||||
tu_desc_set_addr<CHIP>(dst, cmd->device->physical_device->gmem_base + gmem_offset);
|
||||
|
||||
uint64_t va = gmem_offset;
|
||||
if (CHIP < A8XX) {
|
||||
/* For gen8, address is simply gmem_offset if tile_mode is gmem
|
||||
* tiling (TILE6_2)
|
||||
*/
|
||||
va += cmd->device->physical_device->gmem_base;
|
||||
}
|
||||
|
||||
tu_desc_set_addr<CHIP>(dst, va);
|
||||
|
||||
memcpy(&texture.map[i * FDL6_TEX_CONST_DWORDS], dst, sizeof(dst));
|
||||
}
|
||||
|
|
@ -3070,7 +3147,7 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
|||
: LRZ_FEEDBACK_NONE,
|
||||
});
|
||||
|
||||
if (CHIP == A7XX) {
|
||||
if (CHIP >= A7XX) {
|
||||
tu7_emit_sysmem_render_begin_regs<CHIP>(cmd, cs);
|
||||
}
|
||||
|
||||
|
|
@ -5233,7 +5310,7 @@ tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
|
|||
|
||||
/* VPC_SO_FLUSH_BASE has dwords counter, but counter should be in bytes */
|
||||
tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3);
|
||||
tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A6XX_CP_SCRATCH(0)) |
|
||||
tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(tu_scratch_reg<CHIP>(0).reg) |
|
||||
COND(CHIP == A6XX, CP_MEM_TO_REG_0_SHIFT_BY_2) |
|
||||
0x40000 | /* ??? */
|
||||
CP_MEM_TO_REG_0_UNK31 |
|
||||
|
|
@ -5242,14 +5319,14 @@ tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
|
|||
|
||||
if (offset) {
|
||||
tu_cs_emit_pkt7(cs, CP_REG_RMW, 3);
|
||||
tu_cs_emit(cs, CP_REG_RMW_0_DST_REG(REG_A6XX_CP_SCRATCH(0)) |
|
||||
tu_cs_emit(cs, CP_REG_RMW_0_DST_REG(tu_scratch_reg<CHIP>(0).reg) |
|
||||
CP_REG_RMW_0_SRC1_ADD);
|
||||
tu_cs_emit(cs, 0xffffffff);
|
||||
tu_cs_emit(cs, -offset);
|
||||
}
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
|
||||
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_SCRATCH(0)) |
|
||||
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(tu_scratch_reg<CHIP>(0).reg) |
|
||||
CP_REG_TO_MEM_0_CNT(1));
|
||||
tu_cs_emit_qw(cs, vk_buffer_address(&buf->vk, counter_buffer_offset));
|
||||
}
|
||||
|
|
@ -7471,13 +7548,14 @@ tu6_emit_shared_consts(struct tu_cs *cs,
|
|||
}
|
||||
}
|
||||
|
||||
template <chip CHIP>
|
||||
static void
|
||||
tu7_emit_shared_preamble_consts(
|
||||
struct tu_cs *cs,
|
||||
const struct tu_push_constant_range *shared_consts,
|
||||
uint32_t *push_constants)
|
||||
{
|
||||
tu_cs_emit_pkt4(cs, REG_A7XX_SP_SHARED_CONSTANT_GFX(shared_consts->lo_dwords),
|
||||
tu_cs_emit_pkt4(cs, SP_SHARED_CONSTANT_GFX_REG(CHIP, shared_consts->lo_dwords).reg,
|
||||
shared_consts->dwords);
|
||||
tu_cs_emit_array(cs, push_constants + shared_consts->lo_dwords,
|
||||
shared_consts->dwords);
|
||||
|
|
@ -7508,6 +7586,7 @@ tu6_const_size(struct tu_cmd_buffer *cmd,
|
|||
return dwords;
|
||||
}
|
||||
|
||||
template <chip CHIP>
|
||||
static struct tu_draw_state
|
||||
tu_emit_consts(struct tu_cmd_buffer *cmd, bool compute)
|
||||
{
|
||||
|
|
@ -7527,7 +7606,7 @@ tu_emit_consts(struct tu_cmd_buffer *cmd, bool compute)
|
|||
if (shared_consts->type == IR3_PUSH_CONSTS_SHARED) {
|
||||
tu6_emit_shared_consts(&cs, shared_consts, cmd->push_constants, compute);
|
||||
} else if (shared_consts->type == IR3_PUSH_CONSTS_SHARED_PREAMBLE) {
|
||||
tu7_emit_shared_preamble_consts(&cs, shared_consts, cmd->push_constants);
|
||||
tu7_emit_shared_preamble_consts<CHIP>(&cs, shared_consts, cmd->push_constants);
|
||||
}
|
||||
|
||||
if (compute) {
|
||||
|
|
@ -8118,7 +8197,7 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
|
|||
.provoking_vtx_last = provoking_vtx_last)
|
||||
.value;
|
||||
tu_cs_emit_regs(cs, PC_CNTL(CHIP, .dword = primitive_cntl_0));
|
||||
if (CHIP == A7XX) {
|
||||
if (CHIP >= A7XX) {
|
||||
tu_cs_emit_regs(cs, VPC_PC_CNTL(CHIP, .dword = primitive_cntl_0));
|
||||
}
|
||||
}
|
||||
|
|
@ -8156,11 +8235,15 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
|
|||
const struct tu_shader *tcs = cmd->state.shaders[MESA_SHADER_TESS_CTRL];
|
||||
|
||||
/* maximum number of patches that can fit in tess factor/param buffers */
|
||||
uint32_t subdraw_size = MIN2(TU_TESS_FACTOR_SIZE / ir3_tess_factor_stride(tes->variant->key.tessellation),
|
||||
TU_TESS_PARAM_SIZE / (tcs->variant->output_size * 4));
|
||||
uint32_t subdraw_size = MIN2(TU_TESS<CHIP>::FACTOR_SIZE / ir3_tess_factor_stride(tes->variant->key.tessellation),
|
||||
TU_TESS<CHIP>::PARAM_SIZE / (tcs->variant->output_size * 4));
|
||||
/* convert from # of patches to draw count */
|
||||
subdraw_size *= cmd->vk.dynamic_graphics_state.ts.patch_control_points;
|
||||
|
||||
/* For gen8 tess_bo is sized for two draws, adjust subdraw size accordingly: */
|
||||
if (CHIP >= A8XX)
|
||||
subdraw_size /= 2;
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_SET_SUBDRAW_SIZE, 1);
|
||||
tu_cs_emit(cs, subdraw_size);
|
||||
}
|
||||
|
|
@ -8213,7 +8296,7 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
|
|||
}
|
||||
|
||||
if (dirty & TU_CMD_DIRTY_SHADER_CONSTS)
|
||||
cmd->state.shader_const = tu_emit_consts(cmd, false);
|
||||
cmd->state.shader_const = tu_emit_consts<CHIP>(cmd, false);
|
||||
|
||||
if (dirty & TU_CMD_DIRTY_DESC_SETS)
|
||||
tu6_emit_descriptor_sets<CHIP>(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS);
|
||||
|
|
@ -9120,7 +9203,7 @@ tu_dispatch(struct tu_cmd_buffer *cmd,
|
|||
tu_emit_cache_flush<CHIP>(cmd);
|
||||
|
||||
/* note: no reason to have this in a separate IB */
|
||||
tu_cs_emit_state_ib(cs, tu_emit_consts(cmd, true));
|
||||
tu_cs_emit_state_ib(cs, tu_emit_consts<CHIP>(cmd, true));
|
||||
|
||||
tu_emit_compute_driver_params<CHIP>(cmd, cs, info);
|
||||
|
||||
|
|
|
|||
|
|
@ -802,4 +802,15 @@ private:
|
|||
#define with_crb(...) \
|
||||
for (tu_crb crb(__VA_ARGS__); crb.first; crb.first = false)
|
||||
|
||||
template <chip CHIP>
|
||||
static inline fd_reg_pair
|
||||
tu_scratch_reg(int idx, uint32_t val = 0)
|
||||
{
|
||||
if (CHIP >= A8XX) {
|
||||
return CP_SCRATCH_GLOBAL_REG(CHIP, idx, val);
|
||||
} else {
|
||||
return CP_SCRATCH_REG(CHIP, idx, val);
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* TU_CS_H */
|
||||
|
|
|
|||
|
|
@ -911,12 +911,17 @@ tu_get_physical_device_properties_1_2(struct tu_physical_device *pdevice,
|
|||
p->roundingModeIndependence =
|
||||
VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL;
|
||||
|
||||
p->shaderDenormFlushToZeroFloat16 = true;
|
||||
p->shaderDenormPreserveFloat16 = false;
|
||||
if (pdevice->info->chip >= A8XX) {
|
||||
p->shaderDenormFlushToZeroFloat16 = false;
|
||||
p->shaderDenormPreserveFloat16 = true;
|
||||
} else {
|
||||
p->shaderDenormFlushToZeroFloat16 = true;
|
||||
p->shaderDenormPreserveFloat16 = false;
|
||||
}
|
||||
|
||||
p->shaderRoundingModeRTEFloat16 = true;
|
||||
p->shaderRoundingModeRTZFloat16 = false;
|
||||
p->shaderSignedZeroInfNanPreserveFloat16 = true;
|
||||
|
||||
p->shaderDenormFlushToZeroFloat32 = true;
|
||||
|
||||
/* FP32 denorm preserve has to be emulated via soft-float. Normal
|
||||
|
|
@ -1579,7 +1584,8 @@ tu_physical_device_init(struct tu_physical_device *device,
|
|||
|
||||
switch (fd_dev_gen(&device->dev_id)) {
|
||||
case 6:
|
||||
case 7: {
|
||||
case 7:
|
||||
case 8: {
|
||||
device->dev_info = info;
|
||||
device->info = &device->dev_info;
|
||||
|
||||
|
|
@ -2046,6 +2052,7 @@ tu_GetPhysicalDeviceFragmentShadingRatesKHR(
|
|||
uint32_t *pFragmentShadingRateCount,
|
||||
VkPhysicalDeviceFragmentShadingRateKHR *pFragmentShadingRates)
|
||||
{
|
||||
VK_FROM_HANDLE(tu_physical_device, physical_device, physicalDevice);
|
||||
VK_OUTARRAY_MAKE_TYPED(VkPhysicalDeviceFragmentShadingRateKHR, out,
|
||||
pFragmentShadingRates, pFragmentShadingRateCount);
|
||||
|
||||
|
|
@ -2063,6 +2070,9 @@ tu_GetPhysicalDeviceFragmentShadingRatesKHR(
|
|||
|
||||
append_rate(4, 4, VK_SAMPLE_COUNT_1_BIT);
|
||||
append_rate(4, 2, VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_2_BIT);
|
||||
/* Apparently hw didn't actually have this rate in a7xx: */
|
||||
if (physical_device->info->chip >= A8XX)
|
||||
append_rate(2, 4, VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_2_BIT);
|
||||
append_rate(2, 2, VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT);
|
||||
append_rate(2, 1, VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT);
|
||||
append_rate(1, 2, VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT);
|
||||
|
|
@ -2686,6 +2696,13 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
|
|||
case 7:
|
||||
vk_device_dispatch_table_from_entrypoints(
|
||||
&dispatch_table, &tu_device_entrypoints_a7xx, false);
|
||||
break;
|
||||
case 8:
|
||||
/* gen8 TODO: */
|
||||
tu_env.debug |= TU_DEBUG_NOLRZ; /* WRITE iova faults from UCHE */
|
||||
tu_env.debug |= TU_DEBUG_FLUSHALL; /* dEQP-VK.draw.\*from_compute\* */
|
||||
vk_device_dispatch_table_from_entrypoints(
|
||||
&dispatch_table, &tu_device_entrypoints_a8xx, false);
|
||||
}
|
||||
|
||||
vk_device_dispatch_table_from_entrypoints(
|
||||
|
|
@ -2954,8 +2971,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
|
|||
goto fail_prepare_perfcntrs_pass_cs;
|
||||
}
|
||||
|
||||
/* TODO: a8xx */
|
||||
tu_cs_emit_regs(&sub_cs, CP_SCRATCH_REG(A6XX, PERF_CNTRS_REG, 1 << i));
|
||||
tu_cs_emit_regs(&sub_cs, TU_CALLX(device, tu_scratch_reg)(PERF_CNTRS_REG, 1 << i));
|
||||
tu_cs_emit_pkt7(&sub_cs, CP_WAIT_FOR_ME, 0);
|
||||
|
||||
device->perfcntrs_pass_cs_entries[i] =
|
||||
|
|
|
|||
|
|
@ -375,10 +375,6 @@ struct tu_device
|
|||
struct tu_suballocator vis_stream_suballocator;
|
||||
mtx_t vis_stream_suballocator_mtx;
|
||||
|
||||
/* the blob seems to always use 8K factor and 128K param sizes, copy them */
|
||||
#define TU_TESS_FACTOR_SIZE (8 * 1024)
|
||||
#define TU_TESS_PARAM_SIZE (128 * 1024)
|
||||
#define TU_TESS_BO_SIZE (TU_TESS_FACTOR_SIZE + TU_TESS_PARAM_SIZE)
|
||||
/* Lazily allocated, protected by the device mutex. */
|
||||
struct tu_bo *tess_bo;
|
||||
|
||||
|
|
@ -500,6 +496,25 @@ struct tu_device
|
|||
};
|
||||
VK_DEFINE_HANDLE_CASTS(tu_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
|
||||
|
||||
template <chip_range_support>
|
||||
struct TU_TESS;
|
||||
|
||||
template <chip CHIP>
|
||||
struct TU_TESS<chip_range(CHIP <= A7XX)> {
|
||||
/* the blob seems to always use 8K factor and 128K param sizes, copy them */
|
||||
static const size_t FACTOR_SIZE = 8 * 1024;
|
||||
static const size_t PARAM_SIZE = 128 * 1024;
|
||||
static const size_t BO_SIZE = FACTOR_SIZE + PARAM_SIZE;
|
||||
};
|
||||
|
||||
template <chip CHIP>
|
||||
struct TU_TESS<chip_range(CHIP >= A8XX)> {
|
||||
/* for gen8, buffers are sized for two draws: */
|
||||
static const size_t FACTOR_SIZE = 0x4040;
|
||||
static const size_t PARAM_SIZE = 0x40000;
|
||||
static const size_t BO_SIZE = FACTOR_SIZE + PARAM_SIZE;
|
||||
};
|
||||
|
||||
struct tu_device_memory
|
||||
{
|
||||
struct vk_device_memory vk;
|
||||
|
|
|
|||
|
|
@ -109,7 +109,7 @@ template <chip CHIP>
|
|||
static void
|
||||
tu6_emit_lrz_buffer(struct tu_cs *cs, struct tu_image *depth_image)
|
||||
{
|
||||
tu_crb crb = cs->crb(7);
|
||||
tu_crb crb = cs->crb(8);
|
||||
|
||||
if (!depth_image) {
|
||||
crb.add(GRAS_LRZ_BUFFER_BASE(CHIP, 0))
|
||||
|
|
@ -121,6 +121,10 @@ tu6_emit_lrz_buffer(struct tu_cs *cs, struct tu_image *depth_image)
|
|||
crb.add(GRAS_LRZ_CB_CNTL(CHIP));
|
||||
}
|
||||
|
||||
if (CHIP >= A8XX) {
|
||||
crb.add(GRAS_LRZ_BUFFER_SLICE_PITCH(CHIP));
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -142,6 +146,12 @@ tu6_emit_lrz_buffer(struct tu_cs *cs, struct tu_image *depth_image)
|
|||
crb.add(GRAS_LRZ_CB_CNTL(CHIP, .double_buffer_pitch =
|
||||
depth_image->lrz_layout.lrz_buffer_size));
|
||||
}
|
||||
|
||||
if (CHIP >= A8XX) {
|
||||
crb.add(GRAS_LRZ_BUFFER_SLICE_PITCH(CHIP,
|
||||
depth_image->lrz_layout.lrz_slice_pitch
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
|
|
|
|||
|
|
@ -1167,6 +1167,7 @@ tu6_emit_vs_params(struct tu_cs *cs,
|
|||
ARRAY_SIZE(vs_params), vs_params);
|
||||
}
|
||||
|
||||
template <chip CHIP>
|
||||
static void
|
||||
tu_get_tess_iova(struct tu_device *dev,
|
||||
uint64_t *tess_factor_iova,
|
||||
|
|
@ -1176,14 +1177,14 @@ tu_get_tess_iova(struct tu_device *dev,
|
|||
if (!dev->tess_bo) {
|
||||
mtx_lock(&dev->mutex);
|
||||
if (!dev->tess_bo) {
|
||||
tu_bo_init_new(dev, NULL, &dev->tess_bo, TU_TESS_BO_SIZE,
|
||||
tu_bo_init_new(dev, NULL, &dev->tess_bo, TU_TESS<CHIP>::BO_SIZE,
|
||||
TU_BO_ALLOC_INTERNAL_RESOURCE, "tess");
|
||||
}
|
||||
mtx_unlock(&dev->mutex);
|
||||
}
|
||||
|
||||
*tess_factor_iova = dev->tess_bo->iova;
|
||||
*tess_param_iova = dev->tess_bo->iova + TU_TESS_FACTOR_SIZE;
|
||||
*tess_param_iova = dev->tess_bo->iova + TU_TESS<CHIP>::FACTOR_SIZE;
|
||||
}
|
||||
|
||||
static const enum mesa_vk_dynamic_graphics_state tu_patch_control_points_state[] = {
|
||||
|
|
@ -1235,7 +1236,7 @@ tu6_emit_patch_control_points(struct tu_cs *cs,
|
|||
patch_control_points);
|
||||
|
||||
uint64_t tess_factor_iova, tess_param_iova;
|
||||
tu_get_tess_iova(dev, &tess_factor_iova, &tess_param_iova);
|
||||
tu_get_tess_iova<CHIP>(dev, &tess_factor_iova, &tess_param_iova);
|
||||
|
||||
uint32_t hs_params[HS_PARAMS_SIZE] = {
|
||||
vs->variant->output_size * patch_control_points * 4, /* hs primitive stride */
|
||||
|
|
@ -1289,6 +1290,7 @@ tu6_emit_patch_control_points(struct tu_cs *cs,
|
|||
tu_cs_emit(cs, wave_input_size);
|
||||
}
|
||||
|
||||
template <chip CHIP>
|
||||
static void
|
||||
tu6_emit_geom_tess_consts(struct tu_cs *cs,
|
||||
const struct ir3_shader_variant *vs,
|
||||
|
|
@ -1305,7 +1307,7 @@ tu6_emit_geom_tess_consts(struct tu_cs *cs,
|
|||
|
||||
if (hs) {
|
||||
uint64_t tess_factor_iova, tess_param_iova;
|
||||
tu_get_tess_iova(dev, &tess_factor_iova, &tess_param_iova);
|
||||
tu_get_tess_iova<CHIP>(dev, &tess_factor_iova, &tess_param_iova);
|
||||
|
||||
uint32_t ds_params[8] = {
|
||||
gs ? ds->output_size * gs->gs.vertices_in * 4 : 0, /* ds primitive stride */
|
||||
|
|
@ -1407,7 +1409,7 @@ tu6_emit_program_config(struct tu_cs *cs,
|
|||
}
|
||||
|
||||
if (gs || hs) {
|
||||
tu6_emit_geom_tess_consts(cs, vs, hs, ds, gs);
|
||||
tu6_emit_geom_tess_consts<CHIP>(cs, vs, hs, ds, gs);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -2531,7 +2533,11 @@ template <chip CHIP>
|
|||
static unsigned
|
||||
tu6_viewport_nregs(const struct vk_viewport_state *vp)
|
||||
{
|
||||
return 10 * vp->viewport_count + 3;
|
||||
if (CHIP >= A8XX) {
|
||||
return 12 * vp->viewport_count + 1;
|
||||
} else {
|
||||
return 10 * vp->viewport_count + 3;
|
||||
}
|
||||
}
|
||||
|
||||
template <chip CHIP>
|
||||
|
|
@ -2636,7 +2642,10 @@ tu6_emit_viewport(struct tu_cs *cs,
|
|||
crb.add(GRAS_CL_VIEWPORT_ZCLAMP_MIN(CHIP, i, zmin));
|
||||
crb.add(GRAS_CL_VIEWPORT_ZCLAMP_MAX(CHIP, i, zmax));
|
||||
|
||||
if (i == 0) {
|
||||
if (CHIP >= A8XX) {
|
||||
crb.add(RB_VIEWPORT_ZCLAMP_MIN_REG(CHIP, i, zmin));
|
||||
crb.add(RB_VIEWPORT_ZCLAMP_MAX_REG(CHIP, i, zmax));
|
||||
} else if (i == 0) {
|
||||
/* TODO: what to do about this and multi viewport ? */
|
||||
crb.add(RB_VIEWPORT_ZCLAMP_MIN(CHIP, zmin));
|
||||
crb.add(RB_VIEWPORT_ZCLAMP_MAX(CHIP, zmax));
|
||||
|
|
@ -3232,7 +3241,7 @@ tu6_blend_size(struct tu_device *dev,
|
|||
{
|
||||
unsigned num_rts = alpha_to_coverage_enable ?
|
||||
MAX2(cb->attachment_count, 1) : cb->attachment_count;
|
||||
return 8 + 3 * num_rts;
|
||||
return 8 + 5 * num_rts;
|
||||
}
|
||||
|
||||
template <chip CHIP>
|
||||
|
|
@ -3281,7 +3290,8 @@ tu6_emit_blend(struct tu_cs *cs,
|
|||
.dual_color_in_enable =
|
||||
dual_src_blend,
|
||||
.alpha_to_coverage =
|
||||
alpha_to_coverage_enable));
|
||||
alpha_to_coverage_enable,
|
||||
.alpha_to_one = alpha_to_one_enable));
|
||||
/* TODO: set A6XX_RB_BLEND_CNTL_INDEPENDENT_BLEND only when enabled?
|
||||
*
|
||||
* We could also set blend_reads_dest more conservatively, but it didn't show
|
||||
|
|
@ -3340,10 +3350,19 @@ tu6_emit_blend(struct tu_cs *cs,
|
|||
.alpha_src_factor = src_alpha_factor,
|
||||
.alpha_blend_opcode = alpha_op,
|
||||
.alpha_dest_factor = dst_alpha_factor));
|
||||
if (CHIP >= A8XX) {
|
||||
tu_cs_emit_regs(cs, SP_MRT_BLEND_CNTL_REG(CHIP, remapped_idx,
|
||||
.color_blend_en = blend_enable,
|
||||
.alpha_blend_en = blend_enable,
|
||||
.component_write_mask = att->write_mask));
|
||||
}
|
||||
} else {
|
||||
tu_cs_emit_regs(cs,
|
||||
A6XX_RB_MRT_CONTROL(remapped_idx,),
|
||||
A6XX_RB_MRT_BLEND_CONTROL(remapped_idx,));
|
||||
tu_cs_emit_regs(cs,
|
||||
A6XX_RB_MRT_CONTROL(remapped_idx,),
|
||||
A6XX_RB_MRT_BLEND_CONTROL(remapped_idx,));
|
||||
if (CHIP >= A8XX) {
|
||||
tu_cs_emit_regs(cs, SP_MRT_BLEND_CNTL_REG(CHIP, remapped_idx,));
|
||||
}
|
||||
}
|
||||
}
|
||||
tu_cs_emit_regs(cs, A6XX_SP_PS_MRT_CNTL(.mrt = num_remapped_rts));
|
||||
|
|
@ -3400,7 +3419,7 @@ tu6_rast_size(struct tu_device *dev,
|
|||
} else if (CHIP == A6XX) {
|
||||
return 15 + (dev->physical_device->info->props.has_legacy_pipeline_shading_rate ? 8 : 0);
|
||||
} else {
|
||||
return 27;
|
||||
return 30;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -3428,6 +3447,13 @@ tu6_emit_rast(struct tu_cs *cs,
|
|||
.rendertargetindexincr = multiview,
|
||||
.viewportindexincr = multiview && per_view_viewport));
|
||||
|
||||
if (CHIP >= A8XX) {
|
||||
tu_cs_emit_regs(cs, GRAS_SU_STEREO_CNTL(CHIP,
|
||||
.rendertargetindexincr = multiview,
|
||||
.viewportindexincr = multiview && per_view_viewport,
|
||||
));
|
||||
}
|
||||
|
||||
bool depth_clip_enable = vk_rasterization_state_depth_clip_enable(rs);
|
||||
|
||||
tu_cs_emit_regs(cs,
|
||||
|
|
@ -3437,7 +3463,7 @@ tu6_emit_rast(struct tu_cs *cs,
|
|||
/* To support VK_EXT_depth_clamp_zero_one on a7xx+ */
|
||||
.z_clamp_enable = rs->depth_clamp_enable || CHIP >= A7XX,
|
||||
.zero_gb_scale_z = vp->depth_clip_negative_one_to_one ? 0 : 1,
|
||||
.vp_clip_code_ignore = 1));;
|
||||
.vp_clip_code_ignore = 1));
|
||||
|
||||
enum a6xx_polygon_mode polygon_mode = tu6_polygon_mode(rs->polygon_mode);
|
||||
|
||||
|
|
@ -3446,7 +3472,10 @@ tu6_emit_rast(struct tu_cs *cs,
|
|||
tu_cs_emit_regs(cs,
|
||||
PC_DGEN_RAST_CNTL(CHIP, polygon_mode));
|
||||
|
||||
if (CHIP == A7XX || cs->device->physical_device->info->props.is_a702) {
|
||||
if (CHIP >= A8XX)
|
||||
tu_cs_emit_regs(cs, GRAS_RAST_CNTL(CHIP, polygon_mode));
|
||||
|
||||
if (CHIP >= A7XX || cs->device->physical_device->info->props.is_a702) {
|
||||
tu_cs_emit_regs(cs, VPC_PS_RAST_CNTL(CHIP, polygon_mode));
|
||||
}
|
||||
|
||||
|
|
@ -3457,9 +3486,11 @@ tu6_emit_rast(struct tu_cs *cs,
|
|||
tu_cs_emit_regs(cs, VPC_UNKNOWN_9107(CHIP,
|
||||
.raster_discard = rs->rasterizer_discard_enable));
|
||||
} else {
|
||||
tu_cs_emit_regs(cs, VPC_RAST_STREAM_CNTL_V2(CHIP,
|
||||
.stream = rs->rasterization_stream,
|
||||
.discard = rs->rasterizer_discard_enable));
|
||||
if (CHIP == A7XX) {
|
||||
tu_cs_emit_regs(cs, VPC_RAST_STREAM_CNTL_V2(CHIP,
|
||||
.stream = rs->rasterization_stream,
|
||||
.discard = rs->rasterizer_discard_enable));
|
||||
}
|
||||
|
||||
bool conservative_ras_en =
|
||||
rs->conservative_mode ==
|
||||
|
|
@ -3641,7 +3672,9 @@ tu6_emit_rb_depth_cntl(struct tu_cs *cs,
|
|||
.z_read_enable =
|
||||
(ds->depth.test_enable && (zfunc != FUNC_NEVER && zfunc != FUNC_ALWAYS)) ||
|
||||
ds->depth.bounds_test.enable,
|
||||
.z_bounds_enable = ds->depth.bounds_test.enable));
|
||||
.z_bounds_enable = ds->depth.bounds_test.enable,
|
||||
.o_depth_01_clamp_en = CHIP >= A8XX,
|
||||
));
|
||||
tu_cs_emit_regs(cs, GRAS_SU_DEPTH_CNTL(CHIP, depth_test));
|
||||
} else {
|
||||
tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
|
||||
|
|
@ -3775,6 +3808,7 @@ tu6_emit_fragment_shading_rate(struct tu_cs *cs,
|
|||
.frag_size_y = util_logbase2(frag_height),
|
||||
.combiner_op_1 = (a6xx_fsr_combiner) fsr->combiner_ops[0],
|
||||
.combiner_op_2 = (a6xx_fsr_combiner) fsr->combiner_ops[1],
|
||||
.combiner_clamp_mode = (CHIP >= A8XX) ? FSR_COMBINER_CLAMP_16_SAMP : FSR_COMBINER_CLAMP_4x4,
|
||||
.attachment_fsr_enable = enable_att_fsr,
|
||||
.primitive_fsr_enable = enable_prim_fsr));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -541,6 +541,18 @@ is_pipeline_query_with_compute_stage(uint32_t pipeline_statistics)
|
|||
VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT;
|
||||
}
|
||||
|
||||
template <chip CHIP>
|
||||
static inline void
|
||||
emit_counter_barrier(struct tu_cs *cs)
|
||||
{
|
||||
tu_cs_emit_wfi(cs);
|
||||
|
||||
if (CHIP >= A8XX) {
|
||||
tu_cs_emit_pkt7(cs, CP_BARRIER, 1);
|
||||
tu_cs_emit(cs, 1);
|
||||
}
|
||||
}
|
||||
|
||||
/* Wait on the the availability status of a query up until a timeout. */
|
||||
static VkResult
|
||||
wait_for_available(struct tu_device *device, struct tu_query_pool *pool,
|
||||
|
|
@ -1165,7 +1177,7 @@ emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf,
|
|||
tu_emit_event_write<CHIP>(cmdbuf, cs, FD_START_COMPUTE_CTRS);
|
||||
}
|
||||
|
||||
tu_cs_emit_wfi(cs);
|
||||
emit_counter_barrier<CHIP>(cs);
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
|
||||
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(COUNTER_REG(IAVERTICES)) |
|
||||
|
|
@ -1174,12 +1186,13 @@ emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf,
|
|||
tu_cs_emit_qw(cs, begin_iova);
|
||||
}
|
||||
|
||||
template <chip CHIP>
|
||||
static void
|
||||
emit_perfcntrs_pass_start(bool has_pred_bit, struct tu_cs *cs, uint32_t pass)
|
||||
{
|
||||
tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
|
||||
tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(
|
||||
REG_A6XX_CP_SCRATCH(PERF_CNTRS_REG)) |
|
||||
tu_scratch_reg<CHIP>(PERF_CNTRS_REG).reg) |
|
||||
A6XX_CP_REG_TEST_0_BIT(pass) |
|
||||
(has_pred_bit ?
|
||||
A6XX_CP_REG_TEST_0_PRED_BIT(TU_PREDICATE_PERFCNTRS) : 0) |
|
||||
|
|
@ -1222,7 +1235,7 @@ emit_begin_perf_query_raw(struct tu_cmd_buffer *cmdbuf,
|
|||
* stream below CP_COND_REG_EXEC.
|
||||
*/
|
||||
|
||||
tu_cs_emit_wfi(cs);
|
||||
emit_counter_barrier<CHIP>(cs);
|
||||
|
||||
/* Keep preemption disabled for the duration of this query. This way
|
||||
* changes in perfcounter values should only apply to work done during
|
||||
|
|
@ -1242,7 +1255,7 @@ emit_begin_perf_query_raw(struct tu_cmd_buffer *cmdbuf,
|
|||
|
||||
if (data->pass != 0)
|
||||
tu_cond_exec_end(cs);
|
||||
emit_perfcntrs_pass_start(has_pred_bit, cs, data->pass);
|
||||
emit_perfcntrs_pass_start<CHIP>(has_pred_bit, cs, data->pass);
|
||||
}
|
||||
|
||||
const struct fd_perfcntr_counter *counter =
|
||||
|
|
@ -1256,7 +1269,7 @@ emit_begin_perf_query_raw(struct tu_cmd_buffer *cmdbuf,
|
|||
tu_cond_exec_end(cs);
|
||||
|
||||
last_pass = ~0;
|
||||
tu_cs_emit_wfi(cs);
|
||||
emit_counter_barrier<CHIP>(cs);
|
||||
|
||||
for (uint32_t i = 0; i < perf_query->counter_index_count; i++) {
|
||||
struct tu_perf_query_raw_data *data = &perf_query->data[i];
|
||||
|
|
@ -1266,7 +1279,7 @@ emit_begin_perf_query_raw(struct tu_cmd_buffer *cmdbuf,
|
|||
|
||||
if (data->pass != 0)
|
||||
tu_cond_exec_end(cs);
|
||||
emit_perfcntrs_pass_start(has_pred_bit, cs, data->pass);
|
||||
emit_perfcntrs_pass_start<CHIP>(has_pred_bit, cs, data->pass);
|
||||
}
|
||||
|
||||
const struct fd_perfcntr_counter *counter =
|
||||
|
|
@ -1291,7 +1304,7 @@ emit_begin_perf_query_derived(struct tu_cmd_buffer *cmdbuf,
|
|||
struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
|
||||
struct tu_perf_query_derived *perf_query = &pool->perf_query.derived;
|
||||
|
||||
tu_cs_emit_wfi(cs);
|
||||
emit_counter_barrier<CHIP>(cs);
|
||||
|
||||
/* Keep preemption disabled for the duration of this query. This way
|
||||
* changes in perfcounter values should only apply to work done during
|
||||
|
|
@ -1311,7 +1324,7 @@ emit_begin_perf_query_derived(struct tu_cmd_buffer *cmdbuf,
|
|||
tu_cs_emit(cs, countable);
|
||||
}
|
||||
|
||||
tu_cs_emit_wfi(cs);
|
||||
emit_counter_barrier<CHIP>(cs);
|
||||
|
||||
/* Collect the enabled perfcntrs. Emit CP_ALWAYS_COUNT collection last, if necessary. */
|
||||
for (uint32_t i = 0; i < perf_query->collection->num_enabled_perfcntrs; ++i) {
|
||||
|
|
@ -1383,7 +1396,7 @@ emit_begin_prim_generated_query(struct tu_cmd_buffer *cmdbuf,
|
|||
|
||||
tu_emit_event_write<CHIP>(cmdbuf, cs, FD_START_PRIMITIVE_CTRS);
|
||||
|
||||
tu_cs_emit_wfi(cs);
|
||||
emit_counter_barrier<CHIP>(cs);
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
|
||||
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(COUNTER_REG(CINVOCATIONS)) |
|
||||
|
|
@ -1539,7 +1552,7 @@ emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf,
|
|||
.write_accum_sample_count_diff = true).value);
|
||||
tu_cs_emit_qw(cs, begin_iova);
|
||||
|
||||
tu_cs_emit_wfi(cs);
|
||||
emit_counter_barrier<CHIP>(cs);
|
||||
|
||||
if (cmdbuf->device->physical_device->info->props.has_generic_clear) {
|
||||
/* If the next renderpass uses the same depth attachment, clears it
|
||||
|
|
@ -1651,7 +1664,7 @@ emit_end_stat_query(struct tu_cmd_buffer *cmdbuf,
|
|||
tu_emit_event_write<CHIP>(cmdbuf, cs, FD_STOP_COMPUTE_CTRS);
|
||||
}
|
||||
|
||||
tu_cs_emit_wfi(cs);
|
||||
emit_counter_barrier<CHIP>(cs);
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
|
||||
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(COUNTER_REG(IAVERTICES)) |
|
||||
|
|
@ -1705,7 +1718,7 @@ emit_end_perf_query_raw(struct tu_cmd_buffer *cmdbuf,
|
|||
/* Wait for the profiled work to finish so that collected counter values
|
||||
* are as accurate as possible.
|
||||
*/
|
||||
tu_cs_emit_wfi(cs);
|
||||
emit_counter_barrier<CHIP>(cs);
|
||||
|
||||
for (uint32_t i = 0; i < perf_query->counter_index_count; i++) {
|
||||
struct tu_perf_query_raw_data *data = &perf_query->data[i];
|
||||
|
|
@ -1715,7 +1728,7 @@ emit_end_perf_query_raw(struct tu_cmd_buffer *cmdbuf,
|
|||
|
||||
if (data->pass != 0)
|
||||
tu_cond_exec_end(cs);
|
||||
emit_perfcntrs_pass_start(has_pred_bit, cs, data->pass);
|
||||
emit_perfcntrs_pass_start<CHIP>(has_pred_bit, cs, data->pass);
|
||||
}
|
||||
|
||||
const struct fd_perfcntr_counter *counter =
|
||||
|
|
@ -1731,7 +1744,7 @@ emit_end_perf_query_raw(struct tu_cmd_buffer *cmdbuf,
|
|||
tu_cond_exec_end(cs);
|
||||
|
||||
last_pass = ~0;
|
||||
tu_cs_emit_wfi(cs);
|
||||
emit_counter_barrier<CHIP>(cs);
|
||||
|
||||
for (uint32_t i = 0; i < perf_query->counter_index_count; i++) {
|
||||
struct tu_perf_query_raw_data *data = &perf_query->data[i];
|
||||
|
|
@ -1742,7 +1755,7 @@ emit_end_perf_query_raw(struct tu_cmd_buffer *cmdbuf,
|
|||
|
||||
if (data->pass != 0)
|
||||
tu_cond_exec_end(cs);
|
||||
emit_perfcntrs_pass_start(has_pred_bit, cs, data->pass);
|
||||
emit_perfcntrs_pass_start<CHIP>(has_pred_bit, cs, data->pass);
|
||||
}
|
||||
|
||||
result_iova = query_result_iova(pool, query, struct perfcntr_query_slot,
|
||||
|
|
@ -1796,7 +1809,7 @@ emit_end_perf_query_derived(struct tu_cmd_buffer *cmdbuf,
|
|||
/* Wait for the profiled work to finish so that collected counter values
|
||||
* are as accurate as possible.
|
||||
*/
|
||||
tu_cs_emit_wfi(cs);
|
||||
emit_counter_barrier<CHIP>(cs);
|
||||
|
||||
/* Collect the enabled perfcntrs. Emit CP_ALWAYS_COUNT collection first, if necessary. */
|
||||
if (perf_query->collection->cp_always_count_enabled) {
|
||||
|
|
@ -1822,7 +1835,7 @@ emit_end_perf_query_derived(struct tu_cmd_buffer *cmdbuf,
|
|||
tu_cs_emit_qw(cs, end_iova);
|
||||
}
|
||||
|
||||
tu_cs_emit_wfi(cs);
|
||||
emit_counter_barrier<CHIP>(cs);
|
||||
|
||||
for (uint32_t i = 0; i < perf_query->collection->num_enabled_perfcntrs; ++i) {
|
||||
uint64_t result_iova = perf_query_derived_perfcntr_iova(pool, query, result, i);
|
||||
|
|
@ -1884,7 +1897,7 @@ emit_end_xfb_query(struct tu_cmd_buffer *cmdbuf,
|
|||
tu_cs_emit_regs(cs, VPC_SO_QUERY_BASE(CHIP, .qword = end_iova));
|
||||
tu_emit_event_write<CHIP>(cmdbuf, cs, FD_WRITE_PRIMITIVE_COUNTS);
|
||||
|
||||
tu_cs_emit_wfi(cs);
|
||||
emit_counter_barrier<CHIP>(cs);
|
||||
tu_emit_event_write<CHIP>(cmdbuf, cs, FD_CACHE_CLEAN);
|
||||
|
||||
/* Set the count of written primitives */
|
||||
|
|
@ -1936,7 +1949,7 @@ emit_end_prim_generated_query(struct tu_cmd_buffer *cmdbuf,
|
|||
CP_COND_REG_EXEC_0_BINNING);
|
||||
}
|
||||
|
||||
tu_cs_emit_wfi(cs);
|
||||
emit_counter_barrier<CHIP>(cs);
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
|
||||
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(COUNTER_REG(CINVOCATIONS)) |
|
||||
|
|
@ -2085,7 +2098,7 @@ tu_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
|
|||
* there's a better solution that allows all 48 bits of precision
|
||||
* because CP_EVENT_WRITE doesn't support 64-bit timestamps.
|
||||
*/
|
||||
tu_cs_emit_wfi(cs);
|
||||
emit_counter_barrier<CHIP>(cs);
|
||||
}
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
|
||||
|
|
|
|||
|
|
@ -1017,7 +1017,7 @@ tu_lower_io(nir_shader *shader, struct tu_device *dev,
|
|||
/* Disable pushing constants for this stage if none were loaded in the
|
||||
* shader. If all stages don't load their declared push constants, as
|
||||
* is often the case under zink, then we could additionally skip
|
||||
* emitting REG_A7XX_SP_SHARED_CONSTANT_GFX entirely.
|
||||
* emitting SP_SHARED_CONSTANT_GFX entirely.
|
||||
*/
|
||||
if (!shader_uses_push_consts(shader))
|
||||
const_state->push_consts = (struct tu_push_constant_range) {};
|
||||
|
|
@ -1502,6 +1502,7 @@ tu_xs_get_additional_cs_size_dwords(const struct ir3_shader_variant *xs)
|
|||
return size;
|
||||
}
|
||||
|
||||
template <chip CHIP>
|
||||
void
|
||||
tu6_emit_xs(struct tu_crb &crb,
|
||||
struct tu_device *device,
|
||||
|
|
@ -1541,7 +1542,7 @@ tu6_emit_xs(struct tu_crb &crb,
|
|||
A6XX_SP_VS_PVT_MEM_SIZE(.totalpvtmemsize = pvtmem->per_sp_size,
|
||||
.perwavememlayout = xs->pvtmem_per_wave));
|
||||
crb.add(A6XX_SP_VS_PVT_MEM_STACK_OFFSET(.offset = pvtmem->per_sp_size));
|
||||
if (device->physical_device->info->chip >= A7XX)
|
||||
if (CHIP >= A7XX)
|
||||
crb.add(SP_VS_VGS_CNTL(A7XX, 0));
|
||||
break;
|
||||
|
||||
|
|
@ -1560,7 +1561,7 @@ tu6_emit_xs(struct tu_crb &crb,
|
|||
A6XX_SP_HS_PVT_MEM_SIZE(.totalpvtmemsize = pvtmem->per_sp_size,
|
||||
.perwavememlayout = xs->pvtmem_per_wave));
|
||||
crb.add(A6XX_SP_HS_PVT_MEM_STACK_OFFSET(.offset = pvtmem->per_sp_size));
|
||||
if (device->physical_device->info->chip >= A7XX)
|
||||
if (CHIP >= A7XX)
|
||||
crb.add(SP_HS_VGS_CNTL(A7XX, 0));
|
||||
|
||||
break;
|
||||
|
|
@ -1580,7 +1581,7 @@ tu6_emit_xs(struct tu_crb &crb,
|
|||
A6XX_SP_DS_PVT_MEM_SIZE(.totalpvtmemsize = pvtmem->per_sp_size,
|
||||
.perwavememlayout = xs->pvtmem_per_wave));
|
||||
crb.add(A6XX_SP_DS_PVT_MEM_STACK_OFFSET(.offset = pvtmem->per_sp_size));
|
||||
if (device->physical_device->info->chip >= A7XX)
|
||||
if (CHIP >= A7XX)
|
||||
crb.add(SP_DS_VGS_CNTL(A7XX, 0));
|
||||
break;
|
||||
|
||||
|
|
@ -1599,7 +1600,7 @@ tu6_emit_xs(struct tu_crb &crb,
|
|||
A6XX_SP_GS_PVT_MEM_SIZE(.totalpvtmemsize = pvtmem->per_sp_size,
|
||||
.perwavememlayout = xs->pvtmem_per_wave));
|
||||
crb.add(A6XX_SP_GS_PVT_MEM_STACK_OFFSET(.offset = pvtmem->per_sp_size));
|
||||
if (device->physical_device->info->chip >= A7XX)
|
||||
if (CHIP >= A7XX)
|
||||
crb.add(SP_GS_VGS_CNTL(A7XX, 0));
|
||||
break;
|
||||
|
||||
|
|
@ -1615,6 +1616,12 @@ tu6_emit_xs(struct tu_crb &crb,
|
|||
.inoutregoverlap = true, .pixlodenable = xs->need_pixlod,
|
||||
.earlypreamble = xs->early_preamble,
|
||||
.mergedregs = xs->mergedregs, ));
|
||||
if (CHIP >= A8XX) {
|
||||
crb.add(RB_PS_CNTL(CHIP,
|
||||
.pixlodenable = xs->need_pixlod,
|
||||
.lodpixmask = xs->need_full_quad,
|
||||
));
|
||||
}
|
||||
crb.add(A6XX_SP_PS_INSTR_SIZE(xs->instrlen));
|
||||
crb.add(A6XX_SP_PS_PROGRAM_COUNTER_OFFSET(0));
|
||||
crb.add(A6XX_SP_PS_BASE(.qword = binary_iova));
|
||||
|
|
@ -1625,7 +1632,7 @@ tu6_emit_xs(struct tu_crb &crb,
|
|||
A6XX_SP_PS_PVT_MEM_SIZE(.totalpvtmemsize = pvtmem->per_sp_size,
|
||||
.perwavememlayout = xs->pvtmem_per_wave));
|
||||
crb.add(A6XX_SP_PS_PVT_MEM_STACK_OFFSET(.offset = pvtmem->per_sp_size));
|
||||
if (device->physical_device->info->chip >= A7XX)
|
||||
if (CHIP >= A7XX)
|
||||
crb.add(SP_PS_VGS_CNTL(A7XX, 0));
|
||||
|
||||
break;
|
||||
|
|
@ -1650,7 +1657,7 @@ tu6_emit_xs(struct tu_crb &crb,
|
|||
A6XX_SP_CS_PVT_MEM_SIZE(.totalpvtmemsize = pvtmem->per_sp_size,
|
||||
.perwavememlayout = xs->pvtmem_per_wave));
|
||||
crb.add(A6XX_SP_CS_PVT_MEM_STACK_OFFSET(.offset = pvtmem->per_sp_size));
|
||||
if (device->physical_device->info->chip >= A7XX)
|
||||
if (CHIP >= A7XX)
|
||||
crb.add(SP_CS_VGS_CNTL(A7XX, 0));
|
||||
break;
|
||||
|
||||
|
|
@ -1658,6 +1665,7 @@ tu6_emit_xs(struct tu_crb &crb,
|
|||
UNREACHABLE("bad shader stage");
|
||||
}
|
||||
}
|
||||
TU_GENX(tu6_emit_xs);
|
||||
|
||||
void
|
||||
tu6_emit_xs_constants(
|
||||
|
|
@ -1782,7 +1790,7 @@ tu6_emit_cs_config(struct tu_cs *cs,
|
|||
crb.add(SP_UPDATE_CNTL(CHIP, .cs_state = true, .cs_uav = true,
|
||||
.cs_shared_const = shared_consts_enable));
|
||||
tu6_emit_xs_config<CHIP>(crb, { .cs = v });
|
||||
tu6_emit_xs(crb, cs->device, MESA_SHADER_COMPUTE, v, pvtmem, binary_iova);
|
||||
tu6_emit_xs<CHIP>(crb, cs->device, MESA_SHADER_COMPUTE, v, pvtmem, binary_iova);
|
||||
}
|
||||
tu6_emit_xs_constants(cs, MESA_SHADER_COMPUTE, v, binary_iova);
|
||||
|
||||
|
|
@ -1863,6 +1871,7 @@ tu6_emit_cs_config(struct tu_cs *cs,
|
|||
|
||||
#define TU6_EMIT_VFD_DEST_MAX_DWORDS (MAX_VERTEX_ATTRIBS + 2)
|
||||
|
||||
template <chip CHIP>
|
||||
static void
|
||||
tu6_emit_vfd_dest(struct tu_cs *cs,
|
||||
const struct ir3_shader_variant *vs)
|
||||
|
|
@ -1888,6 +1897,25 @@ tu6_emit_vfd_dest(struct tu_cs *cs,
|
|||
.fetch_cnt = attr_count, /* decode_cnt for binning pass ? */
|
||||
.decode_cnt = attr_count));
|
||||
|
||||
if (CHIP >= A8XX) {
|
||||
const uint32_t vertexid_regid =
|
||||
ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID);
|
||||
const uint32_t instanceid_regid =
|
||||
ir3_find_sysval_regid(vs, SYSTEM_VALUE_INSTANCE_ID);
|
||||
const uint32_t viewid_regid =
|
||||
ir3_find_sysval_regid(vs, SYSTEM_VALUE_VIEW_INDEX);
|
||||
|
||||
unsigned sideband_count =
|
||||
(vertexid_regid != INVALID_REG) +
|
||||
(instanceid_regid != INVALID_REG) +
|
||||
(viewid_regid != INVALID_REG);
|
||||
|
||||
tu_cs_emit_regs(cs, PC_VS_INPUT_CNTL(CHIP,
|
||||
.instr_cnt = attr_count,
|
||||
.sideband_cnt = sideband_count,
|
||||
));
|
||||
}
|
||||
|
||||
if (attr_count)
|
||||
tu_cs_emit_pkt4(cs, REG_A6XX_VFD_DEST_CNTL_INSTR(0), attr_count);
|
||||
|
||||
|
|
@ -1990,6 +2018,11 @@ tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs)
|
|||
SP_REG_PROG_ID_3(CHIP, .linelengthregid = 0xfc,
|
||||
.foveationqualityregid = shading_rate_regid), );
|
||||
|
||||
if (CHIP >= A8XX) {
|
||||
tu_cs_emit_regs(cs, RB_LB_PARAM_LIMIT(CHIP,
|
||||
cs->device->physical_device->info->props.prim_alloc_threshold));
|
||||
}
|
||||
|
||||
if (CHIP >= A7XX) {
|
||||
uint32_t sysval_regs = 0;
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++) {
|
||||
|
|
@ -2245,7 +2278,7 @@ tu6_emit_vs(struct tu_cs *cs,
|
|||
tu_cs_emit_regs(cs, VPC_STEREO_RENDERING_VIEWMASK(CHIP, view_mask));
|
||||
}
|
||||
|
||||
tu6_emit_vfd_dest(cs, vs);
|
||||
tu6_emit_vfd_dest<CHIP>(cs, vs);
|
||||
|
||||
const uint32_t vertexid_regid =
|
||||
ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID);
|
||||
|
|
@ -2404,7 +2437,7 @@ tu6_emit_variant(struct tu_cs *cs,
|
|||
}
|
||||
|
||||
with_crb(cs) {
|
||||
tu6_emit_xs(crb, cs->device, stage, xs, pvtmem_config, binary_iova);
|
||||
tu6_emit_xs<CHIP>(crb, cs->device, stage, xs, pvtmem_config, binary_iova);
|
||||
}
|
||||
|
||||
switch (stage) {
|
||||
|
|
|
|||
|
|
@ -151,6 +151,7 @@ tu_spirv_to_nir(struct tu_device *dev,
|
|||
const struct tu_shader_key *key,
|
||||
mesa_shader_stage stage);
|
||||
|
||||
template <chip CHIP>
|
||||
void
|
||||
tu6_emit_xs(struct tu_crb &crb,
|
||||
struct tu_device *device,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue