diff --git a/src/freedreno/vulkan/meson.build b/src/freedreno/vulkan/meson.build index b68006ae571..b529488c09a 100644 --- a/src/freedreno/vulkan/meson.build +++ b/src/freedreno/vulkan/meson.build @@ -10,7 +10,7 @@ tu_entrypoints = custom_target( '--out-h', '@OUTPUT0@', '--out-c', '@OUTPUT1@', '--prefix', 'tu', '--include', 'adreno_common.xml.h', '--tmpl-prefix', 'tu', '--tmpl-param', 'chip CHIP', - '--tmpl-variants', '', '', + '--tmpl-variants', '', '', '', '--beta', with_vulkan_beta.to_string(), '--device-prefix', 'tu_rmv', ], diff --git a/src/freedreno/vulkan/tu_clear_blit.cc b/src/freedreno/vulkan/tu_clear_blit.cc index 656729e468d..d99948690b8 100644 --- a/src/freedreno/vulkan/tu_clear_blit.cc +++ b/src/freedreno/vulkan/tu_clear_blit.cc @@ -375,8 +375,12 @@ r2d_src_buffer_unaligned(struct tu_cmd_buffer *cmd, enum a6xx_format color_format = fmt.fmt; fixup_src_format(&format, dst_format, &color_format); - uint32_t offset_texels = ((va & 0x3f) / util_format_get_blocksize(format)); - va &= ~0x3f; + uint32_t offset_texels = 0; + if (CHIP < A8XX) { + offset_texels = ((va & 0x3f) / util_format_get_blocksize(format)); + va &= ~0x3f; + } + tu_cs_emit_regs(cs, TPL1_A2D_BLT_CNTL(CHIP, .raw_copy = false, .type = A6XX_TEX_IMG_BUFFER, @@ -877,18 +881,18 @@ r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum r3d_type type, .cs_bindless = CHIP == A6XX ? 0x1f : 0xff, .gfx_bindless = CHIP == A6XX ? 0x1f : 0xff,)); - with_crb (cs, 2 * 5 + 2 * 11) { + with_crb (cs, 2 * 5 + 2 * 12) { tu6_emit_xs_config(crb, { .vs = vs, .fs = fs }); struct tu_pvtmem_config pvtmem = {}; - tu6_emit_xs(crb, cs->device, MESA_SHADER_VERTEX, vs, &pvtmem, vs_iova); - tu6_emit_xs(crb, cs->device, MESA_SHADER_FRAGMENT, fs, &pvtmem, fs_iova); + tu6_emit_xs(crb, cs->device, MESA_SHADER_VERTEX, vs, &pvtmem, vs_iova); + tu6_emit_xs(crb, cs->device, MESA_SHADER_FRAGMENT, fs, &pvtmem, fs_iova); } tu6_emit_xs_constants(cs, MESA_SHADER_VERTEX, vs, vs_iova); tu6_emit_xs_constants(cs, MESA_SHADER_FRAGMENT, fs, fs_iova); tu_cs_emit_regs(cs, PC_CNTL(CHIP)); - if (CHIP == A7XX) { + if (CHIP >= A7XX) { tu_cs_emit_regs(cs, VPC_PC_CNTL(CHIP)); } @@ -916,11 +920,16 @@ r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum r3d_type type, .persp_division_disable = 1,)); tu_cs_emit_regs(cs, GRAS_SU_CNTL(CHIP)); // XXX msaa enable? + if (CHIP >= A8XX) { + tu_cs_emit_regs(cs, GRAS_SU_STEREO_CNTL(CHIP)); + } + tu_cs_emit_regs(cs, VPC_RAST_STREAM_CNTL(CHIP)); if (CHIP == A6XX) { tu_cs_emit_regs(cs, VPC_UNKNOWN_9107(CHIP)); } else { - tu_cs_emit_regs(cs, VPC_RAST_STREAM_CNTL_V2(CHIP)); + if (CHIP == A7XX) + tu_cs_emit_regs(cs, VPC_RAST_STREAM_CNTL_V2(CHIP)); tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP, .raster_mode = TYPE_TILED, @@ -1361,7 +1370,16 @@ r3d_src_gmem(struct tu_cmd_buffer *cmd, tu_desc_set_tex_line_offset(desc, cmd->state.tiling->tile0.width * cpp); tu_desc_set_array_slice_offset(desc, 0); tu_desc_set_depth(desc, 1); - tu_desc_set_addr(desc, cmd->device->physical_device->gmem_base + gmem_offset); + + uint64_t va = gmem_offset; + if (CHIP < A8XX) { + /* For gen8, address is simply gmem_offset if tile_mode is gmem + * tiling (TILE6_2) + */ + va += cmd->device->physical_device->gmem_base; + } + + tu_desc_set_addr(desc, va); /* patch the format so that depth/stencil get the right format and swizzle */ tu_desc_set_format(desc, fmt); @@ -1642,6 +1660,10 @@ r3d_setup(struct tu_cmd_buffer *cmd, tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0, .component_enable = aspect_write_mask(dst_format, aspect_mask))); + if (CHIP >= A8XX) { + tu_cs_emit_regs(cs, SP_MRT_BLEND_CNTL_REG(CHIP, 0, + .component_write_mask = aspect_write_mask(dst_format, aspect_mask))); + } tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(util_format_is_srgb(dst_format))); tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(util_format_is_srgb(dst_format))); @@ -4192,11 +4214,17 @@ tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd, tu_cs_emit_regs(cs, A6XX_RB_PS_MRT_CNTL(.mrt = mrt_count)); - tu_cs_emit_regs(cs, SP_BLEND_CNTL(CHIP)); + tu_cs_emit_regs(cs, SP_BLEND_CNTL(CHIP, + .independent_blend_en = true, + )); tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff)); for (uint32_t i = 0; i < mrt_count; i++) { tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i, .component_enable = COND(clear_rts & (1 << i), 0xf))); + if (CHIP >= A8XX) { + tu_cs_emit_regs(cs, SP_MRT_BLEND_CNTL_REG(CHIP, i, + .component_write_mask = COND(clear_rts & (1 << i), 0xf))); + } } tu_cs_emit_regs(cs, GRAS_LRZ_CNTL(CHIP, 0)); @@ -5247,6 +5275,14 @@ store_cp_blit(struct tu_cmd_buffer *cmd, src_height += tiling->tile0.height; } + uint64_t va = gmem_offset; + if (CHIP < A8XX) { + /* For gen8, address is simply gmem_offset if tile_mode is gmem + * tiling (TILE6_2) + */ + va += cmd->device->physical_device->gmem_base; + } + tu_cs_emit_regs(cs, TPL1_A2D_SRC_TEXTURE_INFO(CHIP, .color_format = format, @@ -5262,7 +5298,7 @@ store_cp_blit(struct tu_cmd_buffer *cmd, TPL1_A2D_SRC_TEXTURE_SIZE(CHIP, .width = src_width, .height = src_height), - TPL1_A2D_SRC_TEXTURE_BASE(CHIP, .qword = cmd->device->physical_device->gmem_base + gmem_offset), + TPL1_A2D_SRC_TEXTURE_BASE(CHIP, .qword = va), TPL1_A2D_SRC_TEXTURE_PITCH(CHIP, .pitch = cmd->state.tiling->tile0.width * cpp)); /* sync GMEM writes with CACHE. */ diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index 34733a7f903..ad065a16b73 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -266,9 +266,14 @@ static void tu_set_render_mode(struct tu_cs *cs, tu_set_render_mode args) { tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); - tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(args.mode) | - COND(args.uses_gmem, A6XX_CP_SET_MARKER_0_USES_GMEM) | - COND(args.shader_uses_rt, A6XX_CP_SET_MARKER_0_SHADER_USES_RT)); + if (CHIP >= A8XX) { + tu_cs_emit(cs, A8XX_CP_SET_MARKER_0_MODE(args.mode) | + COND(args.uses_gmem, A8XX_CP_SET_MARKER_0_USES_GMEM)); + } else { + tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(args.mode) | + COND(args.uses_gmem, A6XX_CP_SET_MARKER_0_USES_GMEM) | + COND(args.shader_uses_rt, A6XX_CP_SET_MARKER_0_SHADER_USES_RT)); + } } /* This workaround, copied from the blob, seems to ensure that the BVH node @@ -347,7 +352,7 @@ tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer, /* Invalidating UCHE seems to also invalidate CCHE */ !(flushes & TU_CMD_FLAG_CACHE_INVALIDATE)) tu_cs_emit_pkt7(cs, CP_CCHE_INVALIDATE, 0); - if (CHIP >= A7XX && (flushes & TU_CMD_FLAG_RTU_INVALIDATE) && + if (CHIP == A7XX && (flushes & TU_CMD_FLAG_RTU_INVALIDATE) && cmd_buffer->device->physical_device->info->props.has_rt_workaround) tu_emit_rt_workaround(cmd_buffer, cs); if (flushes & TU_CMD_FLAG_WAIT_MEM_WRITES) @@ -523,16 +528,24 @@ emit_vpc_attr_buf(struct tu_cs *cs, struct tu_device *dev, bool gmem) if (!dev->physical_device->info->props.has_gmem_vpc_attr_buf) return; + tu_crb crb(cs, 9); + const struct fd6_gmem_config *cfg = gmem ? &dev->physical_device->config_gmem : &dev->physical_device->config_sysmem; - tu_cs_emit_regs(cs, - VPC_ATTR_BUF_GMEM_SIZE(CHIP, cfg->vpc_attr_buf_size), - VPC_ATTR_BUF_GMEM_BASE(CHIP, cfg->vpc_attr_buf_offset), - ); + crb.add(VPC_ATTR_BUF_GMEM_SIZE(CHIP, cfg->vpc_attr_buf_size)); + crb.add(VPC_ATTR_BUF_GMEM_BASE(CHIP, cfg->vpc_attr_buf_offset)); + crb.add(PC_ATTR_BUF_GMEM_SIZE(CHIP, cfg->vpc_attr_buf_size)); - tu_cs_emit_regs(cs, PC_ATTR_BUF_GMEM_SIZE(CHIP, cfg->vpc_attr_buf_size)); + if (CHIP >= A8XX) { + crb.add(VPC_POS_BUF_GMEM_SIZE(CHIP, cfg->vpc_pos_buf_size)); + crb.add(VPC_POS_BUF_GMEM_BASE(CHIP, cfg->vpc_pos_buf_offset)); + crb.add(PC_POS_BUF_GMEM_SIZE(CHIP, cfg->vpc_pos_buf_size)); + crb.add(VPC_BV_POS_BUF_GMEM_BASE(CHIP, cfg->vpc_bv_pos_buf_offset)); + crb.add(VPC_BV_POS_BUF_GMEM_SIZE(CHIP, cfg->vpc_bv_pos_buf_size)); + crb.add(PC_BV_POS_BUF_GMEM_SIZE(CHIP, cfg->vpc_bv_pos_buf_size)); + } } template @@ -575,7 +588,14 @@ emit_rb_ccu_cntl(struct tu_cs *cs, struct tu_device *dev, bool gmem) enum a6xx_ccu_cache_size color_cache_size = !gmem ? CCU_CACHE_SIZE_FULL : !gmem ? CCU_CACHE_SIZE_FULL : (a6xx_ccu_cache_size)(dev->physical_device->info->props.gmem_ccu_color_cache_fraction); - if (CHIP == A7XX) { + if (CHIP == A8XX) { + tu_cs_emit_regs(cs, RB_CCU_CACHE_CNTL(CHIP, + .depth_cache_size = (enum a6xx_ccu_cache_size)cfg->depth_cache_fraction, + .depth_offset = cfg->depth_ccu_offset, + .color_cache_size = (enum a6xx_ccu_cache_size)cfg->color_cache_fraction, + .color_offset = cfg->color_ccu_offset, + )); + } else if (CHIP == A7XX) { tu_cs_emit_regs(cs, RB_CCU_CACHE_CNTL(CHIP, .depth_offset_hi = depth_offset_hi, .color_offset_hi = color_offset_hi, @@ -584,7 +604,7 @@ emit_rb_ccu_cntl(struct tu_cs *cs, struct tu_device *dev, bool gmem) .color_cache_size = color_cache_size, .color_offset = color_offset )); - } else { + } else if (CHIP == A6XX) { tu_cs_emit_regs(cs, RB_CCU_CNTL(CHIP, .gmem_fast_clear_disable = !dev->physical_device->info->props.has_gmem_fast_clear, @@ -865,24 +885,14 @@ tu6_emit_bin_size(struct tu_cs *cs, uint32_t bin_h, struct tu_bin_size_params &&p) { - if (CHIP == A6XX) { - tu_cs_emit_regs( - cs, GRAS_SC_BIN_CNTL(CHIP, .binw = bin_w, - .binh = bin_h, - .render_mode = p.render_mode, - .force_lrz_write_dis = p.force_lrz_write_dis, - .buffers_location = p.buffers_location, - .lrz_feedback_zmode_mask = p.lrz_feedback_zmode_mask, )); - } else { - tu_cs_emit_regs(cs, - GRAS_SC_BIN_CNTL(CHIP, .binw = bin_w, - .binh = bin_h, - .render_mode = p.render_mode, - .force_lrz_write_dis = p.force_lrz_write_dis, - .lrz_feedback_zmode_mask = - p.lrz_feedback_zmode_mask, - .force_lrz_dis = p.force_lrz_dis)); - } + tu_cs_emit_regs( + cs, GRAS_SC_BIN_CNTL(CHIP, .binw = bin_w, + .binh = bin_h, + .render_mode = p.render_mode, + .force_lrz_write_dis = p.force_lrz_write_dis, + .buffers_location = p.buffers_location, + .lrz_feedback_zmode_mask = p.lrz_feedback_zmode_mask, + .force_lrz_dis = p.force_lrz_dis)); tu_cs_emit_regs(cs, RB_CNTL(CHIP, .binw = bin_w, @@ -892,6 +902,32 @@ tu6_emit_bin_size(struct tu_cs *cs, .buffers_location = p.buffers_location, .lrz_feedback_zmode_mask = p.lrz_feedback_zmode_mask, )); + if (CHIP >= A8XX) { + tu_crb crb = cs->crb(13); + + crb.add(TPL1_BIN_SIZE(CHIP, .binw = bin_w, .binh = bin_h)); + crb.add(TPL1_A2D_BIN_SIZE(CHIP, .binw = bin_w, .binh = bin_h)); + crb.add(SP_BIN_SIZE(CHIP, .binw = bin_w, .binh = bin_h)); + + for (int i = 0; i < 8; i++) { + // gen8 TODO: 0x0 if !cbuf_cpp[i] + crb.add(RB_MRT_GMEM_DIMENSION_REG(CHIP, i, + .width = bin_w, + .height = bin_h, + )); + } + // gen8 TODO: 0x0 if !zsbuf_cpp[0] + crb.add(RB_DEPTH_GMEM_DIMENSION(CHIP, + .width = bin_w, + .height = bin_h, + )); + // gen8 TODO: 0x0 if !(zsbuf_cpp[0] || zsbuf_cpp[1]) + crb.add(RB_STENCIL_GMEM_DIMENSION(CHIP, + .width = bin_w, + .height = bin_h, + )); + } + /* no flag for RB_RESOLVE_CNTL_3... */ tu_cs_emit_regs(cs, RB_RESOLVE_CNTL_3(CHIP, .binw = bin_w, .binh = bin_h)); } @@ -1682,7 +1718,7 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd, frag_offsets[i].y = y1 - y1 / tile->frag_areas[view].height; } - with_crb (cs, 6) { + with_crb (cs, 26) { crb.add(GRAS_BIN_FOVEAT(CHIP, .binscaleen = bin_scale_en, .xscale_0 = (enum a7xx_bin_scale)util_logbase2(frag_areas[0].width), @@ -1697,24 +1733,46 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd, .yscale_4 = (enum a7xx_bin_scale)util_logbase2(frag_areas[4].height), .xscale_5 = (enum a7xx_bin_scale)util_logbase2(frag_areas[5].width), .yscale_5 = (enum a7xx_bin_scale)util_logbase2(frag_areas[5].height))) - .add(GRAS_BIN_FOVEAT_OFFSET_0(CHIP, - .xoffset_0 = frag_offsets[0].x, - .xoffset_1 = frag_offsets[1].x, - .xoffset_2 = frag_offsets[2].x)) - .add(GRAS_BIN_FOVEAT_OFFSET_1(CHIP, - .xoffset_3 = frag_offsets[3].x, - .xoffset_4 = frag_offsets[4].x, - .xoffset_5 = frag_offsets[5].x)) - .add(GRAS_BIN_FOVEAT_OFFSET_2(CHIP, - .yoffset_0 = frag_offsets[0].y, - .yoffset_1 = frag_offsets[1].y, - .yoffset_2 = frag_offsets[2].y)) - .add(GRAS_BIN_FOVEAT_OFFSET_3(CHIP, - .yoffset_3 = frag_offsets[3].y, - .yoffset_4 = frag_offsets[4].y, - .yoffset_5 = frag_offsets[5].y)) - .add(RB_BIN_FOVEAT(CHIP, + .add(RB_BIN_FOVEAT(CHIP, .binscaleen = bin_scale_en)); + + if (CHIP >= A8XX) { + for (unsigned i = 0; i < MAX_HW_SCALED_VIEWS; i++) { + crb.add(GRAS_BIN_FOVEAT_XY_OFFSET(CHIP, i, + .xoffset = frag_offsets[i].x, + .yoffset = frag_offsets[i].y, + )); + crb.add(RB_BIN_FOVEAT_XY_OFFSET(CHIP, i, + .xoffset = frag_offsets[i].x, + .yoffset = frag_offsets[i].y, + )); + crb.add(GRAS_BIN_FOVEAT_XY_FDM_OFFSET(CHIP, i, + .xoffset = frag_offsets[i].x, + .yoffset = frag_offsets[i].y, + )); + crb.add(RB_BIN_FOVEAT_XY_FDM_OFFSET(CHIP, i, + .xoffset = frag_offsets[i].x, + .yoffset = frag_offsets[i].y, + )); + } + } else { + crb.add(GRAS_BIN_FOVEAT_OFFSET_0(CHIP, + .xoffset_0 = frag_offsets[0].x, + .xoffset_1 = frag_offsets[1].x, + .xoffset_2 = frag_offsets[2].x)) + .add(GRAS_BIN_FOVEAT_OFFSET_1(CHIP, + .xoffset_3 = frag_offsets[3].x, + .xoffset_4 = frag_offsets[4].x, + .xoffset_5 = frag_offsets[5].x)) + .add(GRAS_BIN_FOVEAT_OFFSET_2(CHIP, + .yoffset_0 = frag_offsets[0].y, + .yoffset_1 = frag_offsets[1].y, + .yoffset_2 = frag_offsets[2].y)) + .add(GRAS_BIN_FOVEAT_OFFSET_3(CHIP, + .yoffset_3 = frag_offsets[3].y, + .yoffset_4 = frag_offsets[4].y, + .yoffset_5 = frag_offsets[5].y)); + } } } else { @@ -2017,7 +2075,7 @@ tu6_init_static_regs(struct tu_device *dev, struct tu_cs *cs) { const struct tu_physical_device *phys_dev = dev->physical_device; - if (CHIP >= A7XX) { + if (CHIP == A7XX) { /* On A7XX, RB_CCU_CNTL was broken into two registers, RB_CCU_CNTL which has * static properties that can be set once, this requires a WFI to take effect. * While the newly introduced register RB_CCU_CACHE_CNTL has properties that may @@ -2071,13 +2129,15 @@ tu6_init_static_regs(struct tu_device *dev, struct tu_cs *cs) } if (dev->physical_device->info->props.has_attachment_shading_rate) { - tu_cs_emit_write_reg(cs, REG_A7XX_GRAS_LRZ_QUALITY_LOOKUP_TABLE(0), - fd_gras_shading_rate_lut(0)); - tu_cs_emit_write_reg(cs, REG_A7XX_GRAS_LRZ_QUALITY_LOOKUP_TABLE(1), - fd_gras_shading_rate_lut(1)); + tu_cs_emit_regs(cs, GRAS_LRZ_QUALITY_LOOKUP_TABLE_REG(CHIP, 0, + fd_gras_shading_rate_lut(0))); + tu_cs_emit_regs(cs, GRAS_LRZ_QUALITY_LOOKUP_TABLE_REG(CHIP, 1, + fd_gras_shading_rate_lut(1))); } - tu_cs_emit_write_reg(cs, REG_A6XX_SP_NC_MODE_CNTL_2, 0); + if (CHIP < A8XX) { + tu_cs_emit_write_reg(cs, REG_A6XX_SP_NC_MODE_CNTL_2, 0); + } tu_cs_emit_write_reg(cs, REG_A6XX_SP_PERFCTR_SHADER_MASK, 0x3f); if (CHIP == A6XX && !cs->device->physical_device->info->props.is_a702) tu_cs_emit_regs(cs, TPL1_UNKNOWN_B605(CHIP, .dword = 0x44)); @@ -2113,7 +2173,6 @@ tu6_init_static_regs(struct tu_device *dev, struct tu_cs *cs) tu_cs_emit_regs(cs, RB_UNKNOWN_88F0(CHIP)); } - tu_cs_emit_regs(cs, VPC_REPLACE_MODE_CNTL(CHIP, false)); tu_cs_emit_regs(cs, VPC_ROTATION_CNTL(CHIP)); @@ -2129,8 +2188,12 @@ tu6_init_static_regs(struct tu_device *dev, struct tu_cs *cs) tu_cs_emit_regs(cs, VPC_UNKNOWN_9210(CHIP)); tu_cs_emit_regs(cs, VPC_UNKNOWN_9211(CHIP)); } - tu_cs_emit_regs(cs, VPC_LB_MODE_CNTL(CHIP)); - tu_cs_emit_regs(cs, PC_CONTEXT_SWITCH_GFX_PREEMPTION_MODE(CHIP)); + + if (CHIP < A8XX) { + tu_cs_emit_regs(cs, VPC_LB_MODE_CNTL(CHIP)); + tu_cs_emit_regs(cs, PC_CONTEXT_SWITCH_GFX_PREEMPTION_MODE(CHIP)); + } + tu_cs_emit_regs(cs, TPL1_MODE_CNTL(CHIP, .isammode = ISAMMODE_GL, .texcoordroundmode = dev->instance->use_tex_coord_round_nearest_even_mode ? COORD_ROUND_NEAREST_EVEN @@ -2143,21 +2206,26 @@ tu6_init_static_regs(struct tu_device *dev, struct tu_cs *cs) tu_cs_emit_write_reg(cs, REG_A6XX_VFD_RENDER_MODE, 0x00000000); tu_cs_emit_regs(cs, A6XX_RB_ALPHA_TEST_CNTL()); /* always disable alpha test */ + if (CHIP >= A8XX) + tu_cs_emit_regs(cs, SP_ALPHA_TEST_CNTL(CHIP)); tu_cs_emit_regs(cs, A6XX_TPL1_GFX_BORDER_COLOR_BASE(.qword = dev->global_bo->iova + gb_offset(bcolor))); tu_cs_emit_regs(cs, A6XX_TPL1_CS_BORDER_COLOR_BASE(.qword = dev->global_bo->iova + gb_offset(bcolor))); /* BR-only registers */ - if (CHIP >= A7XX) - tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(THREAD_MODE) | - CP_COND_REG_EXEC_0_BR); - tu_cs_emit_write_reg(cs, REG_A6XX_RB_DBG_ECO_CNTL, - phys_dev->info->magic.RB_DBG_ECO_CNTL); - tu_cs_emit_write_reg(cs, REG_A6XX_RB_RBP_CNTL, - phys_dev->info->magic.RB_RBP_CNTL); - if (CHIP >= A7XX) { - tu_cs_emit_regs(cs, RB_UNKNOWN_8E09(CHIP, 0x7)); - tu_cond_exec_end(cs); + /* non-ctx regs programmed by KMD (and blocked from UMD) on gen8+ */ + if (CHIP < A8XX) { + if (CHIP == A7XX) + tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(THREAD_MODE) | + CP_COND_REG_EXEC_0_BR); + tu_cs_emit_write_reg(cs, REG_A6XX_RB_DBG_ECO_CNTL, + phys_dev->info->magic.RB_DBG_ECO_CNTL); + tu_cs_emit_write_reg(cs, REG_A6XX_RB_RBP_CNTL, + phys_dev->info->magic.RB_RBP_CNTL); + if (CHIP == A7XX) { + tu_cs_emit_regs(cs, RB_UNKNOWN_8E09(CHIP, 0x7)); + tu_cond_exec_end(cs); + } } if (CHIP == A7XX) { @@ -2170,12 +2238,12 @@ tu6_init_static_regs(struct tu_device *dev, struct tu_cs *cs) if (CHIP >= A7XX) { /* Blob sets these two per draw. */ - tu_cs_emit_regs(cs, PC_HS_BUFFER_SIZE(CHIP, TU_TESS_PARAM_SIZE)); + tu_cs_emit_regs(cs, PC_HS_BUFFER_SIZE(CHIP, TU_TESS::PARAM_SIZE)); /* Blob adds a bit more space ({0x10, 0x20, 0x30, 0x40} bytes) * but the meaning of this additional space is not known, * so we play safe and don't add it. */ - tu_cs_emit_regs(cs, PC_TF_BUFFER_SIZE(CHIP, TU_TESS_FACTOR_SIZE)); + tu_cs_emit_regs(cs, PC_TF_BUFFER_SIZE(CHIP, TU_TESS::FACTOR_SIZE)); } /* There is an optimization to skip executing draw states for draws with no @@ -2224,7 +2292,7 @@ tu_emit_bin_preamble(struct tu_device *dev, struct tu_cs *cs, bool bv) emit_rb_ccu_cntl(cs, dev, true); emit_vpc_attr_buf(cs, dev, true); - if (CHIP == A7XX && !bv) { + if (CHIP >= A7XX && !bv) { tu7_emit_tile_render_begin_regs(cs); } @@ -2742,7 +2810,16 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd, } else { tu_desc_set_array_slice_offset(dst, 0); } - tu_desc_set_addr(dst, cmd->device->physical_device->gmem_base + gmem_offset); + + uint64_t va = gmem_offset; + if (CHIP < A8XX) { + /* For gen8, address is simply gmem_offset if tile_mode is gmem + * tiling (TILE6_2) + */ + va += cmd->device->physical_device->gmem_base; + } + + tu_desc_set_addr(dst, va); memcpy(&texture.map[i * FDL6_TEX_CONST_DWORDS], dst, sizeof(dst)); } @@ -3070,7 +3147,7 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, : LRZ_FEEDBACK_NONE, }); - if (CHIP == A7XX) { + if (CHIP >= A7XX) { tu7_emit_sysmem_render_begin_regs(cmd, cs); } @@ -5233,7 +5310,7 @@ tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, /* VPC_SO_FLUSH_BASE has dwords counter, but counter should be in bytes */ tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3); - tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A6XX_CP_SCRATCH(0)) | + tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(tu_scratch_reg(0).reg) | COND(CHIP == A6XX, CP_MEM_TO_REG_0_SHIFT_BY_2) | 0x40000 | /* ??? */ CP_MEM_TO_REG_0_UNK31 | @@ -5242,14 +5319,14 @@ tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, if (offset) { tu_cs_emit_pkt7(cs, CP_REG_RMW, 3); - tu_cs_emit(cs, CP_REG_RMW_0_DST_REG(REG_A6XX_CP_SCRATCH(0)) | + tu_cs_emit(cs, CP_REG_RMW_0_DST_REG(tu_scratch_reg(0).reg) | CP_REG_RMW_0_SRC1_ADD); tu_cs_emit(cs, 0xffffffff); tu_cs_emit(cs, -offset); } tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); - tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_SCRATCH(0)) | + tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(tu_scratch_reg(0).reg) | CP_REG_TO_MEM_0_CNT(1)); tu_cs_emit_qw(cs, vk_buffer_address(&buf->vk, counter_buffer_offset)); } @@ -7471,13 +7548,14 @@ tu6_emit_shared_consts(struct tu_cs *cs, } } +template static void tu7_emit_shared_preamble_consts( struct tu_cs *cs, const struct tu_push_constant_range *shared_consts, uint32_t *push_constants) { - tu_cs_emit_pkt4(cs, REG_A7XX_SP_SHARED_CONSTANT_GFX(shared_consts->lo_dwords), + tu_cs_emit_pkt4(cs, SP_SHARED_CONSTANT_GFX_REG(CHIP, shared_consts->lo_dwords).reg, shared_consts->dwords); tu_cs_emit_array(cs, push_constants + shared_consts->lo_dwords, shared_consts->dwords); @@ -7508,6 +7586,7 @@ tu6_const_size(struct tu_cmd_buffer *cmd, return dwords; } +template static struct tu_draw_state tu_emit_consts(struct tu_cmd_buffer *cmd, bool compute) { @@ -7527,7 +7606,7 @@ tu_emit_consts(struct tu_cmd_buffer *cmd, bool compute) if (shared_consts->type == IR3_PUSH_CONSTS_SHARED) { tu6_emit_shared_consts(&cs, shared_consts, cmd->push_constants, compute); } else if (shared_consts->type == IR3_PUSH_CONSTS_SHARED_PREAMBLE) { - tu7_emit_shared_preamble_consts(&cs, shared_consts, cmd->push_constants); + tu7_emit_shared_preamble_consts(&cs, shared_consts, cmd->push_constants); } if (compute) { @@ -8118,7 +8197,7 @@ tu6_draw_common(struct tu_cmd_buffer *cmd, .provoking_vtx_last = provoking_vtx_last) .value; tu_cs_emit_regs(cs, PC_CNTL(CHIP, .dword = primitive_cntl_0)); - if (CHIP == A7XX) { + if (CHIP >= A7XX) { tu_cs_emit_regs(cs, VPC_PC_CNTL(CHIP, .dword = primitive_cntl_0)); } } @@ -8156,11 +8235,15 @@ tu6_draw_common(struct tu_cmd_buffer *cmd, const struct tu_shader *tcs = cmd->state.shaders[MESA_SHADER_TESS_CTRL]; /* maximum number of patches that can fit in tess factor/param buffers */ - uint32_t subdraw_size = MIN2(TU_TESS_FACTOR_SIZE / ir3_tess_factor_stride(tes->variant->key.tessellation), - TU_TESS_PARAM_SIZE / (tcs->variant->output_size * 4)); + uint32_t subdraw_size = MIN2(TU_TESS::FACTOR_SIZE / ir3_tess_factor_stride(tes->variant->key.tessellation), + TU_TESS::PARAM_SIZE / (tcs->variant->output_size * 4)); /* convert from # of patches to draw count */ subdraw_size *= cmd->vk.dynamic_graphics_state.ts.patch_control_points; + /* For gen8 tess_bo is sized for two draws, adjust subdraw size accordingly: */ + if (CHIP >= A8XX) + subdraw_size /= 2; + tu_cs_emit_pkt7(cs, CP_SET_SUBDRAW_SIZE, 1); tu_cs_emit(cs, subdraw_size); } @@ -8213,7 +8296,7 @@ tu6_draw_common(struct tu_cmd_buffer *cmd, } if (dirty & TU_CMD_DIRTY_SHADER_CONSTS) - cmd->state.shader_const = tu_emit_consts(cmd, false); + cmd->state.shader_const = tu_emit_consts(cmd, false); if (dirty & TU_CMD_DIRTY_DESC_SETS) tu6_emit_descriptor_sets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS); @@ -9120,7 +9203,7 @@ tu_dispatch(struct tu_cmd_buffer *cmd, tu_emit_cache_flush(cmd); /* note: no reason to have this in a separate IB */ - tu_cs_emit_state_ib(cs, tu_emit_consts(cmd, true)); + tu_cs_emit_state_ib(cs, tu_emit_consts(cmd, true)); tu_emit_compute_driver_params(cmd, cs, info); diff --git a/src/freedreno/vulkan/tu_cs.h b/src/freedreno/vulkan/tu_cs.h index e574271508e..9ad80cc4b73 100644 --- a/src/freedreno/vulkan/tu_cs.h +++ b/src/freedreno/vulkan/tu_cs.h @@ -802,4 +802,15 @@ private: #define with_crb(...) \ for (tu_crb crb(__VA_ARGS__); crb.first; crb.first = false) +template +static inline fd_reg_pair +tu_scratch_reg(int idx, uint32_t val = 0) +{ + if (CHIP >= A8XX) { + return CP_SCRATCH_GLOBAL_REG(CHIP, idx, val); + } else { + return CP_SCRATCH_REG(CHIP, idx, val); + } +} + #endif /* TU_CS_H */ diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index bb398213ba8..a57dfe5b6ce 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -911,12 +911,17 @@ tu_get_physical_device_properties_1_2(struct tu_physical_device *pdevice, p->roundingModeIndependence = VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL; - p->shaderDenormFlushToZeroFloat16 = true; - p->shaderDenormPreserveFloat16 = false; + if (pdevice->info->chip >= A8XX) { + p->shaderDenormFlushToZeroFloat16 = false; + p->shaderDenormPreserveFloat16 = true; + } else { + p->shaderDenormFlushToZeroFloat16 = true; + p->shaderDenormPreserveFloat16 = false; + } + p->shaderRoundingModeRTEFloat16 = true; p->shaderRoundingModeRTZFloat16 = false; p->shaderSignedZeroInfNanPreserveFloat16 = true; - p->shaderDenormFlushToZeroFloat32 = true; /* FP32 denorm preserve has to be emulated via soft-float. Normal @@ -1579,7 +1584,8 @@ tu_physical_device_init(struct tu_physical_device *device, switch (fd_dev_gen(&device->dev_id)) { case 6: - case 7: { + case 7: + case 8: { device->dev_info = info; device->info = &device->dev_info; @@ -2046,6 +2052,7 @@ tu_GetPhysicalDeviceFragmentShadingRatesKHR( uint32_t *pFragmentShadingRateCount, VkPhysicalDeviceFragmentShadingRateKHR *pFragmentShadingRates) { + VK_FROM_HANDLE(tu_physical_device, physical_device, physicalDevice); VK_OUTARRAY_MAKE_TYPED(VkPhysicalDeviceFragmentShadingRateKHR, out, pFragmentShadingRates, pFragmentShadingRateCount); @@ -2063,6 +2070,9 @@ tu_GetPhysicalDeviceFragmentShadingRatesKHR( append_rate(4, 4, VK_SAMPLE_COUNT_1_BIT); append_rate(4, 2, VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_2_BIT); + /* Apparently hw didn't actually have this rate in a7xx: */ + if (physical_device->info->chip >= A8XX) + append_rate(2, 4, VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_2_BIT); append_rate(2, 2, VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT); append_rate(2, 1, VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT); append_rate(1, 2, VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT); @@ -2686,6 +2696,13 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, case 7: vk_device_dispatch_table_from_entrypoints( &dispatch_table, &tu_device_entrypoints_a7xx, false); + break; + case 8: + /* gen8 TODO: */ + tu_env.debug |= TU_DEBUG_NOLRZ; /* WRITE iova faults from UCHE */ + tu_env.debug |= TU_DEBUG_FLUSHALL; /* dEQP-VK.draw.\*from_compute\* */ + vk_device_dispatch_table_from_entrypoints( + &dispatch_table, &tu_device_entrypoints_a8xx, false); } vk_device_dispatch_table_from_entrypoints( @@ -2954,8 +2971,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, goto fail_prepare_perfcntrs_pass_cs; } - /* TODO: a8xx */ - tu_cs_emit_regs(&sub_cs, CP_SCRATCH_REG(A6XX, PERF_CNTRS_REG, 1 << i)); + tu_cs_emit_regs(&sub_cs, TU_CALLX(device, tu_scratch_reg)(PERF_CNTRS_REG, 1 << i)); tu_cs_emit_pkt7(&sub_cs, CP_WAIT_FOR_ME, 0); device->perfcntrs_pass_cs_entries[i] = diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h index f0f5dc65bf7..66d28742f2a 100644 --- a/src/freedreno/vulkan/tu_device.h +++ b/src/freedreno/vulkan/tu_device.h @@ -375,10 +375,6 @@ struct tu_device struct tu_suballocator vis_stream_suballocator; mtx_t vis_stream_suballocator_mtx; - /* the blob seems to always use 8K factor and 128K param sizes, copy them */ -#define TU_TESS_FACTOR_SIZE (8 * 1024) -#define TU_TESS_PARAM_SIZE (128 * 1024) -#define TU_TESS_BO_SIZE (TU_TESS_FACTOR_SIZE + TU_TESS_PARAM_SIZE) /* Lazily allocated, protected by the device mutex. */ struct tu_bo *tess_bo; @@ -500,6 +496,25 @@ struct tu_device }; VK_DEFINE_HANDLE_CASTS(tu_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE) +template +struct TU_TESS; + +template +struct TU_TESS { + /* the blob seems to always use 8K factor and 128K param sizes, copy them */ + static const size_t FACTOR_SIZE = 8 * 1024; + static const size_t PARAM_SIZE = 128 * 1024; + static const size_t BO_SIZE = FACTOR_SIZE + PARAM_SIZE; +}; + +template +struct TU_TESS= A8XX)> { + /* for gen8, buffers are sized for two draws: */ + static const size_t FACTOR_SIZE = 0x4040; + static const size_t PARAM_SIZE = 0x40000; + static const size_t BO_SIZE = FACTOR_SIZE + PARAM_SIZE; +}; + struct tu_device_memory { struct vk_device_memory vk; diff --git a/src/freedreno/vulkan/tu_lrz.cc b/src/freedreno/vulkan/tu_lrz.cc index 852554c488f..bc55cdeedb6 100644 --- a/src/freedreno/vulkan/tu_lrz.cc +++ b/src/freedreno/vulkan/tu_lrz.cc @@ -109,7 +109,7 @@ template static void tu6_emit_lrz_buffer(struct tu_cs *cs, struct tu_image *depth_image) { - tu_crb crb = cs->crb(7); + tu_crb crb = cs->crb(8); if (!depth_image) { crb.add(GRAS_LRZ_BUFFER_BASE(CHIP, 0)) @@ -121,6 +121,10 @@ tu6_emit_lrz_buffer(struct tu_cs *cs, struct tu_image *depth_image) crb.add(GRAS_LRZ_CB_CNTL(CHIP)); } + if (CHIP >= A8XX) { + crb.add(GRAS_LRZ_BUFFER_SLICE_PITCH(CHIP)); + } + return; } @@ -142,6 +146,12 @@ tu6_emit_lrz_buffer(struct tu_cs *cs, struct tu_image *depth_image) crb.add(GRAS_LRZ_CB_CNTL(CHIP, .double_buffer_pitch = depth_image->lrz_layout.lrz_buffer_size)); } + + if (CHIP >= A8XX) { + crb.add(GRAS_LRZ_BUFFER_SLICE_PITCH(CHIP, + depth_image->lrz_layout.lrz_slice_pitch + )); + } } static void diff --git a/src/freedreno/vulkan/tu_pipeline.cc b/src/freedreno/vulkan/tu_pipeline.cc index 32f5d597b39..8cd2e5ad58c 100644 --- a/src/freedreno/vulkan/tu_pipeline.cc +++ b/src/freedreno/vulkan/tu_pipeline.cc @@ -1167,6 +1167,7 @@ tu6_emit_vs_params(struct tu_cs *cs, ARRAY_SIZE(vs_params), vs_params); } +template static void tu_get_tess_iova(struct tu_device *dev, uint64_t *tess_factor_iova, @@ -1176,14 +1177,14 @@ tu_get_tess_iova(struct tu_device *dev, if (!dev->tess_bo) { mtx_lock(&dev->mutex); if (!dev->tess_bo) { - tu_bo_init_new(dev, NULL, &dev->tess_bo, TU_TESS_BO_SIZE, + tu_bo_init_new(dev, NULL, &dev->tess_bo, TU_TESS::BO_SIZE, TU_BO_ALLOC_INTERNAL_RESOURCE, "tess"); } mtx_unlock(&dev->mutex); } *tess_factor_iova = dev->tess_bo->iova; - *tess_param_iova = dev->tess_bo->iova + TU_TESS_FACTOR_SIZE; + *tess_param_iova = dev->tess_bo->iova + TU_TESS::FACTOR_SIZE; } static const enum mesa_vk_dynamic_graphics_state tu_patch_control_points_state[] = { @@ -1235,7 +1236,7 @@ tu6_emit_patch_control_points(struct tu_cs *cs, patch_control_points); uint64_t tess_factor_iova, tess_param_iova; - tu_get_tess_iova(dev, &tess_factor_iova, &tess_param_iova); + tu_get_tess_iova(dev, &tess_factor_iova, &tess_param_iova); uint32_t hs_params[HS_PARAMS_SIZE] = { vs->variant->output_size * patch_control_points * 4, /* hs primitive stride */ @@ -1289,6 +1290,7 @@ tu6_emit_patch_control_points(struct tu_cs *cs, tu_cs_emit(cs, wave_input_size); } +template static void tu6_emit_geom_tess_consts(struct tu_cs *cs, const struct ir3_shader_variant *vs, @@ -1305,7 +1307,7 @@ tu6_emit_geom_tess_consts(struct tu_cs *cs, if (hs) { uint64_t tess_factor_iova, tess_param_iova; - tu_get_tess_iova(dev, &tess_factor_iova, &tess_param_iova); + tu_get_tess_iova(dev, &tess_factor_iova, &tess_param_iova); uint32_t ds_params[8] = { gs ? ds->output_size * gs->gs.vertices_in * 4 : 0, /* ds primitive stride */ @@ -1407,7 +1409,7 @@ tu6_emit_program_config(struct tu_cs *cs, } if (gs || hs) { - tu6_emit_geom_tess_consts(cs, vs, hs, ds, gs); + tu6_emit_geom_tess_consts(cs, vs, hs, ds, gs); } } @@ -2531,7 +2533,11 @@ template static unsigned tu6_viewport_nregs(const struct vk_viewport_state *vp) { - return 10 * vp->viewport_count + 3; + if (CHIP >= A8XX) { + return 12 * vp->viewport_count + 1; + } else { + return 10 * vp->viewport_count + 3; + } } template @@ -2636,7 +2642,10 @@ tu6_emit_viewport(struct tu_cs *cs, crb.add(GRAS_CL_VIEWPORT_ZCLAMP_MIN(CHIP, i, zmin)); crb.add(GRAS_CL_VIEWPORT_ZCLAMP_MAX(CHIP, i, zmax)); - if (i == 0) { + if (CHIP >= A8XX) { + crb.add(RB_VIEWPORT_ZCLAMP_MIN_REG(CHIP, i, zmin)); + crb.add(RB_VIEWPORT_ZCLAMP_MAX_REG(CHIP, i, zmax)); + } else if (i == 0) { /* TODO: what to do about this and multi viewport ? */ crb.add(RB_VIEWPORT_ZCLAMP_MIN(CHIP, zmin)); crb.add(RB_VIEWPORT_ZCLAMP_MAX(CHIP, zmax)); @@ -3232,7 +3241,7 @@ tu6_blend_size(struct tu_device *dev, { unsigned num_rts = alpha_to_coverage_enable ? MAX2(cb->attachment_count, 1) : cb->attachment_count; - return 8 + 3 * num_rts; + return 8 + 5 * num_rts; } template @@ -3281,7 +3290,8 @@ tu6_emit_blend(struct tu_cs *cs, .dual_color_in_enable = dual_src_blend, .alpha_to_coverage = - alpha_to_coverage_enable)); + alpha_to_coverage_enable, + .alpha_to_one = alpha_to_one_enable)); /* TODO: set A6XX_RB_BLEND_CNTL_INDEPENDENT_BLEND only when enabled? * * We could also set blend_reads_dest more conservatively, but it didn't show @@ -3340,10 +3350,19 @@ tu6_emit_blend(struct tu_cs *cs, .alpha_src_factor = src_alpha_factor, .alpha_blend_opcode = alpha_op, .alpha_dest_factor = dst_alpha_factor)); + if (CHIP >= A8XX) { + tu_cs_emit_regs(cs, SP_MRT_BLEND_CNTL_REG(CHIP, remapped_idx, + .color_blend_en = blend_enable, + .alpha_blend_en = blend_enable, + .component_write_mask = att->write_mask)); + } } else { - tu_cs_emit_regs(cs, - A6XX_RB_MRT_CONTROL(remapped_idx,), - A6XX_RB_MRT_BLEND_CONTROL(remapped_idx,)); + tu_cs_emit_regs(cs, + A6XX_RB_MRT_CONTROL(remapped_idx,), + A6XX_RB_MRT_BLEND_CONTROL(remapped_idx,)); + if (CHIP >= A8XX) { + tu_cs_emit_regs(cs, SP_MRT_BLEND_CNTL_REG(CHIP, remapped_idx,)); + } } } tu_cs_emit_regs(cs, A6XX_SP_PS_MRT_CNTL(.mrt = num_remapped_rts)); @@ -3400,7 +3419,7 @@ tu6_rast_size(struct tu_device *dev, } else if (CHIP == A6XX) { return 15 + (dev->physical_device->info->props.has_legacy_pipeline_shading_rate ? 8 : 0); } else { - return 27; + return 30; } } @@ -3428,6 +3447,13 @@ tu6_emit_rast(struct tu_cs *cs, .rendertargetindexincr = multiview, .viewportindexincr = multiview && per_view_viewport)); + if (CHIP >= A8XX) { + tu_cs_emit_regs(cs, GRAS_SU_STEREO_CNTL(CHIP, + .rendertargetindexincr = multiview, + .viewportindexincr = multiview && per_view_viewport, + )); + } + bool depth_clip_enable = vk_rasterization_state_depth_clip_enable(rs); tu_cs_emit_regs(cs, @@ -3437,7 +3463,7 @@ tu6_emit_rast(struct tu_cs *cs, /* To support VK_EXT_depth_clamp_zero_one on a7xx+ */ .z_clamp_enable = rs->depth_clamp_enable || CHIP >= A7XX, .zero_gb_scale_z = vp->depth_clip_negative_one_to_one ? 0 : 1, - .vp_clip_code_ignore = 1));; + .vp_clip_code_ignore = 1)); enum a6xx_polygon_mode polygon_mode = tu6_polygon_mode(rs->polygon_mode); @@ -3446,7 +3472,10 @@ tu6_emit_rast(struct tu_cs *cs, tu_cs_emit_regs(cs, PC_DGEN_RAST_CNTL(CHIP, polygon_mode)); - if (CHIP == A7XX || cs->device->physical_device->info->props.is_a702) { + if (CHIP >= A8XX) + tu_cs_emit_regs(cs, GRAS_RAST_CNTL(CHIP, polygon_mode)); + + if (CHIP >= A7XX || cs->device->physical_device->info->props.is_a702) { tu_cs_emit_regs(cs, VPC_PS_RAST_CNTL(CHIP, polygon_mode)); } @@ -3457,9 +3486,11 @@ tu6_emit_rast(struct tu_cs *cs, tu_cs_emit_regs(cs, VPC_UNKNOWN_9107(CHIP, .raster_discard = rs->rasterizer_discard_enable)); } else { - tu_cs_emit_regs(cs, VPC_RAST_STREAM_CNTL_V2(CHIP, - .stream = rs->rasterization_stream, - .discard = rs->rasterizer_discard_enable)); + if (CHIP == A7XX) { + tu_cs_emit_regs(cs, VPC_RAST_STREAM_CNTL_V2(CHIP, + .stream = rs->rasterization_stream, + .discard = rs->rasterizer_discard_enable)); + } bool conservative_ras_en = rs->conservative_mode == @@ -3641,7 +3672,9 @@ tu6_emit_rb_depth_cntl(struct tu_cs *cs, .z_read_enable = (ds->depth.test_enable && (zfunc != FUNC_NEVER && zfunc != FUNC_ALWAYS)) || ds->depth.bounds_test.enable, - .z_bounds_enable = ds->depth.bounds_test.enable)); + .z_bounds_enable = ds->depth.bounds_test.enable, + .o_depth_01_clamp_en = CHIP >= A8XX, + )); tu_cs_emit_regs(cs, GRAS_SU_DEPTH_CNTL(CHIP, depth_test)); } else { tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL()); @@ -3775,6 +3808,7 @@ tu6_emit_fragment_shading_rate(struct tu_cs *cs, .frag_size_y = util_logbase2(frag_height), .combiner_op_1 = (a6xx_fsr_combiner) fsr->combiner_ops[0], .combiner_op_2 = (a6xx_fsr_combiner) fsr->combiner_ops[1], + .combiner_clamp_mode = (CHIP >= A8XX) ? FSR_COMBINER_CLAMP_16_SAMP : FSR_COMBINER_CLAMP_4x4, .attachment_fsr_enable = enable_att_fsr, .primitive_fsr_enable = enable_prim_fsr)); } diff --git a/src/freedreno/vulkan/tu_query_pool.cc b/src/freedreno/vulkan/tu_query_pool.cc index b99fb88c135..962c7028fe5 100644 --- a/src/freedreno/vulkan/tu_query_pool.cc +++ b/src/freedreno/vulkan/tu_query_pool.cc @@ -541,6 +541,18 @@ is_pipeline_query_with_compute_stage(uint32_t pipeline_statistics) VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT; } +template +static inline void +emit_counter_barrier(struct tu_cs *cs) +{ + tu_cs_emit_wfi(cs); + + if (CHIP >= A8XX) { + tu_cs_emit_pkt7(cs, CP_BARRIER, 1); + tu_cs_emit(cs, 1); + } +} + /* Wait on the the availability status of a query up until a timeout. */ static VkResult wait_for_available(struct tu_device *device, struct tu_query_pool *pool, @@ -1165,7 +1177,7 @@ emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf, tu_emit_event_write(cmdbuf, cs, FD_START_COMPUTE_CTRS); } - tu_cs_emit_wfi(cs); + emit_counter_barrier(cs); tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(COUNTER_REG(IAVERTICES)) | @@ -1174,12 +1186,13 @@ emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf, tu_cs_emit_qw(cs, begin_iova); } +template static void emit_perfcntrs_pass_start(bool has_pred_bit, struct tu_cs *cs, uint32_t pass) { tu_cs_emit_pkt7(cs, CP_REG_TEST, 1); tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG( - REG_A6XX_CP_SCRATCH(PERF_CNTRS_REG)) | + tu_scratch_reg(PERF_CNTRS_REG).reg) | A6XX_CP_REG_TEST_0_BIT(pass) | (has_pred_bit ? A6XX_CP_REG_TEST_0_PRED_BIT(TU_PREDICATE_PERFCNTRS) : 0) | @@ -1222,7 +1235,7 @@ emit_begin_perf_query_raw(struct tu_cmd_buffer *cmdbuf, * stream below CP_COND_REG_EXEC. */ - tu_cs_emit_wfi(cs); + emit_counter_barrier(cs); /* Keep preemption disabled for the duration of this query. This way * changes in perfcounter values should only apply to work done during @@ -1242,7 +1255,7 @@ emit_begin_perf_query_raw(struct tu_cmd_buffer *cmdbuf, if (data->pass != 0) tu_cond_exec_end(cs); - emit_perfcntrs_pass_start(has_pred_bit, cs, data->pass); + emit_perfcntrs_pass_start(has_pred_bit, cs, data->pass); } const struct fd_perfcntr_counter *counter = @@ -1256,7 +1269,7 @@ emit_begin_perf_query_raw(struct tu_cmd_buffer *cmdbuf, tu_cond_exec_end(cs); last_pass = ~0; - tu_cs_emit_wfi(cs); + emit_counter_barrier(cs); for (uint32_t i = 0; i < perf_query->counter_index_count; i++) { struct tu_perf_query_raw_data *data = &perf_query->data[i]; @@ -1266,7 +1279,7 @@ emit_begin_perf_query_raw(struct tu_cmd_buffer *cmdbuf, if (data->pass != 0) tu_cond_exec_end(cs); - emit_perfcntrs_pass_start(has_pred_bit, cs, data->pass); + emit_perfcntrs_pass_start(has_pred_bit, cs, data->pass); } const struct fd_perfcntr_counter *counter = @@ -1291,7 +1304,7 @@ emit_begin_perf_query_derived(struct tu_cmd_buffer *cmdbuf, struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; struct tu_perf_query_derived *perf_query = &pool->perf_query.derived; - tu_cs_emit_wfi(cs); + emit_counter_barrier(cs); /* Keep preemption disabled for the duration of this query. This way * changes in perfcounter values should only apply to work done during @@ -1311,7 +1324,7 @@ emit_begin_perf_query_derived(struct tu_cmd_buffer *cmdbuf, tu_cs_emit(cs, countable); } - tu_cs_emit_wfi(cs); + emit_counter_barrier(cs); /* Collect the enabled perfcntrs. Emit CP_ALWAYS_COUNT collection last, if necessary. */ for (uint32_t i = 0; i < perf_query->collection->num_enabled_perfcntrs; ++i) { @@ -1383,7 +1396,7 @@ emit_begin_prim_generated_query(struct tu_cmd_buffer *cmdbuf, tu_emit_event_write(cmdbuf, cs, FD_START_PRIMITIVE_CTRS); - tu_cs_emit_wfi(cs); + emit_counter_barrier(cs); tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(COUNTER_REG(CINVOCATIONS)) | @@ -1539,7 +1552,7 @@ emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf, .write_accum_sample_count_diff = true).value); tu_cs_emit_qw(cs, begin_iova); - tu_cs_emit_wfi(cs); + emit_counter_barrier(cs); if (cmdbuf->device->physical_device->info->props.has_generic_clear) { /* If the next renderpass uses the same depth attachment, clears it @@ -1651,7 +1664,7 @@ emit_end_stat_query(struct tu_cmd_buffer *cmdbuf, tu_emit_event_write(cmdbuf, cs, FD_STOP_COMPUTE_CTRS); } - tu_cs_emit_wfi(cs); + emit_counter_barrier(cs); tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(COUNTER_REG(IAVERTICES)) | @@ -1705,7 +1718,7 @@ emit_end_perf_query_raw(struct tu_cmd_buffer *cmdbuf, /* Wait for the profiled work to finish so that collected counter values * are as accurate as possible. */ - tu_cs_emit_wfi(cs); + emit_counter_barrier(cs); for (uint32_t i = 0; i < perf_query->counter_index_count; i++) { struct tu_perf_query_raw_data *data = &perf_query->data[i]; @@ -1715,7 +1728,7 @@ emit_end_perf_query_raw(struct tu_cmd_buffer *cmdbuf, if (data->pass != 0) tu_cond_exec_end(cs); - emit_perfcntrs_pass_start(has_pred_bit, cs, data->pass); + emit_perfcntrs_pass_start(has_pred_bit, cs, data->pass); } const struct fd_perfcntr_counter *counter = @@ -1731,7 +1744,7 @@ emit_end_perf_query_raw(struct tu_cmd_buffer *cmdbuf, tu_cond_exec_end(cs); last_pass = ~0; - tu_cs_emit_wfi(cs); + emit_counter_barrier(cs); for (uint32_t i = 0; i < perf_query->counter_index_count; i++) { struct tu_perf_query_raw_data *data = &perf_query->data[i]; @@ -1742,7 +1755,7 @@ emit_end_perf_query_raw(struct tu_cmd_buffer *cmdbuf, if (data->pass != 0) tu_cond_exec_end(cs); - emit_perfcntrs_pass_start(has_pred_bit, cs, data->pass); + emit_perfcntrs_pass_start(has_pred_bit, cs, data->pass); } result_iova = query_result_iova(pool, query, struct perfcntr_query_slot, @@ -1796,7 +1809,7 @@ emit_end_perf_query_derived(struct tu_cmd_buffer *cmdbuf, /* Wait for the profiled work to finish so that collected counter values * are as accurate as possible. */ - tu_cs_emit_wfi(cs); + emit_counter_barrier(cs); /* Collect the enabled perfcntrs. Emit CP_ALWAYS_COUNT collection first, if necessary. */ if (perf_query->collection->cp_always_count_enabled) { @@ -1822,7 +1835,7 @@ emit_end_perf_query_derived(struct tu_cmd_buffer *cmdbuf, tu_cs_emit_qw(cs, end_iova); } - tu_cs_emit_wfi(cs); + emit_counter_barrier(cs); for (uint32_t i = 0; i < perf_query->collection->num_enabled_perfcntrs; ++i) { uint64_t result_iova = perf_query_derived_perfcntr_iova(pool, query, result, i); @@ -1884,7 +1897,7 @@ emit_end_xfb_query(struct tu_cmd_buffer *cmdbuf, tu_cs_emit_regs(cs, VPC_SO_QUERY_BASE(CHIP, .qword = end_iova)); tu_emit_event_write(cmdbuf, cs, FD_WRITE_PRIMITIVE_COUNTS); - tu_cs_emit_wfi(cs); + emit_counter_barrier(cs); tu_emit_event_write(cmdbuf, cs, FD_CACHE_CLEAN); /* Set the count of written primitives */ @@ -1936,7 +1949,7 @@ emit_end_prim_generated_query(struct tu_cmd_buffer *cmdbuf, CP_COND_REG_EXEC_0_BINNING); } - tu_cs_emit_wfi(cs); + emit_counter_barrier(cs); tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(COUNTER_REG(CINVOCATIONS)) | @@ -2085,7 +2098,7 @@ tu_CmdWriteTimestamp2(VkCommandBuffer commandBuffer, * there's a better solution that allows all 48 bits of precision * because CP_EVENT_WRITE doesn't support 64-bit timestamps. */ - tu_cs_emit_wfi(cs); + emit_counter_barrier(cs); } tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); diff --git a/src/freedreno/vulkan/tu_shader.cc b/src/freedreno/vulkan/tu_shader.cc index 638a6929ba4..cf1af9f4b74 100644 --- a/src/freedreno/vulkan/tu_shader.cc +++ b/src/freedreno/vulkan/tu_shader.cc @@ -1017,7 +1017,7 @@ tu_lower_io(nir_shader *shader, struct tu_device *dev, /* Disable pushing constants for this stage if none were loaded in the * shader. If all stages don't load their declared push constants, as * is often the case under zink, then we could additionally skip - * emitting REG_A7XX_SP_SHARED_CONSTANT_GFX entirely. + * emitting SP_SHARED_CONSTANT_GFX entirely. */ if (!shader_uses_push_consts(shader)) const_state->push_consts = (struct tu_push_constant_range) {}; @@ -1502,6 +1502,7 @@ tu_xs_get_additional_cs_size_dwords(const struct ir3_shader_variant *xs) return size; } +template void tu6_emit_xs(struct tu_crb &crb, struct tu_device *device, @@ -1541,7 +1542,7 @@ tu6_emit_xs(struct tu_crb &crb, A6XX_SP_VS_PVT_MEM_SIZE(.totalpvtmemsize = pvtmem->per_sp_size, .perwavememlayout = xs->pvtmem_per_wave)); crb.add(A6XX_SP_VS_PVT_MEM_STACK_OFFSET(.offset = pvtmem->per_sp_size)); - if (device->physical_device->info->chip >= A7XX) + if (CHIP >= A7XX) crb.add(SP_VS_VGS_CNTL(A7XX, 0)); break; @@ -1560,7 +1561,7 @@ tu6_emit_xs(struct tu_crb &crb, A6XX_SP_HS_PVT_MEM_SIZE(.totalpvtmemsize = pvtmem->per_sp_size, .perwavememlayout = xs->pvtmem_per_wave)); crb.add(A6XX_SP_HS_PVT_MEM_STACK_OFFSET(.offset = pvtmem->per_sp_size)); - if (device->physical_device->info->chip >= A7XX) + if (CHIP >= A7XX) crb.add(SP_HS_VGS_CNTL(A7XX, 0)); break; @@ -1580,7 +1581,7 @@ tu6_emit_xs(struct tu_crb &crb, A6XX_SP_DS_PVT_MEM_SIZE(.totalpvtmemsize = pvtmem->per_sp_size, .perwavememlayout = xs->pvtmem_per_wave)); crb.add(A6XX_SP_DS_PVT_MEM_STACK_OFFSET(.offset = pvtmem->per_sp_size)); - if (device->physical_device->info->chip >= A7XX) + if (CHIP >= A7XX) crb.add(SP_DS_VGS_CNTL(A7XX, 0)); break; @@ -1599,7 +1600,7 @@ tu6_emit_xs(struct tu_crb &crb, A6XX_SP_GS_PVT_MEM_SIZE(.totalpvtmemsize = pvtmem->per_sp_size, .perwavememlayout = xs->pvtmem_per_wave)); crb.add(A6XX_SP_GS_PVT_MEM_STACK_OFFSET(.offset = pvtmem->per_sp_size)); - if (device->physical_device->info->chip >= A7XX) + if (CHIP >= A7XX) crb.add(SP_GS_VGS_CNTL(A7XX, 0)); break; @@ -1615,6 +1616,12 @@ tu6_emit_xs(struct tu_crb &crb, .inoutregoverlap = true, .pixlodenable = xs->need_pixlod, .earlypreamble = xs->early_preamble, .mergedregs = xs->mergedregs, )); + if (CHIP >= A8XX) { + crb.add(RB_PS_CNTL(CHIP, + .pixlodenable = xs->need_pixlod, + .lodpixmask = xs->need_full_quad, + )); + } crb.add(A6XX_SP_PS_INSTR_SIZE(xs->instrlen)); crb.add(A6XX_SP_PS_PROGRAM_COUNTER_OFFSET(0)); crb.add(A6XX_SP_PS_BASE(.qword = binary_iova)); @@ -1625,7 +1632,7 @@ tu6_emit_xs(struct tu_crb &crb, A6XX_SP_PS_PVT_MEM_SIZE(.totalpvtmemsize = pvtmem->per_sp_size, .perwavememlayout = xs->pvtmem_per_wave)); crb.add(A6XX_SP_PS_PVT_MEM_STACK_OFFSET(.offset = pvtmem->per_sp_size)); - if (device->physical_device->info->chip >= A7XX) + if (CHIP >= A7XX) crb.add(SP_PS_VGS_CNTL(A7XX, 0)); break; @@ -1650,7 +1657,7 @@ tu6_emit_xs(struct tu_crb &crb, A6XX_SP_CS_PVT_MEM_SIZE(.totalpvtmemsize = pvtmem->per_sp_size, .perwavememlayout = xs->pvtmem_per_wave)); crb.add(A6XX_SP_CS_PVT_MEM_STACK_OFFSET(.offset = pvtmem->per_sp_size)); - if (device->physical_device->info->chip >= A7XX) + if (CHIP >= A7XX) crb.add(SP_CS_VGS_CNTL(A7XX, 0)); break; @@ -1658,6 +1665,7 @@ tu6_emit_xs(struct tu_crb &crb, UNREACHABLE("bad shader stage"); } } +TU_GENX(tu6_emit_xs); void tu6_emit_xs_constants( @@ -1782,7 +1790,7 @@ tu6_emit_cs_config(struct tu_cs *cs, crb.add(SP_UPDATE_CNTL(CHIP, .cs_state = true, .cs_uav = true, .cs_shared_const = shared_consts_enable)); tu6_emit_xs_config(crb, { .cs = v }); - tu6_emit_xs(crb, cs->device, MESA_SHADER_COMPUTE, v, pvtmem, binary_iova); + tu6_emit_xs(crb, cs->device, MESA_SHADER_COMPUTE, v, pvtmem, binary_iova); } tu6_emit_xs_constants(cs, MESA_SHADER_COMPUTE, v, binary_iova); @@ -1863,6 +1871,7 @@ tu6_emit_cs_config(struct tu_cs *cs, #define TU6_EMIT_VFD_DEST_MAX_DWORDS (MAX_VERTEX_ATTRIBS + 2) +template static void tu6_emit_vfd_dest(struct tu_cs *cs, const struct ir3_shader_variant *vs) @@ -1888,6 +1897,25 @@ tu6_emit_vfd_dest(struct tu_cs *cs, .fetch_cnt = attr_count, /* decode_cnt for binning pass ? */ .decode_cnt = attr_count)); + if (CHIP >= A8XX) { + const uint32_t vertexid_regid = + ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID); + const uint32_t instanceid_regid = + ir3_find_sysval_regid(vs, SYSTEM_VALUE_INSTANCE_ID); + const uint32_t viewid_regid = + ir3_find_sysval_regid(vs, SYSTEM_VALUE_VIEW_INDEX); + + unsigned sideband_count = + (vertexid_regid != INVALID_REG) + + (instanceid_regid != INVALID_REG) + + (viewid_regid != INVALID_REG); + + tu_cs_emit_regs(cs, PC_VS_INPUT_CNTL(CHIP, + .instr_cnt = attr_count, + .sideband_cnt = sideband_count, + )); + } + if (attr_count) tu_cs_emit_pkt4(cs, REG_A6XX_VFD_DEST_CNTL_INSTR(0), attr_count); @@ -1990,6 +2018,11 @@ tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs) SP_REG_PROG_ID_3(CHIP, .linelengthregid = 0xfc, .foveationqualityregid = shading_rate_regid), ); + if (CHIP >= A8XX) { + tu_cs_emit_regs(cs, RB_LB_PARAM_LIMIT(CHIP, + cs->device->physical_device->info->props.prim_alloc_threshold)); + } + if (CHIP >= A7XX) { uint32_t sysval_regs = 0; for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++) { @@ -2245,7 +2278,7 @@ tu6_emit_vs(struct tu_cs *cs, tu_cs_emit_regs(cs, VPC_STEREO_RENDERING_VIEWMASK(CHIP, view_mask)); } - tu6_emit_vfd_dest(cs, vs); + tu6_emit_vfd_dest(cs, vs); const uint32_t vertexid_regid = ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID); @@ -2404,7 +2437,7 @@ tu6_emit_variant(struct tu_cs *cs, } with_crb(cs) { - tu6_emit_xs(crb, cs->device, stage, xs, pvtmem_config, binary_iova); + tu6_emit_xs(crb, cs->device, stage, xs, pvtmem_config, binary_iova); } switch (stage) { diff --git a/src/freedreno/vulkan/tu_shader.h b/src/freedreno/vulkan/tu_shader.h index e42dc18b927..23b653c4e9c 100644 --- a/src/freedreno/vulkan/tu_shader.h +++ b/src/freedreno/vulkan/tu_shader.h @@ -151,6 +151,7 @@ tu_spirv_to_nir(struct tu_device *dev, const struct tu_shader_key *key, mesa_shader_stage stage); +template void tu6_emit_xs(struct tu_crb &crb, struct tu_device *device,