diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c index c2df8011527..2c1cd5e8152 100644 --- a/src/amd/common/ac_gpu_info.c +++ b/src/amd/common/ac_gpu_info.c @@ -555,6 +555,7 @@ static void handle_env_var_force_family(struct radeon_info *info) get_radeon_info(info, &ac_fake_hw_db[i]); info->name = "NOOP"; info->family_overridden = true; + info->chip_rev = 1; return; } } diff --git a/src/amd/common/ac_surface.c b/src/amd/common/ac_surface.c index 83b74ea17d9..8a779de901d 100644 --- a/src/amd/common/ac_surface.c +++ b/src/amd/common/ac_surface.c @@ -3035,7 +3035,7 @@ static bool gfx12_compute_hiz_his_info(struct ac_addrlib *addrlib, const struct { assert(surf_in->flags.depth != surf_in->flags.stencil); - if (surf->flags & RADEON_SURF_NO_HTILE || (info->gfx_level == GFX12 && info->chip_rev <= 1)) + if (surf->flags & RADEON_SURF_NO_HTILE || (info->gfx_level == GFX12 && info->chip_rev == 0)) return true; ADDR3_COMPUTE_SURFACE_INFO_OUTPUT out = {0}; @@ -3109,7 +3109,11 @@ static bool gfx12_compute_miptree(struct ac_addrlib *addrlib, const struct radeo surf->surf_alignment_log2 = MAX2(surf->surf_alignment_log2, util_logbase2(out.baseAlign)); surf->surf_size = surf->u.gfx9.zs.stencil_offset + out.surfSize; - return gfx12_compute_hiz_his_info(addrlib, info, surf, &surf->u.gfx9.zs.his, in); + if (info->chip_rev >= 2 && + !gfx12_compute_hiz_his_info(addrlib, info, surf, &surf->u.gfx9.zs.his, in)) + return false; + + return true; } surf->u.gfx9.surf_slice_size = out.sliceSize; diff --git a/src/gallium/drivers/radeonsi/si_build_pm4.h b/src/gallium/drivers/radeonsi/si_build_pm4.h index 630b5885ff3..c792d0fd2f8 100644 --- a/src/gallium/drivers/radeonsi/si_build_pm4.h +++ b/src/gallium/drivers/radeonsi/si_build_pm4.h @@ -514,6 +514,20 @@ __event_type == V_028A90_PIXEL_PIPE_STAT_CONTROL ? 1 : 0)); \ } while (0) +#define radeon_emit_alt_hiz_logic() do { \ + static_assert(GFX_VERSION == GFX12 || !ALT_HIZ_LOGIC, ""); \ + if (GFX_VERSION == GFX12 && ALT_HIZ_LOGIC) { \ + radeon_emit(PKT3(PKT3_RELEASE_MEM, 6, 0)); \ + radeon_emit(S_490_EVENT_TYPE(V_028A90_BOTTOM_OF_PIPE_TS) | S_490_EVENT_INDEX(5)); \ + radeon_emit(0); /* DST_SEL, INT_SEL = no write confirm, DATA_SEL = no data */ \ + radeon_emit(0); /* ADDRESS_LO */ \ + radeon_emit(0); /* ADDRESS_HI */ \ + radeon_emit(0); /* DATA_LO */ \ + radeon_emit(0); /* DATA_HI */ \ + radeon_emit(0); /* INT_CTXID */ \ + } \ +} while (0) + /* This should be evaluated at compile time if all parameters are constants. */ static ALWAYS_INLINE unsigned si_get_user_data_base(enum amd_gfx_level gfx_level, enum si_has_tess has_tess, diff --git a/src/gallium/drivers/radeonsi/si_debug_options.h b/src/gallium/drivers/radeonsi/si_debug_options.h index f645d34030c..b025293b771 100644 --- a/src/gallium/drivers/radeonsi/si_debug_options.h +++ b/src/gallium/drivers/radeonsi/si_debug_options.h @@ -22,6 +22,7 @@ OPT_BOOL(dcc_msaa, true, "Enable DCC for MSAA for GFX10-10.3") OPT_BOOL(zerovram, false, "Zero all VRAM allocations") OPT_BOOL(clear_lds, false, "Clear LDS at the end of shaders. Might decrease performance.") OPT_BOOL(cache_rb_gl2, false, "Enable GL2 caching for CB and DB.") +OPT_BOOL(alt_hiz_logic, false, "Enable alternative HiZ logic") #undef OPT_BOOL #undef OPT_INT diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index f6abbe41b79..9084d128a7e 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -382,6 +382,7 @@ struct si_texture { bool can_sample_z : 1; bool can_sample_s : 1; bool need_flush_after_depth_decompression: 1; + bool force_disable_hiz_his : 1; /* We need to track DCC dirtiness, because st/dri usually calls * flush_resource twice per frame (not a bug) and we don't wanna diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index d96695faf91..97712bbb840 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -1694,6 +1694,20 @@ static void si_bind_dsa_state(struct pipe_context *ctx, void *state) si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref); } + struct pipe_surface *zssurf = sctx->framebuffer.state.zsbuf; + struct si_texture *zstex = (struct si_texture*)(zssurf ? zssurf->texture : NULL); + + if (sctx->gfx_level == GFX12 && !sctx->screen->options.alt_hiz_logic && + sctx->framebuffer.has_stencil && dsa->stencil_enabled && !zstex->force_disable_hiz_his) { + zstex->force_disable_hiz_his = true; + si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer); + + if (sctx->framebuffer.has_hiz_his) { + sctx->framebuffer.has_hiz_his = false; + si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); + } + } + if (old_dsa->alpha_func != dsa->alpha_func) { si_ps_key_update_dsa(sctx); si_update_ps_inputs_read_or_disabled(sctx); @@ -2690,9 +2704,14 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, if (util_format_has_stencil(util_format_description(zstex->buffer.b.b.format))) sctx->framebuffer.has_stencil = true; + if (sctx->gfx_level == GFX12 && !sctx->screen->options.alt_hiz_logic && + sctx->framebuffer.has_stencil && sctx->queued.named.dsa->stencil_enabled) + zstex->force_disable_hiz_his = true; + if (sctx->gfx_level >= GFX12) { - sctx->framebuffer.has_hiz_his = zstex->surface.u.gfx9.zs.hiz.offset || - zstex->surface.u.gfx9.zs.his.offset; + sctx->framebuffer.has_hiz_his = (zstex->surface.u.gfx9.zs.hiz.offset || + zstex->surface.u.gfx9.zs.his.offset) && + !zstex->force_disable_hiz_his; } } @@ -3319,18 +3338,24 @@ static void gfx12_emit_framebuffer_state(struct si_context *sctx, unsigned index gfx12_set_context_reg(R_028034_DB_STENCIL_READ_BASE_HI, zb->ds.db_stencil_base >> 32); gfx12_set_context_reg(R_028038_DB_STENCIL_WRITE_BASE, zb->ds.db_stencil_base); gfx12_set_context_reg(R_02803C_DB_STENCIL_WRITE_BASE_HI, zb->ds.db_stencil_base >> 32); - gfx12_set_context_reg(R_028B94_PA_SC_HIZ_INFO, zb->ds.u.gfx12.hiz_info); - gfx12_set_context_reg(R_028B98_PA_SC_HIS_INFO, zb->ds.u.gfx12.his_info); - if (zb->ds.u.gfx12.hiz_info) { - gfx12_set_context_reg(R_028B9C_PA_SC_HIZ_BASE, zb->ds.u.gfx12.hiz_base); - gfx12_set_context_reg(R_028BA0_PA_SC_HIZ_BASE_EXT, zb->ds.u.gfx12.hiz_base >> 32); - gfx12_set_context_reg(R_028BA4_PA_SC_HIZ_SIZE_XY, zb->ds.u.gfx12.hiz_size_xy); - } - if (zb->ds.u.gfx12.his_info) { - gfx12_set_context_reg(R_028BA8_PA_SC_HIS_BASE, zb->ds.u.gfx12.his_base); - gfx12_set_context_reg(R_028BAC_PA_SC_HIS_BASE_EXT, zb->ds.u.gfx12.his_base >> 32); - gfx12_set_context_reg(R_028BB0_PA_SC_HIS_SIZE_XY, zb->ds.u.gfx12.his_size_xy); + if (tex->force_disable_hiz_his) { + gfx12_set_context_reg(R_028B94_PA_SC_HIZ_INFO, S_028B94_SURFACE_ENABLE(0)); + gfx12_set_context_reg(R_028B98_PA_SC_HIS_INFO, S_028B98_SURFACE_ENABLE(0)); + } else { + gfx12_set_context_reg(R_028B94_PA_SC_HIZ_INFO, zb->ds.u.gfx12.hiz_info); + gfx12_set_context_reg(R_028B98_PA_SC_HIS_INFO, zb->ds.u.gfx12.his_info); + + if (zb->ds.u.gfx12.hiz_info) { + gfx12_set_context_reg(R_028B9C_PA_SC_HIZ_BASE, zb->ds.u.gfx12.hiz_base); + gfx12_set_context_reg(R_028BA0_PA_SC_HIZ_BASE_EXT, zb->ds.u.gfx12.hiz_base >> 32); + gfx12_set_context_reg(R_028BA4_PA_SC_HIZ_SIZE_XY, zb->ds.u.gfx12.hiz_size_xy); + } + if (zb->ds.u.gfx12.his_info) { + gfx12_set_context_reg(R_028BA8_PA_SC_HIS_BASE, zb->ds.u.gfx12.his_base); + gfx12_set_context_reg(R_028BAC_PA_SC_HIS_BASE_EXT, zb->ds.u.gfx12.his_base >> 32); + gfx12_set_context_reg(R_028BB0_PA_SC_HIS_SIZE_XY, zb->ds.u.gfx12.his_size_xy); + } } } else if (sctx->framebuffer.dirty_zsbuf) { gfx12_set_context_reg(R_028018_DB_Z_INFO, diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index 9e14749565b..f75089555ba 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -837,6 +837,11 @@ enum si_has_sh_pairs_packed { HAS_SH_PAIRS_PACKED_ON, }; +enum si_alt_hiz_logic { + ALT_HIZ_LOGIC_OFF, + ALT_HIZ_LOGIC_ON, +}; + template ALWAYS_INLINE static bool num_instanced_prims_less_than(const struct pipe_draw_indirect_info *indirect, enum mesa_prim prim, @@ -1231,7 +1236,8 @@ void si_emit_buffered_compute_sh_regs(struct si_context *sctx) #endif template ALWAYS_INLINE + si_is_draw_vertex_state IS_DRAW_VERTEX_STATE, si_has_sh_pairs_packed HAS_SH_PAIRS_PACKED, + si_alt_hiz_logic ALT_HIZ_LOGIC> ALWAYS_INLINE static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw_info *info, unsigned drawid_base, const struct pipe_draw_indirect_info *indirect, @@ -1423,6 +1429,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw radeon_emit((sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2); radeon_emit((sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2); radeon_emit(di_src_sel); + + radeon_emit_alt_hiz_logic(); } else { uint64_t count_va = 0; @@ -1448,6 +1456,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw radeon_emit(count_va >> 32); radeon_emit(indirect->stride); radeon_emit(di_src_sel); + + radeon_emit_alt_hiz_logic(); } } else { if (sctx->last_instance_count == SI_INSTANCE_COUNT_UNKNOWN || @@ -1567,6 +1577,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw radeon_emit(va >> 32); radeon_emit(draws[i].count); radeon_emit(V_0287F0_DI_SRC_SEL_DMA); /* NOT_EOP disabled */ + + radeon_emit_alt_hiz_logic(); } if (num_draws > 1) { BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg); /* BaseVertex */ @@ -1586,6 +1598,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw radeon_emit(va >> 32); radeon_emit(draws[i].count); radeon_emit(V_0287F0_DI_SRC_SEL_DMA); /* NOT_EOP disabled */ + + radeon_emit_alt_hiz_logic(); } if (num_draws > 1) { BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg + 1); /* DrawID */ @@ -1606,6 +1620,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw radeon_emit(va >> 32); radeon_emit(draws[i].count); radeon_emit(V_0287F0_DI_SRC_SEL_DMA); /* NOT_EOP disabled */ + + radeon_emit_alt_hiz_logic(); } if (num_draws > 1) { BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg); /* BaseVertex */ @@ -1633,6 +1649,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw radeon_emit(V_0287F0_DI_SRC_SEL_DMA | S_0287F0_NOT_EOP(GFX_VERSION >= GFX10 && GFX_VERSION < GFX12 && i < num_draws - 1)); + + radeon_emit_alt_hiz_logic(); } } } @@ -1646,6 +1664,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw radeon_emit(0); radeon_emit(V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque); + radeon_emit_alt_hiz_logic(); for (unsigned i = 0; i < 3; i++) radeon_event_write(V_028A90_SQ_NON_EVENT); } else if (increment_draw_id) { @@ -1661,6 +1680,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw radeon_emit(PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit)); radeon_emit(draws[i].count); radeon_emit(V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque); + + radeon_emit_alt_hiz_logic(); } if (num_draws > 1 && (IS_DRAW_VERTEX_STATE || !sctx->num_vs_blit_sgprs)) { BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg); /* BaseVertex */ @@ -1674,6 +1695,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw radeon_emit(PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit)); radeon_emit(draws[i].count); radeon_emit(V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque); + + radeon_emit_alt_hiz_logic(); } if (num_draws > 1 && (IS_DRAW_VERTEX_STATE || !sctx->num_vs_blit_sgprs)) { BITSET_CLEAR(sctx->tracked_regs.reg_saved_mask, tracked_base_vertex_reg); /* BaseVertex */ @@ -2030,7 +2053,7 @@ static void si_emit_all_states(struct si_context *sctx, uint64_t skip_atom_mask) template ALWAYS_INLINE + util_popcnt POPCNT, si_alt_hiz_logic ALT_HIZ_LOGIC> ALWAYS_INLINE static void si_draw(struct pipe_context *ctx, const struct pipe_draw_info *info, unsigned drawid_offset, @@ -2348,7 +2371,8 @@ static void si_draw(struct pipe_context *ctx, return; } - si_emit_draw_packets + si_emit_draw_packets (sctx, info, drawid_offset, indirect, draws, num_draws, indexbuf, index_size, index_offset, instance_count); /* <-- CUs start to get busy here if we waited. */ @@ -2400,7 +2424,7 @@ static void si_draw(struct pipe_context *ctx, } template + si_has_sh_pairs_packed HAS_SH_PAIRS_PACKED, si_alt_hiz_logic ALT_HIZ_LOGIC> static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info, unsigned drawid_offset, @@ -2408,12 +2432,14 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_start_count_bias *draws, unsigned num_draws) { - si_draw + si_draw (ctx, info, drawid_offset, indirect, draws, num_draws, NULL, 0); } template + si_has_sh_pairs_packed HAS_SH_PAIRS_PACKED, util_popcnt POPCNT, + si_alt_hiz_logic ALT_HIZ_LOGIC> static void si_draw_vertex_state(struct pipe_context *ctx, struct pipe_vertex_state *vstate, uint32_t partial_velem_mask, @@ -2429,7 +2455,8 @@ static void si_draw_vertex_state(struct pipe_context *ctx, dinfo.instance_count = 1; dinfo.index.resource = state->b.input.indexbuf; - si_draw + si_draw (ctx, &dinfo, 0, NULL, draws, num_draws, vstate, partial_velem_mask); if (info.take_vertex_state_ownership) @@ -2491,18 +2518,28 @@ static void si_init_draw_vbo(struct si_context *sctx) if (!NGG && GFX_VERSION >= GFX11) return; - if (GFX_VERSION >= GFX11 && GFX_VERSION < GFX12 && sctx->screen->info.has_set_sh_pairs_packed) { + if (GFX_VERSION == GFX12 && sctx->screen->options.alt_hiz_logic) { sctx->draw_vbo[HAS_TESS][HAS_GS][NGG] = - si_draw_vbo; + si_draw_vbo; sctx->draw_vertex_state[HAS_TESS][HAS_GS][NGG] = - si_draw_vertex_state; + si_draw_vertex_state; + } else if (GFX_VERSION >= GFX11 && GFX_VERSION < GFX12 && + sctx->screen->info.has_set_sh_pairs_packed) { + sctx->draw_vbo[HAS_TESS][HAS_GS][NGG] = + si_draw_vbo; + + sctx->draw_vertex_state[HAS_TESS][HAS_GS][NGG] = + si_draw_vertex_state; } else { sctx->draw_vbo[HAS_TESS][HAS_GS][NGG] = - si_draw_vbo; + si_draw_vbo; sctx->draw_vertex_state[HAS_TESS][HAS_GS][NGG] = - si_draw_vertex_state; + si_draw_vertex_state; } }