radeonsi: use a C++ template to decrease draw_vbo overhead by 13 %

With GALLIUM_THREAD=0 to disable draw merging.

Before:
   1, DrawElements ( 1 VBO| 0 UBO|  0    ) w/ no state change,                 8736

After:
   1, DrawElements ( 1 VBO| 0 UBO|  0    ) w/ no state change,                10059

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7807>
This commit is contained in:
Marek Olšák 2020-08-26 00:16:23 -04:00
parent 6347b0b5c4
commit 2b09bde1f5
4 changed files with 262 additions and 140 deletions

View file

@ -126,6 +126,8 @@ enum chip_class
GFX9,
GFX10,
GFX10_3,
NUM_GFX_VERSIONS,
};
enum ring_type

View file

@ -131,6 +131,26 @@ extern "C" {
#define SI_RESOURCE_FLAG_UNCACHED (PIPE_RESOURCE_FLAG_DRV_PRIV << 12)
#define SI_RESOURCE_FLAG_DRIVER_INTERNAL (PIPE_RESOURCE_FLAG_DRV_PRIV << 13)
enum si_has_gs {
GS_OFF,
GS_ON,
};
enum si_has_tess {
TESS_OFF,
TESS_ON,
};
enum si_has_ngg {
NGG_OFF,
NGG_ON,
};
enum si_has_prim_discard_cs {
PRIM_DISCARD_CS_OFF,
PRIM_DISCARD_CS_ON,
};
enum si_clear_code
{
DCC_CLEAR_COLOR_0000 = 0x00000000,
@ -884,6 +904,12 @@ struct si_small_prim_cull_info {
float scale[2], translate[2];
};
typedef void (*pipe_draw_vbo_func)(struct pipe_context *pipe,
const struct pipe_draw_info *info,
const struct pipe_draw_indirect_info *indirect,
const struct pipe_draw_start_count *draws,
unsigned num_draws);
struct si_context {
struct pipe_context b; /* base class */
@ -1268,6 +1294,8 @@ struct si_context {
* a context flush.
*/
struct hash_table *dirty_implicit_resources;
pipe_draw_vbo_func draw_vbo[NUM_GFX_VERSIONS - GFX6][2][2][2][2];
};
/* si_blit.c */
@ -1912,6 +1940,16 @@ static inline unsigned si_get_shader_wave_size(struct si_shader *shader)
shader->key.opt.vs_as_prim_discard_cs);
}
static inline void si_select_draw_vbo(struct si_context *sctx)
{
sctx->b.draw_vbo = sctx->draw_vbo[sctx->chip_class - GFX6]
[!!sctx->tes_shader.cso]
[!!sctx->gs_shader.cso]
[sctx->ngg]
[si_compute_prim_discard_enabled(sctx)];
assert(sctx->b.draw_vbo);
}
#define PRINT_ERR(fmt, args...) \
fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args)

View file

@ -521,7 +521,7 @@ static bool num_instanced_prims_less_than(const struct pipe_draw_info *info,
}
}
ALWAYS_INLINE
template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS> ALWAYS_INLINE
static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
const struct pipe_draw_info *info,
const struct pipe_draw_indirect_info *indirect,
@ -533,9 +533,9 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
unsigned primgroup_size;
unsigned ia_multi_vgt_param;
if (sctx->tes_shader.cso) {
if (HAS_TESS) {
primgroup_size = num_patches; /* must be a multiple of NUM_PATCHES */
} else if (sctx->gs_shader.cso) {
} else if (HAS_GS) {
primgroup_size = 64; /* recommended with a GS */
} else {
primgroup_size = 128; /* recommended without a GS and tess */
@ -552,9 +552,9 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
ia_multi_vgt_param =
sctx->ia_multi_vgt_param[key.index] | S_028AA8_PRIMGROUP_SIZE(primgroup_size - 1);
if (sctx->gs_shader.cso) {
if (HAS_GS) {
/* GS requirement. */
if (sctx->chip_class <= GFX8 &&
if (GFX_VERSION <= GFX8 &&
SI_GS_PER_ES / primgroup_size >= sctx->screen->gs_table_depth - 3)
ia_multi_vgt_param |= S_028AA8_PARTIAL_ES_WAVE_ON(1);
@ -562,7 +562,8 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
* The hw doc says all multi-SE chips are affected, but Vulkan
* only applies it to Hawaii. Do what Vulkan does.
*/
if (sctx->family == CHIP_HAWAII && G_028AA8_SWITCH_ON_EOI(ia_multi_vgt_param) &&
if (GFX_VERSION == GFX7 &&
sctx->family == CHIP_HAWAII && G_028AA8_SWITCH_ON_EOI(ia_multi_vgt_param) &&
num_instanced_prims_less_than(info, indirect, prim, min_vertex_count, instance_count, 2))
sctx->flags |= SI_CONTEXT_VGT_FLUSH;
}
@ -597,7 +598,7 @@ static unsigned si_conv_prim_to_gs_out(unsigned mode)
}
/* rast_prim is the primitive type after GS. */
ALWAYS_INLINE
template<si_has_gs HAS_GS, si_has_ngg NGG> ALWAYS_INLINE
static void si_emit_rasterizer_prim_state(struct si_context *sctx)
{
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
@ -620,7 +621,7 @@ static void si_emit_rasterizer_prim_state(struct si_context *sctx)
}
unsigned gs_out_prim = si_conv_prim_to_gs_out(rast_prim);
if (unlikely(gs_out_prim != sctx->last_gs_out_prim && (sctx->ngg || sctx->gs_shader.cso))) {
if (unlikely(gs_out_prim != sctx->last_gs_out_prim && (NGG || HAS_GS))) {
radeon_set_context_reg(cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, gs_out_prim);
sctx->last_gs_out_prim = gs_out_prim;
}
@ -628,7 +629,7 @@ static void si_emit_rasterizer_prim_state(struct si_context *sctx)
if (initial_cdw != cs->current.cdw)
sctx->context_roll = true;
if (sctx->ngg) {
if (NGG) {
struct si_shader *hw_vs = si_get_vs_state(sctx);
if (hw_vs->uses_vs_state_provoking_vertex) {
@ -696,7 +697,7 @@ static bool si_prim_restart_index_changed(struct si_context *sctx, bool primitiv
sctx->last_restart_index == SI_RESTART_INDEX_UNKNOWN);
}
ALWAYS_INLINE
template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS> ALWAYS_INLINE
static void si_emit_ia_multi_vgt_param(struct si_context *sctx, const struct pipe_draw_info *info,
const struct pipe_draw_indirect_info *indirect,
enum pipe_prim_type prim, unsigned num_patches,
@ -707,15 +708,16 @@ static void si_emit_ia_multi_vgt_param(struct si_context *sctx, const struct pip
unsigned ia_multi_vgt_param;
ia_multi_vgt_param =
si_get_ia_multi_vgt_param(sctx, info, indirect, prim, num_patches, instance_count,
primitive_restart, min_vertex_count);
si_get_ia_multi_vgt_param<GFX_VERSION, HAS_TESS, HAS_GS>
(sctx, info, indirect, prim, num_patches, instance_count, primitive_restart,
min_vertex_count);
/* Draw state. */
if (ia_multi_vgt_param != sctx->last_multi_vgt_param) {
if (sctx->chip_class == GFX9)
if (GFX_VERSION == GFX9)
radeon_set_uconfig_reg_idx(cs, sctx->screen, R_030960_IA_MULTI_VGT_PARAM, 4,
ia_multi_vgt_param);
else if (sctx->chip_class >= GFX7)
else if (GFX_VERSION >= GFX7)
radeon_set_context_reg_idx(cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param);
else
radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param);
@ -727,14 +729,14 @@ static void si_emit_ia_multi_vgt_param(struct si_context *sctx, const struct pip
/* GFX10 removed IA_MULTI_VGT_PARAM in exchange for GE_CNTL.
* We overload last_multi_vgt_param.
*/
ALWAYS_INLINE
template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG> ALWAYS_INLINE
static void gfx10_emit_ge_cntl(struct si_context *sctx, unsigned num_patches)
{
union si_vgt_param_key key = sctx->ia_multi_vgt_param_key;
unsigned ge_cntl;
if (sctx->ngg) {
if (sctx->tes_shader.cso) {
if (NGG) {
if (HAS_TESS) {
ge_cntl = S_03096C_PRIM_GRP_SIZE(num_patches) |
S_03096C_VERT_GRP_SIZE(0) |
S_03096C_BREAK_WAVE_AT_EOI(key.u.tess_uses_prim_id);
@ -745,10 +747,10 @@ static void gfx10_emit_ge_cntl(struct si_context *sctx, unsigned num_patches)
unsigned primgroup_size;
unsigned vertgroup_size;
if (sctx->tes_shader.cso) {
if (HAS_TESS) {
primgroup_size = num_patches; /* must be a multiple of NUM_PATCHES */
vertgroup_size = 0;
} else if (sctx->gs_shader.cso) {
} else if (HAS_GS) {
unsigned vgt_gs_onchip_cntl = sctx->gs_shader.current->ctx_reg.gs.vgt_gs_onchip_cntl;
primgroup_size = G_028A44_GS_PRIMS_PER_SUBGRP(vgt_gs_onchip_cntl);
vertgroup_size = G_028A44_ES_VERTS_PER_SUBGRP(vgt_gs_onchip_cntl);
@ -769,7 +771,7 @@ static void gfx10_emit_ge_cntl(struct si_context *sctx, unsigned num_patches)
}
}
ALWAYS_INLINE
template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG> ALWAYS_INLINE
static void si_emit_draw_registers(struct si_context *sctx, const struct pipe_draw_info *info,
const struct pipe_draw_indirect_info *indirect,
enum pipe_prim_type prim, unsigned num_patches,
@ -779,16 +781,17 @@ static void si_emit_draw_registers(struct si_context *sctx, const struct pipe_dr
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
unsigned vgt_prim = si_conv_pipe_prim(prim);
if (sctx->chip_class >= GFX10)
gfx10_emit_ge_cntl(sctx, num_patches);
if (GFX_VERSION >= GFX10)
gfx10_emit_ge_cntl<GFX_VERSION, HAS_TESS, HAS_GS, NGG>(sctx, num_patches);
else
si_emit_ia_multi_vgt_param(sctx, info, indirect, prim, num_patches, instance_count,
primitive_restart, min_vertex_count);
si_emit_ia_multi_vgt_param<GFX_VERSION, HAS_TESS, HAS_GS>
(sctx, info, indirect, prim, num_patches, instance_count, primitive_restart,
min_vertex_count);
if (vgt_prim != sctx->last_prim) {
if (sctx->chip_class >= GFX10)
if (GFX_VERSION >= GFX10)
radeon_set_uconfig_reg(cs, R_030908_VGT_PRIMITIVE_TYPE, vgt_prim);
else if (sctx->chip_class >= GFX7)
else if (GFX_VERSION >= GFX7)
radeon_set_uconfig_reg_idx(cs, sctx->screen, R_030908_VGT_PRIMITIVE_TYPE, 1, vgt_prim);
else
radeon_set_config_reg(cs, R_008958_VGT_PRIMITIVE_TYPE, vgt_prim);
@ -798,7 +801,7 @@ static void si_emit_draw_registers(struct si_context *sctx, const struct pipe_dr
/* Primitive restart. */
if (primitive_restart != sctx->last_primitive_restart_en) {
if (sctx->chip_class >= GFX9)
if (GFX_VERSION >= GFX9)
radeon_set_uconfig_reg(cs, R_03092C_VGT_MULTI_PRIM_IB_RESET_EN, primitive_restart);
else
radeon_set_context_reg(cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, primitive_restart);
@ -812,6 +815,7 @@ static void si_emit_draw_registers(struct si_context *sctx, const struct pipe_dr
}
}
template <chip_class GFX_VERSION, si_has_ngg NGG, si_has_prim_discard_cs ALLOW_PRIM_DISCARD_CS>
static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw_info *info,
const struct pipe_draw_indirect_info *indirect,
const struct pipe_draw_start_count *draws,
@ -852,19 +856,19 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
case 2:
index_type =
V_028A7C_VGT_INDEX_16 |
(SI_BIG_ENDIAN && sctx->chip_class <= GFX7 ? V_028A7C_VGT_DMA_SWAP_16_BIT : 0);
(SI_BIG_ENDIAN && GFX_VERSION <= GFX7 ? V_028A7C_VGT_DMA_SWAP_16_BIT : 0);
break;
case 4:
index_type =
V_028A7C_VGT_INDEX_32 |
(SI_BIG_ENDIAN && sctx->chip_class <= GFX7 ? V_028A7C_VGT_DMA_SWAP_32_BIT : 0);
(SI_BIG_ENDIAN && GFX_VERSION <= GFX7 ? V_028A7C_VGT_DMA_SWAP_32_BIT : 0);
break;
default:
assert(!"unreachable");
return;
}
if (sctx->chip_class >= GFX9) {
if (GFX_VERSION >= GFX9) {
radeon_set_uconfig_reg_idx(cs, sctx->screen, R_03090C_VGT_INDEX_TYPE, 2, index_type);
} else {
radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
@ -874,7 +878,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
sctx->last_index_size = index_size;
}
if (original_index_size) {
/* If !ALLOW_PRIM_DISCARD_CS, index_size == original_index_size. */
if (!ALLOW_PRIM_DISCARD_CS || original_index_size) {
index_max_size = (indexbuf->width0 - index_offset) >> util_logbase2(original_index_size);
/* Skip draw calls with 0-sized index buffers.
* They cause a hang on some chips, like Navi10-14.
@ -891,7 +896,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
/* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE,
* so the state must be re-emitted before the next indexed draw.
*/
if (sctx->chip_class >= GFX7)
if (GFX_VERSION >= GFX7)
sctx->last_index_size = -1;
}
@ -1018,7 +1023,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
set_draw_id &= info->increment_draw_id;
if (index_size) {
if (dispatch_prim_discard_cs) {
if (ALLOW_PRIM_DISCARD_CS && dispatch_prim_discard_cs) {
for (unsigned i = 0; i < num_draws; i++) {
uint64_t va = index_va + draws[0].start * original_index_size;
@ -1049,13 +1054,13 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
* can be changed between draws and GS fast launch must be disabled.
* NOT_EOP doesn't work on gfx9 and older.
*/
S_0287F0_NOT_EOP(sctx->chip_class >= GFX10 &&
S_0287F0_NOT_EOP(GFX_VERSION >= GFX10 &&
!set_draw_id &&
i < num_draws - 1));
}
} else {
/* Set the index buffer for fast launch. The VS prolog will load the indices. */
if (sctx->ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0)) {
if (NGG && sctx->ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0)) {
index_max_size = (indexbuf->width0 - index_offset) >> util_logbase2(original_index_size);
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(indexbuf),
@ -1587,7 +1592,7 @@ void si_emit_cache_flush(struct si_context *sctx)
sctx->flags = 0;
}
ALWAYS_INLINE
template <chip_class GFX_VERSION> ALWAYS_INLINE
static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
{
unsigned i, count = sctx->num_vertex_elements;
@ -1651,7 +1656,7 @@ static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
uint64_t va = buf->gpu_address + offset;
int64_t num_records = (int64_t)buf->b.b.width0 - offset;
if (sctx->chip_class != GFX8 && vb->stride) {
if (GFX_VERSION != GFX8 && vb->stride) {
/* Round up by rounding down and adding 1 */
num_records = (num_records - velems->format_size[i]) / vb->stride + 1;
}
@ -1663,7 +1668,7 @@ static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
* - 1: index >= NUM_RECORDS (Structured)
* - 3: offset >= NUM_RECORDS (Raw)
*/
if (sctx->chip_class >= GFX10)
if (GFX_VERSION >= GFX10)
rsrc_word3 |= S_008F0C_OOB_SELECT(vb->stride ? V_008F0C_OOB_SELECT_STRUCTURED
: V_008F0C_OOB_SELECT_RAW);
@ -1760,6 +1765,7 @@ static void si_get_draw_start_count(struct si_context *sctx, const struct pipe_d
}
}
template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG>
static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_info *info,
const struct pipe_draw_indirect_info *indirect,
enum pipe_prim_type prim, unsigned instance_count,
@ -1768,8 +1774,8 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i
{
unsigned num_patches = 0;
si_emit_rasterizer_prim_state(sctx);
if (sctx->tes_shader.cso)
si_emit_rasterizer_prim_state<HAS_GS, NGG>(sctx);
if (HAS_TESS)
si_emit_derived_tess_state(sctx, info, &num_patches);
/* Emit state atoms. */
@ -1795,8 +1801,9 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i
/* Emit draw states. */
si_emit_vs_state(sctx, info);
si_emit_draw_registers(sctx, info, indirect, prim, num_patches, instance_count,
primitive_restart, min_vertex_count);
si_emit_draw_registers<GFX_VERSION, HAS_TESS, HAS_GS, NGG>
(sctx, info, indirect, prim, num_patches, instance_count, primitive_restart,
min_vertex_count);
}
static bool si_all_vs_resources_read_only(struct si_context *sctx, struct pipe_resource *indexbuf)
@ -1891,6 +1898,8 @@ static ALWAYS_INLINE bool pd_msg(const char *s)
pipe_resource_reference(&indexbuf, NULL); \
} while (0)
template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
si_has_prim_discard_cs ALLOW_PRIM_DISCARD_CS>
static void si_draw_vbo(struct pipe_context *ctx,
const struct pipe_draw_info *info,
const struct pipe_draw_indirect_info *indirect,
@ -1920,7 +1929,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
struct si_shader_selector *vs = sctx->vs_shader.cso;
if (unlikely(!vs || sctx->num_vertex_elements < vs->num_vs_inputs ||
(!sctx->ps_shader.cso && !rs->rasterizer_discard) ||
(!!sctx->tes_shader.cso != (prim == PIPE_PRIM_PATCHES)))) {
(HAS_TESS != (prim == PIPE_PRIM_PATCHES)))) {
assert(0);
return;
}
@ -1949,10 +1958,10 @@ static void si_draw_vbo(struct pipe_context *ctx,
* This must be done after si_decompress_textures, which can call
* draw_vbo recursively, and before si_update_shaders, which uses
* current_rast_prim for this draw_vbo call. */
if (sctx->gs_shader.cso) {
if (HAS_GS) {
/* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */
rast_prim = sctx->gs_shader.cso->rast_prim;
} else if (sctx->tes_shader.cso) {
} else if (HAS_TESS) {
/* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */
rast_prim = sctx->tes_shader.cso->rast_prim;
} else if (util_rast_prim_is_triangles(prim)) {
@ -1971,12 +1980,12 @@ static void si_draw_vbo(struct pipe_context *ctx,
sctx->do_update_shaders = true;
}
if (sctx->tes_shader.cso) {
if (HAS_TESS) {
struct si_shader_selector *tcs = sctx->tcs_shader.cso;
/* The rarely occuring tcs == NULL case is not optimized. */
bool same_patch_vertices =
sctx->chip_class >= GFX9 &&
GFX_VERSION >= GFX9 &&
tcs && info->vertices_per_patch == tcs->info.base.tess.tcs_vertices_out;
if (sctx->same_patch_vertices != same_patch_vertices) {
@ -1984,7 +1993,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
sctx->do_update_shaders = true;
}
if (sctx->screen->info.has_ls_vgpr_init_bug) {
if (GFX_VERSION == GFX9 && sctx->screen->info.has_ls_vgpr_init_bug) {
/* Determine whether the LS VGPR fix should be applied.
*
* It is only required when num input CPs > num output CPs,
@ -2002,7 +2011,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
}
}
if (sctx->chip_class <= GFX9 && sctx->gs_shader.cso) {
if (GFX_VERSION <= GFX9 && HAS_GS) {
/* Determine whether the GS triangle strip adjacency fix should
* be applied. Rotate every other triangle if
* - triangle strips with adjacency are fed to the GS and
@ -2010,7 +2019,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
* when the restart occurs after an odd number of triangles).
*/
bool gs_tri_strip_adj_fix =
!sctx->tes_shader.cso && prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY && !primitive_restart;
!HAS_TESS && prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY && !primitive_restart;
if (gs_tri_strip_adj_fix != sctx->gs_tri_strip_adj_fix) {
sctx->gs_tri_strip_adj_fix = gs_tri_strip_adj_fix;
@ -2021,7 +2030,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
if (index_size) {
/* Translate or upload, if needed. */
/* 8-bit indices are supported on GFX8. */
if (sctx->chip_class <= GFX7 && index_size == 1) {
if (GFX_VERSION <= GFX7 && index_size == 1) {
unsigned start, count, start_offset, size, offset;
void *ptr;
@ -2056,7 +2065,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
/* info->start will be added by the drawing code */
index_offset -= start_offset;
} else if (sctx->chip_class <= GFX7 && si_resource(indexbuf)->TC_L2_dirty) {
} else if (GFX_VERSION <= GFX7 && si_resource(indexbuf)->TC_L2_dirty) {
/* GFX8 reads index buffers through TC L2, so it doesn't
* need this. */
sctx->flags |= SI_CONTEXT_WB_L2;
@ -2077,7 +2086,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
si_context_add_resource_size(sctx, indirect->buffer);
/* Indirect buffers use TC L2 on GFX9, but not older hw. */
if (sctx->chip_class <= GFX8) {
if (GFX_VERSION <= GFX8) {
if (indirect->buffer && si_resource(indirect->buffer)->TC_L2_dirty) {
sctx->flags |= SI_CONTEXT_WB_L2;
si_resource(indirect->buffer)->TC_L2_dirty = false;
@ -2101,7 +2110,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
}
/* Determine if we can use the primitive discard compute shader. */
if (si_compute_prim_discard_enabled(sctx) &&
if (ALLOW_PRIM_DISCARD_CS &&
(avg_direct_count > sctx->prim_discard_vertex_count_threshold
? (sctx->compute_num_verts_rejected += total_direct_count, true)
: /* Add, then return true. */
@ -2130,8 +2139,8 @@ static void si_draw_vbo(struct pipe_context *ctx,
(!sctx->num_pipeline_stat_queries && !sctx->streamout.prims_gen_query_enabled) ||
pd_msg("pipestat or primgen query")) &&
(!sctx->vertex_elements->instance_divisor_is_fetched || pd_msg("loads instance divisors")) &&
(!sctx->tes_shader.cso || pd_msg("uses tess")) &&
(!sctx->gs_shader.cso || pd_msg("uses GS")) &&
(!HAS_TESS || pd_msg("uses tess")) &&
(!HAS_GS || pd_msg("uses GS")) &&
(!sctx->ps_shader.cso->info.uses_primid || pd_msg("PS uses PrimID")) &&
!rs->polygon_mode_enabled &&
#if SI_PRIM_DISCARD_DEBUG /* same as cso->prim_discard_cs_allowed */
@ -2176,64 +2185,66 @@ static void si_draw_vbo(struct pipe_context *ctx,
}
}
if (prim_discard_cs_instancing != sctx->prim_discard_cs_instancing) {
if (ALLOW_PRIM_DISCARD_CS &&
prim_discard_cs_instancing != sctx->prim_discard_cs_instancing) {
sctx->prim_discard_cs_instancing = prim_discard_cs_instancing;
sctx->do_update_shaders = true;
}
/* Update NGG culling settings. */
uint8_t old_ngg_culling = sctx->ngg_culling;
struct si_shader_selector *hw_vs;
if (sctx->ngg && !dispatch_prim_discard_cs && rast_prim == PIPE_PRIM_TRIANGLES &&
(hw_vs = si_get_vs(sctx)->cso) &&
(avg_direct_count > hw_vs->ngg_cull_vert_threshold ||
(!index_size &&
avg_direct_count > hw_vs->ngg_cull_nonindexed_fast_launch_vert_threshold &&
prim & ((1 << PIPE_PRIM_TRIANGLES) |
(1 << PIPE_PRIM_TRIANGLE_STRIP))))) {
uint8_t ngg_culling = 0;
if (GFX_VERSION >= GFX10) {
struct si_shader_selector *hw_vs;
if (NGG && !dispatch_prim_discard_cs && rast_prim == PIPE_PRIM_TRIANGLES &&
(hw_vs = si_get_vs(sctx)->cso) &&
(avg_direct_count > hw_vs->ngg_cull_vert_threshold ||
(!index_size &&
avg_direct_count > hw_vs->ngg_cull_nonindexed_fast_launch_vert_threshold &&
prim & ((1 << PIPE_PRIM_TRIANGLES) |
(1 << PIPE_PRIM_TRIANGLE_STRIP))))) {
uint8_t ngg_culling = 0;
if (rs->rasterizer_discard) {
ngg_culling |= SI_NGG_CULL_FRONT_FACE | SI_NGG_CULL_BACK_FACE;
} else {
/* Polygon mode can't use view and small primitive culling,
* because it draws points or lines where the culling depends
* on the point or line width.
*/
if (!rs->polygon_mode_enabled)
ngg_culling |= SI_NGG_CULL_VIEW_SMALLPRIMS;
if (rs->rasterizer_discard) {
ngg_culling |= SI_NGG_CULL_FRONT_FACE | SI_NGG_CULL_BACK_FACE;
} else {
/* Polygon mode can't use view and small primitive culling,
* because it draws points or lines where the culling depends
* on the point or line width.
*/
if (!rs->polygon_mode_enabled)
ngg_culling |= SI_NGG_CULL_VIEW_SMALLPRIMS;
if (sctx->viewports.y_inverted ? rs->cull_back : rs->cull_front)
ngg_culling |= SI_NGG_CULL_FRONT_FACE;
if (sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back)
ngg_culling |= SI_NGG_CULL_BACK_FACE;
}
/* Use NGG fast launch for certain primitive types.
* A draw must have at least 1 full primitive.
*/
if (ngg_culling &&
hw_vs->ngg_cull_nonindexed_fast_launch_vert_threshold < UINT32_MAX &&
min_direct_count >= 3 && !sctx->tes_shader.cso &&
!sctx->gs_shader.cso) {
if (prim == PIPE_PRIM_TRIANGLES && !index_size) {
ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST;
} else if (prim == PIPE_PRIM_TRIANGLE_STRIP && !primitive_restart) {
ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP |
SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(MIN2(index_size, 3));
/* The index buffer will be emulated. */
index_size = 0;
if (sctx->viewports.y_inverted ? rs->cull_back : rs->cull_front)
ngg_culling |= SI_NGG_CULL_FRONT_FACE;
if (sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back)
ngg_culling |= SI_NGG_CULL_BACK_FACE;
}
}
if (ngg_culling != old_ngg_culling) {
/* If shader compilation is not ready, this setting will be rejected. */
sctx->ngg_culling = ngg_culling;
/* Use NGG fast launch for certain primitive types.
* A draw must have at least 1 full primitive.
*/
if (ngg_culling &&
hw_vs->ngg_cull_nonindexed_fast_launch_vert_threshold < UINT32_MAX &&
min_direct_count >= 3 && !HAS_TESS && !HAS_GS) {
if (prim == PIPE_PRIM_TRIANGLES && !index_size) {
ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST;
} else if (prim == PIPE_PRIM_TRIANGLE_STRIP && !primitive_restart) {
ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP |
SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(MIN2(index_size, 3));
/* The index buffer will be emulated. */
index_size = 0;
}
}
if (ngg_culling != old_ngg_culling) {
/* If shader compilation is not ready, this setting will be rejected. */
sctx->ngg_culling = ngg_culling;
sctx->do_update_shaders = true;
}
} else if (old_ngg_culling) {
sctx->ngg_culling = 0;
sctx->do_update_shaders = true;
}
} else if (old_ngg_culling) {
sctx->ngg_culling = false;
sctx->do_update_shaders = true;
}
if (sctx->shader_has_inlinable_uniforms_mask &
@ -2255,29 +2266,31 @@ static void si_draw_vbo(struct pipe_context *ctx,
*
* This is the setting that is used by the draw.
*/
uint8_t ngg_culling = si_get_vs(sctx)->current->key.opt.ngg_culling;
if (sctx->chip_class == GFX10 &&
!(old_ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) &&
ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)
sctx->flags |= SI_CONTEXT_VGT_FLUSH;
if (GFX_VERSION >= GFX10) {
uint8_t ngg_culling = si_get_vs(sctx)->current->key.opt.ngg_culling;
if (GFX_VERSION == GFX10 &&
!(old_ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) &&
ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)
sctx->flags |= SI_CONTEXT_VGT_FLUSH;
if (old_ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0) &&
!(ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0))) {
/* Need to re-set these, because we have bound an index buffer there. */
sctx->shader_pointers_dirty |=
(1u << si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_GEOMETRY)) |
(1u << si_sampler_and_image_descriptors_idx(PIPE_SHADER_GEOMETRY));
si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
if (old_ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0) &&
!(ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0))) {
/* Need to re-set these, because we have bound an index buffer there. */
sctx->shader_pointers_dirty |=
(1u << si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_GEOMETRY)) |
(1u << si_sampler_and_image_descriptors_idx(PIPE_SHADER_GEOMETRY));
si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
}
/* Set this to the correct value determined by si_update_shaders. */
sctx->ngg_culling = ngg_culling;
}
/* Set this to the correct value determined by si_update_shaders. */
sctx->ngg_culling = ngg_culling;
}
si_need_gfx_cs_space(sctx, num_draws);
/* If we're using a secure context, determine if cs must be secure or not */
if (unlikely(radeon_uses_secure_bos(sctx->ws))) {
if (GFX_VERSION >= GFX9 && unlikely(radeon_uses_secure_bos(sctx->ws))) {
bool secure = si_gfx_resources_check_encrypted(sctx);
if (secure != sctx->ws->cs_is_secure(&sctx->gfx_cs)) {
si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW |
@ -2295,7 +2308,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
if (unlikely(!si_upload_graphics_shader_descriptors(sctx) ||
(sctx->vertex_buffers_dirty &&
sctx->num_vertex_elements &&
!si_upload_vertex_buffer_descriptors(sctx)))) {
!si_upload_vertex_buffer_descriptors<GFX_VERSION>(sctx)))) {
DRAW_CLEANUP;
return;
}
@ -2307,7 +2320,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
unsigned masked_atoms = 0;
bool gfx9_scissor_bug = false;
if (sctx->screen->info.has_gfx9_scissor_bug) {
if (GFX_VERSION == GFX9 && sctx->screen->info.has_gfx9_scissor_bug) {
masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.scissors);
gfx9_scissor_bug = true;
@ -2329,8 +2342,9 @@ static void si_draw_vbo(struct pipe_context *ctx,
masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.render_cond);
/* Emit all states except possibly render condition. */
si_emit_all_states(sctx, info, indirect, prim, instance_count, min_direct_count,
primitive_restart, masked_atoms);
si_emit_all_states<GFX_VERSION, HAS_TESS, HAS_GS, NGG>
(sctx, info, indirect, prim, instance_count, min_direct_count,
primitive_restart, masked_atoms);
sctx->emit_cache_flush(sctx);
/* <-- CUs are idle here. */
@ -2339,22 +2353,23 @@ static void si_draw_vbo(struct pipe_context *ctx,
sctx->dirty_atoms &= ~si_get_atom_bit(sctx, &sctx->atoms.s.render_cond);
}
if (gfx9_scissor_bug &&
if (GFX_VERSION == GFX9 && gfx9_scissor_bug &&
(sctx->context_roll || si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) {
sctx->atoms.s.scissors.emit(sctx);
sctx->dirty_atoms &= ~si_get_atom_bit(sctx, &sctx->atoms.s.scissors);
}
assert(sctx->dirty_atoms == 0);
si_emit_draw_packets(sctx, info, indirect, draws, num_draws,
indexbuf, index_size, index_offset, instance_count,
dispatch_prim_discard_cs, original_index_size);
si_emit_draw_packets<GFX_VERSION, NGG, ALLOW_PRIM_DISCARD_CS>
(sctx, info, indirect, draws, num_draws, indexbuf, index_size,
index_offset, instance_count, dispatch_prim_discard_cs,
original_index_size);
/* <-- CUs are busy here. */
/* Start prefetches after the draw has been started. Both will run
* in parallel, but starting the draw first is more important.
*/
if (sctx->chip_class >= GFX7 && sctx->prefetch_L2_mask)
if (GFX_VERSION >= GFX7 && sctx->prefetch_L2_mask)
cik_emit_prefetch_L2(sctx, false);
} else {
/* If we don't wait for idle, start prefetches first, then set
@ -2364,31 +2379,36 @@ static void si_draw_vbo(struct pipe_context *ctx,
sctx->emit_cache_flush(sctx);
/* Only prefetch the API VS and VBO descriptors. */
if (sctx->chip_class >= GFX7 && sctx->prefetch_L2_mask)
if (GFX_VERSION >= GFX7 && sctx->prefetch_L2_mask)
cik_emit_prefetch_L2(sctx, true);
si_emit_all_states(sctx, info, indirect, prim, instance_count, min_direct_count,
primitive_restart, masked_atoms);
si_emit_all_states<GFX_VERSION, HAS_TESS, HAS_GS, NGG>
(sctx, info, indirect, prim, instance_count, min_direct_count,
primitive_restart, masked_atoms);
if (gfx9_scissor_bug &&
if (GFX_VERSION == GFX9 && gfx9_scissor_bug &&
(sctx->context_roll || si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) {
sctx->atoms.s.scissors.emit(sctx);
sctx->dirty_atoms &= ~si_get_atom_bit(sctx, &sctx->atoms.s.scissors);
}
assert(sctx->dirty_atoms == 0);
si_emit_draw_packets(sctx, info, indirect, draws, num_draws,
indexbuf, index_size, index_offset, instance_count,
dispatch_prim_discard_cs, original_index_size);
si_emit_draw_packets<GFX_VERSION, NGG, ALLOW_PRIM_DISCARD_CS>
(sctx, info, indirect, draws, num_draws, indexbuf, index_size,
index_offset, instance_count,
dispatch_prim_discard_cs, original_index_size);
/* Prefetch the remaining shaders after the draw has been
* started. */
if (sctx->chip_class >= GFX7 && sctx->prefetch_L2_mask)
if (GFX_VERSION >= GFX7 && sctx->prefetch_L2_mask)
cik_emit_prefetch_L2(sctx, false);
}
/* Clear the context roll flag after the draw call. */
sctx->context_roll = false;
/* Clear the context roll flag after the draw call.
* Only used by the gfx9 scissor bug.
*/
if (GFX_VERSION == GFX9)
sctx->context_roll = false;
if (unlikely(sctx->current_saved_cs)) {
si_trace_emit(sctx);
@ -2397,7 +2417,8 @@ static void si_draw_vbo(struct pipe_context *ctx,
/* Workaround for a VGT hang when streamout is enabled.
* It must be done after drawing. */
if ((sctx->family == CHIP_HAWAII || sctx->family == CHIP_TONGA || sctx->family == CHIP_FIJI) &&
if ((GFX_VERSION == GFX7 || GFX_VERSION == GFX8) &&
(sctx->family == CHIP_HAWAII || sctx->family == CHIP_TONGA || sctx->family == CHIP_FIJI) &&
si_get_strmout_en(sctx)) {
sctx->flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC;
}
@ -2457,7 +2478,7 @@ static void si_draw_rectangle(struct blitter_context *blitter, void *vertex_elem
sctx->vertex_buffer_pointer_dirty = false;
sctx->vertex_buffer_user_sgprs_dirty = false;
si_draw_vbo(pipe, &info, NULL, &draw, 1);
pipe->draw_vbo(pipe, &info, NULL, &draw, 1);
}
extern "C"
@ -2475,10 +2496,67 @@ void si_trace_emit(struct si_context *sctx)
u_log_flush(sctx->log);
}
template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS,
si_has_ngg NGG, si_has_prim_discard_cs ALLOW_PRIM_DISCARD_CS>
static void si_init_draw_vbo(struct si_context *sctx)
{
/* Prim discard CS is only useful on gfx7+ because gfx6 doesn't have async compute. */
if (ALLOW_PRIM_DISCARD_CS && GFX_VERSION < GFX7)
return;
if (NGG && GFX_VERSION < GFX10)
return;
sctx->draw_vbo[GFX_VERSION - GFX6][HAS_TESS][HAS_GS][NGG][ALLOW_PRIM_DISCARD_CS] =
si_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG, ALLOW_PRIM_DISCARD_CS>;
}
template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS>
static void si_init_draw_vbo_all_internal_options(struct si_context *sctx)
{
si_init_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG_OFF, PRIM_DISCARD_CS_OFF>(sctx);
si_init_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG_OFF, PRIM_DISCARD_CS_ON>(sctx);
si_init_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG_ON, PRIM_DISCARD_CS_OFF>(sctx);
si_init_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG_ON, PRIM_DISCARD_CS_ON>(sctx);
}
template <chip_class GFX_VERSION>
static void si_init_draw_vbo_all_pipeline_options(struct si_context *sctx)
{
si_init_draw_vbo_all_internal_options<GFX_VERSION, TESS_OFF, GS_OFF>(sctx);
si_init_draw_vbo_all_internal_options<GFX_VERSION, TESS_OFF, GS_ON>(sctx);
si_init_draw_vbo_all_internal_options<GFX_VERSION, TESS_ON, GS_OFF>(sctx);
si_init_draw_vbo_all_internal_options<GFX_VERSION, TESS_ON, GS_ON>(sctx);
}
static void si_init_draw_vbo_all_families(struct si_context *sctx)
{
si_init_draw_vbo_all_pipeline_options<GFX6>(sctx);
si_init_draw_vbo_all_pipeline_options<GFX7>(sctx);
si_init_draw_vbo_all_pipeline_options<GFX8>(sctx);
si_init_draw_vbo_all_pipeline_options<GFX9>(sctx);
si_init_draw_vbo_all_pipeline_options<GFX10>(sctx);
si_init_draw_vbo_all_pipeline_options<GFX10_3>(sctx);
}
static void si_invalid_draw_vbo(struct pipe_context *pipe,
const struct pipe_draw_info *info,
const struct pipe_draw_indirect_info *indirect,
const struct pipe_draw_start_count *draws,
unsigned num_draws)
{
unreachable("vertex shader not bound");
}
extern "C"
void si_init_draw_functions(struct si_context *sctx)
{
sctx->b.draw_vbo = si_draw_vbo;
si_init_draw_vbo_all_families(sctx);
/* Bind a fake draw_vbo, so that draw_vbo isn't NULL, which would skip
* initialization of callbacks in upper layers (such as u_threaded_context).
*/
sctx->b.draw_vbo = si_invalid_draw_vbo;
sctx->blitter->draw_rectangle = si_draw_rectangle;
si_init_ia_multi_vgt_param_table(sctx);

View file

@ -3011,6 +3011,7 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state)
si_shader_change_notify(sctx);
si_update_common_shader_state(sctx, sel, PIPE_SHADER_VERTEX);
si_select_draw_vbo(sctx);
si_update_vs_viewport_state(sctx);
si_update_streamout_state(sctx);
si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, si_get_vs(sctx)->cso,
@ -3059,6 +3060,7 @@ bool si_update_ngg(struct si_context *sctx)
sctx->ngg = new_ngg;
sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
si_select_draw_vbo(sctx);
return true;
}
return false;
@ -3081,6 +3083,7 @@ static void si_bind_gs_shader(struct pipe_context *ctx, void *state)
sctx->ia_multi_vgt_param_key.u.uses_gs = sel != NULL;
si_update_common_shader_state(sctx, sel, PIPE_SHADER_GEOMETRY);
si_select_draw_vbo(sctx);
sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
ngg_changed = si_update_ngg(sctx);
@ -3132,6 +3135,7 @@ static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
si_update_tess_uses_prim_id(sctx);
si_update_common_shader_state(sctx, sel, PIPE_SHADER_TESS_EVAL);
si_select_draw_vbo(sctx);
sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
bool ngg_changed = si_update_ngg(sctx);