radeonsi: merge pm4 state and atom emit loops into one

This merges both loops in si_draw by tracking which pm4 states are dirty
using the state atom mechanism used for other states. pm4 states now have
to set their own emit function.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24732>
This commit is contained in:
Marek Olšák 2023-07-16 08:38:17 -04:00 committed by Marge Bot
parent c21ce04014
commit 3986f27396
5 changed files with 47 additions and 45 deletions

View file

@ -1033,11 +1033,9 @@ struct si_context {
unsigned last_num_draw_calls;
unsigned flags; /* flush flags */
/* Atoms (direct states). */
/* Atoms (state emit functions). */
union si_state_atoms atoms;
unsigned dirty_atoms; /* mask */
/* PM4 states (precomputed immutable states) */
unsigned dirty_states;
uint64_t dirty_atoms; /* mask */
union si_state queued;
union si_state emitted;
/* Gfx11+: Buffered SH registers for SET_SH_REG_PAIRS_PACKED*. */
@ -1759,14 +1757,14 @@ static inline unsigned si_get_minimum_num_gfx_cs_dwords(struct si_context *sctx,
return 2048 + sctx->num_cs_dw_queries_suspend + num_draws * 10;
}
static inline unsigned si_get_atom_bit(struct si_context *sctx, struct si_atom *atom)
static inline uint64_t si_get_atom_bit(struct si_context *sctx, struct si_atom *atom)
{
return 1 << (atom - sctx->atoms.array);
return 1ull << (atom - sctx->atoms.array);
}
static inline void si_set_atom_dirty(struct si_context *sctx, struct si_atom *atom, bool dirty)
{
unsigned bit = si_get_atom_bit(sctx, atom);
uint64_t bit = si_get_atom_bit(sctx, atom);
if (dirty)
sctx->dirty_atoms |= bit;

View file

@ -316,7 +316,7 @@ void si_pm4_free_state(struct si_context *sctx, struct si_pm4_state *state, unsi
if (sctx->queued.array[idx] == state) {
sctx->queued.array[idx] = NULL;
sctx->dirty_states &= ~BITFIELD_BIT(idx);
sctx->dirty_atoms &= ~BITFIELD64_BIT(idx);
}
}
@ -361,7 +361,7 @@ void si_pm4_reset_emitted(struct si_context *sctx)
for (unsigned i = 0; i < SI_NUM_STATES; i++) {
if (sctx->queued.array[i])
sctx->dirty_states |= BITFIELD_BIT(i);
sctx->dirty_atoms |= BITFIELD64_BIT(i);
}
}

View file

@ -5413,6 +5413,9 @@ void si_init_state_compute_functions(struct si_context *sctx)
void si_init_state_functions(struct si_context *sctx)
{
for (unsigned i = 0; i < ARRAY_SIZE(sctx->atoms.s.pm4_states); i++)
sctx->atoms.s.pm4_states[i].emit = si_pm4_emit_state;
sctx->atoms.s.framebuffer.emit = si_emit_framebuffer_state;
sctx->atoms.s.db_render_state.emit = si_emit_db_render_state;
sctx->atoms.s.dpbb_state.emit = si_emit_dpbb_state;

View file

@ -176,18 +176,13 @@ union si_state {
};
#define SI_STATE_IDX(name) (offsetof(union si_state, named.name) / sizeof(struct si_pm4_state *))
#define SI_STATE_BIT(name) (1 << SI_STATE_IDX(name))
#define SI_STATE_BIT(name) (1ull << SI_STATE_IDX(name))
#define SI_NUM_STATES (sizeof(union si_state) / sizeof(struct si_pm4_state *))
static inline unsigned si_states_that_always_roll_context(void)
{
return (SI_STATE_BIT(blend) | SI_STATE_BIT(rasterizer) | SI_STATE_BIT(dsa) |
SI_STATE_BIT(poly_offset));
}
union si_state_atoms {
struct si_atoms_s {
/* The order matters. */
/* This must be first. */
struct si_atom pm4_states[SI_NUM_STATES];
struct si_atom render_cond;
struct si_atom streamout_begin;
struct si_atom streamout_enable; /* must be after streamout_begin */
@ -217,15 +212,17 @@ union si_state_atoms {
struct si_atom array[sizeof(struct si_atoms_s) / sizeof(struct si_atom)];
};
#define SI_ATOM_BIT(name) (1 << (offsetof(union si_state_atoms, s.name) / sizeof(struct si_atom)))
#define SI_ATOM_BIT(name) (1ull << (offsetof(union si_state_atoms, s.name) / sizeof(struct si_atom)))
#define SI_NUM_ATOMS (sizeof(union si_state_atoms) / sizeof(struct si_atom))
static inline unsigned si_atoms_that_always_roll_context(void)
static inline uint64_t si_atoms_that_always_roll_context(void)
{
return (SI_ATOM_BIT(streamout_begin) | SI_ATOM_BIT(streamout_enable) | SI_ATOM_BIT(framebuffer) |
SI_ATOM_BIT(sample_locations) | SI_ATOM_BIT(sample_mask) | SI_ATOM_BIT(blend_color) |
SI_ATOM_BIT(clip_state) | SI_ATOM_BIT(scissors) | SI_ATOM_BIT(viewports) |
SI_ATOM_BIT(stencil_ref) | SI_ATOM_BIT(scratch_state) | SI_ATOM_BIT(window_rectangles));
return SI_STATE_BIT(blend) | SI_STATE_BIT(rasterizer) | SI_STATE_BIT(dsa) |
SI_STATE_BIT(poly_offset) |
SI_ATOM_BIT(streamout_begin) | SI_ATOM_BIT(streamout_enable) | SI_ATOM_BIT(framebuffer) |
SI_ATOM_BIT(sample_locations) | SI_ATOM_BIT(sample_mask) | SI_ATOM_BIT(blend_color)|
SI_ATOM_BIT(clip_state) | SI_ATOM_BIT(scissors) | SI_ATOM_BIT(viewports)|
SI_ATOM_BIT(stencil_ref) | SI_ATOM_BIT(scratch_state) | SI_ATOM_BIT(window_rectangles);
}
struct si_shader_data {
@ -516,9 +513,9 @@ struct si_buffer_resources {
do { \
(sctx)->queued.named.member = (value); \
if (value && value != (sctx)->emitted.named.member) \
(sctx)->dirty_states |= SI_STATE_BIT(member); \
(sctx)->dirty_atoms |= SI_STATE_BIT(member); \
else \
(sctx)->dirty_states &= ~SI_STATE_BIT(member); \
(sctx)->dirty_atoms &= ~SI_STATE_BIT(member); \
} while (0)
/* si_descriptors.c */

View file

@ -1936,28 +1936,33 @@ static void si_get_draw_start_count(struct si_context *sctx, const struct pipe_d
}
ALWAYS_INLINE
static void si_emit_all_states(struct si_context *sctx, unsigned skip_atom_mask)
static void si_emit_all_states(struct si_context *sctx, uint64_t skip_atom_mask)
{
/* Emit state atoms. */
unsigned mask = sctx->dirty_atoms & ~skip_atom_mask;
if (mask) {
do {
unsigned i = u_bit_scan(&mask);
sctx->atoms.array[i].emit(sctx, i);
} while (mask);
/* Emit states by calling their emit functions. */
uint64_t dirty = sctx->dirty_atoms & ~skip_atom_mask;
if (dirty) {
sctx->dirty_atoms &= skip_atom_mask;
}
/* Emit states. */
mask = sctx->dirty_states;
if (mask) {
do {
unsigned i = u_bit_scan(&mask);
si_pm4_emit_state(sctx, i);
} while (mask);
/* u_bit_scan64 is too slow on i386. */
if (sizeof(void*) == 8) {
do {
unsigned i = u_bit_scan64(&dirty);
sctx->atoms.array[i].emit(sctx, i);
} while (dirty);
} else {
unsigned dirty_lo = dirty;
unsigned dirty_hi = dirty >> 32;
sctx->dirty_states = 0;
while (dirty_lo) {
unsigned i = u_bit_scan(&dirty_lo);
sctx->atoms.array[i].emit(sctx, i);
}
while (dirty_hi) {
unsigned i = 32 + u_bit_scan(&dirty_hi);
sctx->atoms.array[i].emit(sctx, i);
}
}
}
}
@ -2230,7 +2235,7 @@ static void si_draw(struct pipe_context *ctx,
* It's better to draw before prefetches because we want to start fetching indices before
* shaders. The idea is to minimize the time when the CUs are idle.
*/
unsigned masked_atoms = 0;
uint64_t masked_atoms = 0;
if (unlikely(sctx->flags & SI_CONTEXT_FLUSH_FOR_RENDER_COND)) {
/* The render condition state should be emitted after cache flushes. */
masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.render_cond);
@ -2247,8 +2252,7 @@ static void si_draw(struct pipe_context *ctx,
gfx9_scissor_bug = true;
if ((!IS_DRAW_VERTEX_STATE && indirect && indirect->count_from_stream_output) ||
sctx->dirty_atoms & si_atoms_that_always_roll_context() ||
sctx->dirty_states & si_states_that_always_roll_context())
sctx->dirty_atoms & si_atoms_that_always_roll_context())
sctx->context_roll = true;
}