mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-24 19:40:10 +01:00
radeonsi: fix small primitive culling with MSAA force-disabled and smoothing
The problem was that the shader constants were based on the framebuffer
sample count and ignored the multisample enable state and the line/polygon
smoothing state, which uses MSAA rasterization that only sets SampleMaskIn
to get the coverage for alpha-blended smoothing (the PS epilog computes
the alpha channel from SampleMaskIn and blending generates the AA results).
- This is a complete rework that adds a new state for NGG cull constants.
- It fixes the same thing for the prim discard compute shader.
- It documents how VS_STATE.SMALL_PRIM_PRECISION is encoded.
It fixes blue corruption in Unigine Heaven with MSAA and Medium details
or better.
Fixes: 7648060dc0 - radeonsi: enable NGG culling by default on gfx10.3 dGPUs
Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8022>
This commit is contained in:
parent
836b9e1d88
commit
dffc27e5e1
7 changed files with 98 additions and 80 deletions
|
|
@ -1313,19 +1313,6 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
|
|||
desc[10] = fui(cull_info.translate[0]);
|
||||
desc[11] = fui(cull_info.translate[1]);
|
||||
|
||||
/* Better subpixel precision increases the efficiency of small
|
||||
* primitive culling. */
|
||||
unsigned num_samples = sctx->framebuffer.nr_samples;
|
||||
unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode;
|
||||
float small_prim_cull_precision;
|
||||
|
||||
if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
|
||||
small_prim_cull_precision = num_samples / 4096.0;
|
||||
else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
|
||||
small_prim_cull_precision = num_samples / 1024.0;
|
||||
else
|
||||
small_prim_cull_precision = num_samples / 256.0;
|
||||
|
||||
/* Set user data SGPRs. */
|
||||
/* This can't be greater than 14 if we want the fastest launch rate. */
|
||||
unsigned user_sgprs = 13;
|
||||
|
|
@ -1489,7 +1476,7 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
|
|||
radeon_emit(cs, num_prims_udiv.post_shift | (num_prims_per_instance << 5));
|
||||
radeon_emit(cs, info->restart_index);
|
||||
/* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */
|
||||
radeon_emit(cs, fui(small_prim_cull_precision));
|
||||
radeon_emit(cs, fui(cull_info.small_prim_precision));
|
||||
} else {
|
||||
assert(VERTEX_COUNTER_GDS_MODE == 2);
|
||||
/* Only update the SGPRs that changed. */
|
||||
|
|
|
|||
|
|
@ -452,15 +452,14 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
|
|||
ctx->framebuffer.dirty_cbufs = u_bit_consecutive(0, 8);
|
||||
ctx->framebuffer.dirty_zsbuf = true;
|
||||
}
|
||||
/* This should always be marked as dirty to set the framebuffer scissor
|
||||
* at least.
|
||||
*
|
||||
* Even with shadowed registers, we have to add buffers to the buffer list.
|
||||
* All of these do that.
|
||||
|
||||
/* Even with shadowed registers, we have to add buffers to the buffer list.
|
||||
* These atoms are the only ones that add buffers.
|
||||
*/
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.framebuffer);
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.render_cond);
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
|
||||
if (ctx->screen->use_ngg_culling)
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.ngg_cull_state);
|
||||
|
||||
if (first_cs || !ctx->shadowed_regs) {
|
||||
/* These don't add any buffers, so skip them with shadowing. */
|
||||
|
|
@ -490,6 +489,7 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
|
|||
si_mark_atom_dirty(ctx, &ctx->atoms.s.window_rectangles);
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
|
||||
|
||||
/* Invalidate various draw states so that they are emitted before
|
||||
* the first draw call. */
|
||||
|
|
@ -534,7 +534,6 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
|
|||
|
||||
assert(!ctx->gfx_cs.prev_dw);
|
||||
ctx->initial_gfx_cs_size = ctx->gfx_cs.current.cdw;
|
||||
ctx->small_prim_cull_info_dirty = ctx->small_prim_cull_info_buf != NULL;
|
||||
ctx->prim_discard_compute_ib_initialized = false;
|
||||
|
||||
/* Compute-based primitive discard:
|
||||
|
|
|
|||
|
|
@ -902,6 +902,7 @@ struct si_saved_cs {
|
|||
|
||||
struct si_small_prim_cull_info {
|
||||
float scale[2], translate[2];
|
||||
float small_prim_precision;
|
||||
};
|
||||
|
||||
typedef void (*pipe_draw_vbo_func)(struct pipe_context *pipe,
|
||||
|
|
@ -1151,7 +1152,6 @@ struct si_context {
|
|||
struct si_small_prim_cull_info last_small_prim_cull_info;
|
||||
struct si_resource *small_prim_cull_info_buf;
|
||||
uint64_t small_prim_cull_info_address;
|
||||
bool small_prim_cull_info_dirty;
|
||||
|
||||
/* Scratch buffer */
|
||||
struct si_resource *scratch_buffer;
|
||||
|
|
@ -1525,7 +1525,6 @@ struct pipe_video_buffer *si_video_buffer_create(struct pipe_context *pipe,
|
|||
const struct pipe_video_buffer *tmpl);
|
||||
|
||||
/* si_viewport.c */
|
||||
void si_update_ngg_small_prim_precision(struct si_context *ctx);
|
||||
void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small_prim_cull_info *out);
|
||||
void si_update_vs_viewport_state(struct si_context *ctx);
|
||||
void si_init_viewport_functions(struct si_context *ctx);
|
||||
|
|
@ -1950,6 +1949,20 @@ static inline void si_select_draw_vbo(struct si_context *sctx)
|
|||
assert(sctx->b.draw_vbo);
|
||||
}
|
||||
|
||||
/* Return the number of samples that the rasterizer uses. */
|
||||
static inline unsigned si_get_num_coverage_samples(struct si_context *sctx)
|
||||
{
|
||||
if (sctx->framebuffer.nr_samples > 1 &&
|
||||
sctx->queued.named.rasterizer->multisample_enable)
|
||||
return sctx->framebuffer.nr_samples;
|
||||
|
||||
/* Note that smoothing_enabled is set by si_update_shaders. */
|
||||
if (sctx->smoothing_enabled)
|
||||
return SI_NUM_SMOOTH_AA_SAMPLES;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
#define PRINT_ERR(fmt, args...) \
|
||||
fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args)
|
||||
|
||||
|
|
|
|||
|
|
@ -989,6 +989,10 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state)
|
|||
/* Update the small primitive filter workaround if necessary. */
|
||||
if (sctx->screen->info.has_msaa_sample_loc_bug && sctx->framebuffer.nr_samples > 1)
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
|
||||
|
||||
/* NGG cull state uses multisample_enable. */
|
||||
if (sctx->screen->use_ngg_culling)
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.ngg_cull_state);
|
||||
}
|
||||
|
||||
sctx->current_vs_state &= C_VS_STATE_CLAMP_VERTEX_COLOR;
|
||||
|
|
@ -2827,10 +2831,13 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
|
|||
|
||||
si_update_ps_colorbuf0_slot(sctx);
|
||||
si_update_poly_offset_state(sctx);
|
||||
si_update_ngg_small_prim_precision(sctx);
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
|
||||
|
||||
/* NGG cull state uses the sample count. */
|
||||
if (sctx->screen->use_ngg_culling)
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.ngg_cull_state);
|
||||
|
||||
if (sctx->screen->dpbb_allowed)
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
|
||||
|
||||
|
|
@ -3432,8 +3439,9 @@ static void si_emit_msaa_config(struct si_context *sctx)
|
|||
* EQAA 4s 4z 2f - might look the same as 4x MSAA with low-density geometry
|
||||
* EQAA 2s 2z 2f = 2x MSAA
|
||||
*/
|
||||
coverage_samples = color_samples = z_samples = si_get_num_coverage_samples(sctx);
|
||||
|
||||
if (sctx->framebuffer.nr_samples > 1 && rs->multisample_enable) {
|
||||
coverage_samples = sctx->framebuffer.nr_samples;
|
||||
color_samples = sctx->framebuffer.nr_color_samples;
|
||||
|
||||
if (sctx->framebuffer.state.zsbuf) {
|
||||
|
|
@ -3442,10 +3450,6 @@ static void si_emit_msaa_config(struct si_context *sctx)
|
|||
} else {
|
||||
z_samples = coverage_samples;
|
||||
}
|
||||
} else if (sctx->smoothing_enabled) {
|
||||
coverage_samples = color_samples = z_samples = SI_NUM_SMOOTH_AA_SAMPLES;
|
||||
} else {
|
||||
coverage_samples = color_samples = z_samples = 1;
|
||||
}
|
||||
|
||||
/* Required by OpenGL line rasterization.
|
||||
|
|
|
|||
|
|
@ -232,6 +232,7 @@ union si_state_atoms {
|
|||
struct si_atom scratch_state;
|
||||
struct si_atom window_rectangles;
|
||||
struct si_atom shader_query;
|
||||
struct si_atom ngg_cull_state;
|
||||
} s;
|
||||
struct si_atom array[sizeof(struct si_atoms_s) / sizeof(struct si_atom)];
|
||||
};
|
||||
|
|
|
|||
|
|
@ -4090,6 +4090,10 @@ bool si_update_shaders(struct si_context *sctx)
|
|||
sctx->smoothing_enabled = sctx->ps_shader.current->key.part.ps.epilog.poly_line_smoothing;
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
|
||||
|
||||
/* NGG cull state uses smoothing_enabled. */
|
||||
if (sctx->screen->use_ngg_culling)
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.ngg_cull_state);
|
||||
|
||||
if (sctx->chip_class == GFX6)
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
|
||||
|
||||
|
|
|
|||
|
|
@ -28,34 +28,13 @@
|
|||
|
||||
#define SI_MAX_SCISSOR 16384
|
||||
|
||||
void si_update_ngg_small_prim_precision(struct si_context *ctx)
|
||||
{
|
||||
if (!ctx->screen->use_ngg_culling)
|
||||
return;
|
||||
|
||||
/* Set VS_STATE.SMALL_PRIM_PRECISION for NGG culling. */
|
||||
unsigned num_samples = ctx->framebuffer.nr_samples;
|
||||
unsigned quant_mode = ctx->viewports.as_scissor[0].quant_mode;
|
||||
float precision;
|
||||
|
||||
if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
|
||||
precision = num_samples / 4096.0;
|
||||
else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
|
||||
precision = num_samples / 1024.0;
|
||||
else
|
||||
precision = num_samples / 256.0;
|
||||
|
||||
ctx->current_vs_state &= C_VS_STATE_SMALL_PRIM_PRECISION;
|
||||
ctx->current_vs_state |= S_VS_STATE_SMALL_PRIM_PRECISION(fui(precision) >> 23);
|
||||
}
|
||||
|
||||
void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small_prim_cull_info *out)
|
||||
{
|
||||
/* This is needed by the small primitive culling, because it's done
|
||||
* in screen space.
|
||||
*/
|
||||
struct si_small_prim_cull_info info;
|
||||
unsigned num_samples = sctx->framebuffer.nr_samples;
|
||||
unsigned num_samples = si_get_num_coverage_samples(sctx);
|
||||
assert(num_samples >= 1);
|
||||
|
||||
info.scale[0] = sctx->viewports.states[0].scale[0];
|
||||
|
|
@ -85,9 +64,64 @@ void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small_prim_c
|
|||
info.scale[i] *= num_samples;
|
||||
info.translate[i] *= num_samples;
|
||||
}
|
||||
|
||||
/* Better subpixel precision increases the efficiency of small
|
||||
* primitive culling. (more precision means a tighter bounding box
|
||||
* around primitives and more accurate elimination)
|
||||
*/
|
||||
unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode;
|
||||
|
||||
if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
|
||||
info.small_prim_precision = num_samples / 4096.0;
|
||||
else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
|
||||
info.small_prim_precision = num_samples / 1024.0;
|
||||
else
|
||||
info.small_prim_precision = num_samples / 256.0;
|
||||
|
||||
*out = info;
|
||||
}
|
||||
|
||||
static void si_emit_cull_state(struct si_context *sctx)
|
||||
{
|
||||
assert(sctx->screen->use_ngg_culling);
|
||||
|
||||
struct si_small_prim_cull_info info;
|
||||
si_get_small_prim_cull_info(sctx, &info);
|
||||
|
||||
if (!sctx->small_prim_cull_info_buf ||
|
||||
memcmp(&info, &sctx->last_small_prim_cull_info, sizeof(info))) {
|
||||
unsigned offset = 0;
|
||||
|
||||
/* Align to 256, because the address is shifted by 8 bits. */
|
||||
u_upload_data(sctx->b.const_uploader, 0, sizeof(info), 256, &info, &offset,
|
||||
(struct pipe_resource **)&sctx->small_prim_cull_info_buf);
|
||||
|
||||
sctx->small_prim_cull_info_address = sctx->small_prim_cull_info_buf->gpu_address + offset;
|
||||
sctx->last_small_prim_cull_info = info;
|
||||
}
|
||||
|
||||
/* This will end up in SGPR6 as (value << 8), shifted by the hw. */
|
||||
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, sctx->small_prim_cull_info_buf,
|
||||
RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
|
||||
radeon_set_sh_reg(&sctx->gfx_cs, R_00B220_SPI_SHADER_PGM_LO_GS,
|
||||
sctx->small_prim_cull_info_address >> 8);
|
||||
|
||||
/* Set VS_STATE.SMALL_PRIM_PRECISION for NGG culling.
|
||||
*
|
||||
* small_prim_precision is 1 / 2^n. We only need n between 5 (1/32) and 12 (1/4096).
|
||||
* Such a floating point value can be packed into 4 bits as follows:
|
||||
* If we pass the first 4 bits of the exponent to the shader and set the next 3 bits
|
||||
* to 1, we'll get the number exactly because all other bits are always 0. See:
|
||||
* 1
|
||||
* value = (0x70 | value.exponent[0:3]) << 23 = ------------------------------
|
||||
* 2 ^ (15 - value.exponent[0:3])
|
||||
*
|
||||
* So pass only the first 4 bits of the float exponent to the shader.
|
||||
*/
|
||||
sctx->current_vs_state &= C_VS_STATE_SMALL_PRIM_PRECISION;
|
||||
sctx->current_vs_state |= S_VS_STATE_SMALL_PRIM_PRECISION(fui(info.small_prim_precision) >> 23);
|
||||
}
|
||||
|
||||
static void si_set_scissor_states(struct pipe_context *pctx, unsigned start_slot,
|
||||
unsigned num_scissors, const struct pipe_scissor_state *state)
|
||||
{
|
||||
|
|
@ -330,8 +364,6 @@ static void si_emit_guardband(struct si_context *ctx)
|
|||
S_028BE4_QUANT_MODE(V_028BE4_X_16_8_FIXED_POINT_1_256TH + vp_as_scissor.quant_mode));
|
||||
if (initial_cdw != ctx->gfx_cs.current.cdw)
|
||||
ctx->context_roll = true;
|
||||
|
||||
si_update_ngg_small_prim_precision(ctx);
|
||||
}
|
||||
|
||||
static void si_emit_scissors(struct si_context *ctx)
|
||||
|
|
@ -430,6 +462,10 @@ static void si_set_viewport_states(struct pipe_context *pctx, unsigned start_slo
|
|||
if (start_slot == 0) {
|
||||
ctx->viewports.y_inverted =
|
||||
-state->scale[1] + state->translate[1] > state->scale[1] + state->translate[1];
|
||||
|
||||
/* NGG cull state uses the viewport and quant mode. */
|
||||
if (ctx->screen->use_ngg_culling)
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.ngg_cull_state);
|
||||
}
|
||||
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
|
||||
|
|
@ -454,33 +490,6 @@ static void si_emit_viewports(struct si_context *ctx)
|
|||
struct radeon_cmdbuf *cs = &ctx->gfx_cs;
|
||||
struct pipe_viewport_state *states = ctx->viewports.states;
|
||||
|
||||
if (ctx->screen->use_ngg_culling) {
|
||||
/* Set the viewport info for small primitive culling. */
|
||||
struct si_small_prim_cull_info info;
|
||||
si_get_small_prim_cull_info(ctx, &info);
|
||||
|
||||
if (memcmp(&info, &ctx->last_small_prim_cull_info, sizeof(info))) {
|
||||
unsigned offset = 0;
|
||||
|
||||
/* Align to 256, because the address is shifted by 8 bits. */
|
||||
u_upload_data(ctx->b.const_uploader, 0, sizeof(info), 256, &info, &offset,
|
||||
(struct pipe_resource **)&ctx->small_prim_cull_info_buf);
|
||||
|
||||
ctx->small_prim_cull_info_address = ctx->small_prim_cull_info_buf->gpu_address + offset;
|
||||
ctx->last_small_prim_cull_info = info;
|
||||
ctx->small_prim_cull_info_dirty = true;
|
||||
}
|
||||
|
||||
if (ctx->small_prim_cull_info_dirty) {
|
||||
/* This will end up in SGPR6 as (value << 8), shifted by the hw. */
|
||||
radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, ctx->small_prim_cull_info_buf,
|
||||
RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
|
||||
radeon_set_sh_reg(&ctx->gfx_cs, R_00B220_SPI_SHADER_PGM_LO_GS,
|
||||
ctx->small_prim_cull_info_address >> 8);
|
||||
ctx->small_prim_cull_info_dirty = false;
|
||||
}
|
||||
}
|
||||
|
||||
/* The simple case: Only 1 viewport is active. */
|
||||
if (!ctx->vs_writes_viewport_index) {
|
||||
radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE, 6);
|
||||
|
|
@ -655,6 +664,7 @@ void si_init_viewport_functions(struct si_context *ctx)
|
|||
ctx->atoms.s.scissors.emit = si_emit_scissors;
|
||||
ctx->atoms.s.viewports.emit = si_emit_viewport_states;
|
||||
ctx->atoms.s.window_rectangles.emit = si_emit_window_rectangles;
|
||||
ctx->atoms.s.ngg_cull_state.emit = si_emit_cull_state;
|
||||
|
||||
ctx->b.set_scissor_states = si_set_scissor_states;
|
||||
ctx->b.set_viewport_states = si_set_viewport_states;
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue