radeonsi: fix small primitive culling with MSAA force-disabled and smoothing

The problem was that the shader constants were based on the framebuffer
sample count and ignored the multisample enable state and the line/polygon
smoothing state, which uses MSAA rasterization that only sets SampleMaskIn
to get the coverage for alpha-blended smoothing (the PS epilog computes
the alpha channel from SampleMaskIn and blending generates the AA results).

- This is a complete rework that adds a new state for NGG cull constants.
- It fixes the same thing for the prim discard compute shader.
- It documents how VS_STATE.SMALL_PRIM_PRECISION is encoded.

It fixes blue corruption in Unigine Heaven with MSAA and Medium details
or better.

Fixes: 7648060dc0 - radeonsi: enable NGG culling by default on gfx10.3 dGPUs

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8022>
This commit is contained in:
Marek Olšák 2020-12-09 19:18:37 -05:00
parent 836b9e1d88
commit dffc27e5e1
7 changed files with 98 additions and 80 deletions

View file

@ -1313,19 +1313,6 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
desc[10] = fui(cull_info.translate[0]);
desc[11] = fui(cull_info.translate[1]);
/* Better subpixel precision increases the efficiency of small
* primitive culling. */
unsigned num_samples = sctx->framebuffer.nr_samples;
unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode;
float small_prim_cull_precision;
if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
small_prim_cull_precision = num_samples / 4096.0;
else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
small_prim_cull_precision = num_samples / 1024.0;
else
small_prim_cull_precision = num_samples / 256.0;
/* Set user data SGPRs. */
/* This can't be greater than 14 if we want the fastest launch rate. */
unsigned user_sgprs = 13;
@ -1489,7 +1476,7 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
radeon_emit(cs, num_prims_udiv.post_shift | (num_prims_per_instance << 5));
radeon_emit(cs, info->restart_index);
/* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */
radeon_emit(cs, fui(small_prim_cull_precision));
radeon_emit(cs, fui(cull_info.small_prim_precision));
} else {
assert(VERTEX_COUNTER_GDS_MODE == 2);
/* Only update the SGPRs that changed. */

View file

@ -452,15 +452,14 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
ctx->framebuffer.dirty_cbufs = u_bit_consecutive(0, 8);
ctx->framebuffer.dirty_zsbuf = true;
}
/* This should always be marked as dirty to set the framebuffer scissor
* at least.
*
* Even with shadowed registers, we have to add buffers to the buffer list.
* All of these do that.
/* Even with shadowed registers, we have to add buffers to the buffer list.
* These atoms are the only ones that add buffers.
*/
si_mark_atom_dirty(ctx, &ctx->atoms.s.framebuffer);
si_mark_atom_dirty(ctx, &ctx->atoms.s.render_cond);
si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
if (ctx->screen->use_ngg_culling)
si_mark_atom_dirty(ctx, &ctx->atoms.s.ngg_cull_state);
if (first_cs || !ctx->shadowed_regs) {
/* These don't add any buffers, so skip them with shadowing. */
@ -490,6 +489,7 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
si_mark_atom_dirty(ctx, &ctx->atoms.s.window_rectangles);
si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
/* Invalidate various draw states so that they are emitted before
* the first draw call. */
@ -534,7 +534,6 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
assert(!ctx->gfx_cs.prev_dw);
ctx->initial_gfx_cs_size = ctx->gfx_cs.current.cdw;
ctx->small_prim_cull_info_dirty = ctx->small_prim_cull_info_buf != NULL;
ctx->prim_discard_compute_ib_initialized = false;
/* Compute-based primitive discard:

View file

@ -902,6 +902,7 @@ struct si_saved_cs {
struct si_small_prim_cull_info {
float scale[2], translate[2];
float small_prim_precision;
};
typedef void (*pipe_draw_vbo_func)(struct pipe_context *pipe,
@ -1151,7 +1152,6 @@ struct si_context {
struct si_small_prim_cull_info last_small_prim_cull_info;
struct si_resource *small_prim_cull_info_buf;
uint64_t small_prim_cull_info_address;
bool small_prim_cull_info_dirty;
/* Scratch buffer */
struct si_resource *scratch_buffer;
@ -1525,7 +1525,6 @@ struct pipe_video_buffer *si_video_buffer_create(struct pipe_context *pipe,
const struct pipe_video_buffer *tmpl);
/* si_viewport.c */
void si_update_ngg_small_prim_precision(struct si_context *ctx);
void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small_prim_cull_info *out);
void si_update_vs_viewport_state(struct si_context *ctx);
void si_init_viewport_functions(struct si_context *ctx);
@ -1950,6 +1949,20 @@ static inline void si_select_draw_vbo(struct si_context *sctx)
assert(sctx->b.draw_vbo);
}
/* Return the number of samples that the rasterizer uses. */
static inline unsigned si_get_num_coverage_samples(struct si_context *sctx)
{
if (sctx->framebuffer.nr_samples > 1 &&
sctx->queued.named.rasterizer->multisample_enable)
return sctx->framebuffer.nr_samples;
/* Note that smoothing_enabled is set by si_update_shaders. */
if (sctx->smoothing_enabled)
return SI_NUM_SMOOTH_AA_SAMPLES;
return 1;
}
#define PRINT_ERR(fmt, args...) \
fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args)

View file

@ -989,6 +989,10 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state)
/* Update the small primitive filter workaround if necessary. */
if (sctx->screen->info.has_msaa_sample_loc_bug && sctx->framebuffer.nr_samples > 1)
si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
/* NGG cull state uses multisample_enable. */
if (sctx->screen->use_ngg_culling)
si_mark_atom_dirty(sctx, &sctx->atoms.s.ngg_cull_state);
}
sctx->current_vs_state &= C_VS_STATE_CLAMP_VERTEX_COLOR;
@ -2827,10 +2831,13 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
si_update_ps_colorbuf0_slot(sctx);
si_update_poly_offset_state(sctx);
si_update_ngg_small_prim_precision(sctx);
si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
/* NGG cull state uses the sample count. */
if (sctx->screen->use_ngg_culling)
si_mark_atom_dirty(sctx, &sctx->atoms.s.ngg_cull_state);
if (sctx->screen->dpbb_allowed)
si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
@ -3432,8 +3439,9 @@ static void si_emit_msaa_config(struct si_context *sctx)
* EQAA 4s 4z 2f - might look the same as 4x MSAA with low-density geometry
* EQAA 2s 2z 2f = 2x MSAA
*/
coverage_samples = color_samples = z_samples = si_get_num_coverage_samples(sctx);
if (sctx->framebuffer.nr_samples > 1 && rs->multisample_enable) {
coverage_samples = sctx->framebuffer.nr_samples;
color_samples = sctx->framebuffer.nr_color_samples;
if (sctx->framebuffer.state.zsbuf) {
@ -3442,10 +3450,6 @@ static void si_emit_msaa_config(struct si_context *sctx)
} else {
z_samples = coverage_samples;
}
} else if (sctx->smoothing_enabled) {
coverage_samples = color_samples = z_samples = SI_NUM_SMOOTH_AA_SAMPLES;
} else {
coverage_samples = color_samples = z_samples = 1;
}
/* Required by OpenGL line rasterization.

View file

@ -232,6 +232,7 @@ union si_state_atoms {
struct si_atom scratch_state;
struct si_atom window_rectangles;
struct si_atom shader_query;
struct si_atom ngg_cull_state;
} s;
struct si_atom array[sizeof(struct si_atoms_s) / sizeof(struct si_atom)];
};

View file

@ -4090,6 +4090,10 @@ bool si_update_shaders(struct si_context *sctx)
sctx->smoothing_enabled = sctx->ps_shader.current->key.part.ps.epilog.poly_line_smoothing;
si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
/* NGG cull state uses smoothing_enabled. */
if (sctx->screen->use_ngg_culling)
si_mark_atom_dirty(sctx, &sctx->atoms.s.ngg_cull_state);
if (sctx->chip_class == GFX6)
si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);

View file

@ -28,34 +28,13 @@
#define SI_MAX_SCISSOR 16384
void si_update_ngg_small_prim_precision(struct si_context *ctx)
{
if (!ctx->screen->use_ngg_culling)
return;
/* Set VS_STATE.SMALL_PRIM_PRECISION for NGG culling. */
unsigned num_samples = ctx->framebuffer.nr_samples;
unsigned quant_mode = ctx->viewports.as_scissor[0].quant_mode;
float precision;
if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
precision = num_samples / 4096.0;
else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
precision = num_samples / 1024.0;
else
precision = num_samples / 256.0;
ctx->current_vs_state &= C_VS_STATE_SMALL_PRIM_PRECISION;
ctx->current_vs_state |= S_VS_STATE_SMALL_PRIM_PRECISION(fui(precision) >> 23);
}
void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small_prim_cull_info *out)
{
/* This is needed by the small primitive culling, because it's done
* in screen space.
*/
struct si_small_prim_cull_info info;
unsigned num_samples = sctx->framebuffer.nr_samples;
unsigned num_samples = si_get_num_coverage_samples(sctx);
assert(num_samples >= 1);
info.scale[0] = sctx->viewports.states[0].scale[0];
@ -85,9 +64,64 @@ void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small_prim_c
info.scale[i] *= num_samples;
info.translate[i] *= num_samples;
}
/* Better subpixel precision increases the efficiency of small
* primitive culling. (more precision means a tighter bounding box
* around primitives and more accurate elimination)
*/
unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode;
if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
info.small_prim_precision = num_samples / 4096.0;
else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
info.small_prim_precision = num_samples / 1024.0;
else
info.small_prim_precision = num_samples / 256.0;
*out = info;
}
static void si_emit_cull_state(struct si_context *sctx)
{
assert(sctx->screen->use_ngg_culling);
struct si_small_prim_cull_info info;
si_get_small_prim_cull_info(sctx, &info);
if (!sctx->small_prim_cull_info_buf ||
memcmp(&info, &sctx->last_small_prim_cull_info, sizeof(info))) {
unsigned offset = 0;
/* Align to 256, because the address is shifted by 8 bits. */
u_upload_data(sctx->b.const_uploader, 0, sizeof(info), 256, &info, &offset,
(struct pipe_resource **)&sctx->small_prim_cull_info_buf);
sctx->small_prim_cull_info_address = sctx->small_prim_cull_info_buf->gpu_address + offset;
sctx->last_small_prim_cull_info = info;
}
/* This will end up in SGPR6 as (value << 8), shifted by the hw. */
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, sctx->small_prim_cull_info_buf,
RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
radeon_set_sh_reg(&sctx->gfx_cs, R_00B220_SPI_SHADER_PGM_LO_GS,
sctx->small_prim_cull_info_address >> 8);
/* Set VS_STATE.SMALL_PRIM_PRECISION for NGG culling.
*
* small_prim_precision is 1 / 2^n. We only need n between 5 (1/32) and 12 (1/4096).
* Such a floating point value can be packed into 4 bits as follows:
* If we pass the first 4 bits of the exponent to the shader and set the next 3 bits
* to 1, we'll get the number exactly because all other bits are always 0. See:
* 1
* value = (0x70 | value.exponent[0:3]) << 23 = ------------------------------
* 2 ^ (15 - value.exponent[0:3])
*
* So pass only the first 4 bits of the float exponent to the shader.
*/
sctx->current_vs_state &= C_VS_STATE_SMALL_PRIM_PRECISION;
sctx->current_vs_state |= S_VS_STATE_SMALL_PRIM_PRECISION(fui(info.small_prim_precision) >> 23);
}
static void si_set_scissor_states(struct pipe_context *pctx, unsigned start_slot,
unsigned num_scissors, const struct pipe_scissor_state *state)
{
@ -330,8 +364,6 @@ static void si_emit_guardband(struct si_context *ctx)
S_028BE4_QUANT_MODE(V_028BE4_X_16_8_FIXED_POINT_1_256TH + vp_as_scissor.quant_mode));
if (initial_cdw != ctx->gfx_cs.current.cdw)
ctx->context_roll = true;
si_update_ngg_small_prim_precision(ctx);
}
static void si_emit_scissors(struct si_context *ctx)
@ -430,6 +462,10 @@ static void si_set_viewport_states(struct pipe_context *pctx, unsigned start_slo
if (start_slot == 0) {
ctx->viewports.y_inverted =
-state->scale[1] + state->translate[1] > state->scale[1] + state->translate[1];
/* NGG cull state uses the viewport and quant mode. */
if (ctx->screen->use_ngg_culling)
si_mark_atom_dirty(ctx, &ctx->atoms.s.ngg_cull_state);
}
si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
@ -454,33 +490,6 @@ static void si_emit_viewports(struct si_context *ctx)
struct radeon_cmdbuf *cs = &ctx->gfx_cs;
struct pipe_viewport_state *states = ctx->viewports.states;
if (ctx->screen->use_ngg_culling) {
/* Set the viewport info for small primitive culling. */
struct si_small_prim_cull_info info;
si_get_small_prim_cull_info(ctx, &info);
if (memcmp(&info, &ctx->last_small_prim_cull_info, sizeof(info))) {
unsigned offset = 0;
/* Align to 256, because the address is shifted by 8 bits. */
u_upload_data(ctx->b.const_uploader, 0, sizeof(info), 256, &info, &offset,
(struct pipe_resource **)&ctx->small_prim_cull_info_buf);
ctx->small_prim_cull_info_address = ctx->small_prim_cull_info_buf->gpu_address + offset;
ctx->last_small_prim_cull_info = info;
ctx->small_prim_cull_info_dirty = true;
}
if (ctx->small_prim_cull_info_dirty) {
/* This will end up in SGPR6 as (value << 8), shifted by the hw. */
radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, ctx->small_prim_cull_info_buf,
RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
radeon_set_sh_reg(&ctx->gfx_cs, R_00B220_SPI_SHADER_PGM_LO_GS,
ctx->small_prim_cull_info_address >> 8);
ctx->small_prim_cull_info_dirty = false;
}
}
/* The simple case: Only 1 viewport is active. */
if (!ctx->vs_writes_viewport_index) {
radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE, 6);
@ -655,6 +664,7 @@ void si_init_viewport_functions(struct si_context *ctx)
ctx->atoms.s.scissors.emit = si_emit_scissors;
ctx->atoms.s.viewports.emit = si_emit_viewport_states;
ctx->atoms.s.window_rectangles.emit = si_emit_window_rectangles;
ctx->atoms.s.ngg_cull_state.emit = si_emit_cull_state;
ctx->b.set_scissor_states = si_set_scissor_states;
ctx->b.set_viewport_states = si_set_viewport_states;