radeonsi: fix small primitive culling with MSAA force-disabled and smoothing

The problem was that the shader constants were based on the framebuffer sample count and ignored the multisample enable state and the line/polygon smoothing state, which uses MSAA rasterization that only sets SampleMaskIn to get the coverage for alpha-blended smoothing (the PS epilog computes the alpha channel from SampleMaskIn and blending generates the AA results). - This is a complete rework that adds a new state for NGG cull constants. - It fixes the same thing for the prim discard compute shader. - It documents how VS_STATE.SMALL_PRIM_PRECISION is encoded. It fixes blue corruption in Unigine Heaven with MSAA and Medium details or better. Fixes: 7648060dc0 - radeonsi: enable NGG culling by default on gfx10.3 dGPUs Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8022>
2025-12-24 19:40:10 +01:00 · 2020-12-09 19:18:37 -05:00 · 2020-12-09 19:18:37 -05:00 · dffc27e5e1
commit dffc27e5e1
parent 836b9e1d88
7 changed files with 98 additions and 80 deletions
--- a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
+++ b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
@ -1313,19 +1313,6 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
   desc[10] = fui(cull_info.translate[0]);
   desc[11] = fui(cull_info.translate[1]);

-   /* Better subpixel precision increases the efficiency of small
-    * primitive culling. */
-   unsigned num_samples = sctx->framebuffer.nr_samples;
-   unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode;
-   float small_prim_cull_precision;
-
-   if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
-      small_prim_cull_precision = num_samples / 4096.0;
-   else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
-      small_prim_cull_precision = num_samples / 1024.0;
-   else
-      small_prim_cull_precision = num_samples / 256.0;
-
   /* Set user data SGPRs. */
   /* This can't be greater than 14 if we want the fastest launch rate. */
   unsigned user_sgprs = 13;
@ -1489,7 +1476,7 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
         radeon_emit(cs, num_prims_udiv.post_shift | (num_prims_per_instance << 5));
         radeon_emit(cs, info->restart_index);
         /* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */
-         radeon_emit(cs, fui(small_prim_cull_precision));
+         radeon_emit(cs, fui(cull_info.small_prim_precision));
      } else {
         assert(VERTEX_COUNTER_GDS_MODE == 2);
         /* Only update the SGPRs that changed. */
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@ -452,15 +452,14 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
      ctx->framebuffer.dirty_cbufs = u_bit_consecutive(0, 8);
      ctx->framebuffer.dirty_zsbuf = true;
   }
-   /* This should always be marked as dirty to set the framebuffer scissor
-    * at least.
-    *
-    * Even with shadowed registers, we have to add buffers to the buffer list.
-    * All of these do that.
+
+   /* Even with shadowed registers, we have to add buffers to the buffer list.
+    * These atoms are the only ones that add buffers.
    */
   si_mark_atom_dirty(ctx, &ctx->atoms.s.framebuffer);
   si_mark_atom_dirty(ctx, &ctx->atoms.s.render_cond);
-   si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
+   if (ctx->screen->use_ngg_culling)
+      si_mark_atom_dirty(ctx, &ctx->atoms.s.ngg_cull_state);

   if (first_cs || !ctx->shadowed_regs) {
      /* These don't add any buffers, so skip them with shadowing. */
@ -490,6 +489,7 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
         si_mark_atom_dirty(ctx, &ctx->atoms.s.window_rectangles);
      si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
      si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
+      si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);

      /* Invalidate various draw states so that they are emitted before
       * the first draw call. */
@ -534,7 +534,6 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)

   assert(!ctx->gfx_cs.prev_dw);
   ctx->initial_gfx_cs_size = ctx->gfx_cs.current.cdw;
-   ctx->small_prim_cull_info_dirty = ctx->small_prim_cull_info_buf != NULL;
   ctx->prim_discard_compute_ib_initialized = false;

   /* Compute-based primitive discard:
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@ -902,6 +902,7 @@ struct si_saved_cs {

 struct si_small_prim_cull_info {
   float scale[2], translate[2];
+   float small_prim_precision;
 };

 typedef void (*pipe_draw_vbo_func)(struct pipe_context *pipe,
@ -1151,7 +1152,6 @@ struct si_context {
   struct si_small_prim_cull_info last_small_prim_cull_info;
   struct si_resource *small_prim_cull_info_buf;
   uint64_t small_prim_cull_info_address;
-   bool small_prim_cull_info_dirty;

   /* Scratch buffer */
   struct si_resource *scratch_buffer;
@ -1525,7 +1525,6 @@ struct pipe_video_buffer *si_video_buffer_create(struct pipe_context *pipe,
                                                 const struct pipe_video_buffer *tmpl);

 /* si_viewport.c */
-void si_update_ngg_small_prim_precision(struct si_context *ctx);
 void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small_prim_cull_info *out);
 void si_update_vs_viewport_state(struct si_context *ctx);
 void si_init_viewport_functions(struct si_context *ctx);
@ -1950,6 +1949,20 @@ static inline void si_select_draw_vbo(struct si_context *sctx)
   assert(sctx->b.draw_vbo);
 }

+/* Return the number of samples that the rasterizer uses. */
+static inline unsigned si_get_num_coverage_samples(struct si_context *sctx)
+{
+   if (sctx->framebuffer.nr_samples > 1 &&
+       sctx->queued.named.rasterizer->multisample_enable)
+      return sctx->framebuffer.nr_samples;
+
+   /* Note that smoothing_enabled is set by si_update_shaders. */
+   if (sctx->smoothing_enabled)
+      return SI_NUM_SMOOTH_AA_SAMPLES;
+
+   return 1;
+}
+
 #define PRINT_ERR(fmt, args...)                                                                    \
   fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args)

--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@ -989,6 +989,10 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state)
      /* Update the small primitive filter workaround if necessary. */
      if (sctx->screen->info.has_msaa_sample_loc_bug && sctx->framebuffer.nr_samples > 1)
         si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
+
+      /* NGG cull state uses multisample_enable. */
+      if (sctx->screen->use_ngg_culling)
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.ngg_cull_state);
   }

   sctx->current_vs_state &= C_VS_STATE_CLAMP_VERTEX_COLOR;
@ -2827,10 +2831,13 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,

   si_update_ps_colorbuf0_slot(sctx);
   si_update_poly_offset_state(sctx);
-   si_update_ngg_small_prim_precision(sctx);
   si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
   si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);

+   /* NGG cull state uses the sample count. */
+   if (sctx->screen->use_ngg_culling)
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.ngg_cull_state);
+
   if (sctx->screen->dpbb_allowed)
      si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);

@ -3432,8 +3439,9 @@ static void si_emit_msaa_config(struct si_context *sctx)
    *   EQAA  4s 4z 2f - might look the same as 4x MSAA with low-density geometry
    *   EQAA  2s 2z 2f = 2x MSAA
    */
+   coverage_samples = color_samples = z_samples = si_get_num_coverage_samples(sctx);
+
   if (sctx->framebuffer.nr_samples > 1 && rs->multisample_enable) {
-      coverage_samples = sctx->framebuffer.nr_samples;
      color_samples = sctx->framebuffer.nr_color_samples;

      if (sctx->framebuffer.state.zsbuf) {
@ -3442,10 +3450,6 @@ static void si_emit_msaa_config(struct si_context *sctx)
      } else {
         z_samples = coverage_samples;
      }
-   } else if (sctx->smoothing_enabled) {
-      coverage_samples = color_samples = z_samples = SI_NUM_SMOOTH_AA_SAMPLES;
-   } else {
-      coverage_samples = color_samples = z_samples = 1;
   }

   /* Required by OpenGL line rasterization.
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@ -232,6 +232,7 @@ union si_state_atoms {
      struct si_atom scratch_state;
      struct si_atom window_rectangles;
      struct si_atom shader_query;
+      struct si_atom ngg_cull_state;
   } s;
   struct si_atom array[sizeof(struct si_atoms_s) / sizeof(struct si_atom)];
 };
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@ -4090,6 +4090,10 @@ bool si_update_shaders(struct si_context *sctx)
         sctx->smoothing_enabled = sctx->ps_shader.current->key.part.ps.epilog.poly_line_smoothing;
         si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);

+         /* NGG cull state uses smoothing_enabled. */
+         if (sctx->screen->use_ngg_culling)
+            si_mark_atom_dirty(sctx, &sctx->atoms.s.ngg_cull_state);
+
         if (sctx->chip_class == GFX6)
            si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);

--- a/src/gallium/drivers/radeonsi/si_state_viewport.c
+++ b/src/gallium/drivers/radeonsi/si_state_viewport.c
@ -28,34 +28,13 @@

 #define SI_MAX_SCISSOR 16384

-void si_update_ngg_small_prim_precision(struct si_context *ctx)
-{
-   if (!ctx->screen->use_ngg_culling)
-      return;
-
-   /* Set VS_STATE.SMALL_PRIM_PRECISION for NGG culling. */
-   unsigned num_samples = ctx->framebuffer.nr_samples;
-   unsigned quant_mode = ctx->viewports.as_scissor[0].quant_mode;
-   float precision;
-
-   if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
-      precision = num_samples / 4096.0;
-   else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
-      precision = num_samples / 1024.0;
-   else
-      precision = num_samples / 256.0;
-
-   ctx->current_vs_state &= C_VS_STATE_SMALL_PRIM_PRECISION;
-   ctx->current_vs_state |= S_VS_STATE_SMALL_PRIM_PRECISION(fui(precision) >> 23);
-}
-
 void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small_prim_cull_info *out)
 {
   /* This is needed by the small primitive culling, because it's done
    * in screen space.
    */
   struct si_small_prim_cull_info info;
-   unsigned num_samples = sctx->framebuffer.nr_samples;
+   unsigned num_samples = si_get_num_coverage_samples(sctx);
   assert(num_samples >= 1);

   info.scale[0] = sctx->viewports.states[0].scale[0];
@ -85,9 +64,64 @@ void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small_prim_c
      info.scale[i] *= num_samples;
      info.translate[i] *= num_samples;
   }
+
+   /* Better subpixel precision increases the efficiency of small
+    * primitive culling. (more precision means a tighter bounding box
+    * around primitives and more accurate elimination)
+    */
+   unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode;
+
+   if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
+      info.small_prim_precision = num_samples / 4096.0;
+   else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
+      info.small_prim_precision = num_samples / 1024.0;
+   else
+      info.small_prim_precision = num_samples / 256.0;
+
   *out = info;
 }

+static void si_emit_cull_state(struct si_context *sctx)
+{
+   assert(sctx->screen->use_ngg_culling);
+
+   struct si_small_prim_cull_info info;
+   si_get_small_prim_cull_info(sctx, &info);
+
+   if (!sctx->small_prim_cull_info_buf ||
+       memcmp(&info, &sctx->last_small_prim_cull_info, sizeof(info))) {
+      unsigned offset = 0;
+
+      /* Align to 256, because the address is shifted by 8 bits. */
+      u_upload_data(sctx->b.const_uploader, 0, sizeof(info), 256, &info, &offset,
+                    (struct pipe_resource **)&sctx->small_prim_cull_info_buf);
+
+      sctx->small_prim_cull_info_address = sctx->small_prim_cull_info_buf->gpu_address + offset;
+      sctx->last_small_prim_cull_info = info;
+   }
+
+   /* This will end up in SGPR6 as (value << 8), shifted by the hw. */
+   radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, sctx->small_prim_cull_info_buf,
+                             RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
+   radeon_set_sh_reg(&sctx->gfx_cs, R_00B220_SPI_SHADER_PGM_LO_GS,
+                     sctx->small_prim_cull_info_address >> 8);
+
+   /* Set VS_STATE.SMALL_PRIM_PRECISION for NGG culling.
+    *
+    * small_prim_precision is 1 / 2^n. We only need n between 5 (1/32) and 12 (1/4096).
+    * Such a floating point value can be packed into 4 bits as follows:
+    * If we pass the first 4 bits of the exponent to the shader and set the next 3 bits
+    * to 1, we'll get the number exactly because all other bits are always 0. See:
+    *                                                               1
+    * value  =  (0x70 | value.exponent[0:3]) << 23  =  ------------------------------
+    *                                                  2 ^ (15 - value.exponent[0:3])
+    *
+    * So pass only the first 4 bits of the float exponent to the shader.
+    */
+   sctx->current_vs_state &= C_VS_STATE_SMALL_PRIM_PRECISION;
+   sctx->current_vs_state |= S_VS_STATE_SMALL_PRIM_PRECISION(fui(info.small_prim_precision) >> 23);
+}
+
 static void si_set_scissor_states(struct pipe_context *pctx, unsigned start_slot,
                                  unsigned num_scissors, const struct pipe_scissor_state *state)
 {
@ -330,8 +364,6 @@ static void si_emit_guardband(struct si_context *ctx)
         S_028BE4_QUANT_MODE(V_028BE4_X_16_8_FIXED_POINT_1_256TH + vp_as_scissor.quant_mode));
   if (initial_cdw != ctx->gfx_cs.current.cdw)
      ctx->context_roll = true;
-
-   si_update_ngg_small_prim_precision(ctx);
 }

 static void si_emit_scissors(struct si_context *ctx)
@ -430,6 +462,10 @@ static void si_set_viewport_states(struct pipe_context *pctx, unsigned start_slo
   if (start_slot == 0) {
      ctx->viewports.y_inverted =
         -state->scale[1] + state->translate[1] > state->scale[1] + state->translate[1];
+
+      /* NGG cull state uses the viewport and quant mode. */
+      if (ctx->screen->use_ngg_culling)
+         si_mark_atom_dirty(ctx, &ctx->atoms.s.ngg_cull_state);
   }

   si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
@ -454,33 +490,6 @@ static void si_emit_viewports(struct si_context *ctx)
   struct radeon_cmdbuf *cs = &ctx->gfx_cs;
   struct pipe_viewport_state *states = ctx->viewports.states;

-   if (ctx->screen->use_ngg_culling) {
-      /* Set the viewport info for small primitive culling. */
-      struct si_small_prim_cull_info info;
-      si_get_small_prim_cull_info(ctx, &info);
-
-      if (memcmp(&info, &ctx->last_small_prim_cull_info, sizeof(info))) {
-         unsigned offset = 0;
-
-         /* Align to 256, because the address is shifted by 8 bits. */
-         u_upload_data(ctx->b.const_uploader, 0, sizeof(info), 256, &info, &offset,
-                       (struct pipe_resource **)&ctx->small_prim_cull_info_buf);
-
-         ctx->small_prim_cull_info_address = ctx->small_prim_cull_info_buf->gpu_address + offset;
-         ctx->last_small_prim_cull_info = info;
-         ctx->small_prim_cull_info_dirty = true;
-      }
-
-      if (ctx->small_prim_cull_info_dirty) {
-         /* This will end up in SGPR6 as (value << 8), shifted by the hw. */
-         radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, ctx->small_prim_cull_info_buf,
-                                   RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
-         radeon_set_sh_reg(&ctx->gfx_cs, R_00B220_SPI_SHADER_PGM_LO_GS,
-                           ctx->small_prim_cull_info_address >> 8);
-         ctx->small_prim_cull_info_dirty = false;
-      }
-   }
-
   /* The simple case: Only 1 viewport is active. */
   if (!ctx->vs_writes_viewport_index) {
      radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE, 6);
@ -655,6 +664,7 @@ void si_init_viewport_functions(struct si_context *ctx)
   ctx->atoms.s.scissors.emit = si_emit_scissors;
   ctx->atoms.s.viewports.emit = si_emit_viewport_states;
   ctx->atoms.s.window_rectangles.emit = si_emit_window_rectangles;
+   ctx->atoms.s.ngg_cull_state.emit = si_emit_cull_state;

   ctx->b.set_scissor_states = si_set_scissor_states;
   ctx->b.set_viewport_states = si_set_viewport_states;