radeonsi: cull against clip planes, clipvertex, clip/cull distances in shader

The downside is that this duplicates shader code for clip/cull distances
in both the position and parameter portions of the shader.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13811>
This commit is contained in:
Marek Olšák 2021-11-07 16:43:13 -05:00 committed by Marge Bot
parent 881c459191
commit 513bd6acca
6 changed files with 109 additions and 14 deletions

View file

@ -554,12 +554,14 @@ enum
/* Byte 0: Boolean ES thread accepted (unculled) flag.
* Byte 1: New ES thread ID, loaded by GS to prepare the prim export value.
* Byte 2: TES rel patch ID
* Byte 3: Unused
* Byte 3: 8-bit clip distance mask: 1 means the clip distance is negative.
* The mask from all vertices is AND'ed. If the result is non-zero,
* the primitive is culled.
*/
lds_byte0_accept_flag = 0,
lds_byte1_new_thread_id,
lds_byte2_tes_rel_patch_id,
lds_byte3_unused,
lds_byte3_clipdist_neg_mask,
lds_packed_data = 0, /* lds_byteN_... */
lds_pos_cull_x_div_w,
@ -804,6 +806,37 @@ static void gfx10_build_primitive_accepted(struct ac_llvm_context *ac, LLVMValue
ac_build_endif(&ctx->ac, 0);
}
static void add_clipdist_bit(struct si_shader_context *ctx, LLVMValueRef distance, unsigned i,
LLVMValueRef *packed_data)
{
LLVMValueRef neg = LLVMBuildFCmp(ctx->ac.builder, LLVMRealOLT, distance, ctx->ac.f32_0, "");
neg = LLVMBuildZExt(ctx->ac.builder, neg, ctx->ac.i32, "");
/* Put the negative distance flag into lds_byte3_clipdist_neg_mask. */
neg = LLVMBuildShl(ctx->ac.builder, neg, LLVMConstInt(ctx->ac.i32, 24 + i, 0), "");
*packed_data = LLVMBuildOr(ctx->ac.builder, *packed_data, neg, "");
}
static bool add_clipdist_bits_for_clipvertex(struct si_shader_context *ctx,
unsigned clipdist_enable,
LLVMValueRef clipvertex[4],
LLVMValueRef *packed_data)
{
struct ac_export_args clipdist[2];
bool added = false;
si_llvm_clipvertex_to_clipdist(ctx, clipdist, clipvertex);
for (unsigned j = 0; j < 8; j++) {
if (!(clipdist_enable & BITFIELD_BIT(j)))
continue;
LLVMValueRef distance = clipdist[j / 4].out[j % 4];
add_clipdist_bit(ctx, distance, j, packed_data);
added = true;
}
return added;
}
/**
* Cull primitives for NGG VS or TES, then compact vertices, which happens
* before the VS or TES main function. Return values for the main function.
@ -826,10 +859,16 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
(sel->info.stage == MESA_SHADER_TESS_EVAL && !shader->key.ge.as_es));
LLVMValueRef es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
LLVMValueRef packed_data = ctx->ac.i32_0;
LLVMValueRef position[4] = {};
unsigned pos_index = 0;
unsigned clip_plane_enable = SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(shader->key.ge.opt.ngg_culling);
unsigned clipdist_enable = (sel->clipdist_mask & clip_plane_enable) | sel->culldist_mask;
bool has_clipdist_mask = false;
for (unsigned i = 0; i < info->num_outputs; i++) {
LLVMValueRef position[4];
LLVMValueRef clipvertex[4];
unsigned base;
switch (info->output_semantic[i]) {
case VARYING_SLOT_POS:
@ -862,12 +901,45 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_pos_cull_x_div_w + chan, 0)));
}
break;
case VARYING_SLOT_CLIP_DIST0:
case VARYING_SLOT_CLIP_DIST1:
base = info->output_semantic[i] == VARYING_SLOT_CLIP_DIST1 ? 4 : 0;
for (unsigned j = 0; j < 4; j++) {
unsigned index = base + j;
if (!(clipdist_enable & BITFIELD_BIT(index)))
continue;
LLVMValueRef distance = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + j], "");
add_clipdist_bit(ctx, distance, index, &packed_data);
has_clipdist_mask = true;
}
break;
case VARYING_SLOT_CLIP_VERTEX:
for (unsigned j = 0; j < 4; j++)
clipvertex[j] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + j], "");
if (add_clipdist_bits_for_clipvertex(ctx, clipdist_enable, clipvertex, &packed_data))
has_clipdist_mask = true;
break;
}
}
if (clip_plane_enable && !sel->clipdist_mask) {
/* When clip planes are enabled and there are no clip distance outputs,
* we should use user clip planes and cull against the position.
*/
assert(!has_clipdist_mask);
if (add_clipdist_bits_for_clipvertex(ctx, clipdist_enable, position, &packed_data))
has_clipdist_mask = true;
}
/* Initialize the packed data. */
LLVMBuildStore(
builder, ctx->ac.i32_0,
builder, packed_data,
ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_packed_data, 0)));
ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
ac_build_s_barrier(&ctx->ac);
@ -950,6 +1022,8 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
{
/* Load positions. */
LLVMValueRef pos[3][4] = {};
LLVMValueRef clipdist_neg_mask = NULL;
for (unsigned vtx = 0; vtx < num_vertices; vtx++) {
for (unsigned chan = 0; chan < 4; chan++) {
unsigned index;
@ -965,8 +1039,25 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
pos[vtx][chan] = LLVMBuildLoad(builder, addr, "");
pos[vtx][chan] = ac_to_float(&ctx->ac, pos[vtx][chan]);
}
if (has_clipdist_mask) {
/* Load and AND clip distance masks. Each bit means whether that clip distance is
* negative. If all masks are AND'ed and the result is 0, the primitive isn't culled
* by clip distances.
*/
LLVMValueRef addr = si_build_gep_i8(ctx, gs_vtxptr[vtx], lds_byte3_clipdist_neg_mask);
LLVMValueRef mask = LLVMBuildLoad(builder, addr, "");
if (!clipdist_neg_mask)
clipdist_neg_mask = mask;
else
clipdist_neg_mask = LLVMBuildAnd(builder, clipdist_neg_mask, mask, "");
}
}
LLVMValueRef clipdist_accepted =
has_clipdist_mask ? LLVMBuildICmp(builder, LLVMIntEQ, clipdist_neg_mask, ctx->ac.i8_0, "")
: ctx->ac.i1true;
LLVMValueRef vp_scale[2] = {}, vp_translate[2] = {}, small_prim_precision = NULL;
LLVMValueRef clip_half_line_width[2] = {};
@ -1020,7 +1111,7 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
gs_accepted,
(void*)gs_vtxptr,
};
ac_cull_primitive(&ctx->ac, pos, ctx->ac.i1true, vp_scale, vp_translate,
ac_cull_primitive(&ctx->ac, pos, clipdist_accepted, vp_scale, vp_translate,
small_prim_precision, clip_half_line_width,
&options, gfx10_build_primitive_accepted, params);
}

View file

@ -1134,7 +1134,7 @@ struct si_context {
/* Emitted draw state. */
bool ngg : 1;
uint8_t ngg_culling;
uint16_t ngg_culling;
unsigned last_index_size;
int last_base_vertex;
unsigned last_start_instance;

View file

@ -283,6 +283,8 @@ enum
#define SI_NGG_CULL_FRONT_FACE (1 << 2) /* front faces */
#define SI_NGG_CULL_LINES (1 << 3) /* the primitive type is lines */
#define SI_NGG_CULL_SMALL_LINES_DIAMOND_EXIT (1 << 4) /* cull small lines according to the diamond exit rule */
#define SI_NGG_CULL_CLIP_PLANE_ENABLE(enable) (((enable) & 0xff) << 5)
#define SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(x) (((x) >> 5) & 0xff)
/**
* For VS shader keys, describe any fixups required for vertex fetch.
@ -660,7 +662,7 @@ struct si_shader_key_ge {
unsigned kill_pointsize : 1;
/* For NGG VS and TES. */
unsigned ngg_culling : 5; /* SI_NGG_CULL_* */
unsigned ngg_culling : 13; /* SI_NGG_CULL_* */
/* For shaders where monolithic variants have better code.
*

View file

@ -960,11 +960,13 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast
S_028810_DX_RASTERIZATION_KILL(state->rasterizer_discard) |
S_028810_DX_LINEAR_ATTR_CLIP_ENA(1);
rs->ngg_cull_flags_tris = SI_NGG_CULL_TRIANGLES;
rs->ngg_cull_flags_tris = SI_NGG_CULL_TRIANGLES |
SI_NGG_CULL_CLIP_PLANE_ENABLE(state->clip_plane_enable);
rs->ngg_cull_flags_tris_y_inverted = rs->ngg_cull_flags_tris;
rs->ngg_cull_flags_lines = SI_NGG_CULL_LINES |
(!rs->perpendicular_end_caps ? SI_NGG_CULL_SMALL_LINES_DIAMOND_EXIT : 0);
(!rs->perpendicular_end_caps ? SI_NGG_CULL_SMALL_LINES_DIAMOND_EXIT : 0) |
SI_NGG_CULL_CLIP_PLANE_ENABLE(state->clip_plane_enable);
if (rs->rasterizer_discard) {
rs->ngg_cull_flags_tris |= SI_NGG_CULL_FRONT_FACE |

View file

@ -76,9 +76,9 @@ struct si_state_rasterizer {
unsigned pa_cl_clip_cntl;
float line_width;
float max_point_size;
unsigned ngg_cull_flags_tris : 8;
unsigned ngg_cull_flags_tris_y_inverted : 8;
unsigned ngg_cull_flags_lines : 8;
unsigned ngg_cull_flags_tris : 16;
unsigned ngg_cull_flags_tris_y_inverted : 16;
unsigned ngg_cull_flags_lines : 16;
unsigned sprite_coord_enable : 8;
unsigned clip_plane_enable : 8;
unsigned half_pixel_center : 1;

View file

@ -2263,7 +2263,7 @@ static void si_draw(struct pipe_context *ctx,
}
/* Update NGG culling settings. */
uint8_t old_ngg_culling = sctx->ngg_culling;
uint16_t old_ngg_culling = sctx->ngg_culling;
if (GFX_VERSION >= GFX10) {
struct si_shader_selector *hw_vs = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->cso;
@ -2278,7 +2278,7 @@ static void si_draw(struct pipe_context *ctx,
/* Check that the current shader allows culling. */
assert(hw_vs->ngg_cull_vert_threshold != UINT_MAX);
uint8_t ngg_culling;
uint16_t ngg_culling;
if (util_prim_is_lines(sctx->current_rast_prim)) {
/* Overwrite it to mask out face cull flags. */