radeonsi: implement shader-based culling for lines

This helps some viewperf subtests.
Only view XY culling is done. Edgeflags are always disabled with lines.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13048>
This commit is contained in:
Marek Olšák 2021-09-26 14:18:45 -04:00 committed by Marge Bot
parent e7e0b90c94
commit f00d3e2909
8 changed files with 59 additions and 25 deletions

View file

@ -52,7 +52,7 @@ struct ac_position_w_info {
};
static void ac_analyze_position_w(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4],
struct ac_position_w_info *w)
struct ac_position_w_info *w, unsigned num_vertices)
{
LLVMBuilderRef builder = ctx->builder;
LLVMValueRef all_w_negative = ctx->i1true;
@ -60,7 +60,7 @@ static void ac_analyze_position_w(struct ac_llvm_context *ctx, LLVMValueRef pos[
w->w_reflection = ctx->i1false;
w->any_w_negative = ctx->i1false;
for (unsigned i = 0; i < 3; i++) {
for (unsigned i = 0; i < num_vertices; i++) {
LLVMValueRef neg_w;
neg_w = LLVMBuildFCmp(builder, LLVMRealOLT, pos[i][3], ctx->f32_0, "");
@ -137,11 +137,14 @@ static void cull_bbox(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4],
/* Compute the primitive bounding box for easy culling. */
for (unsigned chan = 0; chan < (options->cull_view_near_z ||
options->cull_view_far_z ? 3 : 2); chan++) {
assert(options->num_vertices >= 2);
bbox_min[chan] = ac_build_fmin(ctx, pos[0][chan], pos[1][chan]);
bbox_min[chan] = ac_build_fmin(ctx, bbox_min[chan], pos[2][chan]);
bbox_max[chan] = ac_build_fmax(ctx, pos[0][chan], pos[1][chan]);
bbox_max[chan] = ac_build_fmax(ctx, bbox_max[chan], pos[2][chan]);
if (options->num_vertices == 3) {
bbox_min[chan] = ac_build_fmin(ctx, bbox_min[chan], pos[2][chan]);
bbox_max[chan] = ac_build_fmax(ctx, bbox_max[chan], pos[2][chan]);
}
}
/* View culling. */
@ -231,7 +234,7 @@ void ac_cull_primitive(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4],
void *userdata)
{
struct ac_position_w_info w;
ac_analyze_position_w(ctx, pos, &w);
ac_analyze_position_w(ctx, pos, &w, options->num_vertices);
/* W culling. */
LLVMValueRef accepted = options->cull_w ? w.w_accepted : ctx->i1true;

View file

@ -46,6 +46,8 @@ struct ac_cull_options {
bool cull_w; /* cull primitives with all W < 0 */
bool use_halfz_clip_space;
uint8_t num_vertices; /* 1..3 */
};
/* Callback invoked in the inner-most branch where the primitive is accepted. */

View file

@ -83,6 +83,9 @@ static LLVMValueRef ngg_get_vertices_per_prim(struct si_shader_context *ctx, uns
/* Blits always use axis-aligned rectangles with 3 vertices. */
*num_vertices = 3;
return LLVMConstInt(ctx->ac.i32, 3, 0);
} else if (ctx->shader->key.opt.ngg_culling & SI_NGG_CULL_LINES) {
*num_vertices = 2;
return LLVMConstInt(ctx->ac.i32, 2, 0);
} else {
/* We always build up all three indices for the prim export
* independent of the primitive type. The additional garbage
@ -994,13 +997,23 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
/* Execute culling code. */
struct ac_cull_options options = {};
options.cull_front = shader->key.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE;
options.cull_back = shader->key.opt.ngg_culling & SI_NGG_CULL_BACK_FACE;
options.cull_view_xy = true;
options.cull_small_prims = true; /* this would only be false with conservative rasterization */
options.cull_zero_area = options.cull_front || options.cull_back;
options.cull_w = true;
if (shader->key.opt.ngg_culling & SI_NGG_CULL_LINES) {
options.num_vertices = 2;
assert(!(shader->key.opt.ngg_culling & SI_NGG_CULL_BACK_FACE));
assert(!(shader->key.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE));
assert(!(shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL));
} else {
options.num_vertices = 3;
options.cull_front = shader->key.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE;
options.cull_back = shader->key.opt.ngg_culling & SI_NGG_CULL_BACK_FACE;
options.cull_small_prims = true; /* this would only be false with conservative rasterization */
options.cull_zero_area = options.cull_front || options.cull_back;
}
/* Tell ES threads whether their vertex survived. */
LLVMValueRef params[] = {
gs_accepted,
@ -1995,7 +2008,7 @@ bool gfx10_ngg_calculate_subgroup_info(struct si_shader *shader)
shader->previous_stage_sel ? shader->previous_stage_sel : gs_sel;
const gl_shader_stage gs_stage = gs_sel->info.stage;
const unsigned gs_num_invocations = MAX2(gs_sel->info.base.gs.invocations, 1);
const unsigned input_prim = si_get_input_prim(gs_sel);
const unsigned input_prim = si_get_input_prim(gs_sel, &shader->key);
const bool use_adjacency =
input_prim >= PIPE_PRIM_LINES_ADJACENCY && input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY;
const unsigned max_verts_per_prim = u_vertices_per_prim(input_prim);

View file

@ -1819,6 +1819,12 @@ static inline unsigned si_get_total_colormask(struct si_context *sctx)
((1 << PIPE_PRIM_LINES) | (1 << PIPE_PRIM_LINE_LOOP) | (1 << PIPE_PRIM_LINE_STRIP) | \
(1 << PIPE_PRIM_LINES_ADJACENCY) | (1 << PIPE_PRIM_LINE_STRIP_ADJACENCY))
#define UTIL_ALL_PRIM_TRIANGLE_MODES \
((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP) | \
(1 << PIPE_PRIM_TRIANGLE_FAN) | (1 << PIPE_PRIM_QUADS) | (1 << PIPE_PRIM_QUAD_STRIP) | \
(1 << PIPE_PRIM_POLYGON) | (1 << PIPE_PRIM_TRIANGLES_ADJACENCY) | \
(1 << PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY))
static inline bool util_prim_is_lines(unsigned prim)
{
return ((1 << prim) & UTIL_ALL_PRIM_LINE_MODES) != 0;
@ -1831,11 +1837,12 @@ static inline bool util_prim_is_points_or_lines(unsigned prim)
static inline bool util_rast_prim_is_triangles(unsigned prim)
{
return ((1 << prim) &
((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP) |
(1 << PIPE_PRIM_TRIANGLE_FAN) | (1 << PIPE_PRIM_QUADS) | (1 << PIPE_PRIM_QUAD_STRIP) |
(1 << PIPE_PRIM_POLYGON) | (1 << PIPE_PRIM_TRIANGLES_ADJACENCY) |
(1 << PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY)));
return ((1 << prim) & UTIL_ALL_PRIM_TRIANGLE_MODES) != 0;
}
static inline bool util_rast_prim_is_lines_or_triangles(unsigned prim)
{
return ((1 << prim) & (UTIL_ALL_PRIM_LINE_MODES | UTIL_ALL_PRIM_TRIANGLE_MODES)) != 0;
}
/**

View file

@ -287,6 +287,7 @@ enum
#define SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(x) (((x) & 0x3) << 5) /* 0->0, 1->1, 2->2, 3->4 */
#define SI_GET_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(x) (((x) >> 5) & 0x3)
#define SI_NGG_CULL_GS_FAST_LAUNCH_ALL (0xf << 3) /* GS fast launch (both prim types) */
#define SI_NGG_CULL_LINES (1 << 7) /* the primitive type is lines */
/**
* For VS shader keys, describe any fixups required for vertex fetch.
@ -685,7 +686,7 @@ struct si_shader_key {
unsigned kill_pointsize : 1;
/* For NGG VS and TES. */
unsigned ngg_culling : 7; /* SI_NGG_CULL_* */
unsigned ngg_culling : 8; /* SI_NGG_CULL_* */
/* For shaders where monolithic variants have better code.
*
@ -963,7 +964,8 @@ static inline bool si_shader_uses_bindless_images(struct si_shader_selector *sel
static inline bool gfx10_edgeflags_have_effect(struct si_shader *shader)
{
if (shader->selector->info.stage == MESA_SHADER_VERTEX &&
!shader->selector->info.base.vs.blit_sgprs_amd)
!shader->selector->info.base.vs.blit_sgprs_amd &&
!(shader->key.opt.ngg_culling & SI_NGG_CULL_LINES))
return true;
return false;

View file

@ -587,7 +587,7 @@ void si_get_vs_key_inputs(struct si_context *sctx, struct si_shader_key *key,
void si_update_ps_inputs_read_or_disabled(struct si_context *sctx);
void si_update_ps_kill_enable(struct si_context *sctx);
void si_update_vrs_flat_shading(struct si_context *sctx);
unsigned si_get_input_prim(const struct si_shader_selector *gs);
unsigned si_get_input_prim(const struct si_shader_selector *gs, const struct si_shader_key *key);
bool si_update_ngg(struct si_context *sctx);
void si_ps_key_update_framebuffer(struct si_context *sctx);
void si_ps_key_update_framebuffer_blend(struct si_context *sctx);

View file

@ -2157,8 +2157,8 @@ static void si_draw_vbo(struct pipe_context *ctx,
if (NGG && !HAS_GS &&
/* Tessellation sets ngg_cull_vert_threshold to UINT_MAX if the prim type
* is not triangles, so this check is only needed without tessellation. */
(HAS_TESS || sctx->current_rast_prim == PIPE_PRIM_TRIANGLES) &&
* is not points, so this check is only needed without tessellation. */
(HAS_TESS || util_rast_prim_is_lines_or_triangles(sctx->current_rast_prim)) &&
/* Only the first draw for a shader starts with culling disabled and it's disabled
* until we pass the total_direct_count check and then it stays enabled until
* the shader is changed. This eliminates most culling on/off state changes. */
@ -2170,6 +2170,11 @@ static void si_draw_vbo(struct pipe_context *ctx,
rs->ngg_cull_flags;
assert(ngg_culling); /* rasterizer state should always set this to non-zero */
if (util_prim_is_lines(sctx->current_rast_prim)) {
/* Overwrite it to mask out face cull flags. */
ngg_culling = SI_NGG_CULL_ENABLED | SI_NGG_CULL_LINES;
}
/* Use NGG fast launch for certain primitive types.
* A draw must have at least 1 full primitive.
* The fast launch doesn't work with tessellation.

View file

@ -1094,7 +1094,7 @@ static void gfx10_emit_shader_ngg_tess_gs(struct si_context *sctx)
gfx10_emit_shader_ngg_tail(sctx, shader);
}
unsigned si_get_input_prim(const struct si_shader_selector *gs)
unsigned si_get_input_prim(const struct si_shader_selector *gs, const struct si_shader_key *key)
{
if (gs->info.stage == MESA_SHADER_GEOMETRY)
return gs->info.base.gs.input_primitive;
@ -1107,7 +1107,9 @@ unsigned si_get_input_prim(const struct si_shader_selector *gs)
return PIPE_PRIM_TRIANGLES;
}
/* TODO: Set this correctly if the primitive type is set in the shader key. */
if (key->opt.ngg_culling & SI_NGG_CULL_LINES)
return PIPE_PRIM_LINES;
return PIPE_PRIM_TRIANGLES; /* worst case for all callers */
}
@ -1151,7 +1153,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
gs_info->base.vs.window_space_position : 0;
bool es_enable_prim_id = shader->key.mono.u.vs_export_prim_id || es_info->uses_primid;
unsigned gs_num_invocations = MAX2(gs_sel->info.base.gs.invocations, 1);
unsigned input_prim = si_get_input_prim(gs_sel);
unsigned input_prim = si_get_input_prim(gs_sel, &shader->key);
bool break_wave_at_eoi = false;
struct si_pm4_state *pm4 = si_get_shader_pm4_state(shader);
if (!pm4)
@ -2987,7 +2989,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
sel->ngg_cull_vert_threshold = 128;
}
} else if (sel->info.stage == MESA_SHADER_TESS_EVAL) {
if (sel->rast_prim == PIPE_PRIM_TRIANGLES &&
if (sel->rast_prim != PIPE_PRIM_POINTS &&
(sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING_ALL) ||
sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING_TESS) ||
sscreen->info.chip_class == GFX10_3))