mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-01 01:38:06 +02:00
radeonsi: move instance divisors into a constant buffer
Shader key size: 107 -> 47
Divisors of 0 and 1 are encoded in the shader key. Greater instance divisors
are loaded from a constant buffer.
The shader code doing the division is huge. Is it something we need to
worry about? Does any app use instance divisors >= 2?
VS prolog disassembly:
s_load_dwordx4 s[12:15], s[0:1], 0x80 ; C00A0300 00000080
s_nop 0 ; BF800000
s_waitcnt lgkmcnt(0) ; BF8C007F
s_buffer_load_dword s14, s[12:15], 0x4 ; C0220386 00000004
s_waitcnt lgkmcnt(0) ; BF8C007F
v_cvt_f32_u32_e32 v4, s14 ; 7E080C0E
v_rcp_iflag_f32_e32 v4, v4 ; 7E084704
v_mul_f32_e32 v4, 0x4f800000, v4 ; 0A0808FF 4F800000
v_cvt_u32_f32_e32 v4, v4 ; 7E080F04
v_mul_hi_u32 v5, v4, s14 ; D2860005 00001D04
v_mul_lo_i32 v6, v4, s14 ; D2850006 00001D04
v_cmp_eq_u32_e64 s[12:13], 0, v5 ; D0CA000C 00020A80
v_sub_i32_e32 v5, vcc, 0, v6 ; 340A0C80
v_cndmask_b32_e64 v5, v6, v5, s[12:13] ; D1000005 00320B06
v_mul_hi_u32 v5, v5, v4 ; D2860005 00020905
v_add_i32_e32 v6, vcc, v5, v4 ; 320C0905
v_subrev_i32_e32 v4, vcc, v5, v4 ; 36080905
v_cndmask_b32_e64 v4, v4, v6, s[12:13] ; D1000004 00320D04
v_mul_hi_u32 v5, v4, v1 ; D2860005 00020304
v_add_i32_e32 v4, vcc, s8, v0 ; 32080008
v_mul_lo_i32 v6, v5, s14 ; D2850006 00001D05
v_add_i32_e32 v7, vcc, 1, v5 ; 320E0A81
v_cmp_ge_u32_e64 s[12:13], v1, v6 ; D0CE000C 00020D01
v_sub_i32_e32 v6, vcc, v1, v6 ; 340C0D01
v_cmp_le_u32_e32 vcc, s14, v6 ; 7D960C0E
v_cndmask_b32_e64 v8, 0, -1, s[12:13] ; D1000008 00318280
v_cndmask_b32_e64 v6, 0, -1, vcc ; D1000006 01A98280
v_and_b32_e32 v6, v8, v6 ; 260C0D08
v_cmp_eq_u32_e32 vcc, 0, v6 ; 7D940C80
v_cndmask_b32_e32 v6, v7, v5, vcc ; 000C0B07
v_add_i32_e32 v5, vcc, -1, v5 ; 320A0AC1
v_cmp_eq_u32_e32 vcc, 0, v8 ; 7D941080
v_cndmask_b32_e32 v5, v6, v5, vcc ; 000A0B06
v_add_i32_e32 v5, vcc, s9, v5 ; 320A0A09
v2: set prefer_mono for fetched instance divisors
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
This commit is contained in:
parent
aef998fe4b
commit
4a10d6154e
7 changed files with 93 additions and 28 deletions
|
|
@ -2192,6 +2192,8 @@ void si_emit_graphics_shader_userdata(struct si_context *sctx,
|
|||
R_00B330_SPI_SHADER_USER_DATA_ES_0);
|
||||
si_emit_shader_pointer(sctx, descs,
|
||||
R_00B430_SPI_SHADER_USER_DATA_HS_0);
|
||||
si_emit_shader_pointer(sctx, descs,
|
||||
R_00B530_SPI_SHADER_USER_DATA_LS_0);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -308,6 +308,8 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
|
|||
|
||||
si_set_rw_buffer(sctx, SI_HS_CONST_DEFAULT_TESS_LEVELS,
|
||||
&sctx->null_const_buf);
|
||||
si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS,
|
||||
&sctx->null_const_buf);
|
||||
si_set_rw_buffer(sctx, SI_VS_CONST_CLIP_PLANES,
|
||||
&sctx->null_const_buf);
|
||||
si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE,
|
||||
|
|
|
|||
|
|
@ -312,7 +312,7 @@ get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
|
|||
|
||||
static LLVMValueRef get_instance_index_for_fetch(
|
||||
struct si_shader_context *ctx,
|
||||
unsigned param_start_instance, unsigned divisor)
|
||||
unsigned param_start_instance, LLVMValueRef divisor)
|
||||
{
|
||||
struct gallivm_state *gallivm = &ctx->gallivm;
|
||||
|
||||
|
|
@ -320,9 +320,8 @@ static LLVMValueRef get_instance_index_for_fetch(
|
|||
ctx->param_instance_id);
|
||||
|
||||
/* The division must be done before START_INSTANCE is added. */
|
||||
if (divisor > 1)
|
||||
result = LLVMBuildUDiv(gallivm->builder, result,
|
||||
LLVMConstInt(ctx->i32, divisor, 0), "");
|
||||
if (divisor != ctx->i32_1)
|
||||
result = LLVMBuildUDiv(gallivm->builder, result, divisor, "");
|
||||
|
||||
return LLVMBuildAdd(gallivm->builder, result,
|
||||
LLVMGetParam(ctx->main_fn, param_start_instance), "");
|
||||
|
|
@ -5282,12 +5281,10 @@ static void si_dump_shader_key_vs(const struct si_shader_key *key,
|
|||
const struct si_vs_prolog_bits *prolog,
|
||||
const char *prefix, FILE *f)
|
||||
{
|
||||
fprintf(f, " %s.instance_divisors = {", prefix);
|
||||
for (int i = 0; i < ARRAY_SIZE(prolog->instance_divisors); i++) {
|
||||
fprintf(f, !i ? "%u" : ", %u",
|
||||
prolog->instance_divisors[i]);
|
||||
}
|
||||
fprintf(f, "}\n");
|
||||
fprintf(f, " %s.instance_divisor_is_one = %u\n",
|
||||
prefix, prolog->instance_divisor_is_one);
|
||||
fprintf(f, " %s.instance_divisor_is_fetched = %u\n",
|
||||
prefix, prolog->instance_divisor_is_fetched);
|
||||
|
||||
fprintf(f, " mono.vs.fix_fetch = {");
|
||||
for (int i = 0; i < SI_MAX_ATTRIBS; i++)
|
||||
|
|
@ -5603,10 +5600,12 @@ static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
|
|||
key->vs_prolog.num_merged_next_stage_vgprs = 5;
|
||||
}
|
||||
|
||||
/* Set the instanceID flag. */
|
||||
for (unsigned i = 0; i < info->num_inputs; i++)
|
||||
if (key->vs_prolog.states.instance_divisors[i])
|
||||
shader_out->info.uses_instanceid = true;
|
||||
/* Enable loading the InstanceID VGPR. */
|
||||
uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
|
||||
|
||||
if ((key->vs_prolog.states.instance_divisor_is_one |
|
||||
key->vs_prolog.states.instance_divisor_is_fetched) & input_mask)
|
||||
shader_out->info.uses_instanceid = true;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -6527,6 +6526,21 @@ out:
|
|||
return result;
|
||||
}
|
||||
|
||||
static LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx)
|
||||
{
|
||||
struct gallivm_state *gallivm = &ctx->gallivm;
|
||||
LLVMValueRef ptr[2], list;
|
||||
|
||||
/* Get the pointer to rw buffers. */
|
||||
ptr[0] = LLVMGetParam(ctx->main_fn, SI_SGPR_RW_BUFFERS);
|
||||
ptr[1] = LLVMGetParam(ctx->main_fn, SI_SGPR_RW_BUFFERS_HI);
|
||||
list = lp_build_gather_values(gallivm, ptr, 2);
|
||||
list = LLVMBuildBitCast(gallivm->builder, list, ctx->i64, "");
|
||||
list = LLVMBuildIntToPtr(gallivm->builder, list,
|
||||
si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS), "");
|
||||
return list;
|
||||
}
|
||||
|
||||
/**
|
||||
* Build the vertex shader prolog function.
|
||||
*
|
||||
|
|
@ -6609,11 +6623,33 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
|
|||
}
|
||||
|
||||
/* Compute vertex load indices from instance divisors. */
|
||||
LLVMValueRef instance_divisor_constbuf = NULL;
|
||||
|
||||
if (key->vs_prolog.states.instance_divisor_is_fetched) {
|
||||
LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
|
||||
LLVMValueRef buf_index =
|
||||
LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
|
||||
instance_divisor_constbuf =
|
||||
ac_build_indexed_load_const(&ctx->ac, list, buf_index);
|
||||
}
|
||||
|
||||
for (i = 0; i <= key->vs_prolog.last_input; i++) {
|
||||
unsigned divisor = key->vs_prolog.states.instance_divisors[i];
|
||||
bool divisor_is_one =
|
||||
key->vs_prolog.states.instance_divisor_is_one & (1u << i);
|
||||
bool divisor_is_fetched =
|
||||
key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
|
||||
LLVMValueRef index;
|
||||
|
||||
if (divisor) {
|
||||
if (divisor_is_one || divisor_is_fetched) {
|
||||
LLVMValueRef divisor = ctx->i32_1;
|
||||
|
||||
if (divisor_is_fetched) {
|
||||
divisor = buffer_load_const(ctx, instance_divisor_constbuf,
|
||||
LLVMConstInt(ctx->i32, i * 4, 0));
|
||||
divisor = LLVMBuildBitCast(gallivm->builder, divisor,
|
||||
ctx->i32, "");
|
||||
}
|
||||
|
||||
/* InstanceID / Divisor + StartInstance */
|
||||
index = get_instance_index_for_fetch(ctx,
|
||||
user_sgpr_base +
|
||||
|
|
@ -6866,15 +6902,7 @@ static void si_build_ps_prolog_function(struct si_shader_context *ctx,
|
|||
/* POS_FIXED_PT is always last. */
|
||||
unsigned pos = key->ps_prolog.num_input_sgprs +
|
||||
key->ps_prolog.num_input_vgprs - 1;
|
||||
LLVMValueRef ptr[2], list;
|
||||
|
||||
/* Get the pointer to rw buffers. */
|
||||
ptr[0] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS);
|
||||
ptr[1] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS_HI);
|
||||
list = lp_build_gather_values(gallivm, ptr, 2);
|
||||
list = LLVMBuildBitCast(gallivm->builder, list, ctx->i64, "");
|
||||
list = LLVMBuildIntToPtr(gallivm->builder, list,
|
||||
si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS), "");
|
||||
LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
|
||||
|
||||
si_llvm_emit_polygon_stipple(ctx, list, pos);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -385,7 +385,14 @@ struct si_shader_selector {
|
|||
|
||||
/* Common VS bits between the shader key and the prolog key. */
|
||||
struct si_vs_prolog_bits {
|
||||
unsigned instance_divisors[SI_MAX_ATTRIBS];
|
||||
/* - If neither "is_one" nor "is_fetched" has a bit set, the instance
|
||||
* divisor is 0.
|
||||
* - If "is_one" has a bit set, the instance divisor is 1.
|
||||
* - If "is_fetched" has a bit set, the instance divisor will be loaded
|
||||
* from the constant buffer.
|
||||
*/
|
||||
uint16_t instance_divisor_is_one; /* bitmask of inputs */
|
||||
uint16_t instance_divisor_is_fetched; /* bitmask of inputs */
|
||||
};
|
||||
|
||||
/* Common TCS bits between the shader key and the epilog key. */
|
||||
|
|
|
|||
|
|
@ -3773,6 +3773,11 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
|
|||
if (elements[i].instance_divisor) {
|
||||
v->uses_instance_divisors = true;
|
||||
v->instance_divisors[i] = elements[i].instance_divisor;
|
||||
|
||||
if (v->instance_divisors[i] == 1)
|
||||
v->instance_divisor_is_one |= 1u << i;
|
||||
else
|
||||
v->instance_divisor_is_fetched |= 1u << i;
|
||||
}
|
||||
|
||||
if (!used[vbo_index]) {
|
||||
|
|
@ -3901,6 +3906,16 @@ static void si_bind_vertex_elements(struct pipe_context *ctx, void *state)
|
|||
v->uses_instance_divisors || /* we don't check which divisors changed */
|
||||
memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) * v->count)))
|
||||
sctx->do_update_shaders = true;
|
||||
|
||||
if (v && v->instance_divisor_is_fetched) {
|
||||
struct pipe_constant_buffer cb;
|
||||
|
||||
cb.buffer = NULL;
|
||||
cb.user_buffer = v->instance_divisors;
|
||||
cb.buffer_offset = 0;
|
||||
cb.buffer_size = sizeof(uint32_t) * v->count;
|
||||
si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS, &cb);
|
||||
}
|
||||
}
|
||||
|
||||
static void si_delete_vertex_element(struct pipe_context *ctx, void *state)
|
||||
|
|
|
|||
|
|
@ -115,6 +115,8 @@ struct si_vertex_elements
|
|||
uint16_t first_vb_use_mask;
|
||||
/* Vertex buffer descriptor list size aligned for optimal prefetch. */
|
||||
uint16_t desc_list_byte_size;
|
||||
uint16_t instance_divisor_is_one; /* bitmask of inputs */
|
||||
uint16_t instance_divisor_is_fetched; /* bitmask of inputs */
|
||||
};
|
||||
|
||||
union si_state {
|
||||
|
|
@ -182,6 +184,7 @@ enum {
|
|||
SI_VS_STREAMOUT_BUF3,
|
||||
|
||||
SI_HS_CONST_DEFAULT_TESS_LEVELS,
|
||||
SI_VS_CONST_INSTANCE_DIVISORS,
|
||||
SI_VS_CONST_CLIP_PLANES,
|
||||
SI_PS_CONST_POLY_STIPPLE,
|
||||
SI_PS_CONST_SAMPLE_POSITIONS,
|
||||
|
|
|
|||
|
|
@ -1187,10 +1187,18 @@ static void si_shader_selector_key_vs(struct si_context *sctx,
|
|||
if (!sctx->vertex_elements)
|
||||
return;
|
||||
|
||||
prolog_key->instance_divisor_is_one =
|
||||
sctx->vertex_elements->instance_divisor_is_one;
|
||||
prolog_key->instance_divisor_is_fetched =
|
||||
sctx->vertex_elements->instance_divisor_is_fetched;
|
||||
|
||||
/* Prefer a monolithic shader to allow scheduling divisions around
|
||||
* VBO loads. */
|
||||
if (prolog_key->instance_divisor_is_fetched)
|
||||
key->opt.prefer_mono = 1;
|
||||
|
||||
unsigned count = MIN2(vs->info.num_inputs,
|
||||
sctx->vertex_elements->count);
|
||||
memcpy(prolog_key->instance_divisors,
|
||||
sctx->vertex_elements->instance_divisors, count * 4);
|
||||
memcpy(key->mono.vs_fix_fetch, sctx->vertex_elements->fix_fetch, count);
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue