mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-06 11:38:05 +02:00
freedreno/ir3: Push UBOs to constant file
We have a rather big constant file and it seems that the best way to use it is to upload all UBOs and lower UBO access the load_uniform. Signed-off-by: Kristian H. Kristensen <hoegsberg@chromium.org> Reviewed-by: Rob Clark <robdclark@gmail.com>
This commit is contained in:
parent
3c8779af32
commit
893425a607
5 changed files with 145 additions and 16 deletions
|
|
@ -124,7 +124,7 @@ ir3_context_init(struct ir3_compiler *compiler,
|
||||||
* Immediates go last mostly because they are inserted in the CP pass
|
* Immediates go last mostly because they are inserted in the CP pass
|
||||||
* after the nir -> ir3 frontend.
|
* after the nir -> ir3 frontend.
|
||||||
*/
|
*/
|
||||||
unsigned constoff = align(ctx->s->num_uniforms, 4);
|
unsigned constoff = align(ctx->so->shader->ubo_state.size / 16, 4);
|
||||||
unsigned ptrsz = ir3_pointer_size(ctx);
|
unsigned ptrsz = ir3_pointer_size(ctx);
|
||||||
|
|
||||||
memset(&so->constbase, ~0, sizeof(so->constbase));
|
memset(&so->constbase, ~0, sizeof(so->constbase));
|
||||||
|
|
|
||||||
|
|
@ -27,9 +27,38 @@
|
||||||
#include "util/u_dynarray.h"
|
#include "util/u_dynarray.h"
|
||||||
#include "mesa/main/macros.h"
|
#include "mesa/main/macros.h"
|
||||||
|
|
||||||
struct ir3_ubo_analysis_state {
|
static inline struct ir3_ubo_range
|
||||||
unsigned lower_count;
|
get_ubo_load_range(nir_intrinsic_instr *instr)
|
||||||
};
|
{
|
||||||
|
struct ir3_ubo_range r;
|
||||||
|
|
||||||
|
const int bytes = nir_intrinsic_dest_components(instr) *
|
||||||
|
(nir_dest_bit_size(instr->dest) / 8);
|
||||||
|
|
||||||
|
r.start = ROUND_DOWN_TO(nir_src_as_uint(instr->src[1]), 16 * 4);
|
||||||
|
r.end = ALIGN(r.start + bytes, 16 * 4);
|
||||||
|
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
gather_ubo_ranges(nir_intrinsic_instr *instr,
|
||||||
|
struct ir3_ubo_analysis_state *state)
|
||||||
|
{
|
||||||
|
if (!nir_src_is_const(instr->src[0]))
|
||||||
|
return;
|
||||||
|
|
||||||
|
if (!nir_src_is_const(instr->src[1]))
|
||||||
|
return;
|
||||||
|
|
||||||
|
const struct ir3_ubo_range r = get_ubo_load_range(instr);
|
||||||
|
const uint32_t block = nir_src_as_uint(instr->src[0]);
|
||||||
|
|
||||||
|
if (r.start < state->range[block].start)
|
||||||
|
state->range[block].start = r.start;
|
||||||
|
if (state->range[block].end < r.end)
|
||||||
|
state->range[block].end = r.end;
|
||||||
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b,
|
lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b,
|
||||||
|
|
@ -43,15 +72,37 @@ lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b,
|
||||||
return;
|
return;
|
||||||
|
|
||||||
const uint32_t block = nir_src_as_uint(instr->src[0]);
|
const uint32_t block = nir_src_as_uint(instr->src[0]);
|
||||||
if (block > 0)
|
|
||||||
return;
|
if (block > 0) {
|
||||||
|
/* We don't lower dynamic array indexing either, but we definitely should.
|
||||||
|
* We don't have a good way of determining the range of the dynamic
|
||||||
|
* access, so for now just fall back to pulling.
|
||||||
|
*/
|
||||||
|
if (!nir_src_is_const(instr->src[1]))
|
||||||
|
return;
|
||||||
|
|
||||||
|
/* After gathering the UBO access ranges, we limit the total
|
||||||
|
* upload. Reject if we're now outside the range.
|
||||||
|
*/
|
||||||
|
const struct ir3_ubo_range r = get_ubo_load_range(instr);
|
||||||
|
if (!(state->range[block].start <= r.start &&
|
||||||
|
r.end <= state->range[block].end))
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
b->cursor = nir_before_instr(&instr->instr);
|
b->cursor = nir_before_instr(&instr->instr);
|
||||||
|
|
||||||
nir_ssa_def *ubo_offset = nir_ssa_for_src(b, instr->src[1], 1);
|
nir_ssa_def *ubo_offset = nir_ssa_for_src(b, instr->src[1], 1);
|
||||||
nir_ssa_def *uniform_offset = ir3_nir_try_propagate_bit_shift(b, ubo_offset, -2);
|
nir_ssa_def *new_offset = ir3_nir_try_propagate_bit_shift(b, ubo_offset, -2);
|
||||||
if (uniform_offset == NULL)
|
if (new_offset)
|
||||||
uniform_offset = nir_ushr(b, ubo_offset, nir_imm_int(b, 2));
|
ubo_offset = new_offset;
|
||||||
|
else
|
||||||
|
ubo_offset = nir_ushr(b, ubo_offset, nir_imm_int(b, 2));
|
||||||
|
|
||||||
|
const int range_offset =
|
||||||
|
(state->range[block].offset - state->range[block].start) / 4;
|
||||||
|
nir_ssa_def *uniform_offset =
|
||||||
|
nir_iadd(b, ubo_offset, nir_imm_int(b, range_offset));
|
||||||
|
|
||||||
nir_intrinsic_instr *uniform =
|
nir_intrinsic_instr *uniform =
|
||||||
nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_uniform);
|
nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_uniform);
|
||||||
|
|
@ -72,7 +123,45 @@ lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b,
|
||||||
bool
|
bool
|
||||||
ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader *shader)
|
ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader *shader)
|
||||||
{
|
{
|
||||||
struct ir3_ubo_analysis_state state = { 0 };
|
struct ir3_ubo_analysis_state *state = &shader->ubo_state;
|
||||||
|
|
||||||
|
memset(state, 0, sizeof(*state));
|
||||||
|
state->range[0].end = nir->num_uniforms * 16;
|
||||||
|
|
||||||
|
nir_foreach_function(function, nir) {
|
||||||
|
if (function->impl) {
|
||||||
|
nir_foreach_block(block, function->impl) {
|
||||||
|
nir_foreach_instr(instr, block) {
|
||||||
|
if (instr->type == nir_instr_type_intrinsic &&
|
||||||
|
nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_ubo)
|
||||||
|
gather_ubo_ranges(nir_instr_as_intrinsic(instr), state);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* For now, everything we upload is accessed statically and thus will be
|
||||||
|
* used by the shader. Once we can upload dynamically indexed data, we may
|
||||||
|
* upload sparsely accessed arrays, at which point we probably want to
|
||||||
|
* give priority to smaller UBOs, on the assumption that big UBOs will be
|
||||||
|
* accessed dynamically. Alternatively, we can track statically and
|
||||||
|
* dynamically accessed ranges separately and upload static rangtes
|
||||||
|
* first.
|
||||||
|
*/
|
||||||
|
const uint32_t max_upload = 16 * 1024;
|
||||||
|
uint32_t offset = 0;
|
||||||
|
for (uint32_t i = 0; i < ARRAY_SIZE(state->range); i++) {
|
||||||
|
uint32_t range_size = state->range[i].end - state->range[i].start;
|
||||||
|
|
||||||
|
debug_assert(offset <= max_upload);
|
||||||
|
state->range[i].offset = offset;
|
||||||
|
if (offset + range_size > max_upload) {
|
||||||
|
range_size = max_upload - offset;
|
||||||
|
state->range[i].end = state->range[i].start + range_size;
|
||||||
|
}
|
||||||
|
offset += range_size;
|
||||||
|
}
|
||||||
|
state->size = offset;
|
||||||
|
|
||||||
nir_foreach_function(function, nir) {
|
nir_foreach_function(function, nir) {
|
||||||
if (function->impl) {
|
if (function->impl) {
|
||||||
|
|
@ -82,7 +171,7 @@ ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader *shader)
|
||||||
nir_foreach_instr_safe(instr, block) {
|
nir_foreach_instr_safe(instr, block) {
|
||||||
if (instr->type == nir_instr_type_intrinsic &&
|
if (instr->type == nir_instr_type_intrinsic &&
|
||||||
nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_ubo)
|
nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_ubo)
|
||||||
lower_ubo_load_to_uniform(nir_instr_as_intrinsic(instr), &builder, &state);
|
lower_ubo_load_to_uniform(nir_instr_as_intrinsic(instr), &builder, state);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -91,5 +180,5 @@ ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader *shader)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return state.lower_count > 0;
|
return state->lower_count > 0;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -67,6 +67,8 @@ enum ir3_driver_param {
|
||||||
#define IR3_MAX_SHADER_IMAGES 32
|
#define IR3_MAX_SHADER_IMAGES 32
|
||||||
#define IR3_MAX_SO_BUFFERS 4
|
#define IR3_MAX_SO_BUFFERS 4
|
||||||
#define IR3_MAX_SO_OUTPUTS 64
|
#define IR3_MAX_SO_OUTPUTS 64
|
||||||
|
#define IR3_MAX_CONSTANT_BUFFERS 32
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* For consts needed to pass internal values to shader which may or may not
|
* For consts needed to pass internal values to shader which may or may not
|
||||||
|
|
@ -474,6 +476,19 @@ struct ir3_shader_variant {
|
||||||
struct ir3_shader *shader;
|
struct ir3_shader *shader;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct ir3_ubo_range {
|
||||||
|
uint32_t offset; /* start offset of this block in const register file */
|
||||||
|
uint32_t start, end; /* range of block that's actually used */
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ir3_ubo_analysis_state
|
||||||
|
{
|
||||||
|
struct ir3_ubo_range range[IR3_MAX_CONSTANT_BUFFERS];
|
||||||
|
uint32_t size;
|
||||||
|
uint32_t lower_count;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
struct ir3_shader {
|
struct ir3_shader {
|
||||||
gl_shader_stage type;
|
gl_shader_stage type;
|
||||||
|
|
||||||
|
|
@ -486,6 +501,8 @@ struct ir3_shader {
|
||||||
|
|
||||||
struct ir3_compiler *compiler;
|
struct ir3_compiler *compiler;
|
||||||
|
|
||||||
|
struct ir3_ubo_analysis_state ubo_state;
|
||||||
|
|
||||||
struct nir_shader *nir;
|
struct nir_shader *nir;
|
||||||
struct ir3_stream_output_info stream_output;
|
struct ir3_stream_output_info stream_output;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -72,11 +72,10 @@ fd6_emit_const(struct fd_ringbuffer *ring, gl_shader_stage type,
|
||||||
uint32_t regid, uint32_t offset, uint32_t sizedwords,
|
uint32_t regid, uint32_t offset, uint32_t sizedwords,
|
||||||
const uint32_t *dwords, struct pipe_resource *prsc)
|
const uint32_t *dwords, struct pipe_resource *prsc)
|
||||||
{
|
{
|
||||||
uint32_t i, sz;
|
uint32_t i, sz, align_sz;
|
||||||
enum a6xx_state_src src;
|
enum a6xx_state_src src;
|
||||||
|
|
||||||
debug_assert((regid % 4) == 0);
|
debug_assert((regid % 4) == 0);
|
||||||
debug_assert((sizedwords % 4) == 0);
|
|
||||||
|
|
||||||
if (prsc) {
|
if (prsc) {
|
||||||
sz = 0;
|
sz = 0;
|
||||||
|
|
@ -86,12 +85,14 @@ fd6_emit_const(struct fd_ringbuffer *ring, gl_shader_stage type,
|
||||||
src = SS6_DIRECT;
|
src = SS6_DIRECT;
|
||||||
}
|
}
|
||||||
|
|
||||||
OUT_PKT7(ring, shader_t_to_opcode(type), 3 + sz);
|
align_sz = align(sz, 4);
|
||||||
|
|
||||||
|
OUT_PKT7(ring, shader_t_to_opcode(type), 3 + align_sz);
|
||||||
OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(regid/4) |
|
OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(regid/4) |
|
||||||
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
|
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
|
||||||
CP_LOAD_STATE6_0_STATE_SRC(src) |
|
CP_LOAD_STATE6_0_STATE_SRC(src) |
|
||||||
CP_LOAD_STATE6_0_STATE_BLOCK(fd6_stage2shadersb(type)) |
|
CP_LOAD_STATE6_0_STATE_BLOCK(fd6_stage2shadersb(type)) |
|
||||||
CP_LOAD_STATE6_0_NUM_UNIT(sizedwords/4));
|
CP_LOAD_STATE6_0_NUM_UNIT(DIV_ROUND_UP(sizedwords, 4)));
|
||||||
if (prsc) {
|
if (prsc) {
|
||||||
struct fd_bo *bo = fd_resource(prsc)->bo;
|
struct fd_bo *bo = fd_resource(prsc)->bo;
|
||||||
OUT_RELOC(ring, bo, offset, 0, 0);
|
OUT_RELOC(ring, bo, offset, 0, 0);
|
||||||
|
|
@ -100,9 +101,15 @@ fd6_emit_const(struct fd_ringbuffer *ring, gl_shader_stage type,
|
||||||
OUT_RING(ring, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
|
OUT_RING(ring, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
|
||||||
dwords = (uint32_t *)&((uint8_t *)dwords)[offset];
|
dwords = (uint32_t *)&((uint8_t *)dwords)[offset];
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i = 0; i < sz; i++) {
|
for (i = 0; i < sz; i++) {
|
||||||
OUT_RING(ring, dwords[i]);
|
OUT_RING(ring, dwords[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Zero-pad to multiple of 4 dwords */
|
||||||
|
for (i = sz; i < align_sz; i++) {
|
||||||
|
OUT_RING(ring, 0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
|
|
|
||||||
|
|
@ -254,6 +254,22 @@ emit_user_consts(struct fd_context *ctx, const struct ir3_shader_variant *v,
|
||||||
cb->user_buffer, cb->buffer);
|
cb->user_buffer, cb->buffer);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ir3_ubo_analysis_state *state;
|
||||||
|
state = &v->shader->ubo_state;
|
||||||
|
|
||||||
|
for (uint32_t i = 1; i < ARRAY_SIZE(state->range); i++) {
|
||||||
|
struct pipe_constant_buffer *cb = &constbuf->cb[i];
|
||||||
|
|
||||||
|
if (state->range[i].start < state->range[i].end &&
|
||||||
|
constbuf->enabled_mask & (1 << i)) {
|
||||||
|
|
||||||
|
ctx->emit_const(ring, v->type, state->range[i].offset / 4,
|
||||||
|
cb->buffer_offset + state->range[i].start,
|
||||||
|
(state->range[i].end - state->range[i].start) / 4,
|
||||||
|
cb->user_buffer, cb->buffer);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue