freedreno/a3xx: parameterize ubo optimization

A3xx apparently has higher alignment requirements than later gens for
indirect const uploads. It also has fewer of them. Add compiler
parameters for both settings, and set accordingly for a3xx and a4xx+.
This fixes all the ubo test failures caused by this optimization.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Rob Clark <robdclark@chromium.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5077>
This commit is contained in:
Ilia Mirkin 2020-05-17 18:08:11 -04:00
parent 475fb28377
commit b5accb3ff9
4 changed files with 27 additions and 1930 deletions

File diff suppressed because it is too large Load diff

View file

@ -73,6 +73,8 @@ struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id
compiler->unminify_coords = false;
compiler->txf_ms_with_isaml = false;
compiler->array_index_add_half = true;
compiler->max_const = 1024;
compiler->const_upload_unit = 4;
} else {
/* no special handling for "flat" */
compiler->flat_bypass = false;
@ -80,6 +82,8 @@ struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id
compiler->unminify_coords = true;
compiler->txf_ms_with_isaml = true;
compiler->array_index_add_half = false;
compiler->max_const = 512;
compiler->const_upload_unit = 8;
}
return compiler;

View file

@ -67,6 +67,16 @@ struct ir3_compiler {
/* on a6xx, rewrite samgp to sequence of samgq0-3 in vertex shaders:
*/
bool samgq_workaround;
/* on a3xx, the limit on const access is lower than later gens (in vec4
* units):
*/
uint32_t max_const;
/* on a3xx, the unit of indirect const load is higher than later gens (in
* vec4 units):
*/
uint32_t const_upload_unit;
};
struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id);

View file

@ -28,15 +28,15 @@
#include "util/u_math.h"
static inline struct ir3_ubo_range
get_ubo_load_range(nir_intrinsic_instr *instr)
get_ubo_load_range(nir_intrinsic_instr *instr, uint32_t alignment)
{
struct ir3_ubo_range r;
int offset = nir_src_as_uint(instr->src[1]);
const int bytes = nir_intrinsic_dest_components(instr) * 4;
r.start = ROUND_DOWN_TO(offset, 16 * 4);
r.end = ALIGN(offset + bytes, 16 * 4);
r.start = ROUND_DOWN_TO(offset, alignment * 16);
r.end = ALIGN(offset + bytes, alignment * 16);
return r;
}
@ -85,7 +85,7 @@ get_existing_range(nir_intrinsic_instr *instr,
static void
gather_ubo_ranges(nir_shader *nir, nir_intrinsic_instr *instr,
struct ir3_ubo_analysis_state *state)
struct ir3_ubo_analysis_state *state, uint32_t alignment)
{
struct ir3_ubo_range *old_r = get_existing_range(instr, state, true);
if (!old_r)
@ -97,13 +97,13 @@ gather_ubo_ranges(nir_shader *nir, nir_intrinsic_instr *instr,
* load_uniform. Set the range to cover all of UBO 0.
*/
old_r->start = 0;
old_r->end = ALIGN(nir->num_uniforms * 16, 16 * 4);
old_r->end = ALIGN(nir->num_uniforms * 16, alignment * 16);
}
return;
}
const struct ir3_ubo_range r = get_ubo_load_range(instr);
const struct ir3_ubo_range r = get_ubo_load_range(instr, alignment);
/* if UBO lowering is disabled, we still want to lower block 0
* (which is normal uniforms):
@ -207,7 +207,7 @@ lower_ubo_block_decrement(nir_intrinsic_instr *instr, nir_builder *b, int *num_u
static void
lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b,
struct ir3_ubo_analysis_state *state, int *num_ubos)
struct ir3_ubo_analysis_state *state, int *num_ubos, uint32_t alignment)
{
b->cursor = nir_before_instr(&instr->instr);
@ -234,7 +234,7 @@ lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b,
/* After gathering the UBO access ranges, we limit the total
* upload. Reject if we're now outside the range.
*/
const struct ir3_ubo_range r = get_ubo_load_range(instr);
const struct ir3_ubo_range r = get_ubo_load_range(instr, alignment);
if (!(range->start <= r.start && r.end <= range->end)) {
lower_ubo_block_decrement(instr, b, num_ubos);
return;
@ -325,7 +325,8 @@ ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader *shader)
nir_foreach_block (block, function->impl) {
nir_foreach_instr (instr, block) {
if (instr_is_load_ubo(instr))
gather_ubo_ranges(nir, nir_instr_as_intrinsic(instr), state);
gather_ubo_ranges(nir, nir_instr_as_intrinsic(instr),
state, shader->compiler->const_upload_unit);
}
}
}
@ -339,7 +340,7 @@ ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader *shader)
* dynamically accessed ranges separately and upload static rangtes
* first.
*/
const uint32_t max_upload = 16 * 1024;
const uint32_t max_upload = shader->compiler->max_const * 16;
uint32_t offset = shader->const_state.num_reserved_user_consts * 16;
state->num_enabled = ARRAY_SIZE(state->range);
for (uint32_t i = 0; i < ARRAY_SIZE(state->range); i++) {
@ -370,7 +371,8 @@ ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader *shader)
nir_foreach_instr_safe (instr, block) {
if (instr_is_load_ubo(instr))
lower_ubo_load_to_uniform(nir_instr_as_intrinsic(instr),
&builder, state, &num_ubos);
&builder, state, &num_ubos,
shader->compiler->const_upload_unit);
}
}