freedreno/a3xx: parameterize ubo optimization

A3xx apparently has higher alignment requirements than later gens for indirect const uploads. It also has fewer of them. Add compiler parameters for both settings, and set accordingly for a3xx and a4xx+. This fixes all the ubo test failures caused by this optimization. Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu> Reviewed-by: Rob Clark <robdclark@chromium.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5077>
2025-12-31 01:10:16 +01:00 · 2020-05-17 18:08:11 -04:00 · 2020-05-17 18:08:11 -04:00 · b5accb3ff9
commit b5accb3ff9
parent 475fb28377
4 changed files with 27 additions and 1930 deletions
--- a/.gitlab-ci/deqp-freedreno-a307-fails.txt
+++ b/.gitlab-ci/deqp-freedreno-a307-fails.txt
--- a/src/freedreno/ir3/ir3_compiler.c
+++ b/src/freedreno/ir3/ir3_compiler.c
@ -73,6 +73,8 @@ struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id
 		compiler->unminify_coords = false;
 		compiler->txf_ms_with_isaml = false;
 		compiler->array_index_add_half = true;
+		compiler->max_const = 1024;
+		compiler->const_upload_unit = 4;
 	} else {
 		/* no special handling for "flat" */
 		compiler->flat_bypass = false;
@ -80,6 +82,8 @@ struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id
 		compiler->unminify_coords = true;
 		compiler->txf_ms_with_isaml = true;
 		compiler->array_index_add_half = false;
+		compiler->max_const = 512;
+		compiler->const_upload_unit = 8;
 	}

 	return compiler;
--- a/src/freedreno/ir3/ir3_compiler.h
+++ b/src/freedreno/ir3/ir3_compiler.h
@ -67,6 +67,16 @@ struct ir3_compiler {
 	/* on a6xx, rewrite samgp to sequence of samgq0-3 in vertex shaders:
 	 */
 	bool samgq_workaround;
+
+	/* on a3xx, the limit on const access is lower than later gens (in vec4
+	 * units):
+	 */
+	uint32_t max_const;
+
+	/* on a3xx, the unit of indirect const load is higher than later gens (in
+	 * vec4 units):
+	 */
+	uint32_t const_upload_unit;
 };

 struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id);
--- a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c
+++ b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c
@ -28,15 +28,15 @@
 #include "util/u_math.h"

 static inline struct ir3_ubo_range
-get_ubo_load_range(nir_intrinsic_instr *instr)
+get_ubo_load_range(nir_intrinsic_instr *instr, uint32_t alignment)
 {
 	struct ir3_ubo_range r;

 	int offset = nir_src_as_uint(instr->src[1]);
 	const int bytes = nir_intrinsic_dest_components(instr) * 4;

-	r.start = ROUND_DOWN_TO(offset, 16 * 4);
-	r.end = ALIGN(offset + bytes, 16 * 4);
+	r.start = ROUND_DOWN_TO(offset, alignment * 16);
+	r.end = ALIGN(offset + bytes, alignment * 16);

 	return r;
 }
@ -85,7 +85,7 @@ get_existing_range(nir_intrinsic_instr *instr,

 static void
 gather_ubo_ranges(nir_shader *nir, nir_intrinsic_instr *instr,
-				  struct ir3_ubo_analysis_state *state)
+				  struct ir3_ubo_analysis_state *state, uint32_t alignment)
 {
 	struct ir3_ubo_range *old_r = get_existing_range(instr, state, true);
 	if (!old_r)
@ -97,13 +97,13 @@ gather_ubo_ranges(nir_shader *nir, nir_intrinsic_instr *instr,
 			 * load_uniform.  Set the range to cover all of UBO 0.
 			 */
 			old_r->start = 0;
-			old_r->end = ALIGN(nir->num_uniforms * 16, 16 * 4);
+			old_r->end = ALIGN(nir->num_uniforms * 16, alignment * 16);
 		}

 		return;
 	}

-	const struct ir3_ubo_range r = get_ubo_load_range(instr);
+	const struct ir3_ubo_range r = get_ubo_load_range(instr, alignment);

 	/* if UBO lowering is disabled, we still want to lower block 0
 	 * (which is normal uniforms):
@ -207,7 +207,7 @@ lower_ubo_block_decrement(nir_intrinsic_instr *instr, nir_builder *b, int *num_u

 static void
 lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b,
-		struct ir3_ubo_analysis_state *state, int *num_ubos)
+		struct ir3_ubo_analysis_state *state, int *num_ubos, uint32_t alignment)
 {
 	b->cursor = nir_before_instr(&instr->instr);

@ -234,7 +234,7 @@ lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b,
 		/* After gathering the UBO access ranges, we limit the total
 		 * upload. Reject if we're now outside the range.
 		 */
-		const struct ir3_ubo_range r = get_ubo_load_range(instr);
+		const struct ir3_ubo_range r = get_ubo_load_range(instr, alignment);
 		if (!(range->start <= r.start && r.end <= range->end)) {
 			lower_ubo_block_decrement(instr, b, num_ubos);
 			return;
@ -325,7 +325,8 @@ ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader *shader)
 			nir_foreach_block (block, function->impl) {
 				nir_foreach_instr (instr, block) {
 					if (instr_is_load_ubo(instr))
-						gather_ubo_ranges(nir, nir_instr_as_intrinsic(instr), state);
+						gather_ubo_ranges(nir, nir_instr_as_intrinsic(instr),
+								state, shader->compiler->const_upload_unit);
 				}
 			}
 		}
@ -339,7 +340,7 @@ ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader *shader)
 	 * dynamically accessed ranges separately and upload static rangtes
 	 * first.
 	 */
-	const uint32_t max_upload = 16 * 1024;
+	const uint32_t max_upload = shader->compiler->max_const * 16;
 	uint32_t offset = shader->const_state.num_reserved_user_consts * 16;
 	state->num_enabled = ARRAY_SIZE(state->range);
 	for (uint32_t i = 0; i < ARRAY_SIZE(state->range); i++) {
@ -370,7 +371,8 @@ ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader *shader)
 				nir_foreach_instr_safe (instr, block) {
 					if (instr_is_load_ubo(instr))
 						lower_ubo_load_to_uniform(nir_instr_as_intrinsic(instr),
-								&builder, state, &num_ubos);
+								&builder, state, &num_ubos,
+								shader->compiler->const_upload_unit);
 				}
 			}