ir3: Calcuate max_waves and threadsize

max_waves is just for shader-db stats for now, but threadsize will replace the various mechanisms used to determine threadsize across the different gen's. Calculating these correctly entails adding a bunch of details about the sizes of various things to ir3. In the future we will use the guts of the max_waves calculation to inform RA decisions as well, which is why the max_waves calculation is broken up into register dependent/independent pieces. Something should be said about the units of reg_size_vec4. These units were chosen for two reasons: 1. As said in the comment, it makes some calculations easier. 2. For a4xx/a5xx, where we don't know as much because we haven't done the same sorts of experiments to probe for the HW configuration, it corresponds more directly to things that are known. The existing code switches to the smaller threadsize when r24.x or higher is used, which translates directly to a reg_size_vec4 of 48. If we chose different units (e.g. multiplying by wave_granularity and/or threadsize_base), then to match the same behavior we'd have to set reg_size_vec4 based on some other parameters that aren't 100% known. If someone comes along and updates them, they might inadvertantly break it. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9498>
2026-05-04 05:28:05 +02:00 · 2021-03-10 13:03:16 +01:00 · 2021-03-10 13:03:16 +01:00 · fd7960e191
commit fd7960e191
parent cbc68c79a5
4 changed files with 185 additions and 0 deletions
--- a/src/freedreno/ir3/ir3.c
+++ b/src/freedreno/ir3/ir3.c
@ -107,6 +107,99 @@ collect_reg_info(struct ir3_instruction *instr, struct ir3_register *reg,
 	}
 }

+static bool
+should_double_threadsize(struct ir3_shader_variant *v,
+						 unsigned regs_count)
+{
+	const struct ir3_compiler *compiler = v->shader->compiler;
+	switch (v->type) {
+	case MESA_SHADER_COMPUTE: {
+		unsigned threads_per_wg = v->local_size[0] * v->local_size[1] * v->local_size[2];
+
+		/* For a5xx, if the workgroup size is greater than the maximum number
+		 * of threads per core with 32 threads per wave (512) then we have to
+		 * use the doubled threadsize because otherwise the workgroup wouldn't
+		 * fit. For smaller workgroup sizes, we follow the blob and use the
+		 * smaller threadsize.
+		 */
+		if (compiler->gpu_id < 600) {
+			return v->local_size_variable || threads_per_wg >
+				compiler->threadsize_base * compiler->max_waves;
+		}
+
+		/* On a6xx, we prefer the larger threadsize unless the workgroup is
+		 * small enough that it would be useless. Note that because
+		 * threadsize_base is bumped to 64, we don't have to worry about the
+		 * workgroup fitting, unlike the a5xx case.
+		 */
+		if (!v->local_size_variable) {
+			if (threads_per_wg <= compiler->threadsize_base)
+				return false;
+		}
+	}
+	/* fallthrough */
+	case MESA_SHADER_FRAGMENT: {
+		/* Check that doubling the threadsize wouldn't exceed the regfile size */
+		return regs_count * 2 <= compiler->reg_size_vec4;
+	}
+
+	default:
+		/* On a6xx+, it's impossible to use a doubled wavesize in the geometry
+		 * stages - the bit doesn't exist. The blob never used it for the VS
+		 * on earlier gen's anyway.
+		 */
+		return false;
+	}
+}
+
+/* Get the maximum number of waves that could be used even if this shader
+ * didn't use any registers.
+ */
+static unsigned
+get_reg_independent_max_waves(struct ir3_shader_variant *v, bool double_threadsize)
+{
+	const struct ir3_compiler *compiler = v->shader->compiler;
+	unsigned max_waves = compiler->max_waves;
+
+	/* If this is a compute shader, compute the limit based on shared size */
+	if (v->type == MESA_SHADER_COMPUTE) {
+		/* Shared is allocated in chunks of 1k */
+		unsigned shared_per_wg = ALIGN_POT(v->shared_size, 1024);
+		if (shared_per_wg > 0 && !v->local_size_variable) {
+			unsigned wgs_per_core = compiler->local_mem_size / shared_per_wg;
+			unsigned threads_per_wg = v->local_size[0] * v->local_size[1] * v->local_size[2];
+			unsigned waves_per_wg =
+				DIV_ROUND_UP(threads_per_wg,
+					compiler->threadsize_base *
+					(double_threadsize ? 2 : 1) * compiler->wave_granularity);
+			max_waves =
+				MIN2(max_waves, waves_per_wg * wgs_per_core * compiler->wave_granularity);
+		}
+	}
+
+	/* Compute the limit based on branchstack */
+	if (v->branchstack > 0) {
+		unsigned branchstack_max_waves =
+			compiler->branchstack_size / v->branchstack *
+			compiler->wave_granularity;
+		max_waves = MIN2(max_waves, branchstack_max_waves);
+	}
+
+	return max_waves;
+}
+
+/* Get the maximum number of waves that could be launched limited by reg size.
+ */
+static unsigned
+get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
+							unsigned reg_count, bool double_threadsize)
+{
+	return reg_count ?
+		(compiler->reg_size_vec4 / (reg_count * (double_threadsize ? 2 : 1)) *
+		 compiler->wave_granularity) :
+		compiler->max_waves;
+}
+
 void
 ir3_collect_info(struct ir3_shader_variant *v)
 {
@ -200,6 +293,20 @@ ir3_collect_info(struct ir3_shader_variant *v)
 			}
 		}
 	}
+
+	/* TODO: for a5xx and below, is there a separate regfile for
+	 * half-registers?
+	 */
+	unsigned regs_count =
+		info->max_reg + 1 + (compiler->gpu_id >= 600 ? ((info->max_half_reg + 2) / 2) : 0);
+
+	info->double_threadsize = should_double_threadsize(v, regs_count);
+	unsigned reg_independent_max_waves =
+		get_reg_independent_max_waves(v, info->double_threadsize);
+	unsigned reg_dependent_max_waves =
+		get_reg_dependent_max_waves(compiler, regs_count, info->double_threadsize);
+	info->max_waves = MIN2(reg_independent_max_waves, reg_dependent_max_waves);
+	assert(info->max_waves <= v->shader->compiler->max_waves);
 }

 static struct ir3_register * reg_create(struct ir3 *shader,
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@ -64,6 +64,11 @@ struct ir3_info {
 	int8_t   max_reg;   /* highest GPR # used by shader */
 	int8_t   max_half_reg;
 	int16_t  max_const;
+	/* This is the maximum # of waves that can executed at once in one core,
+	 * assuming that they are all executing this shader.
+	 */
+	int8_t   max_waves;
+	bool     double_threadsize;
 	bool     multi_dword_ldp_stp;

 	/* number of sync bits: */
--- a/src/freedreno/ir3/ir3_compiler.c
+++ b/src/freedreno/ir3/ir3_compiler.c
@ -79,6 +79,13 @@ ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id)
 	compiler->gpu_id = gpu_id;
 	compiler->set = ir3_ra_alloc_reg_set(compiler, false);

+	/* All known GPU's have 32k local memory (aka shared) */
+	compiler->local_mem_size = 32 * 1024;
+	/* TODO see if older GPU's were different here */
+	compiler->branchstack_size = 64;
+	compiler->wave_granularity = 2;
+	compiler->max_waves = 16;
+
 	if (compiler->gpu_id >= 600) {
 		compiler->mergedregs_set = ir3_ra_alloc_reg_set(compiler, true);
 		compiler->samgq_workaround = true;
@ -123,6 +130,34 @@ ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id)
 		compiler->max_const_safe = 256;
 	}

+	if (compiler->gpu_id == 650) {
+		/* This changed mid-generation for a650, so that using r32.x and above
+		 * requires using the smallest threadsize.
+		 */
+		compiler->reg_size_vec4 = 64;
+	} else if (compiler->gpu_id >= 600) {
+		compiler->reg_size_vec4 = 96;
+	} else if (compiler->gpu_id >= 400) {
+		/* On a4xx-a5xx, using r24.x and above requires using the smallest
+		 * threadsize.
+		 */
+		compiler->reg_size_vec4 = 48;
+	} else {
+		/* TODO: confirm this */
+		compiler->reg_size_vec4 = 96;
+	}
+
+	if (compiler->gpu_id >= 600) {
+		compiler->threadsize_base = 64;
+	} else if (compiler->gpu_id >= 400) {
+		/* TODO: Confirm this for a4xx. For a5xx this is based on the Vulkan
+		 * 1.1 subgroupSize which is 32.
+		 */
+		compiler->threadsize_base = 32;
+	} else {
+		compiler->threadsize_base = 8;
+	}
+
 	if (compiler->gpu_id >= 400) {
 		/* need special handling for "flat" */
 		compiler->flat_bypass = true;
--- a/src/freedreno/ir3/ir3_compiler.h
+++ b/src/freedreno/ir3/ir3_compiler.h
@ -107,6 +107,44 @@ struct ir3_compiler {
 	 */
 	uint32_t const_upload_unit;

+	/* The base number of threads per wave. Some stages may be able to double
+	 * this.
+	 */
+	uint32_t threadsize_base;
+
+	/* On at least a6xx, waves are always launched in pairs. In calculations
+	 * about occupancy, we pretend that each wave pair is actually one wave,
+	 * which simplifies many of the calculations, but means we have to
+	 * multiply threadsize_base by this number.
+	 */
+	uint32_t wave_granularity;
+
+	/* The maximum number of simultaneous waves per core. */
+	uint32_t max_waves;
+
+	/* This is theoretical maximum number of vec4 registers that one wave of
+	 * the base threadsize could use. To get the actual size of the register
+	 * file in bytes one would need to compute:
+	 *
+	 * reg_size_vec4 * threadsize_base * wave_granularity * 16 (bytes per vec4)
+	 *
+	 * However this number is more often what we actually need. For example, a
+	 * max_reg more than half of this will result in a doubled threadsize
+	 * being impossible (because double-sized waves take up twice as many
+	 * registers). Also, the formula for the occupancy given a particular
+	 * register footprint is simpler.
+	 *
+	 * It is in vec4 units because the register file is allocated
+	 * with vec4 granularity, so it's in the same units as max_reg.
+	 */
+	uint32_t reg_size_vec4;
+
+	/* The size of local memory in bytes */
+	uint32_t local_mem_size;
+
+	/* The number of total branch stack entries, divided by wave_granularity. */
+	uint32_t branchstack_size;
+
 	/* Whether clip+cull distances are supported */
 	bool has_clip_cull;