diff --git a/src/freedreno/common/freedreno_dev_info.h b/src/freedreno/common/freedreno_dev_info.h index a067a38db4f..6b20f84506c 100644 --- a/src/freedreno/common/freedreno_dev_info.h +++ b/src/freedreno/common/freedreno_dev_info.h @@ -51,6 +51,18 @@ struct fd_dev_info { uint32_t max_waves; + /* Local Memory (i.e. shared memory in GL/Vulkan) and compute shader + * const registers, as well as other things not relevant here, share the + * same storage space, called the Local Buffer or LB. This is the size of + * the part of the LB used for consts and LM. Consts are duplicated + * wavesize_granularity times, and the size of duplicated consts + local + * memory must not exceed it. If it is left 0, assume that it is + * compute constlen + wavesize_granularity * cs_shared_mem_size, which is + * enough to hold both the maximum possible compute consts and local + * memory at the same time. + */ + uint32_t compute_lb_size; + /* number of CCU is always equal to the number of SP */ union { uint32_t num_sp_cores; diff --git a/src/freedreno/common/freedreno_devices.py b/src/freedreno/common/freedreno_devices.py index b8321245ae7..6b41d22574d 100644 --- a/src/freedreno/common/freedreno_devices.py +++ b/src/freedreno/common/freedreno_devices.py @@ -103,7 +103,7 @@ class GPUInfo(Struct): tile_max_w, tile_max_h, num_vsc_pipes, cs_shared_mem_size, num_sp_cores, wave_granularity, fibers_per_sp, highest_bank_bit = 0, ubwc_swizzle = 0x7, macrotile_mode = 0, - threadsize_base = 64, max_waves = 16): + threadsize_base = 64, max_waves = 16, compute_lb_size = 0): self.chip = chip.value self.gmem_align_w = gmem_align_w self.gmem_align_h = gmem_align_h @@ -139,9 +139,13 @@ class A6xxGPUInfo(GPUInfo): if chip == CHIP.A6XX: tile_max_w = 1024 # max_bitfield_val(5, 0, 5) tile_max_h = max_bitfield_val(14, 8, 4) # 1008 + compute_lb_size = 0 else: tile_max_w = 1728 tile_max_h = 1728 + # on a7xx the compute_lb_size is 40KB for all known parts for now. + # We have a parameter for it in case some low-end parts cut it down. + compute_lb_size = 40 * 1024 super().__init__(chip, gmem_align_w = 16, gmem_align_h = 4, tile_align_w = tile_align_w, @@ -157,7 +161,8 @@ class A6xxGPUInfo(GPUInfo): ubwc_swizzle = ubwc_swizzle, macrotile_mode = macrotile_mode, threadsize_base = threadsize_base, - max_waves = max_waves) + max_waves = max_waves, + compute_lb_size = compute_lb_size) self.num_ccu = num_ccu diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c index 24a8e9ebf6b..ee9a101acff 100644 --- a/src/freedreno/ir3/ir3_compiler.c +++ b/src/freedreno/ir3/ir3_compiler.c @@ -263,6 +263,14 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id, compiler->has_early_preamble = false; } + if (dev_info->compute_lb_size) { + compiler->compute_lb_size = dev_info->compute_lb_size; + } else { + compiler->compute_lb_size = + compiler->max_const_compute * 16 /* bytes/vec4 */ * + compiler->wave_granularity + compiler->local_mem_size; + } + /* This is just a guess for a4xx. */ compiler->pvtmem_per_fiber_align = compiler->gen >= 4 ? 512 : 128; /* TODO: implement private memory on earlier gen's */ diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h index 6a6d31968eb..cbf4a253b5b 100644 --- a/src/freedreno/ir3/ir3_compiler.h +++ b/src/freedreno/ir3/ir3_compiler.h @@ -129,6 +129,9 @@ struct ir3_compiler { /* The maximum number of constants, in vec4's, for compute shaders. */ uint16_t max_const_compute; + /* See freedreno_dev_info::compute_lb_size. */ + uint32_t compute_lb_size; + /* Number of instructions that the shader's base address and length * (instrlen divides instruction count by this) must be aligned to. */