mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-03 07:48:07 +02:00
freedreno,turnip: Make CS shared memory size configurable
a610 and similar models have less shared memory size. Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20991>
This commit is contained in:
parent
05fffc7b25
commit
4a43ab3019
5 changed files with 26 additions and 7 deletions
|
|
@ -49,6 +49,8 @@ struct fd_dev_info {
|
|||
|
||||
uint32_t num_vsc_pipes;
|
||||
|
||||
uint32_t cs_shared_mem_size;
|
||||
|
||||
/* number of CCU is always equal to the number of SP */
|
||||
union {
|
||||
uint32_t num_sp_cores;
|
||||
|
|
|
|||
|
|
@ -108,7 +108,8 @@ class GPUInfo(Struct):
|
|||
"""
|
||||
def __init__(self, chip, gmem_align_w, gmem_align_h,
|
||||
tile_align_w, tile_align_h,
|
||||
tile_max_w, tile_max_h, num_vsc_pipes):
|
||||
tile_max_w, tile_max_h, num_vsc_pipes,
|
||||
cs_shared_mem_size):
|
||||
self.chip = chip.value
|
||||
self.gmem_align_w = gmem_align_w
|
||||
self.gmem_align_h = gmem_align_h
|
||||
|
|
@ -117,6 +118,7 @@ class GPUInfo(Struct):
|
|||
self.tile_max_w = tile_max_w
|
||||
self.tile_max_h = tile_max_h
|
||||
self.num_vsc_pipes = num_vsc_pipes
|
||||
self.cs_shared_mem_size = cs_shared_mem_size
|
||||
|
||||
s.gpu_infos.append(self)
|
||||
|
||||
|
|
@ -126,13 +128,14 @@ class A6xxGPUInfo(GPUInfo):
|
|||
into distinct sub-generations. The template parameter avoids
|
||||
duplication of parameters that are unique to the sub-generation.
|
||||
"""
|
||||
def __init__(self, chip, template, num_ccu, tile_align_w, tile_align_h, num_vsc_pipes, magic_regs):
|
||||
def __init__(self, chip, template, num_ccu, tile_align_w, tile_align_h, num_vsc_pipes, cs_shared_mem_size, magic_regs):
|
||||
super().__init__(chip, gmem_align_w = 16, gmem_align_h = 4,
|
||||
tile_align_w = tile_align_w,
|
||||
tile_align_h = tile_align_h,
|
||||
tile_max_w = 1024, # max_bitfield_val(5, 0, 5)
|
||||
tile_max_h = max_bitfield_val(14, 8, 4),
|
||||
num_vsc_pipes = num_vsc_pipes)
|
||||
num_vsc_pipes = num_vsc_pipes,
|
||||
cs_shared_mem_size = cs_shared_mem_size)
|
||||
# The # of SP cores seems to always match # of CCU
|
||||
self.num_sp_cores = num_ccu
|
||||
self.num_ccu = num_ccu
|
||||
|
|
@ -174,6 +177,7 @@ add_gpus([
|
|||
tile_max_w = 512,
|
||||
tile_max_h = ~0, # TODO
|
||||
num_vsc_pipes = 8,
|
||||
cs_shared_mem_size = 0,
|
||||
))
|
||||
|
||||
add_gpus([
|
||||
|
|
@ -188,6 +192,7 @@ add_gpus([
|
|||
tile_max_w = 992, # max_bitfield_val(4, 0, 5)
|
||||
tile_max_h = max_bitfield_val(9, 5, 5),
|
||||
num_vsc_pipes = 8,
|
||||
cs_shared_mem_size = 32 * 1024,
|
||||
))
|
||||
|
||||
add_gpus([
|
||||
|
|
@ -201,6 +206,7 @@ add_gpus([
|
|||
tile_max_w = 1024, # max_bitfield_val(4, 0, 5)
|
||||
tile_max_h = max_bitfield_val(9, 5, 5),
|
||||
num_vsc_pipes = 8,
|
||||
cs_shared_mem_size = 32 * 1024,
|
||||
))
|
||||
|
||||
add_gpus([
|
||||
|
|
@ -218,6 +224,7 @@ add_gpus([
|
|||
tile_max_w = 1024, # max_bitfield_val(7, 0, 5)
|
||||
tile_max_h = max_bitfield_val(16, 9, 5),
|
||||
num_vsc_pipes = 16,
|
||||
cs_shared_mem_size = 32 * 1024,
|
||||
))
|
||||
|
||||
# a6xx can be divided into distinct sub-generations, where certain device-
|
||||
|
|
@ -307,6 +314,7 @@ add_gpus([
|
|||
tile_align_w = 32,
|
||||
tile_align_h = 32,
|
||||
num_vsc_pipes = 32,
|
||||
cs_shared_mem_size = 32 * 1024,
|
||||
magic_regs = dict(
|
||||
PC_POWER_CNTL = 0,
|
||||
TPL1_DBG_ECO_CNTL = 0x00108000,
|
||||
|
|
@ -333,6 +341,7 @@ add_gpus([
|
|||
tile_align_w = 32,
|
||||
tile_align_h = 16,
|
||||
num_vsc_pipes = 32,
|
||||
cs_shared_mem_size = 32 * 1024,
|
||||
magic_regs = dict(
|
||||
PC_POWER_CNTL = 0,
|
||||
TPL1_DBG_ECO_CNTL = 0x01008000,
|
||||
|
|
@ -359,6 +368,7 @@ add_gpus([
|
|||
tile_align_w = 32,
|
||||
tile_align_h = 16,
|
||||
num_vsc_pipes = 32,
|
||||
cs_shared_mem_size = 32 * 1024,
|
||||
magic_regs = dict(
|
||||
PC_POWER_CNTL = 1,
|
||||
TPL1_DBG_ECO_CNTL = 0x00108000,
|
||||
|
|
@ -385,6 +395,7 @@ add_gpus([
|
|||
tile_align_w = 32,
|
||||
tile_align_h = 16,
|
||||
num_vsc_pipes = 32,
|
||||
cs_shared_mem_size = 32 * 1024,
|
||||
magic_regs = dict(
|
||||
PC_POWER_CNTL = 1,
|
||||
TPL1_DBG_ECO_CNTL = 0x00008000,
|
||||
|
|
@ -411,6 +422,7 @@ add_gpus([
|
|||
tile_align_w = 64,
|
||||
tile_align_h = 32,
|
||||
num_vsc_pipes = 32,
|
||||
cs_shared_mem_size = 32 * 1024,
|
||||
magic_regs = dict(
|
||||
PC_POWER_CNTL = 3,
|
||||
TPL1_DBG_ECO_CNTL = 0x00108000,
|
||||
|
|
@ -437,6 +449,7 @@ add_gpus([
|
|||
tile_align_w = 96,
|
||||
tile_align_h = 16,
|
||||
num_vsc_pipes = 32,
|
||||
cs_shared_mem_size = 32 * 1024,
|
||||
magic_regs = dict(
|
||||
PC_POWER_CNTL = 2,
|
||||
# this seems to be a chicken bit that fixes cubic filtering:
|
||||
|
|
@ -468,6 +481,7 @@ add_gpus([
|
|||
tile_align_w = 32,
|
||||
tile_align_h = 16,
|
||||
num_vsc_pipes = 32,
|
||||
cs_shared_mem_size = 32 * 1024,
|
||||
magic_regs = dict(
|
||||
PC_POWER_CNTL = 1,
|
||||
TPL1_DBG_ECO_CNTL = 0x05008000,
|
||||
|
|
@ -494,6 +508,7 @@ add_gpus([
|
|||
tile_align_w = 96,
|
||||
tile_align_h = 16,
|
||||
num_vsc_pipes = 32,
|
||||
cs_shared_mem_size = 32 * 1024,
|
||||
magic_regs = dict(
|
||||
PC_POWER_CNTL = 2,
|
||||
TPL1_DBG_ECO_CNTL = 0x05008000,
|
||||
|
|
@ -520,6 +535,7 @@ add_gpus([
|
|||
tile_align_w = 64,
|
||||
tile_align_h = 32,
|
||||
num_vsc_pipes = 32,
|
||||
cs_shared_mem_size = 32 * 1024,
|
||||
magic_regs = dict(
|
||||
PC_POWER_CNTL = 7,
|
||||
TPL1_DBG_ECO_CNTL = 0x01008000,
|
||||
|
|
@ -548,6 +564,7 @@ add_gpus([
|
|||
tile_align_w = 64,
|
||||
tile_align_h = 32,
|
||||
num_vsc_pipes = 32,
|
||||
cs_shared_mem_size = 32 * 1024,
|
||||
magic_regs = dict()
|
||||
))
|
||||
|
||||
|
|
|
|||
|
|
@ -145,8 +145,6 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
|
|||
compiler->is_64bit = fd_dev_64b(dev_id);
|
||||
compiler->options = *options;
|
||||
|
||||
/* All known GPU's have 32k local memory (aka shared) */
|
||||
compiler->local_mem_size = 32 * 1024;
|
||||
/* TODO see if older GPU's were different here */
|
||||
compiler->branchstack_size = 64;
|
||||
compiler->wave_granularity = 2;
|
||||
|
|
@ -156,6 +154,8 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
|
|||
|
||||
const struct fd_dev_info *dev_info = fd_dev_info(compiler->dev_id);
|
||||
|
||||
compiler->local_mem_size = dev_info->cs_shared_mem_size;
|
||||
|
||||
if (compiler->gen >= 6) {
|
||||
compiler->samgq_workaround = true;
|
||||
/* a6xx split the pipeline state into geometry and fragment state, in
|
||||
|
|
|
|||
|
|
@ -1092,7 +1092,7 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
|
|||
.maxFragmentOutputAttachments = 8,
|
||||
.maxFragmentDualSrcAttachments = 1,
|
||||
.maxFragmentCombinedOutputResources = MAX_RTS + max_descriptor_set_size * 2,
|
||||
.maxComputeSharedMemorySize = 32768,
|
||||
.maxComputeSharedMemorySize = pdevice->info->cs_shared_mem_size,
|
||||
.maxComputeWorkGroupCount = { 65535, 65535, 65535 },
|
||||
.maxComputeWorkGroupInvocations = 2048,
|
||||
.maxComputeWorkGroupSize = { 1024, 1024, 1024 },
|
||||
|
|
|
|||
|
|
@ -823,7 +823,7 @@ fd_get_compute_param(struct pipe_screen *pscreen, enum pipe_shader_ir ir_type,
|
|||
RET((uint64_t[]){screen->ram_size});
|
||||
|
||||
case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE:
|
||||
RET((uint64_t[]){32768});
|
||||
RET((uint64_t[]){screen->info->cs_shared_mem_size});
|
||||
|
||||
case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE:
|
||||
case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE:
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue