diff --git a/src/freedreno/common/freedreno_dev_info.h b/src/freedreno/common/freedreno_dev_info.h index defd43a2c40..5dda81a24bf 100644 --- a/src/freedreno/common/freedreno_dev_info.h +++ b/src/freedreno/common/freedreno_dev_info.h @@ -62,135 +62,136 @@ struct fd_dev_info { uint32_t num_ccu; }; - union { + struct { + uint32_t reg_size_vec4; + + /* The size (in instrlen units (128 bytes)) of instruction cache where + * we preload a shader. Loading more than this could trigger a hang + * on gen3 and later. + */ + uint32_t instr_cache_size; + + bool has_hw_multiview; + + bool has_fs_tex_prefetch; + + /* Whether the PC_MULTIVIEW_MASK register exists. */ + bool supports_multiview_mask; + + /* info for setting RB_CCU_CNTL */ + bool concurrent_resolve; + bool has_z24uint_s8uint; + + bool tess_use_shared; + + /* Does the hw support GL_QCOM_shading_rate? */ + bool has_shading_rate; + + /* newer a6xx allows using 16-bit descriptor for both 16-bit + * and 32-bit access + */ + bool storage_16bit; + + /* The latest known a630_sqe.fw fails to wait for WFI before + * reading the indirect buffer when using CP_DRAW_INDIRECT_MULTI, + * so we have to fall back to CP_WAIT_FOR_ME except for a650 + * which has a fixed firmware. + * + * TODO: There may be newer a630_sqe.fw released in the future + * which fixes this, if so we should detect it and avoid this + * workaround. Once we have uapi to query fw version, we can + * replace this with minimum fw version. + */ + bool indirect_draw_wfm_quirk; + + /* On some GPUs, the depth test needs to be enabled when the + * depth bounds test is enabled and the depth attachment uses UBWC. + */ + bool depth_bounds_require_depth_test_quirk; + + bool has_tex_filter_cubic; + bool has_separate_chroma_filter; + + bool has_sample_locations; + + /* The firmware on newer a6xx drops CP_REG_WRITE support as we + * can now use direct register writes for these regs. + */ + bool has_cp_reg_write; + + bool has_8bpp_ubwc; + + bool has_lpac; + + bool has_getfiberid; + + bool has_dp2acc; + bool has_dp4acc; + + /* LRZ fast-clear works on all gens, however blob disables it on + * gen1 and gen2. We also elect to disable fast-clear on these gens + * because for close to none gains it adds complexity and seem to work + * a bit differently from gen3+. Which creates at least one edge case: + * if first draw which uses LRZ fast-clear doesn't lock LRZ direction + * the fast-clear value is undefined. For details see + * https://gitlab.freedesktop.org/mesa/mesa/-/issues/6829 + */ + bool enable_lrz_fast_clear; + bool has_lrz_dir_tracking; + bool lrz_track_quirk; + + /* Some generations have a bit to add the multiview index to the + * viewport index, which lets us implement different scaling for + * different views. + */ + bool has_per_view_viewport; + bool has_gmem_fast_clear; + + /* Per CCU GMEM amount reserved for each of DEPTH and COLOR caches + * in sysmem rendering. */ + uint32_t sysmem_per_ccu_cache_size; + /* Per CCU GMEM amount reserved for color cache used by GMEM resolves + * which require color cache (non-BLIT event case). + * The size is expressed as a fraction of ccu cache used by sysmem + * rendering. If a GMEM resolve requires color cache, the driver needs + * to make sure it will not overwrite pixel data in GMEM that is still + * needed. + */ + /* see enum a6xx_ccu_color_cache_size */ + uint32_t gmem_ccu_color_cache_fraction; + + /* Corresponds to HLSQ_CONTROL_1_REG::PRIMALLOCTHRESHOLD */ + uint32_t prim_alloc_threshold; + + uint32_t vs_max_inputs_count; + + bool supports_double_threadsize; + + bool has_sampler_minmax; struct { - uint32_t reg_size_vec4; + uint32_t PC_POWER_CNTL; + uint32_t TPL1_DBG_ECO_CNTL; + uint32_t GRAS_DBG_ECO_CNTL; + uint32_t SP_CHICKEN_BITS; + uint32_t UCHE_CLIENT_PF; + uint32_t PC_MODE_CNTL; + uint32_t SP_DBG_ECO_CNTL; + uint32_t RB_DBG_ECO_CNTL; + uint32_t RB_DBG_ECO_CNTL_blit; + uint32_t HLSQ_DBG_ECO_CNTL; + uint32_t RB_UNKNOWN_8E01; + uint32_t VPC_DBG_ECO_CNTL; + uint32_t UCHE_UNKNOWN_0E12; + } magic; - /* The size (in instrlen units (128 bytes)) of instruction cache where - * we preload a shader. Loading more than this could trigger a hang - * on gen3 and later. - */ - uint32_t instr_cache_size; + struct { + uint32_t reg; + uint32_t value; + } magic_raw[32]; + } a6xx; - bool has_hw_multiview; - - bool has_fs_tex_prefetch; - - /* Whether the PC_MULTIVIEW_MASK register exists. */ - bool supports_multiview_mask; - - /* info for setting RB_CCU_CNTL */ - bool concurrent_resolve; - bool has_z24uint_s8uint; - - bool tess_use_shared; - - /* Does the hw support GL_QCOM_shading_rate? */ - bool has_shading_rate; - - /* newer a6xx allows using 16-bit descriptor for both 16-bit - * and 32-bit access - */ - bool storage_16bit; - - /* The latest known a630_sqe.fw fails to wait for WFI before - * reading the indirect buffer when using CP_DRAW_INDIRECT_MULTI, - * so we have to fall back to CP_WAIT_FOR_ME except for a650 - * which has a fixed firmware. - * - * TODO: There may be newer a630_sqe.fw released in the future - * which fixes this, if so we should detect it and avoid this - * workaround. Once we have uapi to query fw version, we can - * replace this with minimum fw version. - */ - bool indirect_draw_wfm_quirk; - - /* On some GPUs, the depth test needs to be enabled when the - * depth bounds test is enabled and the depth attachment uses UBWC. - */ - bool depth_bounds_require_depth_test_quirk; - - bool has_tex_filter_cubic; - bool has_separate_chroma_filter; - - bool has_sample_locations; - - /* The firmware on newer a6xx drops CP_REG_WRITE support as we - * can now use direct register writes for these regs. - */ - bool has_cp_reg_write; - - bool has_8bpp_ubwc; - - bool has_lpac; - - bool has_getfiberid; - - bool has_dp2acc; - bool has_dp4acc; - - /* LRZ fast-clear works on all gens, however blob disables it on - * gen1 and gen2. We also elect to disable fast-clear on these gens - * because for close to none gains it adds complexity and seem to work - * a bit differently from gen3+. Which creates at least one edge case: - * if first draw which uses LRZ fast-clear doesn't lock LRZ direction - * the fast-clear value is undefined. For details see - * https://gitlab.freedesktop.org/mesa/mesa/-/issues/6829 - */ - bool enable_lrz_fast_clear; - bool has_lrz_dir_tracking; - bool lrz_track_quirk; - - /* Some generations have a bit to add the multiview index to the - * viewport index, which lets us implement different scaling for - * different views. - */ - bool has_per_view_viewport; - bool has_gmem_fast_clear; - - /* Per CCU GMEM amount reserved for each of DEPTH and COLOR caches - * in sysmem rendering. */ - uint32_t sysmem_per_ccu_cache_size; - /* Per CCU GMEM amount reserved for color cache used by GMEM resolves - * which require color cache (non-BLIT event case). - * The size is expressed as a fraction of ccu cache used by sysmem - * rendering. If a GMEM resolve requires color cache, the driver needs - * to make sure it will not overwrite pixel data in GMEM that is still - * needed. - */ - /* see enum a6xx_ccu_color_cache_size */ - uint32_t gmem_ccu_color_cache_fraction; - - /* Corresponds to HLSQ_CONTROL_1_REG::PRIMALLOCTHRESHOLD */ - uint32_t prim_alloc_threshold; - - uint32_t vs_max_inputs_count; - - bool supports_double_threadsize; - - bool has_sampler_minmax; - struct { - uint32_t PC_POWER_CNTL; - uint32_t TPL1_DBG_ECO_CNTL; - uint32_t GRAS_DBG_ECO_CNTL; - uint32_t SP_CHICKEN_BITS; - uint32_t UCHE_CLIENT_PF; - uint32_t PC_MODE_CNTL; - uint32_t SP_DBG_ECO_CNTL; - uint32_t RB_DBG_ECO_CNTL; - uint32_t RB_DBG_ECO_CNTL_blit; - uint32_t HLSQ_DBG_ECO_CNTL; - uint32_t RB_UNKNOWN_8E01; - uint32_t VPC_DBG_ECO_CNTL; - uint32_t UCHE_UNKNOWN_0E12; - } magic; - - struct { - uint32_t reg; - uint32_t value; - } magic_raw[32]; - } a6xx; - }; + struct { + } a7xx; }; struct fd_dev_id { diff --git a/src/freedreno/common/freedreno_devices.py b/src/freedreno/common/freedreno_devices.py index 07eb24409b7..04695d30cff 100644 --- a/src/freedreno/common/freedreno_devices.py +++ b/src/freedreno/common/freedreno_devices.py @@ -158,6 +158,8 @@ class A6xxGPUInfo(GPUInfo): self.num_ccu = num_ccu self.a6xx = Struct() + self.a7xx = Struct() + self.a6xx.magic = Struct() for name, val in magic_regs.items(): @@ -183,10 +185,10 @@ class A6xxGPUInfo(GPUInfo): self.a6xx.vs_max_inputs_count = 32 - for name, val in template.items(): - if name == "magic": # handled above - continue - setattr(self.a6xx, name, val) + templates = template if type(template) is list else [template] + for template in templates: + template.apply_props(self) + def __str__(self): return super(A6xxGPUInfo, self).__str__().replace('[', '{').replace("]", "}") @@ -296,12 +298,27 @@ add_gpus([ fibers_per_sp = 64 * 16, # Lowest number that didn't fault on spillall fs-varying-array-mat4-col-row-rd. )) + +class A6XXProps(dict): + def apply_props(self, gpu_info): + for name, val in self.items(): + if name == "magic": + continue + setattr(gpu_info.a6xx, name, val) + + +class A7XXProps(dict): + def apply_props(self, gpu_info): + for name, val in self.items(): + setattr(gpu_info.a7xx, name, val) + + # a6xx can be divided into distinct sub-generations, where certain device- # info parameters are keyed to the sub-generation. These templates reduce # the copypaste # a615, a616, a618, a619, a620 and a630: -a6xx_gen1 = dict( +a6xx_gen1 = A6XXProps( reg_size_vec4 = 96, instr_cache_size = 64, concurrent_resolve = False, @@ -311,7 +328,7 @@ a6xx_gen1 = dict( ) # a605, a608, a610, 612 -a6xx_gen1_low = {**a6xx_gen1, **dict( +a6xx_gen1_low = A6XXProps({**a6xx_gen1, **A6XXProps( has_gmem_fast_clear = False, reg_size_vec4 = 48, has_hw_multiview = False, @@ -321,10 +338,10 @@ a6xx_gen1_low = {**a6xx_gen1, **dict( gmem_ccu_color_cache_fraction = CCUColorCacheFraction.HALF.value, vs_max_inputs_count = 16, supports_double_threadsize = False, -)} +)}) # a640, a680: -a6xx_gen2 = dict( +a6xx_gen2 = A6XXProps( reg_size_vec4 = 96, instr_cache_size = 64, # TODO supports_multiview_mask = True, @@ -337,7 +354,7 @@ a6xx_gen2 = dict( ) # a650: -a6xx_gen3 = dict( +a6xx_gen3 = A6XXProps( reg_size_vec4 = 64, # Blob limits it to 128 but we hang with 128 instr_cache_size = 127, @@ -358,7 +375,7 @@ a6xx_gen3 = dict( ) # a635, a660: -a6xx_gen4 = dict( +a6xx_gen4 = A6XXProps( reg_size_vec4 = 64, # Blob limits it to 128 but we hang with 128 instr_cache_size = 127, @@ -685,12 +702,16 @@ add_gpus([ ) )) +a7xx_730 = A7XXProps() + +a7xx_740 = A7XXProps() + add_gpus([ GPUId(chip_id=0x07030001, name="FD730"), # KGSL, no speedbin data GPUId(chip_id=0xffff07030001, name="FD730"), # Default no-speedbin fallback ], A6xxGPUInfo( CHIP.A7XX, - a6xx_gen4, + [a6xx_gen4, a7xx_730], num_ccu = 4, tile_align_w = 64, tile_align_h = 32, @@ -746,7 +767,7 @@ add_gpus([ GPUId(chip_id=0xffff43050a01, name="FD740"), # Default no-speedbin fallback ], A6xxGPUInfo( CHIP.A7XX, - a6xx_gen4, + [a6xx_gen4, a7xx_740], num_ccu = 6, tile_align_w = 64, tile_align_h = 32,