ac/gpu_info: add some more flags to ac_cu_info

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38701>
This commit is contained in:
Daniel Schürmann 2025-11-27 11:43:06 +01:00 committed by Marge Bot
parent f791e46c47
commit f7c4aa48a0
9 changed files with 57 additions and 49 deletions

View file

@ -299,6 +299,23 @@ ac_fill_cu_info(struct radeon_info *info, struct drm_amdgpu_info_device *device_
cu_info->max_vgpr_alloc = 256;
cu_info->num_simd_per_compute_unit = info->gfx_level >= GFX10 ? 2 : 4;
/* Flags */
cu_info->has_lds_bank_count_16 = info->family == CHIP_KABINI || info->family == CHIP_STONEY;
cu_info->has_sram_ecc_enabled = info->family == CHIP_VEGA20 || info->family == CHIP_MI100 ||
info->family == CHIP_MI200 || info->family == CHIP_GFX940;
cu_info->has_fast_fma32 = info->gfx_level >= GFX9 || info->family == CHIP_TAHITI ||
info->family == CHIP_HAWAII || info->family == CHIP_CARRIZO;
cu_info->has_fma_mix = info->gfx_level >= GFX10 ||
info->family == CHIP_VEGA12 || info->family == CHIP_VEGA20 ||
info->family == CHIP_MI100 || info->family == CHIP_MI200 ||
info->family == CHIP_GFX940;
cu_info->has_packed_math_16bit = info->gfx_level >= GFX9;
cu_info->has_accelerated_dot_product =
info->family == CHIP_VEGA20 ||
(info->family >= CHIP_MI100 && info->family != CHIP_NAVI10 && info->family != CHIP_GFX1013);
/* GFX1013 is GFX10 plus ray tracing instructions */
cu_info->has_image_bvh_intersect_ray = info->gfx_level >= GFX10_3 || info->family == CHIP_GFX1013;
}
enum ac_query_gpu_info_result
@ -968,16 +985,6 @@ ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
info->has_out_of_order_rast =
info->gfx_level >= GFX8 && info->gfx_level <= GFX9 && info->max_se >= 2;
/* Whether chips support double rate packed math instructions. */
info->has_packed_math_16bit = info->gfx_level >= GFX9;
/* Whether chips support dot product instructions. A subset of these support a smaller
* instruction encoding which accumulates with the destination.
*/
info->has_accelerated_dot_product =
info->family == CHIP_VEGA20 ||
(info->family >= CHIP_MI100 && info->family != CHIP_NAVI10 && info->family != CHIP_GFX1013);
/* TODO: Figure out how to use LOAD_CONTEXT_REG on GFX6-GFX7. */
info->has_load_ctx_reg_pkt =
info->gfx_level >= GFX9 || (info->gfx_level >= GFX8 && info->me_fw_feature >= 41);
@ -1608,15 +1615,11 @@ ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
info->max_good_cu_per_sa * info->max_sa_per_se * info->max_se;
info->total_tess_ring_size = info->tess_offchip_ring_size + info->tess_factor_ring_size;
/* GFX1013 is GFX10 plus ray tracing instructions */
info->has_image_bvh_intersect_ray = info->gfx_level >= GFX10_3 ||
info->family == CHIP_GFX1013;
if (info->gfx_level >= GFX12)
info->rt_ip_version = RT_3_1;
else if (info->gfx_level >= GFX11)
info->rt_ip_version = RT_2_0;
else if (info->has_image_bvh_intersect_ray)
else if (info->cu_info.has_image_bvh_intersect_ray)
info->rt_ip_version = RT_1_1;
set_custom_cu_en_mask(info);

View file

@ -43,6 +43,25 @@ struct ac_cu_info {
uint32_t min_wave64_vgpr_alloc;
uint32_t max_vgpr_alloc;
uint32_t wave64_vgpr_alloc_granularity;
/* Flags */
bool has_lds_bank_count_16 : 1;
bool has_sram_ecc_enabled : 1;
bool has_fast_fma32 : 1;
/* Whether chips support fused v_fma_mix* instructions.
* Otherwise, unfused v_mad_mix* is available on GFX9.
*/
bool has_fma_mix : 1;
/* Whether chips support double rate packed math instructions. */
bool has_packed_math_16bit : 1;
/* Whether chips support dot product instructions. A subset of these support a smaller
* instruction encoding which accumulates with the destination.
*/
bool has_accelerated_dot_product : 1;
/* Device supports hardware-accelerated raytracing using
* image_bvh*_intersect_ray instructions
*/
bool has_image_bvh_intersect_ray : 1;
};
struct radeon_info {
@ -101,8 +120,6 @@ struct radeon_info {
bool rbplus_allowed; /* if RB+ is allowed */
bool has_load_ctx_reg_pkt;
bool has_out_of_order_rast;
bool has_packed_math_16bit;
bool has_accelerated_dot_product;
bool cpdma_prefetch_writes_memory;
bool has_gfx9_scissor_bug;
bool has_htile_stencil_mipmap_bug;
@ -350,11 +367,6 @@ struct radeon_info {
uint32_t sdma_csa_size;
uint32_t sdma_csa_alignment;
} fw_based_mcbp;
/* Device supports hardware-accelerated raytracing using
* image_bvh*_intersect_ray instructions
*/
bool has_image_bvh_intersect_ray;
};
enum ac_query_gpu_info_result {

View file

@ -60,7 +60,6 @@ ac_null_device_create(struct radeon_info *gpu_info, const char *family)
gpu_info->max_render_backends = pci_ids[gpu_info->family].num_render_backends;
gpu_info->has_dedicated_vram = pci_ids[gpu_info->family].has_dedicated_vram;
gpu_info->has_packed_math_16bit = gpu_info->gfx_level >= GFX9;
gpu_info->has_cb_lt16bit_int_clamp_bug = gpu_info->gfx_level <= GFX7 &&
gpu_info->family != CHIP_HAWAII;
@ -70,12 +69,6 @@ ac_null_device_create(struct radeon_info *gpu_info, const char *family)
gpu_info->has_distributed_tess =
gpu_info->gfx_level >= GFX10 || (gpu_info->gfx_level >= GFX8 && gpu_info->max_se >= 2);
gpu_info->has_accelerated_dot_product =
gpu_info->family == CHIP_VEGA20 ||
(gpu_info->family >= CHIP_MI100 && gpu_info->family != CHIP_NAVI10 && gpu_info->family != CHIP_GFX1013);
gpu_info->has_image_bvh_intersect_ray = gpu_info->gfx_level >= GFX10_3 || gpu_info->family == CHIP_GFX1013;
gpu_info->address32_hi = gpu_info->gfx_level >= GFX9 ? 0xffff8000u : 0x0;
gpu_info->has_rbplus = gpu_info->family == CHIP_STONEY || gpu_info->gfx_level >= GFX9;

View file

@ -73,13 +73,13 @@ void ac_nir_set_options(struct radeon_info *info, bool use_llvm,
options->has_ford_funord = true;
options->has_fsub = true;
options->has_isub = true;
options->has_sdot_4x8 = info->has_accelerated_dot_product;
options->has_sudot_4x8 = info->has_accelerated_dot_product && info->gfx_level >= GFX11;
options->has_udot_4x8 = info->has_accelerated_dot_product;
options->has_sdot_4x8_sat = info->has_accelerated_dot_product;
options->has_sudot_4x8_sat = info->has_accelerated_dot_product && info->gfx_level >= GFX11;
options->has_udot_4x8_sat = info->has_accelerated_dot_product;
options->has_dot_2x16 = info->has_accelerated_dot_product && info->gfx_level < GFX11;
options->has_sdot_4x8 = info->cu_info.has_accelerated_dot_product;
options->has_sudot_4x8 = info->cu_info.has_accelerated_dot_product && info->gfx_level >= GFX11;
options->has_udot_4x8 = info->cu_info.has_accelerated_dot_product;
options->has_sdot_4x8_sat = info->cu_info.has_accelerated_dot_product;
options->has_sudot_4x8_sat = info->cu_info.has_accelerated_dot_product && info->gfx_level >= GFX11;
options->has_udot_4x8_sat = info->cu_info.has_accelerated_dot_product;
options->has_dot_2x16 = info->cu_info.has_accelerated_dot_product && info->gfx_level < GFX11;
options->has_bfdot2_bfadd = info->gfx_level >= GFX12;
options->has_find_msb_rev = true;
options->has_pack_32_4x8 = true;
@ -103,7 +103,7 @@ void ac_nir_set_options(struct radeon_info *info, bool use_llvm,
options->optimize_quad_vote_to_reduce = !use_llvm;
options->lower_fisnormal = true;
options->support_16bit_alu = info->gfx_level >= GFX8;
options->vectorize_vec2_16bit = info->has_packed_math_16bit;
options->vectorize_vec2_16bit = info->cu_info.has_packed_math_16bit;
options->discard_is_demote = true;
options->optimize_sample_mask_in = true;
options->optimize_load_front_face_fsign = true;

View file

@ -221,8 +221,8 @@ static void run_subtest(subtest *st, bool print = false)
struct radeon_info info = {};
info.gfx_level = st->gfx_level;
info.has_packed_math_16bit = true;
info.has_accelerated_dot_product = true;
info.cu_info.has_packed_math_16bit = true;
info.cu_info.has_accelerated_dot_product = true;
nir_shader_compiler_options options = {};
ac_nir_set_options(&info, st->use_llvm, &options);

View file

@ -946,7 +946,7 @@ radv_build_ray_traversal(struct radv_device *device, nir_builder *b, const struc
nir_def *global_bvh_node = nir_iadd(b, nir_load_deref(b, args->vars.bvh_base), nir_u2u64(b, bvh_node));
bool has_result = false;
if (pdev->info.has_image_bvh_intersect_ray && !radv_emulate_rt(pdev)) {
if (pdev->info.cu_info.has_image_bvh_intersect_ray && !radv_emulate_rt(pdev)) {
nir_store_var(
b, intrinsic_result,
nir_bvh64_intersect_ray_amd(b, 32, desc, nir_unpack_64_2x32(b, global_bvh_node),

View file

@ -179,7 +179,7 @@ radv_shader_fp16_enabled(const struct radv_physical_device *pdev)
/* GFX8 supports fp16, but not double rate packed math. We don't enable
* that by default because it can sometimes hurt perf.
*/
return pdev->info.has_packed_math_16bit ||
return pdev->info.cu_info.has_packed_math_16bit ||
(pdev->info.gfx_level == GFX8 && instance->drirc.features.expose_float16_gfx8);
}
@ -193,7 +193,7 @@ radv_host_image_copy_enabled(const struct radv_physical_device *pdev)
bool
radv_enable_rt(const struct radv_physical_device *pdev)
{
if (!pdev->info.has_image_bvh_intersect_ray && !radv_emulate_rt(pdev))
if (!pdev->info.cu_info.has_image_bvh_intersect_ray && !radv_emulate_rt(pdev))
return false;
if (pdev->use_llvm)
@ -210,7 +210,7 @@ radv_emulate_rt(const struct radv_physical_device *pdev)
return true;
/* Do not force emulated RT on GPUs that have native support. */
return !pdev->info.has_image_bvh_intersect_ray && instance->drirc.features.emulate_rt;
return !pdev->info.cu_info.has_image_bvh_intersect_ray && instance->drirc.features.emulate_rt;
}
bool
@ -830,8 +830,8 @@ radv_physical_device_get_supported_extensions(const struct radv_physical_device
.AMD_device_coherent_memory = pdev->info.has_l2_uncached,
.AMD_draw_indirect_count = true,
.AMD_gcn_shader = true,
.AMD_gpu_shader_half_float = pdev->info.has_packed_math_16bit,
.AMD_gpu_shader_int16 = pdev->info.has_packed_math_16bit,
.AMD_gpu_shader_half_float = pdev->info.cu_info.has_packed_math_16bit,
.AMD_gpu_shader_int16 = pdev->info.cu_info.has_packed_math_16bit,
.AMD_memory_overallocation_behavior = true,
.AMD_mixed_attachment_samples = true,
.AMD_rasterization_order = pdev->info.has_out_of_order_rast,
@ -931,7 +931,7 @@ radv_physical_device_get_features(const struct radv_physical_device *pdev, struc
.storageBuffer16BitAccess = true,
.uniformAndStorageBuffer16BitAccess = true,
.storagePushConstant16 = true,
.storageInputOutput16 = pdev->info.has_packed_math_16bit,
.storageInputOutput16 = pdev->info.cu_info.has_packed_math_16bit,
.multiview = true,
.multiviewGeometryShader = true,
.multiviewTessellationShader = true,
@ -1589,7 +1589,7 @@ radv_get_physical_device_properties(struct radv_physical_device *pdev)
radv_taskmesh_enabled(pdev) ? VK_SHADER_STAGE_MESH_BIT_EXT | VK_SHADER_STAGE_TASK_BIT_EXT : 0;
VkShaderStageFlags rt_stages = radv_enable_rt(pdev) ? RADV_RT_STAGE_BITS : 0;
bool accel_dot = pdev->info.has_accelerated_dot_product;
bool accel_dot = pdev->info.cu_info.has_accelerated_dot_product;
bool gfx11plus = pdev->info.gfx_level >= GFX11;
VkExtent2D vrs_texel_extent = radv_vrs_attachment_enabled(pdev) ? (VkExtent2D){8, 8} : (VkExtent2D){0, 0};

View file

@ -821,7 +821,7 @@ void si_init_screen_get_functions(struct si_screen *sscreen)
options->lower_uniforms_to_ubo = true;
options->lower_to_scalar = true;
options->lower_to_scalar_filter =
sscreen->info.has_packed_math_16bit ? si_alu_to_scalar_packed_math_filter : NULL;
sscreen->info.cu_info.has_packed_math_16bit ? si_alu_to_scalar_packed_math_filter : NULL;
options->max_unroll_iterations = 128;
options->max_unroll_iterations_aggressive = 128;
/* For OpenGL, rounding mode is undefined. We want fast packing with v_cvt_pkrtz_f16,

View file

@ -122,7 +122,7 @@ void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool has_arr
if (nir->info.stage == MESA_SHADER_FRAGMENT)
NIR_PASS(_, nir, nir_opt_move_discards_to_top);
if (sscreen->info.has_packed_math_16bit)
if (sscreen->info.cu_info.has_packed_math_16bit)
NIR_PASS(progress, nir, nir_opt_vectorize, si_vectorize_callback, NULL);
} while (progress);