From d82eda72a1fe3932615b3fb16391e84de0431e6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 14 Apr 2025 20:05:21 -0400 Subject: [PATCH] ac/gpu_info: move HS info into radeon_info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Pierre-Eric Pelloux-Prayer Reviewed-by: Timur Kristóf Part-of: --- src/amd/common/ac_gpu_info.c | 130 +++++++++--------- src/amd/common/ac_gpu_info.h | 16 +-- src/amd/vulkan/radv_physical_device.c | 1 - src/amd/vulkan/radv_physical_device.h | 1 - src/amd/vulkan/radv_queue.c | 19 +-- .../drivers/radeonsi/si_nir_lower_abi.c | 2 +- src/gallium/drivers/radeonsi/si_pipe.c | 2 - src/gallium/drivers/radeonsi/si_pipe.h | 1 - .../drivers/radeonsi/si_state_shaders.cpp | 12 +- 9 files changed, 86 insertions(+), 98 deletions(-) diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c index 7e45ae20e86..1698775dabf 100644 --- a/src/amd/common/ac_gpu_info.c +++ b/src/amd/common/ac_gpu_info.c @@ -1733,6 +1733,69 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info, info->has_set_sh_pairs_packed = info->register_shadowing_required; } + bool double_offchip_wg = info->gfx_level >= GFX7 && + info->family != CHIP_CARRIZO && + info->family != CHIP_STONEY; + /* This is the size of all TCS outputs in memory per workgroup. + * Hawaii can't handle num_workgroups > 256 with 8K per workgroup, so use 4K. + */ + unsigned wg_size_in_dwords = info->family == CHIP_HAWAII ? 4096 : 8192; + unsigned wg_size_enum; + unsigned num_workgroups_per_se; + + switch (wg_size_in_dwords) { + case 8192: + wg_size_enum = V_03093C_X_8K_DWORDS; + break; + case 4096: + wg_size_enum = V_03093C_X_4K_DWORDS; + break; + case 2048: + wg_size_enum = V_03093C_X_2K_DWORDS; + break; + case 1024: + wg_size_enum = V_03093C_X_1K_DWORDS; + break; + default: + unreachable("invalid TCS workgroup size"); + } + + /* Vega10 should limit num_workgroups to 508 (127 per SE) + * Gfx7 should limit num_workgroups to 508 (127 per SE) + * Gfx6 should limit num_workgroups to 126 (63 per SE) + */ + if (info->gfx_level >= GFX11) { + num_workgroups_per_se = 256; + } else if (info->gfx_level >= GFX10) { + num_workgroups_per_se = 128; + } else if (info->family == CHIP_VEGA12 || info->family == CHIP_VEGA20) { + num_workgroups_per_se = double_offchip_wg ? 128 : 64; + } else { + num_workgroups_per_se = double_offchip_wg ? 127 : 63; + } + + unsigned num_workgroups = num_workgroups_per_se * info->max_se; + + if (info->gfx_level >= GFX11) { + /* OFFCHIP_BUFFERING is per SE. */ + info->hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX103(num_workgroups_per_se - 1) | + S_03093C_OFFCHIP_GRANULARITY_GFX103(wg_size_enum); + } else if (info->gfx_level >= GFX10_3) { + info->hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX103(num_workgroups - 1) | + S_03093C_OFFCHIP_GRANULARITY_GFX103(wg_size_enum); + } else if (info->gfx_level >= GFX7) { + info->hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX7(num_workgroups - + (info->gfx_level >= GFX8 ? 1 : 0)) | + S_03093C_OFFCHIP_GRANULARITY_GFX7(wg_size_enum); + } else { + info->hs_offchip_param = S_0089B0_OFFCHIP_BUFFERING(num_workgroups) | + S_0089B0_OFFCHIP_GRANULARITY(wg_size_enum); + } + + info->tess_offchip_ring_size = num_workgroups * wg_size_in_dwords * 4; + info->tess_factor_ring_size = 48 * 1024 * info->max_se; + info->total_tess_ring_size = info->tess_offchip_ring_size + info->tess_factor_ring_size; + /* GFX1013 is GFX10 plus ray tracing instructions */ info->has_image_bvh_intersect_ray = info->gfx_level >= GFX10_3 || info->family == CHIP_GFX1013; @@ -2410,73 +2473,6 @@ ac_get_compute_resource_limits(const struct radeon_info *info, unsigned waves_pe return compute_resource_limits; } -void ac_get_hs_info(const struct radeon_info *info, - struct ac_hs_info *hs) -{ - bool double_offchip_wg = info->gfx_level >= GFX7 && - info->family != CHIP_CARRIZO && - info->family != CHIP_STONEY; - /* This is the size of all TCS outputs in memory per workgroup. - * Hawaii can't handle num_workgroups > 256 with 8K per workgroup, so use 4K. - */ - unsigned wg_size_in_dwords = info->family == CHIP_HAWAII ? 4096 : 8192; - unsigned wg_size_enum; - unsigned num_workgroups_per_se; - - switch (wg_size_in_dwords) { - case 8192: - wg_size_enum = V_03093C_X_8K_DWORDS; - break; - case 4096: - wg_size_enum = V_03093C_X_4K_DWORDS; - break; - case 2048: - wg_size_enum = V_03093C_X_2K_DWORDS; - break; - case 1024: - wg_size_enum = V_03093C_X_1K_DWORDS; - break; - default: - unreachable("invalid TCS workgroup size"); - } - - /* Vega10 should limit num_workgroups to 508 (127 per SE) - * Gfx7 should limit num_workgroups to 508 (127 per SE) - * Gfx6 should limit num_workgroups to 126 (63 per SE) - */ - if (info->gfx_level >= GFX11) { - num_workgroups_per_se = 256; - } else if (info->gfx_level >= GFX10) { - num_workgroups_per_se = 128; - } else if (info->family == CHIP_VEGA12 || info->family == CHIP_VEGA20) { - num_workgroups_per_se = double_offchip_wg ? 128 : 64; - } else { - num_workgroups_per_se = double_offchip_wg ? 127 : 63; - } - - unsigned num_workgroups = num_workgroups_per_se * info->max_se; - - if (info->gfx_level >= GFX11) { - /* OFFCHIP_BUFFERING is per SE. */ - hs->hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX103(num_workgroups_per_se - 1) | - S_03093C_OFFCHIP_GRANULARITY_GFX103(wg_size_enum); - } else if (info->gfx_level >= GFX10_3) { - hs->hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX103(num_workgroups - 1) | - S_03093C_OFFCHIP_GRANULARITY_GFX103(wg_size_enum); - } else if (info->gfx_level >= GFX7) { - hs->hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX7(num_workgroups - - (info->gfx_level >= GFX8 ? 1 : 0)) | - S_03093C_OFFCHIP_GRANULARITY_GFX7(wg_size_enum); - } else { - hs->hs_offchip_param = S_0089B0_OFFCHIP_BUFFERING(num_workgroups) | - S_0089B0_OFFCHIP_GRANULARITY(wg_size_enum); - } - - hs->tess_offchip_ring_size = num_workgroups * wg_size_in_dwords * 4; - hs->tess_factor_ring_size = 48 * 1024 * info->max_se; - hs->total_tess_ring_size = hs->tess_offchip_ring_size + hs->tess_factor_ring_size; -} - static uint16_t get_task_num_entries(enum radeon_family fam) { /* Number of task shader ring entries. Needs to be a power of two. diff --git a/src/amd/common/ac_gpu_info.h b/src/amd/common/ac_gpu_info.h index 933e235ec06..df27220c8cd 100644 --- a/src/amd/common/ac_gpu_info.h +++ b/src/amd/common/ac_gpu_info.h @@ -293,6 +293,12 @@ struct radeon_info { uint32_t total_attribute_pos_prim_ring_size; /* GFX11+ */ bool has_attr_ring; + /* Tessellation rings. */ + uint32_t hs_offchip_param; + uint32_t tess_factor_ring_size; + uint32_t tess_offchip_ring_size; + uint32_t total_tess_ring_size; + /* Render backends (color + depth blocks). */ uint32_t r300_num_gb_pipes; uint32_t r300_num_z_pipes; @@ -347,16 +353,6 @@ unsigned ac_get_compute_resource_limits(const struct radeon_info *info, unsigned waves_per_threadgroup, unsigned max_waves_per_sh, unsigned threadgroups_per_cu); -struct ac_hs_info { - uint32_t hs_offchip_param; - uint32_t tess_factor_ring_size; - uint32_t tess_offchip_ring_size; - uint32_t total_tess_ring_size; -}; - -void ac_get_hs_info(const struct radeon_info *info, - struct ac_hs_info *hs); - /* Task rings BO layout information. * This BO is shared between GFX and ACE queues so that the ACE and GFX * firmware can cooperate on task->mesh dispatches and is also used to diff --git a/src/amd/vulkan/radv_physical_device.c b/src/amd/vulkan/radv_physical_device.c index f325520fdf9..343579f2f68 100644 --- a/src/amd/vulkan/radv_physical_device.c +++ b/src/amd/vulkan/radv_physical_device.c @@ -2285,7 +2285,6 @@ radv_physical_device_try_create(struct radv_instance *instance, drmDevicePtr drm pdev->gs_table_depth = ac_get_gs_table_depth(pdev->info.gfx_level, pdev->info.family); - ac_get_hs_info(&pdev->info, &pdev->hs); ac_get_task_info(&pdev->info, &pdev->task_info); radv_get_binning_settings(pdev, &pdev->binning_settings); diff --git a/src/amd/vulkan/radv_physical_device.h b/src/amd/vulkan/radv_physical_device.h index 0c079b294d1..af0a5475a8d 100644 --- a/src/amd/vulkan/radv_physical_device.h +++ b/src/amd/vulkan/radv_physical_device.h @@ -158,7 +158,6 @@ struct radv_physical_device { uint32_t gs_table_depth; - struct ac_hs_info hs; struct ac_task_info task_info; struct radv_binning_settings binning_settings; diff --git a/src/amd/vulkan/radv_queue.c b/src/amd/vulkan/radv_queue.c index c9b897f2df6..5b110c4dd1f 100644 --- a/src/amd/vulkan/radv_queue.c +++ b/src/amd/vulkan/radv_queue.c @@ -313,10 +313,11 @@ radv_fill_shader_rings(struct radv_device *device, uint32_t *desc, struct radeon desc += 8; if (tess_rings_bo) { - radv_set_ring_buffer(pdev, tess_rings_bo, pdev->hs.tess_offchip_ring_size, pdev->hs.tess_factor_ring_size, false, - false, true, 0, 0, &desc[0]); + radv_set_ring_buffer(pdev, tess_rings_bo, pdev->info.tess_offchip_ring_size, pdev->info.tess_factor_ring_size, + false, false, true, 0, 0, &desc[0]); - radv_set_ring_buffer(pdev, tess_rings_bo, 0, pdev->hs.tess_offchip_ring_size, false, false, true, 0, 0, &desc[4]); + radv_set_ring_buffer(pdev, tess_rings_bo, 0, pdev->info.tess_offchip_ring_size, false, false, true, 0, 0, + &desc[4]); } desc += 8; @@ -397,8 +398,8 @@ radv_emit_tess_factor_ring(struct radv_device *device, struct radeon_cmdbuf *cs, if (!tess_rings_bo) return; - tf_ring_size = pdev->hs.tess_factor_ring_size / 4; - tf_va = radv_buffer_get_va(tess_rings_bo) + pdev->hs.tess_offchip_ring_size; + tf_ring_size = pdev->info.tess_factor_ring_size / 4; + tf_va = radv_buffer_get_va(tess_rings_bo) + pdev->info.tess_offchip_ring_size; radv_cs_add_buffer(device->ws, cs, tess_rings_bo); @@ -421,11 +422,11 @@ radv_emit_tess_factor_ring(struct radv_device *device, struct radeon_cmdbuf *cs, radeon_set_uconfig_reg(R_030944_VGT_TF_MEMORY_BASE_HI, S_030944_BASE_HI(tf_va >> 40)); } - radeon_set_uconfig_reg(R_03093C_VGT_HS_OFFCHIP_PARAM, pdev->hs.hs_offchip_param); + radeon_set_uconfig_reg(R_03093C_VGT_HS_OFFCHIP_PARAM, pdev->info.hs_offchip_param); } else { radeon_set_config_reg(R_008988_VGT_TF_RING_SIZE, S_008988_SIZE(tf_ring_size)); radeon_set_config_reg(R_0089B8_VGT_TF_MEMORY_BASE, tf_va >> 8); - radeon_set_config_reg(R_0089B0_VGT_HS_OFFCHIP_PARAM, pdev->hs.hs_offchip_param); + radeon_set_config_reg(R_0089B0_VGT_HS_OFFCHIP_PARAM, pdev->info.hs_offchip_param); } radeon_end(); @@ -997,11 +998,11 @@ radv_update_preamble_cs(struct radv_queue_state *queue, struct radv_device *devi } if (!queue->ring_info.tess_rings && needs->tess_rings) { - result = radv_bo_create(device, NULL, pdev->hs.total_tess_ring_size, 256, RADEON_DOMAIN_VRAM, ring_bo_flags, + result = radv_bo_create(device, NULL, pdev->info.total_tess_ring_size, 256, RADEON_DOMAIN_VRAM, ring_bo_flags, RADV_BO_PRIORITY_SCRATCH, 0, true, &tess_rings_bo); if (result != VK_SUCCESS) goto fail; - radv_rmv_log_command_buffer_bo_create(device, tess_rings_bo, 0, 0, pdev->hs.total_tess_ring_size); + radv_rmv_log_command_buffer_bo_create(device, tess_rings_bo, 0, 0, pdev->info.total_tess_ring_size); } if (!queue->ring_info.task_rings && needs->task_rings) { diff --git a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c index 8af33eba7e2..88353c1800b 100644 --- a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c +++ b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c @@ -490,7 +490,7 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s case nir_intrinsic_load_ring_tess_factors_amd: { assert(s->tess_offchip_ring); nir_def *addr = nir_channel(b, s->tess_offchip_ring, 0); - addr = nir_iadd_imm(b, addr, sel->screen->hs.tess_offchip_ring_size); + addr = nir_iadd_imm(b, addr, sel->screen->info.tess_offchip_ring_size); replacement = nir_vector_insert_imm(b, s->tess_offchip_ring, addr, 0); break; } diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index adafd57b03f..4580fa4a919 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -1461,8 +1461,6 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws, if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", false)) si_init_perfcounters(sscreen); - ac_get_hs_info(&sscreen->info, &sscreen->hs); - if (sscreen->debug_flags & DBG(NO_OUT_OF_ORDER)) sscreen->info.has_out_of_order_rast = false; diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index ea6a30b0bec..8fe3cf4d170 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -519,7 +519,6 @@ struct si_screen { unsigned pa_sc_raster_config_1; unsigned se_tile_repeat; unsigned gs_table_depth; - struct ac_hs_info hs; unsigned eqaa_force_coverage_samples; unsigned eqaa_force_z_samples; unsigned eqaa_force_color_samples; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 843b65ffb3c..34f4945dde1 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -4544,7 +4544,7 @@ void si_init_tess_factor_ring(struct si_context *sctx) SI_RESOURCE_FLAG_DRIVER_INTERNAL | SI_RESOURCE_FLAG_DISCARDABLE, PIPE_USAGE_DEFAULT, - sscreen->hs.total_tess_ring_size, + sscreen->info.total_tess_ring_size, 2 * 1024 * 1024); if (!sscreen->tess_rings) { simple_mtx_unlock(&sscreen->tess_ring_lock); @@ -4559,7 +4559,7 @@ void si_init_tess_factor_ring(struct si_context *sctx) SI_RESOURCE_FLAG_DRIVER_INTERNAL | SI_RESOURCE_FLAG_DISCARDABLE, PIPE_USAGE_DEFAULT, - sscreen->hs.total_tess_ring_size, + sscreen->info.total_tess_ring_size, 2 * 1024 * 1024); } } @@ -5085,9 +5085,9 @@ static void si_emit_spi_ge_ring_state(struct si_context *sctx, unsigned index) struct pipe_resource *tf_ring = sctx->ws->cs_is_secure(&sctx->gfx_cs) ? sscreen->tess_rings_tmz : sscreen->tess_rings; uint64_t factor_va = si_resource(tf_ring)->gpu_address + - sscreen->hs.tess_offchip_ring_size; + sscreen->info.tess_offchip_ring_size; - unsigned tf_ring_size_field = sscreen->hs.tess_factor_ring_size / 4; + unsigned tf_ring_size_field = sscreen->info.tess_factor_ring_size / 4; if (sctx->gfx_level >= GFX11) tf_ring_size_field /= sscreen->info.max_se; @@ -5104,7 +5104,7 @@ static void si_emit_spi_ge_ring_state(struct si_context *sctx, unsigned index) if (sctx->gfx_level >= GFX7) { radeon_set_uconfig_reg_seq(R_030938_VGT_TF_RING_SIZE, 3); radeon_emit(S_030938_SIZE(tf_ring_size_field)); /* R_030938_VGT_TF_RING_SIZE */ - radeon_emit(sscreen->hs.hs_offchip_param); /* R_03093C_VGT_HS_OFFCHIP_PARAM */ + radeon_emit(sscreen->info.hs_offchip_param); /* R_03093C_VGT_HS_OFFCHIP_PARAM */ radeon_emit(factor_va >> 8); /* R_030940_VGT_TF_MEMORY_BASE */ if (sctx->gfx_level >= GFX12) @@ -5116,7 +5116,7 @@ static void si_emit_spi_ge_ring_state(struct si_context *sctx, unsigned index) } else { radeon_set_config_reg(R_008988_VGT_TF_RING_SIZE, S_008988_SIZE(tf_ring_size_field)); radeon_set_config_reg(R_0089B8_VGT_TF_MEMORY_BASE, factor_va >> 8); - radeon_set_config_reg(R_0089B0_VGT_HS_OFFCHIP_PARAM, sscreen->hs.hs_offchip_param); + radeon_set_config_reg(R_0089B0_VGT_HS_OFFCHIP_PARAM, sscreen->info.hs_offchip_param); } radeon_end(); }