radeonsi/gfx: add si_gfx_screen.c

And move code specific to gfx/compute from radeonsi_screen_create_impl there.

ac_init_llvm_once has to stay in si_pipe.c because it has to be called very
early to avoid conflicts with u_queue initialisation.

Reviewed-by: David Rosca <david.rosca@amd.com>
Reviewed-by: Qiang Yu <yuq825@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41133>
This commit is contained in:
Pierre-Eric Pelloux-Prayer 2026-04-20 14:50:39 +02:00
parent 5f56a0e057
commit d1c57f742e
6 changed files with 633 additions and 599 deletions

View file

@ -18,6 +18,10 @@ struct si_screen;
struct si_shader;
struct si_shader_selector;
/* si_gfx_screen.c */
MESAPROC bool si_init_gfx_screen(struct si_screen *sscreen) TAILBT;
MESAPROC void si_fini_gfx_screen(struct si_screen *sscreen) TAILV;
/* si_shader_cache.c */
MESAPROC void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es,
unsigned wave_size, unsigned char ir_blake3_cache_key[BLAKE3_KEY_LEN]) TAILV;

View file

@ -0,0 +1,611 @@
/*
* Copyright 2026 Advanced Micro Devices, Inc.
*
* SPDX-License-Identifier: MIT
*/
#include "si_gfx.h"
#include "si_pipe.h"
#include "compiler/nir/nir.h"
#include "ac_shader_util.h"
#include "ac_shadowed_regs.h"
#include "util/disk_cache.h"
#include "aco_interface.h"
#include "util/hex.h"
#include "util/u_cpu_detect.h"
#include <sys/utsname.h>
#include <ctype.h>
#if AMD_LLVM_AVAILABLE
#include "ac_llvm_util.h"
#endif
#include <xf86drm.h>
static const struct debug_named_value radeonsi_shader_debug_options[] = {
/* Shader logging options: */
{"vs", DBG(VS), "Print vertex shaders"},
{"ps", DBG(PS), "Print pixel shaders"},
{"gs", DBG(GS), "Print geometry shaders"},
{"tcs", DBG(TCS), "Print tessellation control shaders"},
{"tes", DBG(TES), "Print tessellation evaluation shaders"},
{"cs", DBG(CS), "Print compute shaders"},
{"ts", DBG(TS), "Print task shaders"},
{"ms", DBG(MS), "Print mesh shaders"},
{"initnir", DBG(INIT_NIR), "Print initial input NIR when shaders are created"},
{"nir", DBG(NIR), "Print final NIR after lowering when shader variants are created"},
{"initllvm", DBG(INIT_LLVM), "Print initial LLVM IR before optimizations"},
{"llvm", DBG(LLVM), "Print final LLVM IR"},
{"initaco", DBG(INIT_ACO), "Print initial ACO IR before optimizations"},
{"aco", DBG(ACO), "Print final ACO IR"},
{"asm", DBG(ASM), "Print final shaders in asm"},
{"stats", DBG(STATS), "Print shader-db stats to stderr"},
/* Shader compiler options the shader cache should be aware of: */
{"w32ge", DBG(W32_GE), "Use Wave32 for vertex, tessellation, and geometry shaders."},
{"w32ps", DBG(W32_PS), "Use Wave32 for pixel shaders."},
{"w32cs", DBG(W32_CS), "Use Wave32 for computes shaders."},
{"w64ge", DBG(W64_GE), "Use Wave64 for vertex, tessellation, and geometry shaders."},
{"w64ps", DBG(W64_PS), "Use Wave64 for pixel shaders."},
{"w64cs", DBG(W64_CS), "Use Wave64 for computes shaders."},
/* Shader compiler options (with no effect on the shader cache): */
{"checkir", DBG(CHECK_IR), "Enable additional sanity checks on shader IR"},
{"mono", DBG(MONOLITHIC_SHADERS), "Use old-style monolithic shaders compiled on demand"},
{"nooptvariant", DBG(NO_OPT_VARIANT), "Disable compiling optimized shader variants."},
{"usellvm", DBG(USE_LLVM), "Use LLVM as shader compiler when possible"},
DEBUG_NAMED_VALUE_END /* must be last */
};
static void si_init_gs_info(struct si_screen *sscreen)
{
sscreen->gs_table_depth = ac_get_gs_table_depth(sscreen->info.gfx_level, sscreen->info.family);
}
static void
parse_hex(char *out, const char *in, unsigned length)
{
for (unsigned i = 0; i < length; ++i)
out[i] = 0;
for (unsigned i = 0; i < 2 * length; ++i) {
unsigned v = in[i] <= '9' ? in[i] - '0' : (in[i] >= 'a' ? (in[i] - 'a' + 10) : (in[i] - 'A' + 10));
out[i / 2] |= v << (4 * (1 - i % 2));
}
}
static void si_disk_cache_create(struct si_screen *sscreen)
{
/* Don't use the cache if shader dumping is enabled. */
if (sscreen->shader_debug_flags & DBG_ALL_SHADERS)
return;
blake3_hasher ctx;
unsigned char blake3[BLAKE3_KEY_LEN];
char cache_id[BLAKE3_HEX_LEN];
_mesa_blake3_init(&ctx);
#ifdef RADEONSI_BUILD_ID_OVERRIDE
{
unsigned size = strlen(RADEONSI_BUILD_ID_OVERRIDE) / 2;
char *data = alloca(size);
parse_hex(data, RADEONSI_BUILD_ID_OVERRIDE, size);
_mesa_blake3_update(&ctx, data, size);
}
#else
if (!disk_cache_get_function_identifier(si_disk_cache_create, &ctx))
return;
#endif
#if AMD_LLVM_AVAILABLE
if (!disk_cache_get_function_identifier(LLVMInitializeAMDGPUTargetInfo, &ctx))
return;
#endif
/* NIR options depend on si_screen::use_aco, which affects all shaders, including GLSL
* compilation.
*/
_mesa_blake3_update(&ctx, &sscreen->use_aco, sizeof(sscreen->use_aco));
_mesa_blake3_final(&ctx, blake3);
mesa_bytes_to_hex(cache_id, blake3, BLAKE3_KEY_LEN);
sscreen->disk_shader_cache = disk_cache_create(ac_get_family_name(sscreen->info.family),
cache_id, sscreen->info.address32_hi);
}
static void si_set_max_shader_compiler_threads(struct pipe_screen *screen, unsigned max_threads)
{
struct si_screen *sscreen = (struct si_screen *)screen;
/* This function doesn't allow a greater number of threads than
* the queue had at its creation. */
util_queue_adjust_num_threads(&sscreen->shader_compiler_queue, max_threads, false);
/* Don't change the number of threads on the low priority queue. */
}
static bool si_is_parallel_shader_compilation_finished(struct pipe_screen *screen, void *shader,
mesa_shader_stage shader_type)
{
struct si_shader_selector *sel = (struct si_shader_selector *)shader;
return util_queue_fence_is_signalled(&sel->ready);
}
static void si_setup_force_shader_use_aco(struct si_screen *sscreen, bool support_aco)
{
/* Usage:
* 1. shader type: vs|tcs|tes|gs|ps|cs, specify a class of shaders to use aco
* 2. shader blake: specify a single shader blake directly to use aco
* 3. filename: specify a file which contains shader blakes in lines
*/
sscreen->use_aco_shader_type = MESA_SHADER_NONE;
if (sscreen->use_aco || !support_aco)
return;
const char *option = debug_get_option("AMD_FORCE_SHADER_USE_ACO", NULL);
if (!option)
return;
if (!strcmp("vs", option)) {
sscreen->use_aco_shader_type = MESA_SHADER_VERTEX;
return;
} else if (!strcmp("tcs", option)) {
sscreen->use_aco_shader_type = MESA_SHADER_TESS_CTRL;
return;
} else if (!strcmp("tes", option)) {
sscreen->use_aco_shader_type = MESA_SHADER_TESS_EVAL;
return;
} else if (!strcmp("gs", option)) {
sscreen->use_aco_shader_type = MESA_SHADER_GEOMETRY;
return;
} else if (!strcmp("ps", option)) {
sscreen->use_aco_shader_type = MESA_SHADER_FRAGMENT;
return;
} else if (!strcmp("cs", option)) {
sscreen->use_aco_shader_type = MESA_SHADER_COMPUTE;
return;
}
blake3_hash blake;
if (_mesa_blake3_from_printed_string(blake, option)) {
sscreen->use_aco_shader_blakes = MALLOC(sizeof(blake));
memcpy(sscreen->use_aco_shader_blakes[0], blake, sizeof(blake));
sscreen->num_use_aco_shader_blakes = 1;
return;
}
FILE *f = fopen(option, "r");
if (!f) {
mesa_loge("invalid AMD_FORCE_SHADER_USE_ACO value");
return;
}
unsigned max_size = 16 * sizeof(blake3_hash);
sscreen->use_aco_shader_blakes = MALLOC(max_size);
char line[1024];
while (fgets(line, sizeof(line), f)) {
if (sscreen->num_use_aco_shader_blakes * sizeof(blake3_hash) >= max_size) {
sscreen->use_aco_shader_blakes = REALLOC(
sscreen->use_aco_shader_blakes, max_size, max_size * 2);
max_size *= 2;
}
if (line[BLAKE3_PRINTED_LEN] == '\n')
line[BLAKE3_PRINTED_LEN] = 0;
if (_mesa_blake3_from_printed_string(
sscreen->use_aco_shader_blakes[sscreen->num_use_aco_shader_blakes], line))
sscreen->num_use_aco_shader_blakes++;
}
fclose(f);
}
static bool
is_pro_graphics(struct si_screen *sscreen)
{
return strstr(sscreen->info.marketing_name, "Pro") ||
strstr(sscreen->info.marketing_name, "PRO") ||
strstr(sscreen->info.marketing_name, "Frontier");
}
static bool
si_is_compute_copy_faster(struct pipe_screen *pscreen,
enum pipe_format src_format,
enum pipe_format dst_format,
unsigned width,
unsigned height,
unsigned depth,
bool cpu)
{
if (cpu)
/* very basic for now */
return (uint64_t)width * height * depth > 64 * 64;
return false;
}
static void
si_driver_thread_add_job(struct pipe_screen *screen, void *data,
struct util_queue_fence *fence,
pipe_driver_thread_func execute,
pipe_driver_thread_func cleanup,
const size_t job_size)
{
struct si_screen *sscreen = (struct si_screen *)screen;
util_queue_add_job(&sscreen->shader_compiler_queue, data, fence, execute, cleanup, job_size);
}
static struct disk_cache *si_get_disk_shader_cache(struct pipe_screen *pscreen)
{
struct si_screen *sscreen = (struct si_screen *)pscreen;
return sscreen->disk_shader_cache;
}
bool si_init_gfx_screen(struct si_screen *sscreen) {
unsigned hw_threads, num_comp_hi_threads, num_comp_lo_threads;
const bool support_aco = aco_is_gpu_supported(&sscreen->info);
bool support_llvm = false;
#if AMD_LLVM_AVAILABLE
support_llvm = strlen(ac_get_llvm_processor_name(sscreen->info.family)) != 0;
#endif
sscreen->has_gfx_compute = support_aco || support_llvm;
if (!sscreen->has_gfx_compute)
return true;
ac_get_task_info(&sscreen->info, &sscreen->task_info);
si_disk_cache_create(sscreen);
if (sscreen->info.gfx_level >= GFX11) {
sscreen->use_ngg = true;
sscreen->use_ngg_culling = sscreen->info.max_render_backends >= 2 &&
!(sscreen->debug_flags & DBG(NO_NGG_CULLING));
} else {
sscreen->use_ngg = !(sscreen->debug_flags & DBG(NO_NGG)) &&
sscreen->info.gfx_level >= GFX10 &&
(sscreen->info.family != CHIP_NAVI14 ||
is_pro_graphics(sscreen));
sscreen->use_ngg_culling = sscreen->use_ngg &&
sscreen->info.max_render_backends >= 2 &&
!(sscreen->debug_flags & DBG(NO_NGG_CULLING));
}
sscreen->has_draw_indirect_multi =
(sscreen->info.family >= CHIP_POLARIS10) ||
(sscreen->info.gfx_level == GFX8 && sscreen->info.pfp_fw_version >= 121 &&
sscreen->info.me_fw_version >= 87) ||
(sscreen->info.gfx_level == GFX7 && sscreen->info.pfp_fw_version >= 211 &&
sscreen->info.me_fw_version >= 173) ||
(sscreen->info.gfx_level == GFX6 && sscreen->info.pfp_fw_version >= 79 &&
sscreen->info.me_fw_version >= 142);
si_driver_ds_init();
sscreen->b.get_disk_shader_cache = si_get_disk_shader_cache;
sscreen->b.is_compute_copy_faster = si_is_compute_copy_faster;
sscreen->b.driver_thread_add_job = si_driver_thread_add_job;
sscreen->context_roll_log_filename = debug_get_option("AMD_ROLLS", NULL);
sscreen->shader_debug_flags = debug_get_flags_option("AMD_DEBUG", radeonsi_shader_debug_options, 0);
if (sscreen->debug_flags & DBG(NO_DISPLAY_DCC)) {
sscreen->info.use_display_dcc_unaligned = false;
sscreen->info.use_display_dcc_with_retile_blit = false;
}
/* Using the environment variable doesn't enable PAIRS packets for simplicity. */
if ((sscreen->debug_flags & DBG(SHADOW_REGS)) &&
!(sscreen->info.userq_ip_mask & (1 << AMD_IP_GFX)))
sscreen->info.has_kernelq_reg_shadowing = true;
#if AMD_LLVM_AVAILABLE
sscreen->use_aco = support_aco && sscreen->info.has_image_opcodes &&
!(sscreen->shader_debug_flags & DBG(USE_LLVM));
#else
sscreen->use_aco = true;
#endif
if (sscreen->use_aco && !support_aco) {
mesa_loge("ACO does not support this chip yet");
return false;
}
si_setup_force_shader_use_aco(sscreen, support_aco);
sscreen->b.set_max_shader_compiler_threads = si_set_max_shader_compiler_threads;
sscreen->b.is_parallel_shader_compilation_finished = si_is_parallel_shader_compilation_finished;
sscreen->b.finalize_nir = si_finalize_nir;
sscreen->nir_options = CALLOC_STRUCT(nir_shader_compiler_options);
si_init_screen_state_functions(sscreen);
si_init_screen_query_functions(sscreen);
si_init_screen_live_shader_cache(sscreen);
si_init_screen_nir_options(sscreen);
si_init_shader_caps(sscreen);
si_init_compute_caps(sscreen);
si_init_gfx_caps(sscreen);
if (sscreen->b.caps.mesh_shader)
si_init_mesh_caps(sscreen);
sscreen->force_aniso = MIN2(16, debug_get_num_option("R600_TEX_ANISO", -1));
if (sscreen->force_aniso == -1) {
sscreen->force_aniso = MIN2(16, debug_get_num_option("AMD_TEX_ANISO", -1));
}
if (sscreen->force_aniso >= 0) {
printf("radeonsi: Forcing anisotropy filter to %ix\n",
/* round down to a power of two */
1 << util_logbase2(sscreen->force_aniso));
}
(void)simple_mtx_init(&sscreen->async_compute_context_lock, mtx_plain);
(void)simple_mtx_init(&sscreen->gpu_load_mutex, mtx_plain);
(void)simple_mtx_init(&sscreen->gds_mutex, mtx_plain);
(void)simple_mtx_init(&sscreen->tess_ring_lock, mtx_plain);
si_init_gs_info(sscreen);
if (!si_init_shader_cache(sscreen)) {
FREE(sscreen->nir_options);
return false;
}
if (sscreen->info.gfx_level < GFX10_3)
sscreen->options.vrs2x2 = false;
/* Determine the number of shader compiler threads. */
const struct util_cpu_caps_t *caps = util_get_cpu_caps();
hw_threads = caps->nr_cpus;
if (hw_threads >= 12) {
num_comp_hi_threads = hw_threads * 3 / 4;
num_comp_lo_threads = hw_threads / 3;
} else if (hw_threads >= 6) {
num_comp_hi_threads = hw_threads - 2;
num_comp_lo_threads = hw_threads / 2;
} else if (hw_threads >= 2) {
num_comp_hi_threads = hw_threads - 1;
num_comp_lo_threads = hw_threads / 2;
} else {
num_comp_hi_threads = 1;
num_comp_lo_threads = 1;
}
#if !defined(NDEBUG)
nir_process_debug_variable();
/* Use a single compilation thread if NIR printing is enabled to avoid
* multiple shaders being printed at the same time.
*/
if (NIR_DEBUG(PRINT)) {
num_comp_hi_threads = 1;
num_comp_lo_threads = 1;
}
#endif
num_comp_hi_threads = MIN2(num_comp_hi_threads, ARRAY_SIZE(sscreen->compiler));
num_comp_lo_threads = MIN2(num_comp_lo_threads, ARRAY_SIZE(sscreen->compiler_lowp));
/* Take a reference on the glsl types for the compiler threads. */
glsl_type_singleton_init_or_ref();
/* Start with a single thread and a single slot.
* Each time we'll hit the "all slots are in use" case, the number of threads and
* slots will be increased.
*/
int num_slots = num_comp_hi_threads == 1 ? 64 : 1;
if (!util_queue_init(&sscreen->shader_compiler_queue, "sh", num_slots,
num_comp_hi_threads,
UTIL_QUEUE_INIT_RESIZE_IF_FULL |
UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY, NULL)) {
si_destroy_shader_cache(sscreen);
FREE(sscreen->nir_options);
glsl_type_singleton_decref();
return false;
}
if (!util_queue_init(&sscreen->shader_compiler_queue_opt_variants, "sh_opt", num_slots,
num_comp_lo_threads,
UTIL_QUEUE_INIT_RESIZE_IF_FULL |
UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY, NULL)) {
si_destroy_shader_cache(sscreen);
FREE(sscreen->nir_options);
glsl_type_singleton_decref();
return false;
}
if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", false))
si_init_perfcounters(sscreen);
if (sscreen->debug_flags & DBG(NO_OUT_OF_ORDER))
sscreen->info.has_out_of_order_rast = false;
/* Only set this for the cases that are known to work, which are:
* - GFX9 if bpp >= 4 (in bytes)
*/
if (sscreen->info.gfx_level >= GFX10) {
memset(sscreen->allow_dcc_msaa_clear_to_reg_for_bpp, true,
sizeof(sscreen->allow_dcc_msaa_clear_to_reg_for_bpp));
} else if (sscreen->info.gfx_level == GFX9) {
for (unsigned bpp_log2 = util_logbase2(1); bpp_log2 <= util_logbase2(16); bpp_log2++)
sscreen->allow_dcc_msaa_clear_to_reg_for_bpp[bpp_log2] = true;
}
/* DCC stores have 50% performance of uncompressed stores and sometimes
* even less than that. It's risky to enable on dGPUs.
*/
sscreen->always_allow_dcc_stores = !(sscreen->debug_flags & DBG(NO_DCC_STORE)) &&
(sscreen->debug_flags & DBG(DCC_STORE) ||
sscreen->info.gfx_level >= GFX11 || /* always enabled on gfx11 */
(sscreen->info.gfx_level >= GFX10_3 &&
!sscreen->info.has_dedicated_vram));
sscreen->dpbb_allowed = !(sscreen->debug_flags & DBG(NO_DPBB)) &&
(sscreen->info.gfx_level >= GFX10 ||
/* Only enable primitive binning on gfx9 APUs by default. */
(sscreen->info.gfx_level == GFX9 && !sscreen->info.has_dedicated_vram) ||
sscreen->debug_flags & DBG(DPBB));
if (sscreen->dpbb_allowed) {
if ((sscreen->info.has_dedicated_vram && sscreen->info.max_render_backends > 4) ||
sscreen->info.gfx_level >= GFX10) {
/* Only bin draws that have no CONTEXT and SH register changes between
* them because higher settings cause hangs. We've only been able to
* reproduce hangs on smaller chips (e.g. Navi24, Phoenix), though all
* chips might have them. What we see may be due to a driver bug.
*/
sscreen->pbb_context_states_per_bin = 1;
sscreen->pbb_persistent_states_per_bin = 1;
} else {
/* This is a workaround for:
* https://bugs.freedesktop.org/show_bug.cgi?id=110214
* (an alternative is to insert manual BATCH_BREAK event when
* a context_roll is detected). */
sscreen->pbb_context_states_per_bin = sscreen->info.has_gfx9_scissor_bug ? 1 : 3;
sscreen->pbb_persistent_states_per_bin = 8;
}
if (!sscreen->info.has_gfx9_scissor_bug)
sscreen->pbb_context_states_per_bin =
debug_get_num_option("AMD_DEBUG_DPBB_CS", sscreen->pbb_context_states_per_bin);
sscreen->pbb_persistent_states_per_bin =
debug_get_num_option("AMD_DEBUG_DPBB_PS", sscreen->pbb_persistent_states_per_bin);
assert(sscreen->pbb_context_states_per_bin >= 1 &&
sscreen->pbb_context_states_per_bin <= 6);
assert(sscreen->pbb_persistent_states_per_bin >= 1 &&
sscreen->pbb_persistent_states_per_bin <= 32);
}
(void)simple_mtx_init(&sscreen->shader_parts_mutex, mtx_plain);
sscreen->use_monolithic_shaders =
(sscreen->shader_debug_flags & DBG(MONOLITHIC_SHADERS)) != 0;
if (debug_get_bool_option("RADEON_DUMP_SHADERS", false))
sscreen->shader_debug_flags |= DBG_ALL_SHADERS;
/* Syntax:
* EQAA=s,z,c
* Example:
* EQAA=8,4,2
* That means 8 coverage samples, 4 Z/S samples, and 2 color samples.
* Constraints:
* s >= z >= c (ignoring this only wastes memory)
* s = [2..16]
* z = [2..8]
* c = [2..8]
*
* Only MSAA color and depth buffers are overridden.
*/
if (sscreen->info.has_eqaa_surface_allocator) {
const char *eqaa = debug_get_option("EQAA", NULL);
unsigned s, z, f;
if (eqaa && sscanf(eqaa, "%u,%u,%u", &s, &z, &f) == 3 && s && z && f) {
sscreen->eqaa_force_coverage_samples = s;
sscreen->eqaa_force_z_samples = z;
sscreen->eqaa_force_color_samples = f;
}
}
if (sscreen->info.gfx_level >= GFX11) {
sscreen->attribute_pos_prim_ring =
si_aligned_buffer_create(&sscreen->b,
PIPE_RESOURCE_FLAG_UNMAPPABLE |
SI_RESOURCE_FLAG_32BIT |
SI_RESOURCE_FLAG_DRIVER_INTERNAL |
SI_RESOURCE_FLAG_DISCARDABLE,
PIPE_USAGE_DEFAULT,
sscreen->info.total_attribute_pos_prim_ring_size,
2 * 1024 * 1024);
}
ac_print_nonshadowed_regs(sscreen->info.gfx_level, sscreen->info.family);
return true;
}
void si_fini_gfx_screen(struct si_screen *sscreen) {
struct si_shader_part *parts[] = {sscreen->ps_prologs, sscreen->ps_epilogs};
unsigned i;
if (!sscreen->has_gfx_compute)
return;
if (sscreen->debug_flags & DBG(CACHE_STATS)) {
printf("live shader cache: hits = %u, misses = %u\n", sscreen->live_shader_cache.hits,
sscreen->live_shader_cache.misses);
printf("memory shader cache: hits = %u, misses = %u\n", sscreen->num_memory_shader_cache_hits,
sscreen->num_memory_shader_cache_misses);
printf("disk shader cache: hits = %u, misses = %u\n", sscreen->num_disk_shader_cache_hits,
sscreen->num_disk_shader_cache_misses);
}
si_resource_reference(&sscreen->attribute_pos_prim_ring, NULL);
si_resource_reference(&sscreen->attribute_pos_prim_ring_tmz, NULL);
pipe_resource_reference(&sscreen->tess_rings, NULL);
pipe_resource_reference(&sscreen->tess_rings_tmz, NULL);
util_queue_destroy(&sscreen->shader_compiler_queue);
util_queue_destroy(&sscreen->shader_compiler_queue_opt_variants);
simple_mtx_destroy(&sscreen->async_compute_context_lock);
if (sscreen->async_compute_context)
sscreen->async_compute_context->destroy(sscreen->async_compute_context);
/* Release the reference on glsl types of the compiler threads. */
glsl_type_singleton_decref();
for (i = 0; i < ARRAY_SIZE(sscreen->compiler); i++) {
if (sscreen->compiler[i])
si_destroy_llvm_compiler(sscreen->compiler[i]);
}
for (i = 0; i < ARRAY_SIZE(sscreen->compiler_lowp); i++) {
if (sscreen->compiler_lowp[i])
si_destroy_llvm_compiler(sscreen->compiler_lowp[i]);
}
/* Free shader parts. */
for (i = 0; i < ARRAY_SIZE(parts); i++) {
while (parts[i]) {
struct si_shader_part *part = parts[i];
parts[i] = part->next;
si_shader_binary_clean(&part->binary);
FREE(part);
}
}
simple_mtx_destroy(&sscreen->shader_parts_mutex);
si_destroy_shader_cache(sscreen);
si_destroy_perfcounters(sscreen);
si_gpu_load_kill_thread(sscreen);
simple_mtx_destroy(&sscreen->gpu_load_mutex);
simple_mtx_destroy(&sscreen->gds_mutex);
simple_mtx_destroy(&sscreen->tess_ring_lock);
radeon_bo_reference(sscreen->ws, &sscreen->gds_oa, NULL);
disk_cache_destroy(sscreen->disk_shader_cache);
util_vertex_state_cache_deinit(&sscreen->vertex_state_cache);
util_live_shader_cache_deinit(&sscreen->live_shader_cache);
FREE(sscreen->use_aco_shader_blakes);
FREE(sscreen->nir_options);
}

View file

@ -119,6 +119,7 @@ endif
radeonsi_gfx_libs = []
if with_gfx_compute
files_libradeonsi+= files(
'gfx/si_gfx_screen.c',
'si_blit.c',
'si_cp_reg_shadowing.c',
'si_compute_blit.c',

View file

@ -25,21 +25,6 @@ static const char *si_get_device_vendor(struct pipe_screen *pscreen)
return "AMD";
}
static bool
si_is_compute_copy_faster(struct pipe_screen *pscreen,
enum pipe_format src_format,
enum pipe_format dst_format,
unsigned width,
unsigned height,
unsigned depth,
bool cpu)
{
if (cpu)
/* very basic for now */
return (uint64_t)width * height * depth > 64 * 64;
return false;
}
static void si_get_driver_uuid(struct pipe_screen *pscreen, char *uuid)
{
ac_compute_driver_uuid(uuid, PIPE_UUID_SIZE);
@ -103,14 +88,7 @@ static void si_query_memory_info(struct pipe_screen *screen, struct pipe_memory_
info->nr_device_memory_evictions = info->device_memory_evicted / 64;
}
static struct disk_cache *si_get_disk_shader_cache(struct pipe_screen *pscreen)
{
struct si_screen *sscreen = (struct si_screen *)pscreen;
return sscreen->disk_shader_cache;
}
static void si_init_renderer_string(struct si_screen *sscreen)
void si_init_renderer_string(struct si_screen *sscreen)
{
char first_name[256], second_name[32] = {}, kernel_version[128] = {};
struct utsname uname_data;
@ -160,18 +138,6 @@ static unsigned si_varying_expression_max_cost(nir_shader *producer, nir_shader
return ac_nir_varying_expression_max_cost(producer, consumer);
}
static void
si_driver_thread_add_job(struct pipe_screen *screen, void *data,
struct util_queue_fence *fence,
pipe_driver_thread_func execute,
pipe_driver_thread_func cleanup,
const size_t job_size)
{
struct si_screen *sscreen = (struct si_screen *)screen;
util_queue_add_job(&sscreen->shader_compiler_queue, data, fence, execute, cleanup, job_size);
}
static bool enable_mesh_shader(struct si_screen *sscreen)
{
return sscreen->use_ngg &&
@ -208,15 +174,10 @@ void si_init_screen_get_functions(struct si_screen *sscreen)
sscreen->b.get_vendor = si_get_vendor;
sscreen->b.get_device_vendor = si_get_device_vendor;
sscreen->b.get_screen_fd = si_get_screen_fd;
sscreen->b.is_compute_copy_faster = si_is_compute_copy_faster;
sscreen->b.driver_thread_add_job = si_driver_thread_add_job;
sscreen->b.get_timestamp = si_get_timestamp;
sscreen->b.get_device_uuid = si_get_device_uuid;
sscreen->b.get_driver_uuid = si_get_driver_uuid;
sscreen->b.query_memory_info = si_query_memory_info;
sscreen->b.get_disk_shader_cache = si_get_disk_shader_cache;
si_init_renderer_string(sscreen);
}
void si_init_screen_nir_options(struct si_screen *sscreen)
@ -408,7 +369,7 @@ void si_init_compute_caps(struct si_screen *sscreen)
sscreen->info.compiler_info.has_cs_regalloc_hang_bug ? 256 : SI_MAX_VARIABLE_THREADS_PER_BLOCK;
}
static void si_init_mesh_caps(struct si_screen *sscreen)
void si_init_mesh_caps(struct si_screen *sscreen)
{
struct pipe_mesh_caps *caps = (struct pipe_mesh_caps *)&sscreen->b.caps.mesh;
@ -459,7 +420,7 @@ static void si_init_mesh_caps(struct si_screen *sscreen)
caps->pipeline_statistic_queries = sscreen->info.gfx_level >= GFX11;
}
static void si_init_gfx_caps(struct si_screen *sscreen)
void si_init_gfx_caps(struct si_screen *sscreen)
{
struct pipe_caps *caps = (struct pipe_caps *)&sscreen->b.caps;
@ -581,7 +542,7 @@ static void si_init_gfx_caps(struct si_screen *sscreen)
caps->fbfetch = 1;
caps->graphics = sscreen->info.has_graphics;
caps->mesh_shader = enable_mesh_shader(sscreen);
caps->mesh_shader = sscreen->b.nir_options[MESA_SHADER_MESH];
caps->compute = sscreen->has_gfx_compute;
/* Tahiti and Verde only: reduction mode is unsupported due to a bug
@ -609,14 +570,6 @@ static void si_init_gfx_caps(struct si_screen *sscreen)
caps->post_depth_coverage = sscreen->info.gfx_level >= GFX10;
#ifdef HAVE_GFX_COMPUTE
caps->graphics = sscreen->info.has_graphics;
caps->mesh_shader = sscreen->b.nir_options[MESA_SHADER_MESH];
caps->compute = true;
#else
caps->graphics = caps->mesh_shader = caps->compute = false;
#endif
caps->max_vertex_buffers = SI_MAX_ATTRIBS;
caps->constant_buffer_offset_alignment =
@ -758,6 +711,11 @@ static void si_init_gfx_caps(struct si_screen *sscreen)
* KHR-GL46.texture_lod_bias.texture_lod_bias_all
*/
caps->max_texture_lod_bias = 16;
/* Override the value set by u_init_pipe_screen_caps because it was called
* before shader caps are set.
*/
caps->hardware_gl_select = debug_get_bool_option("MESA_HW_ACCEL_SELECT", true);
}
void si_init_screen_caps(struct si_screen *sscreen)
@ -771,11 +729,7 @@ void si_init_screen_caps(struct si_screen *sscreen)
if (sscreen->info.is_virtio)
caps->dmabuf |= DRM_PRIME_CAP_EXPORT | DRM_PRIME_CAP_IMPORT;
#ifdef HAVE_GFX_COMPUTE
si_init_gfx_caps(sscreen);
#else
caps->graphics = caps->mesh_shader = caps->compute = false;
#endif
caps->resource_from_user_memory = !UTIL_ARCH_BIG_ENDIAN && sscreen->info.has_userptr;
@ -808,9 +762,6 @@ void si_init_screen_caps(struct si_screen *sscreen)
/* Conversion to nanos from cycles per millisecond */
caps->timer_resolution = DIV_ROUND_UP(1000000, sscreen->info.clock_crystal_freq);
if (caps->mesh_shader)
si_init_mesh_caps(sscreen);
if (sscreen->ws->va_range)
sscreen->ws->va_range(sscreen->ws, &caps->min_vma, &caps->max_vma);

View file

@ -95,43 +95,6 @@ static const struct debug_named_value radeonsi_debug_options[] = {
DEBUG_NAMED_VALUE_END /* must be last */
};
static const struct debug_named_value radeonsi_shader_debug_options[] = {
/* Shader logging options: */
{"vs", DBG(VS), "Print vertex shaders"},
{"ps", DBG(PS), "Print pixel shaders"},
{"gs", DBG(GS), "Print geometry shaders"},
{"tcs", DBG(TCS), "Print tessellation control shaders"},
{"tes", DBG(TES), "Print tessellation evaluation shaders"},
{"cs", DBG(CS), "Print compute shaders"},
{"ts", DBG(TS), "Print task shaders"},
{"ms", DBG(MS), "Print mesh shaders"},
{"initnir", DBG(INIT_NIR), "Print initial input NIR when shaders are created"},
{"nir", DBG(NIR), "Print final NIR after lowering when shader variants are created"},
{"initllvm", DBG(INIT_LLVM), "Print initial LLVM IR before optimizations"},
{"llvm", DBG(LLVM), "Print final LLVM IR"},
{"initaco", DBG(INIT_ACO), "Print initial ACO IR before optimizations"},
{"aco", DBG(ACO), "Print final ACO IR"},
{"asm", DBG(ASM), "Print final shaders in asm"},
{"stats", DBG(STATS), "Print shader-db stats to stderr"},
/* Shader compiler options the shader cache should be aware of: */
{"w32ge", DBG(W32_GE), "Use Wave32 for vertex, tessellation, and geometry shaders."},
{"w32ps", DBG(W32_PS), "Use Wave32 for pixel shaders."},
{"w32cs", DBG(W32_CS), "Use Wave32 for computes shaders."},
{"w64ge", DBG(W64_GE), "Use Wave64 for vertex, tessellation, and geometry shaders."},
{"w64ps", DBG(W64_PS), "Use Wave64 for pixel shaders."},
{"w64cs", DBG(W64_CS), "Use Wave64 for computes shaders."},
/* Shader compiler options (with no effect on the shader cache): */
{"checkir", DBG(CHECK_IR), "Enable additional sanity checks on shader IR"},
{"mono", DBG(MONOLITHIC_SHADERS), "Use old-style monolithic shaders compiled on demand"},
{"nooptvariant", DBG(NO_OPT_VARIANT), "Disable compiling optimized shader variants."},
{"usellvm", DBG(USE_LLVM), "Use LLVM as shader compiler when possible"},
DEBUG_NAMED_VALUE_END /* must be last */
};
static const struct debug_named_value test_options[] = {
/* Tests: */
{"clearbuffer", DBG(TEST_CLEAR_BUFFER), "Test correctness of the clear_buffer compute shader"},
@ -179,7 +142,7 @@ void si_init_aux_async_compute_ctx(struct si_screen *sscreen)
((struct si_context*)sscreen->async_compute_context)->cs_max_waves_per_sh = 2;
}
static void si_destroy_llvm_compiler(struct ac_llvm_compiler *compiler)
void si_destroy_llvm_compiler(struct ac_llvm_compiler *compiler)
{
#if AMD_LLVM_AVAILABLE
ac_destroy_llvm_compiler(compiler);
@ -1026,26 +989,10 @@ static struct pipe_context *si_pipe_create_context(struct pipe_screen *screen, v
void si_destroy_screen(struct pipe_screen *pscreen)
{
struct si_screen *sscreen = (struct si_screen *)pscreen;
struct si_shader_part *parts[] = {sscreen->ps_prologs, sscreen->ps_epilogs};
unsigned i;
if (!sscreen->ws->unref(sscreen->ws))
return;
if (sscreen->debug_flags & DBG(CACHE_STATS)) {
printf("live shader cache: hits = %u, misses = %u\n", sscreen->live_shader_cache.hits,
sscreen->live_shader_cache.misses);
printf("memory shader cache: hits = %u, misses = %u\n", sscreen->num_memory_shader_cache_hits,
sscreen->num_memory_shader_cache_misses);
printf("disk shader cache: hits = %u, misses = %u\n", sscreen->num_disk_shader_cache_hits,
sscreen->num_disk_shader_cache_misses);
}
si_resource_reference(&sscreen->attribute_pos_prim_ring, NULL);
si_resource_reference(&sscreen->attribute_pos_prim_ring_tmz, NULL);
pipe_resource_reference(&sscreen->tess_rings, NULL);
pipe_resource_reference(&sscreen->tess_rings_tmz, NULL);
for (unsigned i = 0; i < ARRAY_SIZE(sscreen->aux_contexts); i++) {
if (!sscreen->aux_contexts[i].ctx)
continue;
@ -1063,68 +1010,18 @@ void si_destroy_screen(struct pipe_screen *pscreen)
mtx_destroy(&sscreen->aux_contexts[i].lock);
}
util_queue_destroy(&sscreen->shader_compiler_queue);
util_queue_destroy(&sscreen->shader_compiler_queue_opt_variants);
si_fini_gfx_screen(sscreen);
simple_mtx_destroy(&sscreen->async_compute_context_lock);
if (sscreen->async_compute_context) {
sscreen->async_compute_context->destroy(sscreen->async_compute_context);
}
/* Release the reference on glsl types of the compiler threads. */
glsl_type_singleton_decref();
for (i = 0; i < ARRAY_SIZE(sscreen->compiler); i++) {
if (sscreen->compiler[i])
si_destroy_llvm_compiler(sscreen->compiler[i]);
}
for (i = 0; i < ARRAY_SIZE(sscreen->compiler_lowp); i++) {
if (sscreen->compiler_lowp[i])
si_destroy_llvm_compiler(sscreen->compiler_lowp[i]);
}
/* Free shader parts. */
for (i = 0; i < ARRAY_SIZE(parts); i++) {
while (parts[i]) {
struct si_shader_part *part = parts[i];
parts[i] = part->next;
si_shader_binary_clean(&part->binary);
FREE(part);
}
}
simple_mtx_destroy(&sscreen->shader_parts_mutex);
si_destroy_shader_cache(sscreen);
si_destroy_perfcounters(sscreen);
si_gpu_load_kill_thread(sscreen);
simple_mtx_destroy(&sscreen->gpu_load_mutex);
simple_mtx_destroy(&sscreen->gds_mutex);
simple_mtx_destroy(&sscreen->tess_ring_lock);
simple_mtx_destroy(&sscreen->print_ib_mutex);
radeon_bo_reference(sscreen->ws, &sscreen->gds_oa, NULL);
slab_destroy_parent(&sscreen->pool_transfers);
disk_cache_destroy(sscreen->disk_shader_cache);
util_live_shader_cache_deinit(&sscreen->live_shader_cache);
util_idalloc_mt_fini(&sscreen->buffer_ids);
util_vertex_state_cache_deinit(&sscreen->vertex_state_cache);
sscreen->ws->destroy(sscreen->ws);
FREE(sscreen->use_aco_shader_blakes);
FREE(sscreen->nir_options);
FREE(sscreen);
}
static void si_init_gs_info(struct si_screen *sscreen)
{
sscreen->gs_table_depth = ac_get_gs_table_depth(sscreen->info.gfx_level, sscreen->info.family);
}
static void si_test_vmfault(struct si_screen *sscreen, uint64_t test_flags)
{
struct pipe_context *ctx = sscreen->aux_context.general.ctx;
@ -1150,163 +1047,10 @@ static void si_test_vmfault(struct si_screen *sscreen, uint64_t test_flags)
exit(0);
}
static void
parse_hex(char *out, const char *in, unsigned length)
{
for (unsigned i = 0; i < length; ++i)
out[i] = 0;
for (unsigned i = 0; i < 2 * length; ++i) {
unsigned v = in[i] <= '9' ? in[i] - '0' : (in[i] >= 'a' ? (in[i] - 'a' + 10) : (in[i] - 'A' + 10));
out[i / 2] |= v << (4 * (1 - i % 2));
}
}
static void si_disk_cache_create(struct si_screen *sscreen)
{
/* Don't use the cache if shader dumping is enabled. */
if (sscreen->shader_debug_flags & DBG_ALL_SHADERS)
return;
blake3_hasher ctx;
unsigned char blake3[BLAKE3_KEY_LEN];
char cache_id[BLAKE3_HEX_LEN];
_mesa_blake3_init(&ctx);
#ifdef RADEONSI_BUILD_ID_OVERRIDE
{
unsigned size = strlen(RADEONSI_BUILD_ID_OVERRIDE) / 2;
char *data = alloca(size);
parse_hex(data, RADEONSI_BUILD_ID_OVERRIDE, size);
_mesa_blake3_update(&ctx, data, size);
}
#else
if (!disk_cache_get_function_identifier(si_disk_cache_create, &ctx))
return;
#endif
#if AMD_LLVM_AVAILABLE
if (!disk_cache_get_function_identifier(LLVMInitializeAMDGPUTargetInfo, &ctx))
return;
#endif
/* NIR options depend on si_screen::use_aco, which affects all shaders, including GLSL
* compilation.
*/
_mesa_blake3_update(&ctx, &sscreen->use_aco, sizeof(sscreen->use_aco));
_mesa_blake3_final(&ctx, blake3);
mesa_bytes_to_hex(cache_id, blake3, BLAKE3_KEY_LEN);
sscreen->disk_shader_cache = disk_cache_create(ac_get_family_name(sscreen->info.family),
cache_id, sscreen->info.address32_hi);
}
static void si_set_max_shader_compiler_threads(struct pipe_screen *screen, unsigned max_threads)
{
struct si_screen *sscreen = (struct si_screen *)screen;
/* This function doesn't allow a greater number of threads than
* the queue had at its creation. */
util_queue_adjust_num_threads(&sscreen->shader_compiler_queue, max_threads, false);
/* Don't change the number of threads on the low priority queue. */
}
static bool si_is_parallel_shader_compilation_finished(struct pipe_screen *screen, void *shader,
mesa_shader_stage shader_type)
{
struct si_shader_selector *sel = (struct si_shader_selector *)shader;
return util_queue_fence_is_signalled(&sel->ready);
}
static void si_setup_force_shader_use_aco(struct si_screen *sscreen, bool support_aco)
{
/* Usage:
* 1. shader type: vs|tcs|tes|gs|ps|cs, specify a class of shaders to use aco
* 2. shader blake: specify a single shader blake directly to use aco
* 3. filename: specify a file which contains shader blakes in lines
*/
sscreen->use_aco_shader_type = MESA_SHADER_NONE;
if (sscreen->use_aco || !support_aco)
return;
const char *option = debug_get_option("AMD_FORCE_SHADER_USE_ACO", NULL);
if (!option)
return;
if (!strcmp("vs", option)) {
sscreen->use_aco_shader_type = MESA_SHADER_VERTEX;
return;
} else if (!strcmp("tcs", option)) {
sscreen->use_aco_shader_type = MESA_SHADER_TESS_CTRL;
return;
} else if (!strcmp("tes", option)) {
sscreen->use_aco_shader_type = MESA_SHADER_TESS_EVAL;
return;
} else if (!strcmp("gs", option)) {
sscreen->use_aco_shader_type = MESA_SHADER_GEOMETRY;
return;
} else if (!strcmp("ps", option)) {
sscreen->use_aco_shader_type = MESA_SHADER_FRAGMENT;
return;
} else if (!strcmp("cs", option)) {
sscreen->use_aco_shader_type = MESA_SHADER_COMPUTE;
return;
}
blake3_hash blake;
if (_mesa_blake3_from_printed_string(blake, option)) {
sscreen->use_aco_shader_blakes = MALLOC(sizeof(blake));
memcpy(sscreen->use_aco_shader_blakes[0], blake, sizeof(blake));
sscreen->num_use_aco_shader_blakes = 1;
return;
}
FILE *f = fopen(option, "r");
if (!f) {
mesa_loge("invalid AMD_FORCE_SHADER_USE_ACO value");
return;
}
unsigned max_size = 16 * sizeof(blake3_hash);
sscreen->use_aco_shader_blakes = MALLOC(max_size);
char line[1024];
while (fgets(line, sizeof(line), f)) {
if (sscreen->num_use_aco_shader_blakes * sizeof(blake3_hash) >= max_size) {
sscreen->use_aco_shader_blakes = REALLOC(
sscreen->use_aco_shader_blakes, max_size, max_size * 2);
max_size *= 2;
}
if (line[BLAKE3_PRINTED_LEN] == '\n')
line[BLAKE3_PRINTED_LEN] = 0;
if (_mesa_blake3_from_printed_string(
sscreen->use_aco_shader_blakes[sscreen->num_use_aco_shader_blakes], line))
sscreen->num_use_aco_shader_blakes++;
}
fclose(f);
}
static bool
is_pro_graphics(struct si_screen *sscreen)
{
return strstr(sscreen->info.marketing_name, "Pro") ||
strstr(sscreen->info.marketing_name, "PRO") ||
strstr(sscreen->info.marketing_name, "Frontier");
}
static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
const struct pipe_screen_config *config)
{
struct si_screen *sscreen = CALLOC_STRUCT(si_screen);
unsigned hw_threads, num_comp_hi_threads, num_comp_lo_threads;
uint64_t test_flags;
if (!sscreen) {
@ -1324,41 +1068,10 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
sscreen->ws = ws;
ws->query_info(ws, &sscreen->info);
sscreen->context_roll_log_filename = debug_get_option("AMD_ROLLS", NULL);
sscreen->debug_flags = debug_get_flags_option("R600_DEBUG", radeonsi_debug_options, 0);
sscreen->debug_flags |= debug_get_flags_option("AMD_DEBUG", radeonsi_debug_options, 0);
sscreen->shader_debug_flags = debug_get_flags_option("AMD_DEBUG", radeonsi_shader_debug_options, 0);
test_flags = debug_get_flags_option("AMD_TEST", test_options, 0);
if (sscreen->debug_flags & DBG(NO_DISPLAY_DCC)) {
sscreen->info.use_display_dcc_unaligned = false;
sscreen->info.use_display_dcc_with_retile_blit = false;
}
/* Using the environment variable doesn't enable PAIRS packets for simplicity. */
if ((sscreen->debug_flags & DBG(SHADOW_REGS)) &&
!(sscreen->info.userq_ip_mask & (1 << AMD_IP_GFX)))
sscreen->info.has_kernelq_reg_shadowing = true;
#ifdef HAVE_GFX_COMPUTE
bool support_aco = aco_is_gpu_supported(&sscreen->info);
#if AMD_LLVM_AVAILABLE
sscreen->use_aco = support_aco && sscreen->info.has_image_opcodes &&
!(sscreen->shader_debug_flags & DBG(USE_LLVM));
#else
sscreen->use_aco = true;
#endif
if (sscreen->use_aco && !support_aco) {
mesa_loge("ACO does not support this chip yet");
FREE(sscreen);
return NULL;
}
si_setup_force_shader_use_aco(sscreen, support_aco);
#endif
if ((sscreen->debug_flags & DBG(TMZ)) &&
!sscreen->info.has_tmz_support) {
fprintf(stderr, "radeonsi: requesting TMZ features but TMZ is not supported\n");
@ -1366,71 +1079,18 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
return NULL;
}
if (!sscreen->use_aco) {
/* Initialize just one compiler instance to check for errors. The other compiler instances
* are initialized on demand.
*/
sscreen->compiler[0] = si_create_llvm_compiler(sscreen);
if (!sscreen->compiler[0]) {
/* The callee prints the error message. */
FREE(sscreen);
return NULL;
}
}
#ifdef HAVE_GFX_COMPUTE
sscreen->has_gfx_compute = true;
#endif
util_idalloc_mt_init_tc(&sscreen->buffer_ids);
/* Set functions first. */
sscreen->b.context_create = si_pipe_create_context;
sscreen->b.destroy = si_destroy_screen;
sscreen->b.set_max_shader_compiler_threads = si_set_max_shader_compiler_threads;
sscreen->b.is_parallel_shader_compilation_finished = si_is_parallel_shader_compilation_finished;
#ifdef HAVE_GFX_COMPUTE
sscreen->b.finalize_nir = si_finalize_nir;
#endif
sscreen->nir_options = CALLOC_STRUCT(nir_shader_compiler_options);
si_init_screen_buffer_functions(sscreen);
si_init_screen_fence_functions(sscreen);
si_init_screen_state_functions(sscreen);
si_init_screen_texture_functions(sscreen);
si_init_screen_query_functions(sscreen);
si_init_screen_live_shader_cache(sscreen);
if (sscreen->info.gfx_level >= GFX11) {
sscreen->use_ngg = true;
sscreen->use_ngg_culling = sscreen->info.max_render_backends >= 2 &&
!(sscreen->debug_flags & DBG(NO_NGG_CULLING));
} else {
sscreen->use_ngg = !(sscreen->debug_flags & DBG(NO_NGG)) &&
sscreen->info.gfx_level >= GFX10 &&
(sscreen->info.family != CHIP_NAVI14 ||
is_pro_graphics(sscreen));
sscreen->use_ngg_culling = sscreen->use_ngg &&
sscreen->info.max_render_backends >= 2 &&
!(sscreen->debug_flags & DBG(NO_NGG_CULLING));
}
sscreen->has_draw_indirect_multi =
(sscreen->info.family >= CHIP_POLARIS10) ||
(sscreen->info.gfx_level == GFX8 && sscreen->info.pfp_fw_version >= 121 &&
sscreen->info.me_fw_version >= 87) ||
(sscreen->info.gfx_level == GFX7 && sscreen->info.pfp_fw_version >= 211 &&
sscreen->info.me_fw_version >= 173) ||
(sscreen->info.gfx_level == GFX6 && sscreen->info.pfp_fw_version >= 79 &&
sscreen->info.me_fw_version >= 142);
si_init_screen_get_functions(sscreen);
si_init_screen_nir_options(sscreen);
si_init_shader_caps(sscreen);
si_init_compute_caps(sscreen);
/* si_init_screen_caps depends on shader caps. */
si_init_screen_caps(sscreen);
if (sscreen->debug_flags & DBG(INFO))
@ -1438,209 +1098,16 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
slab_create_parent(&sscreen->pool_transfers, sizeof(struct si_transfer), 64);
sscreen->force_aniso = MIN2(16, debug_get_num_option("R600_TEX_ANISO", -1));
if (sscreen->force_aniso == -1) {
sscreen->force_aniso = MIN2(16, debug_get_num_option("AMD_TEX_ANISO", -1));
}
if (sscreen->force_aniso >= 0) {
printf("radeonsi: Forcing anisotropy filter to %ix\n",
/* round down to a power of two */
1 << util_logbase2(sscreen->force_aniso));
}
(void)simple_mtx_init(&sscreen->async_compute_context_lock, mtx_plain);
(void)simple_mtx_init(&sscreen->gpu_load_mutex, mtx_plain);
(void)simple_mtx_init(&sscreen->gds_mutex, mtx_plain);
(void)simple_mtx_init(&sscreen->tess_ring_lock, mtx_plain);
(void)simple_mtx_init(&sscreen->print_ib_mutex, mtx_plain);
si_init_gs_info(sscreen);
if (!si_init_shader_cache(sscreen)) {
FREE(sscreen->nir_options);
if (!si_init_gfx_screen(sscreen)) {
FREE(sscreen);
return NULL;
}
if (sscreen->info.gfx_level < GFX10_3)
sscreen->options.vrs2x2 = false;
si_disk_cache_create(sscreen);
/* Determine the number of shader compiler threads. */
const struct util_cpu_caps_t *caps = util_get_cpu_caps();
hw_threads = caps->nr_cpus;
if (hw_threads >= 12) {
num_comp_hi_threads = hw_threads * 3 / 4;
num_comp_lo_threads = hw_threads / 3;
} else if (hw_threads >= 6) {
num_comp_hi_threads = hw_threads - 2;
num_comp_lo_threads = hw_threads / 2;
} else if (hw_threads >= 2) {
num_comp_hi_threads = hw_threads - 1;
num_comp_lo_threads = hw_threads / 2;
} else {
num_comp_hi_threads = 1;
num_comp_lo_threads = 1;
}
#if !defined(NDEBUG) && defined(HAVE_GFX_COMPUTE)
nir_process_debug_variable();
/* Use a single compilation thread if NIR printing is enabled to avoid
* multiple shaders being printed at the same time.
*/
if (NIR_DEBUG(PRINT)) {
num_comp_hi_threads = 1;
num_comp_lo_threads = 1;
}
#endif
num_comp_hi_threads = MIN2(num_comp_hi_threads, ARRAY_SIZE(sscreen->compiler));
num_comp_lo_threads = MIN2(num_comp_lo_threads, ARRAY_SIZE(sscreen->compiler_lowp));
/* Take a reference on the glsl types for the compiler threads. */
glsl_type_singleton_init_or_ref();
/* Start with a single thread and a single slot.
* Each time we'll hit the "all slots are in use" case, the number of threads and
* slots will be increased.
*/
int num_slots = num_comp_hi_threads == 1 ? 64 : 1;
if (!util_queue_init(&sscreen->shader_compiler_queue, "sh", num_slots,
num_comp_hi_threads,
UTIL_QUEUE_INIT_RESIZE_IF_FULL |
UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY, NULL)) {
si_destroy_shader_cache(sscreen);
FREE(sscreen->nir_options);
FREE(sscreen);
glsl_type_singleton_decref();
return NULL;
}
if (!util_queue_init(&sscreen->shader_compiler_queue_opt_variants, "sh_opt", num_slots,
num_comp_lo_threads,
UTIL_QUEUE_INIT_RESIZE_IF_FULL |
UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY, NULL)) {
si_destroy_shader_cache(sscreen);
FREE(sscreen->nir_options);
FREE(sscreen);
glsl_type_singleton_decref();
return NULL;
}
/* Don't fail if the multimedia support is missing. */
si_init_mm_screen(sscreen);
if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", false))
si_init_perfcounters(sscreen);
ac_get_task_info(&sscreen->info, &sscreen->task_info);
if (sscreen->debug_flags & DBG(NO_OUT_OF_ORDER))
sscreen->info.has_out_of_order_rast = false;
/* Only set this for the cases that are known to work, which are:
* - GFX9 if bpp >= 4 (in bytes)
*/
if (sscreen->info.gfx_level >= GFX10) {
memset(sscreen->allow_dcc_msaa_clear_to_reg_for_bpp, true,
sizeof(sscreen->allow_dcc_msaa_clear_to_reg_for_bpp));
} else if (sscreen->info.gfx_level == GFX9) {
for (unsigned bpp_log2 = util_logbase2(1); bpp_log2 <= util_logbase2(16); bpp_log2++)
sscreen->allow_dcc_msaa_clear_to_reg_for_bpp[bpp_log2] = true;
}
/* DCC stores have 50% performance of uncompressed stores and sometimes
* even less than that. It's risky to enable on dGPUs.
*/
sscreen->always_allow_dcc_stores = !(sscreen->debug_flags & DBG(NO_DCC_STORE)) &&
(sscreen->debug_flags & DBG(DCC_STORE) ||
sscreen->info.gfx_level >= GFX11 || /* always enabled on gfx11 */
(sscreen->info.gfx_level >= GFX10_3 &&
!sscreen->info.has_dedicated_vram));
sscreen->dpbb_allowed = !(sscreen->debug_flags & DBG(NO_DPBB)) &&
(sscreen->info.gfx_level >= GFX10 ||
/* Only enable primitive binning on gfx9 APUs by default. */
(sscreen->info.gfx_level == GFX9 && !sscreen->info.has_dedicated_vram) ||
sscreen->debug_flags & DBG(DPBB));
if (sscreen->dpbb_allowed) {
if ((sscreen->info.has_dedicated_vram && sscreen->info.max_render_backends > 4) ||
sscreen->info.gfx_level >= GFX10) {
/* Only bin draws that have no CONTEXT and SH register changes between
* them because higher settings cause hangs. We've only been able to
* reproduce hangs on smaller chips (e.g. Navi24, Phoenix), though all
* chips might have them. What we see may be due to a driver bug.
*/
sscreen->pbb_context_states_per_bin = 1;
sscreen->pbb_persistent_states_per_bin = 1;
} else {
/* This is a workaround for:
* https://bugs.freedesktop.org/show_bug.cgi?id=110214
* (an alternative is to insert manual BATCH_BREAK event when
* a context_roll is detected). */
sscreen->pbb_context_states_per_bin = sscreen->info.has_gfx9_scissor_bug ? 1 : 3;
sscreen->pbb_persistent_states_per_bin = 8;
}
if (!sscreen->info.has_gfx9_scissor_bug)
sscreen->pbb_context_states_per_bin =
debug_get_num_option("AMD_DEBUG_DPBB_CS", sscreen->pbb_context_states_per_bin);
sscreen->pbb_persistent_states_per_bin =
debug_get_num_option("AMD_DEBUG_DPBB_PS", sscreen->pbb_persistent_states_per_bin);
assert(sscreen->pbb_context_states_per_bin >= 1 &&
sscreen->pbb_context_states_per_bin <= 6);
assert(sscreen->pbb_persistent_states_per_bin >= 1 &&
sscreen->pbb_persistent_states_per_bin <= 32);
}
(void)simple_mtx_init(&sscreen->shader_parts_mutex, mtx_plain);
sscreen->use_monolithic_shaders =
(sscreen->shader_debug_flags & DBG(MONOLITHIC_SHADERS)) != 0;
if (debug_get_bool_option("RADEON_DUMP_SHADERS", false))
sscreen->shader_debug_flags |= DBG_ALL_SHADERS;
/* Syntax:
* EQAA=s,z,c
* Example:
* EQAA=8,4,2
* That means 8 coverage samples, 4 Z/S samples, and 2 color samples.
* Constraints:
* s >= z >= c (ignoring this only wastes memory)
* s = [2..16]
* z = [2..8]
* c = [2..8]
*
* Only MSAA color and depth buffers are overridden.
*/
if (sscreen->info.has_eqaa_surface_allocator) {
const char *eqaa = debug_get_option("EQAA", NULL);
unsigned s, z, f;
if (eqaa && sscanf(eqaa, "%u,%u,%u", &s, &z, &f) == 3 && s && z && f) {
sscreen->eqaa_force_coverage_samples = s;
sscreen->eqaa_force_z_samples = z;
sscreen->eqaa_force_color_samples = f;
}
}
if (sscreen->info.gfx_level >= GFX11) {
sscreen->attribute_pos_prim_ring =
si_aligned_buffer_create(&sscreen->b,
PIPE_RESOURCE_FLAG_UNMAPPABLE |
SI_RESOURCE_FLAG_32BIT |
SI_RESOURCE_FLAG_DRIVER_INTERNAL |
SI_RESOURCE_FLAG_DISCARDABLE,
PIPE_USAGE_DEFAULT,
sscreen->info.total_attribute_pos_prim_ring_size,
2 * 1024 * 1024);
}
si_init_renderer_string(sscreen);
for (unsigned i = 0; i < ARRAY_SIZE(sscreen->aux_contexts); i++)
(void)mtx_init(&sscreen->aux_contexts[i].lock, mtx_plain | mtx_recursive);
@ -1666,8 +1133,6 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
if (test_flags & (DBG(TEST_VMFAULT_CP) | DBG(TEST_VMFAULT_SHADER)))
si_test_vmfault(sscreen, test_flags);
ac_print_nonshadowed_regs(sscreen->info.gfx_level, sscreen->info.family);
return &sscreen->b;
}
@ -1711,8 +1176,6 @@ struct pipe_screen *radeonsi_screen_create(int fd, const struct pipe_screen_conf
}
}
si_driver_ds_init();
drmFreeVersion(version);
return rw ? rw->screen : NULL;
}

View file

@ -1596,6 +1596,9 @@ void si_init_screen_nir_options(struct si_screen *sscreen);
void si_init_shader_caps(struct si_screen *sscreen);
void si_init_compute_caps(struct si_screen *sscreen);
void si_init_screen_caps(struct si_screen *sscreen);
void si_init_mesh_caps(struct si_screen *screen);
void si_init_gfx_caps(struct si_screen *sscreen);
void si_init_renderer_string(struct si_screen *sscreen);
bool si_sdma_copy_image(struct si_context *ctx, struct si_texture *dst, struct si_texture *src);
@ -1623,6 +1626,7 @@ MESAPROC void si_init_compute_functions(struct si_context *sctx) TAILV;
/* si_pipe.c */
struct ac_llvm_compiler *si_create_llvm_compiler(struct si_screen *sscreen);
void si_destroy_llvm_compiler(struct ac_llvm_compiler *compiler);
void si_init_aux_async_compute_ctx(struct si_screen *sscreen);
struct si_context *si_get_aux_context(struct si_screen *sscreen, struct si_aux_context *ctx);
void si_put_aux_context_flush(struct si_aux_context *ctx);