From d1c57f742e3864c3021dfda6537d0f208ac09a9c Mon Sep 17 00:00:00 2001 From: Pierre-Eric Pelloux-Prayer Date: Mon, 20 Apr 2026 14:50:39 +0200 Subject: [PATCH] radeonsi/gfx: add si_gfx_screen.c And move code specific to gfx/compute from radeonsi_screen_create_impl there. ac_init_llvm_once has to stay in si_pipe.c because it has to be called very early to avoid conflicts with u_queue initialisation. Reviewed-by: David Rosca Reviewed-by: Qiang Yu Part-of: --- src/gallium/drivers/radeonsi/gfx/si_gfx.h | 4 + .../drivers/radeonsi/gfx/si_gfx_screen.c | 611 ++++++++++++++++++ src/gallium/drivers/radeonsi/meson.build | 1 + src/gallium/drivers/radeonsi/si_get.c | 67 +- src/gallium/drivers/radeonsi/si_pipe.c | 545 +--------------- src/gallium/drivers/radeonsi/si_pipe.h | 4 + 6 files changed, 633 insertions(+), 599 deletions(-) create mode 100644 src/gallium/drivers/radeonsi/gfx/si_gfx_screen.c diff --git a/src/gallium/drivers/radeonsi/gfx/si_gfx.h b/src/gallium/drivers/radeonsi/gfx/si_gfx.h index d3e62d4fac3..5c38c2f3e35 100644 --- a/src/gallium/drivers/radeonsi/gfx/si_gfx.h +++ b/src/gallium/drivers/radeonsi/gfx/si_gfx.h @@ -18,6 +18,10 @@ struct si_screen; struct si_shader; struct si_shader_selector; +/* si_gfx_screen.c */ +MESAPROC bool si_init_gfx_screen(struct si_screen *sscreen) TAILBT; +MESAPROC void si_fini_gfx_screen(struct si_screen *sscreen) TAILV; + /* si_shader_cache.c */ MESAPROC void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es, unsigned wave_size, unsigned char ir_blake3_cache_key[BLAKE3_KEY_LEN]) TAILV; diff --git a/src/gallium/drivers/radeonsi/gfx/si_gfx_screen.c b/src/gallium/drivers/radeonsi/gfx/si_gfx_screen.c new file mode 100644 index 00000000000..ba5033d8604 --- /dev/null +++ b/src/gallium/drivers/radeonsi/gfx/si_gfx_screen.c @@ -0,0 +1,611 @@ +/* + * Copyright 2026 Advanced Micro Devices, Inc. + * + * SPDX-License-Identifier: MIT + */ +#include "si_gfx.h" +#include "si_pipe.h" +#include "compiler/nir/nir.h" +#include "ac_shader_util.h" +#include "ac_shadowed_regs.h" +#include "util/disk_cache.h" +#include "aco_interface.h" +#include "util/hex.h" +#include "util/u_cpu_detect.h" + +#include +#include + +#if AMD_LLVM_AVAILABLE +#include "ac_llvm_util.h" +#endif + +#include + +static const struct debug_named_value radeonsi_shader_debug_options[] = { + /* Shader logging options: */ + {"vs", DBG(VS), "Print vertex shaders"}, + {"ps", DBG(PS), "Print pixel shaders"}, + {"gs", DBG(GS), "Print geometry shaders"}, + {"tcs", DBG(TCS), "Print tessellation control shaders"}, + {"tes", DBG(TES), "Print tessellation evaluation shaders"}, + {"cs", DBG(CS), "Print compute shaders"}, + {"ts", DBG(TS), "Print task shaders"}, + {"ms", DBG(MS), "Print mesh shaders"}, + + {"initnir", DBG(INIT_NIR), "Print initial input NIR when shaders are created"}, + {"nir", DBG(NIR), "Print final NIR after lowering when shader variants are created"}, + {"initllvm", DBG(INIT_LLVM), "Print initial LLVM IR before optimizations"}, + {"llvm", DBG(LLVM), "Print final LLVM IR"}, + {"initaco", DBG(INIT_ACO), "Print initial ACO IR before optimizations"}, + {"aco", DBG(ACO), "Print final ACO IR"}, + {"asm", DBG(ASM), "Print final shaders in asm"}, + {"stats", DBG(STATS), "Print shader-db stats to stderr"}, + + /* Shader compiler options the shader cache should be aware of: */ + {"w32ge", DBG(W32_GE), "Use Wave32 for vertex, tessellation, and geometry shaders."}, + {"w32ps", DBG(W32_PS), "Use Wave32 for pixel shaders."}, + {"w32cs", DBG(W32_CS), "Use Wave32 for computes shaders."}, + {"w64ge", DBG(W64_GE), "Use Wave64 for vertex, tessellation, and geometry shaders."}, + {"w64ps", DBG(W64_PS), "Use Wave64 for pixel shaders."}, + {"w64cs", DBG(W64_CS), "Use Wave64 for computes shaders."}, + + /* Shader compiler options (with no effect on the shader cache): */ + {"checkir", DBG(CHECK_IR), "Enable additional sanity checks on shader IR"}, + {"mono", DBG(MONOLITHIC_SHADERS), "Use old-style monolithic shaders compiled on demand"}, + {"nooptvariant", DBG(NO_OPT_VARIANT), "Disable compiling optimized shader variants."}, + {"usellvm", DBG(USE_LLVM), "Use LLVM as shader compiler when possible"}, + + DEBUG_NAMED_VALUE_END /* must be last */ +}; + +static void si_init_gs_info(struct si_screen *sscreen) +{ + sscreen->gs_table_depth = ac_get_gs_table_depth(sscreen->info.gfx_level, sscreen->info.family); +} + +static void +parse_hex(char *out, const char *in, unsigned length) +{ + for (unsigned i = 0; i < length; ++i) + out[i] = 0; + + for (unsigned i = 0; i < 2 * length; ++i) { + unsigned v = in[i] <= '9' ? in[i] - '0' : (in[i] >= 'a' ? (in[i] - 'a' + 10) : (in[i] - 'A' + 10)); + out[i / 2] |= v << (4 * (1 - i % 2)); + } +} + +static void si_disk_cache_create(struct si_screen *sscreen) +{ + /* Don't use the cache if shader dumping is enabled. */ + if (sscreen->shader_debug_flags & DBG_ALL_SHADERS) + return; + + blake3_hasher ctx; + unsigned char blake3[BLAKE3_KEY_LEN]; + char cache_id[BLAKE3_HEX_LEN]; + + _mesa_blake3_init(&ctx); + +#ifdef RADEONSI_BUILD_ID_OVERRIDE + { + unsigned size = strlen(RADEONSI_BUILD_ID_OVERRIDE) / 2; + char *data = alloca(size); + parse_hex(data, RADEONSI_BUILD_ID_OVERRIDE, size); + _mesa_blake3_update(&ctx, data, size); + } +#else + if (!disk_cache_get_function_identifier(si_disk_cache_create, &ctx)) + return; +#endif + +#if AMD_LLVM_AVAILABLE + if (!disk_cache_get_function_identifier(LLVMInitializeAMDGPUTargetInfo, &ctx)) + return; +#endif + + /* NIR options depend on si_screen::use_aco, which affects all shaders, including GLSL + * compilation. + */ + _mesa_blake3_update(&ctx, &sscreen->use_aco, sizeof(sscreen->use_aco)); + + _mesa_blake3_final(&ctx, blake3); + mesa_bytes_to_hex(cache_id, blake3, BLAKE3_KEY_LEN); + + sscreen->disk_shader_cache = disk_cache_create(ac_get_family_name(sscreen->info.family), + cache_id, sscreen->info.address32_hi); +} + +static void si_set_max_shader_compiler_threads(struct pipe_screen *screen, unsigned max_threads) +{ + struct si_screen *sscreen = (struct si_screen *)screen; + + /* This function doesn't allow a greater number of threads than + * the queue had at its creation. */ + util_queue_adjust_num_threads(&sscreen->shader_compiler_queue, max_threads, false); + /* Don't change the number of threads on the low priority queue. */ +} + +static bool si_is_parallel_shader_compilation_finished(struct pipe_screen *screen, void *shader, + mesa_shader_stage shader_type) +{ + struct si_shader_selector *sel = (struct si_shader_selector *)shader; + + return util_queue_fence_is_signalled(&sel->ready); +} + +static void si_setup_force_shader_use_aco(struct si_screen *sscreen, bool support_aco) +{ + /* Usage: + * 1. shader type: vs|tcs|tes|gs|ps|cs, specify a class of shaders to use aco + * 2. shader blake: specify a single shader blake directly to use aco + * 3. filename: specify a file which contains shader blakes in lines + */ + + sscreen->use_aco_shader_type = MESA_SHADER_NONE; + + if (sscreen->use_aco || !support_aco) + return; + + const char *option = debug_get_option("AMD_FORCE_SHADER_USE_ACO", NULL); + if (!option) + return; + + if (!strcmp("vs", option)) { + sscreen->use_aco_shader_type = MESA_SHADER_VERTEX; + return; + } else if (!strcmp("tcs", option)) { + sscreen->use_aco_shader_type = MESA_SHADER_TESS_CTRL; + return; + } else if (!strcmp("tes", option)) { + sscreen->use_aco_shader_type = MESA_SHADER_TESS_EVAL; + return; + } else if (!strcmp("gs", option)) { + sscreen->use_aco_shader_type = MESA_SHADER_GEOMETRY; + return; + } else if (!strcmp("ps", option)) { + sscreen->use_aco_shader_type = MESA_SHADER_FRAGMENT; + return; + } else if (!strcmp("cs", option)) { + sscreen->use_aco_shader_type = MESA_SHADER_COMPUTE; + return; + } + + blake3_hash blake; + if (_mesa_blake3_from_printed_string(blake, option)) { + sscreen->use_aco_shader_blakes = MALLOC(sizeof(blake)); + memcpy(sscreen->use_aco_shader_blakes[0], blake, sizeof(blake)); + sscreen->num_use_aco_shader_blakes = 1; + return; + } + + FILE *f = fopen(option, "r"); + if (!f) { + mesa_loge("invalid AMD_FORCE_SHADER_USE_ACO value"); + return; + } + + unsigned max_size = 16 * sizeof(blake3_hash); + sscreen->use_aco_shader_blakes = MALLOC(max_size); + + char line[1024]; + while (fgets(line, sizeof(line), f)) { + if (sscreen->num_use_aco_shader_blakes * sizeof(blake3_hash) >= max_size) { + sscreen->use_aco_shader_blakes = REALLOC( + sscreen->use_aco_shader_blakes, max_size, max_size * 2); + max_size *= 2; + } + + if (line[BLAKE3_PRINTED_LEN] == '\n') + line[BLAKE3_PRINTED_LEN] = 0; + + if (_mesa_blake3_from_printed_string( + sscreen->use_aco_shader_blakes[sscreen->num_use_aco_shader_blakes], line)) + sscreen->num_use_aco_shader_blakes++; + } + + fclose(f); +} + +static bool +is_pro_graphics(struct si_screen *sscreen) +{ + return strstr(sscreen->info.marketing_name, "Pro") || + strstr(sscreen->info.marketing_name, "PRO") || + strstr(sscreen->info.marketing_name, "Frontier"); +} + +static bool +si_is_compute_copy_faster(struct pipe_screen *pscreen, + enum pipe_format src_format, + enum pipe_format dst_format, + unsigned width, + unsigned height, + unsigned depth, + bool cpu) +{ + if (cpu) + /* very basic for now */ + return (uint64_t)width * height * depth > 64 * 64; + return false; +} + +static void +si_driver_thread_add_job(struct pipe_screen *screen, void *data, + struct util_queue_fence *fence, + pipe_driver_thread_func execute, + pipe_driver_thread_func cleanup, + const size_t job_size) +{ + struct si_screen *sscreen = (struct si_screen *)screen; + util_queue_add_job(&sscreen->shader_compiler_queue, data, fence, execute, cleanup, job_size); +} + +static struct disk_cache *si_get_disk_shader_cache(struct pipe_screen *pscreen) +{ + struct si_screen *sscreen = (struct si_screen *)pscreen; + + return sscreen->disk_shader_cache; +} + +bool si_init_gfx_screen(struct si_screen *sscreen) { + unsigned hw_threads, num_comp_hi_threads, num_comp_lo_threads; + const bool support_aco = aco_is_gpu_supported(&sscreen->info); + bool support_llvm = false; + +#if AMD_LLVM_AVAILABLE + support_llvm = strlen(ac_get_llvm_processor_name(sscreen->info.family)) != 0; +#endif + + sscreen->has_gfx_compute = support_aco || support_llvm; + + if (!sscreen->has_gfx_compute) + return true; + + ac_get_task_info(&sscreen->info, &sscreen->task_info); + + si_disk_cache_create(sscreen); + + if (sscreen->info.gfx_level >= GFX11) { + sscreen->use_ngg = true; + sscreen->use_ngg_culling = sscreen->info.max_render_backends >= 2 && + !(sscreen->debug_flags & DBG(NO_NGG_CULLING)); + } else { + sscreen->use_ngg = !(sscreen->debug_flags & DBG(NO_NGG)) && + sscreen->info.gfx_level >= GFX10 && + (sscreen->info.family != CHIP_NAVI14 || + is_pro_graphics(sscreen)); + sscreen->use_ngg_culling = sscreen->use_ngg && + sscreen->info.max_render_backends >= 2 && + !(sscreen->debug_flags & DBG(NO_NGG_CULLING)); + } + + sscreen->has_draw_indirect_multi = + (sscreen->info.family >= CHIP_POLARIS10) || + (sscreen->info.gfx_level == GFX8 && sscreen->info.pfp_fw_version >= 121 && + sscreen->info.me_fw_version >= 87) || + (sscreen->info.gfx_level == GFX7 && sscreen->info.pfp_fw_version >= 211 && + sscreen->info.me_fw_version >= 173) || + (sscreen->info.gfx_level == GFX6 && sscreen->info.pfp_fw_version >= 79 && + sscreen->info.me_fw_version >= 142); + + si_driver_ds_init(); + + sscreen->b.get_disk_shader_cache = si_get_disk_shader_cache; + sscreen->b.is_compute_copy_faster = si_is_compute_copy_faster; + sscreen->b.driver_thread_add_job = si_driver_thread_add_job; + + sscreen->context_roll_log_filename = debug_get_option("AMD_ROLLS", NULL); + sscreen->shader_debug_flags = debug_get_flags_option("AMD_DEBUG", radeonsi_shader_debug_options, 0); + + if (sscreen->debug_flags & DBG(NO_DISPLAY_DCC)) { + sscreen->info.use_display_dcc_unaligned = false; + sscreen->info.use_display_dcc_with_retile_blit = false; + } + + /* Using the environment variable doesn't enable PAIRS packets for simplicity. */ + if ((sscreen->debug_flags & DBG(SHADOW_REGS)) && + !(sscreen->info.userq_ip_mask & (1 << AMD_IP_GFX))) + sscreen->info.has_kernelq_reg_shadowing = true; + +#if AMD_LLVM_AVAILABLE + sscreen->use_aco = support_aco && sscreen->info.has_image_opcodes && + !(sscreen->shader_debug_flags & DBG(USE_LLVM)); +#else + sscreen->use_aco = true; +#endif + + if (sscreen->use_aco && !support_aco) { + mesa_loge("ACO does not support this chip yet"); + return false; + } + + si_setup_force_shader_use_aco(sscreen, support_aco); + + sscreen->b.set_max_shader_compiler_threads = si_set_max_shader_compiler_threads; + sscreen->b.is_parallel_shader_compilation_finished = si_is_parallel_shader_compilation_finished; + sscreen->b.finalize_nir = si_finalize_nir; + + sscreen->nir_options = CALLOC_STRUCT(nir_shader_compiler_options); + + si_init_screen_state_functions(sscreen); + si_init_screen_query_functions(sscreen); + si_init_screen_live_shader_cache(sscreen); + + si_init_screen_nir_options(sscreen); + si_init_shader_caps(sscreen); + si_init_compute_caps(sscreen); + si_init_gfx_caps(sscreen); + if (sscreen->b.caps.mesh_shader) + si_init_mesh_caps(sscreen); + + sscreen->force_aniso = MIN2(16, debug_get_num_option("R600_TEX_ANISO", -1)); + if (sscreen->force_aniso == -1) { + sscreen->force_aniso = MIN2(16, debug_get_num_option("AMD_TEX_ANISO", -1)); + } + + if (sscreen->force_aniso >= 0) { + printf("radeonsi: Forcing anisotropy filter to %ix\n", + /* round down to a power of two */ + 1 << util_logbase2(sscreen->force_aniso)); + } + + (void)simple_mtx_init(&sscreen->async_compute_context_lock, mtx_plain); + (void)simple_mtx_init(&sscreen->gpu_load_mutex, mtx_plain); + (void)simple_mtx_init(&sscreen->gds_mutex, mtx_plain); + (void)simple_mtx_init(&sscreen->tess_ring_lock, mtx_plain); + + si_init_gs_info(sscreen); + if (!si_init_shader_cache(sscreen)) { + FREE(sscreen->nir_options); + return false; + } + + if (sscreen->info.gfx_level < GFX10_3) + sscreen->options.vrs2x2 = false; + + /* Determine the number of shader compiler threads. */ + const struct util_cpu_caps_t *caps = util_get_cpu_caps(); + hw_threads = caps->nr_cpus; + + if (hw_threads >= 12) { + num_comp_hi_threads = hw_threads * 3 / 4; + num_comp_lo_threads = hw_threads / 3; + } else if (hw_threads >= 6) { + num_comp_hi_threads = hw_threads - 2; + num_comp_lo_threads = hw_threads / 2; + } else if (hw_threads >= 2) { + num_comp_hi_threads = hw_threads - 1; + num_comp_lo_threads = hw_threads / 2; + } else { + num_comp_hi_threads = 1; + num_comp_lo_threads = 1; + } + +#if !defined(NDEBUG) + nir_process_debug_variable(); + + /* Use a single compilation thread if NIR printing is enabled to avoid + * multiple shaders being printed at the same time. + */ + if (NIR_DEBUG(PRINT)) { + num_comp_hi_threads = 1; + num_comp_lo_threads = 1; + } +#endif + + num_comp_hi_threads = MIN2(num_comp_hi_threads, ARRAY_SIZE(sscreen->compiler)); + num_comp_lo_threads = MIN2(num_comp_lo_threads, ARRAY_SIZE(sscreen->compiler_lowp)); + + /* Take a reference on the glsl types for the compiler threads. */ + glsl_type_singleton_init_or_ref(); + + /* Start with a single thread and a single slot. + * Each time we'll hit the "all slots are in use" case, the number of threads and + * slots will be increased. + */ + int num_slots = num_comp_hi_threads == 1 ? 64 : 1; + if (!util_queue_init(&sscreen->shader_compiler_queue, "sh", num_slots, + num_comp_hi_threads, + UTIL_QUEUE_INIT_RESIZE_IF_FULL | + UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY, NULL)) { + si_destroy_shader_cache(sscreen); + FREE(sscreen->nir_options); + glsl_type_singleton_decref(); + return false; + } + + if (!util_queue_init(&sscreen->shader_compiler_queue_opt_variants, "sh_opt", num_slots, + num_comp_lo_threads, + UTIL_QUEUE_INIT_RESIZE_IF_FULL | + UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY, NULL)) { + si_destroy_shader_cache(sscreen); + FREE(sscreen->nir_options); + glsl_type_singleton_decref(); + return false; + } + + if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", false)) + si_init_perfcounters(sscreen); + + if (sscreen->debug_flags & DBG(NO_OUT_OF_ORDER)) + sscreen->info.has_out_of_order_rast = false; + + /* Only set this for the cases that are known to work, which are: + * - GFX9 if bpp >= 4 (in bytes) + */ + if (sscreen->info.gfx_level >= GFX10) { + memset(sscreen->allow_dcc_msaa_clear_to_reg_for_bpp, true, + sizeof(sscreen->allow_dcc_msaa_clear_to_reg_for_bpp)); + } else if (sscreen->info.gfx_level == GFX9) { + for (unsigned bpp_log2 = util_logbase2(1); bpp_log2 <= util_logbase2(16); bpp_log2++) + sscreen->allow_dcc_msaa_clear_to_reg_for_bpp[bpp_log2] = true; + } + + /* DCC stores have 50% performance of uncompressed stores and sometimes + * even less than that. It's risky to enable on dGPUs. + */ + sscreen->always_allow_dcc_stores = !(sscreen->debug_flags & DBG(NO_DCC_STORE)) && + (sscreen->debug_flags & DBG(DCC_STORE) || + sscreen->info.gfx_level >= GFX11 || /* always enabled on gfx11 */ + (sscreen->info.gfx_level >= GFX10_3 && + !sscreen->info.has_dedicated_vram)); + + sscreen->dpbb_allowed = !(sscreen->debug_flags & DBG(NO_DPBB)) && + (sscreen->info.gfx_level >= GFX10 || + /* Only enable primitive binning on gfx9 APUs by default. */ + (sscreen->info.gfx_level == GFX9 && !sscreen->info.has_dedicated_vram) || + sscreen->debug_flags & DBG(DPBB)); + + if (sscreen->dpbb_allowed) { + if ((sscreen->info.has_dedicated_vram && sscreen->info.max_render_backends > 4) || + sscreen->info.gfx_level >= GFX10) { + /* Only bin draws that have no CONTEXT and SH register changes between + * them because higher settings cause hangs. We've only been able to + * reproduce hangs on smaller chips (e.g. Navi24, Phoenix), though all + * chips might have them. What we see may be due to a driver bug. + */ + sscreen->pbb_context_states_per_bin = 1; + sscreen->pbb_persistent_states_per_bin = 1; + } else { + /* This is a workaround for: + * https://bugs.freedesktop.org/show_bug.cgi?id=110214 + * (an alternative is to insert manual BATCH_BREAK event when + * a context_roll is detected). */ + sscreen->pbb_context_states_per_bin = sscreen->info.has_gfx9_scissor_bug ? 1 : 3; + sscreen->pbb_persistent_states_per_bin = 8; + } + + if (!sscreen->info.has_gfx9_scissor_bug) + sscreen->pbb_context_states_per_bin = + debug_get_num_option("AMD_DEBUG_DPBB_CS", sscreen->pbb_context_states_per_bin); + sscreen->pbb_persistent_states_per_bin = + debug_get_num_option("AMD_DEBUG_DPBB_PS", sscreen->pbb_persistent_states_per_bin); + + assert(sscreen->pbb_context_states_per_bin >= 1 && + sscreen->pbb_context_states_per_bin <= 6); + assert(sscreen->pbb_persistent_states_per_bin >= 1 && + sscreen->pbb_persistent_states_per_bin <= 32); + } + + (void)simple_mtx_init(&sscreen->shader_parts_mutex, mtx_plain); + sscreen->use_monolithic_shaders = + (sscreen->shader_debug_flags & DBG(MONOLITHIC_SHADERS)) != 0; + + if (debug_get_bool_option("RADEON_DUMP_SHADERS", false)) + sscreen->shader_debug_flags |= DBG_ALL_SHADERS; + + /* Syntax: + * EQAA=s,z,c + * Example: + * EQAA=8,4,2 + + * That means 8 coverage samples, 4 Z/S samples, and 2 color samples. + * Constraints: + * s >= z >= c (ignoring this only wastes memory) + * s = [2..16] + * z = [2..8] + * c = [2..8] + * + * Only MSAA color and depth buffers are overridden. + */ + if (sscreen->info.has_eqaa_surface_allocator) { + const char *eqaa = debug_get_option("EQAA", NULL); + unsigned s, z, f; + + if (eqaa && sscanf(eqaa, "%u,%u,%u", &s, &z, &f) == 3 && s && z && f) { + sscreen->eqaa_force_coverage_samples = s; + sscreen->eqaa_force_z_samples = z; + sscreen->eqaa_force_color_samples = f; + } + } + + if (sscreen->info.gfx_level >= GFX11) { + sscreen->attribute_pos_prim_ring = + si_aligned_buffer_create(&sscreen->b, + PIPE_RESOURCE_FLAG_UNMAPPABLE | + SI_RESOURCE_FLAG_32BIT | + SI_RESOURCE_FLAG_DRIVER_INTERNAL | + SI_RESOURCE_FLAG_DISCARDABLE, + PIPE_USAGE_DEFAULT, + sscreen->info.total_attribute_pos_prim_ring_size, + 2 * 1024 * 1024); + } + + ac_print_nonshadowed_regs(sscreen->info.gfx_level, sscreen->info.family); + + return true; +} + +void si_fini_gfx_screen(struct si_screen *sscreen) { + struct si_shader_part *parts[] = {sscreen->ps_prologs, sscreen->ps_epilogs}; + unsigned i; + + if (!sscreen->has_gfx_compute) + return; + + if (sscreen->debug_flags & DBG(CACHE_STATS)) { + printf("live shader cache: hits = %u, misses = %u\n", sscreen->live_shader_cache.hits, + sscreen->live_shader_cache.misses); + printf("memory shader cache: hits = %u, misses = %u\n", sscreen->num_memory_shader_cache_hits, + sscreen->num_memory_shader_cache_misses); + printf("disk shader cache: hits = %u, misses = %u\n", sscreen->num_disk_shader_cache_hits, + sscreen->num_disk_shader_cache_misses); + } + + si_resource_reference(&sscreen->attribute_pos_prim_ring, NULL); + si_resource_reference(&sscreen->attribute_pos_prim_ring_tmz, NULL); + pipe_resource_reference(&sscreen->tess_rings, NULL); + pipe_resource_reference(&sscreen->tess_rings_tmz, NULL); + + util_queue_destroy(&sscreen->shader_compiler_queue); + util_queue_destroy(&sscreen->shader_compiler_queue_opt_variants); + + simple_mtx_destroy(&sscreen->async_compute_context_lock); + if (sscreen->async_compute_context) + sscreen->async_compute_context->destroy(sscreen->async_compute_context); + + /* Release the reference on glsl types of the compiler threads. */ + glsl_type_singleton_decref(); + + for (i = 0; i < ARRAY_SIZE(sscreen->compiler); i++) { + if (sscreen->compiler[i]) + si_destroy_llvm_compiler(sscreen->compiler[i]); + } + + for (i = 0; i < ARRAY_SIZE(sscreen->compiler_lowp); i++) { + if (sscreen->compiler_lowp[i]) + si_destroy_llvm_compiler(sscreen->compiler_lowp[i]); + } + + /* Free shader parts. */ + for (i = 0; i < ARRAY_SIZE(parts); i++) { + while (parts[i]) { + struct si_shader_part *part = parts[i]; + + parts[i] = part->next; + si_shader_binary_clean(&part->binary); + FREE(part); + } + } + simple_mtx_destroy(&sscreen->shader_parts_mutex); + si_destroy_shader_cache(sscreen); + + si_destroy_perfcounters(sscreen); + si_gpu_load_kill_thread(sscreen); + + simple_mtx_destroy(&sscreen->gpu_load_mutex); + simple_mtx_destroy(&sscreen->gds_mutex); + simple_mtx_destroy(&sscreen->tess_ring_lock); + + radeon_bo_reference(sscreen->ws, &sscreen->gds_oa, NULL); + + disk_cache_destroy(sscreen->disk_shader_cache); + util_vertex_state_cache_deinit(&sscreen->vertex_state_cache); + + util_live_shader_cache_deinit(&sscreen->live_shader_cache); + + FREE(sscreen->use_aco_shader_blakes); + FREE(sscreen->nir_options); +} diff --git a/src/gallium/drivers/radeonsi/meson.build b/src/gallium/drivers/radeonsi/meson.build index 0d36bfc72d8..1eca71c3480 100644 --- a/src/gallium/drivers/radeonsi/meson.build +++ b/src/gallium/drivers/radeonsi/meson.build @@ -119,6 +119,7 @@ endif radeonsi_gfx_libs = [] if with_gfx_compute files_libradeonsi+= files( + 'gfx/si_gfx_screen.c', 'si_blit.c', 'si_cp_reg_shadowing.c', 'si_compute_blit.c', diff --git a/src/gallium/drivers/radeonsi/si_get.c b/src/gallium/drivers/radeonsi/si_get.c index 6a2ede35bad..b413218c1d8 100644 --- a/src/gallium/drivers/radeonsi/si_get.c +++ b/src/gallium/drivers/radeonsi/si_get.c @@ -25,21 +25,6 @@ static const char *si_get_device_vendor(struct pipe_screen *pscreen) return "AMD"; } -static bool -si_is_compute_copy_faster(struct pipe_screen *pscreen, - enum pipe_format src_format, - enum pipe_format dst_format, - unsigned width, - unsigned height, - unsigned depth, - bool cpu) -{ - if (cpu) - /* very basic for now */ - return (uint64_t)width * height * depth > 64 * 64; - return false; -} - static void si_get_driver_uuid(struct pipe_screen *pscreen, char *uuid) { ac_compute_driver_uuid(uuid, PIPE_UUID_SIZE); @@ -103,14 +88,7 @@ static void si_query_memory_info(struct pipe_screen *screen, struct pipe_memory_ info->nr_device_memory_evictions = info->device_memory_evicted / 64; } -static struct disk_cache *si_get_disk_shader_cache(struct pipe_screen *pscreen) -{ - struct si_screen *sscreen = (struct si_screen *)pscreen; - - return sscreen->disk_shader_cache; -} - -static void si_init_renderer_string(struct si_screen *sscreen) +void si_init_renderer_string(struct si_screen *sscreen) { char first_name[256], second_name[32] = {}, kernel_version[128] = {}; struct utsname uname_data; @@ -160,18 +138,6 @@ static unsigned si_varying_expression_max_cost(nir_shader *producer, nir_shader return ac_nir_varying_expression_max_cost(producer, consumer); } - -static void -si_driver_thread_add_job(struct pipe_screen *screen, void *data, - struct util_queue_fence *fence, - pipe_driver_thread_func execute, - pipe_driver_thread_func cleanup, - const size_t job_size) -{ - struct si_screen *sscreen = (struct si_screen *)screen; - util_queue_add_job(&sscreen->shader_compiler_queue, data, fence, execute, cleanup, job_size); -} - static bool enable_mesh_shader(struct si_screen *sscreen) { return sscreen->use_ngg && @@ -208,15 +174,10 @@ void si_init_screen_get_functions(struct si_screen *sscreen) sscreen->b.get_vendor = si_get_vendor; sscreen->b.get_device_vendor = si_get_device_vendor; sscreen->b.get_screen_fd = si_get_screen_fd; - sscreen->b.is_compute_copy_faster = si_is_compute_copy_faster; - sscreen->b.driver_thread_add_job = si_driver_thread_add_job; sscreen->b.get_timestamp = si_get_timestamp; sscreen->b.get_device_uuid = si_get_device_uuid; sscreen->b.get_driver_uuid = si_get_driver_uuid; sscreen->b.query_memory_info = si_query_memory_info; - sscreen->b.get_disk_shader_cache = si_get_disk_shader_cache; - - si_init_renderer_string(sscreen); } void si_init_screen_nir_options(struct si_screen *sscreen) @@ -408,7 +369,7 @@ void si_init_compute_caps(struct si_screen *sscreen) sscreen->info.compiler_info.has_cs_regalloc_hang_bug ? 256 : SI_MAX_VARIABLE_THREADS_PER_BLOCK; } -static void si_init_mesh_caps(struct si_screen *sscreen) +void si_init_mesh_caps(struct si_screen *sscreen) { struct pipe_mesh_caps *caps = (struct pipe_mesh_caps *)&sscreen->b.caps.mesh; @@ -459,7 +420,7 @@ static void si_init_mesh_caps(struct si_screen *sscreen) caps->pipeline_statistic_queries = sscreen->info.gfx_level >= GFX11; } -static void si_init_gfx_caps(struct si_screen *sscreen) +void si_init_gfx_caps(struct si_screen *sscreen) { struct pipe_caps *caps = (struct pipe_caps *)&sscreen->b.caps; @@ -581,7 +542,7 @@ static void si_init_gfx_caps(struct si_screen *sscreen) caps->fbfetch = 1; caps->graphics = sscreen->info.has_graphics; - caps->mesh_shader = enable_mesh_shader(sscreen); + caps->mesh_shader = sscreen->b.nir_options[MESA_SHADER_MESH]; caps->compute = sscreen->has_gfx_compute; /* Tahiti and Verde only: reduction mode is unsupported due to a bug @@ -609,14 +570,6 @@ static void si_init_gfx_caps(struct si_screen *sscreen) caps->post_depth_coverage = sscreen->info.gfx_level >= GFX10; -#ifdef HAVE_GFX_COMPUTE - caps->graphics = sscreen->info.has_graphics; - caps->mesh_shader = sscreen->b.nir_options[MESA_SHADER_MESH]; - caps->compute = true; -#else - caps->graphics = caps->mesh_shader = caps->compute = false; -#endif - caps->max_vertex_buffers = SI_MAX_ATTRIBS; caps->constant_buffer_offset_alignment = @@ -758,6 +711,11 @@ static void si_init_gfx_caps(struct si_screen *sscreen) * KHR-GL46.texture_lod_bias.texture_lod_bias_all */ caps->max_texture_lod_bias = 16; + + /* Override the value set by u_init_pipe_screen_caps because it was called + * before shader caps are set. + */ + caps->hardware_gl_select = debug_get_bool_option("MESA_HW_ACCEL_SELECT", true); } void si_init_screen_caps(struct si_screen *sscreen) @@ -771,11 +729,7 @@ void si_init_screen_caps(struct si_screen *sscreen) if (sscreen->info.is_virtio) caps->dmabuf |= DRM_PRIME_CAP_EXPORT | DRM_PRIME_CAP_IMPORT; -#ifdef HAVE_GFX_COMPUTE - si_init_gfx_caps(sscreen); -#else caps->graphics = caps->mesh_shader = caps->compute = false; -#endif caps->resource_from_user_memory = !UTIL_ARCH_BIG_ENDIAN && sscreen->info.has_userptr; @@ -808,9 +762,6 @@ void si_init_screen_caps(struct si_screen *sscreen) /* Conversion to nanos from cycles per millisecond */ caps->timer_resolution = DIV_ROUND_UP(1000000, sscreen->info.clock_crystal_freq); - if (caps->mesh_shader) - si_init_mesh_caps(sscreen); - if (sscreen->ws->va_range) sscreen->ws->va_range(sscreen->ws, &caps->min_vma, &caps->max_vma); diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index f5e030468ab..0e01219c033 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -95,43 +95,6 @@ static const struct debug_named_value radeonsi_debug_options[] = { DEBUG_NAMED_VALUE_END /* must be last */ }; -static const struct debug_named_value radeonsi_shader_debug_options[] = { - /* Shader logging options: */ - {"vs", DBG(VS), "Print vertex shaders"}, - {"ps", DBG(PS), "Print pixel shaders"}, - {"gs", DBG(GS), "Print geometry shaders"}, - {"tcs", DBG(TCS), "Print tessellation control shaders"}, - {"tes", DBG(TES), "Print tessellation evaluation shaders"}, - {"cs", DBG(CS), "Print compute shaders"}, - {"ts", DBG(TS), "Print task shaders"}, - {"ms", DBG(MS), "Print mesh shaders"}, - - {"initnir", DBG(INIT_NIR), "Print initial input NIR when shaders are created"}, - {"nir", DBG(NIR), "Print final NIR after lowering when shader variants are created"}, - {"initllvm", DBG(INIT_LLVM), "Print initial LLVM IR before optimizations"}, - {"llvm", DBG(LLVM), "Print final LLVM IR"}, - {"initaco", DBG(INIT_ACO), "Print initial ACO IR before optimizations"}, - {"aco", DBG(ACO), "Print final ACO IR"}, - {"asm", DBG(ASM), "Print final shaders in asm"}, - {"stats", DBG(STATS), "Print shader-db stats to stderr"}, - - /* Shader compiler options the shader cache should be aware of: */ - {"w32ge", DBG(W32_GE), "Use Wave32 for vertex, tessellation, and geometry shaders."}, - {"w32ps", DBG(W32_PS), "Use Wave32 for pixel shaders."}, - {"w32cs", DBG(W32_CS), "Use Wave32 for computes shaders."}, - {"w64ge", DBG(W64_GE), "Use Wave64 for vertex, tessellation, and geometry shaders."}, - {"w64ps", DBG(W64_PS), "Use Wave64 for pixel shaders."}, - {"w64cs", DBG(W64_CS), "Use Wave64 for computes shaders."}, - - /* Shader compiler options (with no effect on the shader cache): */ - {"checkir", DBG(CHECK_IR), "Enable additional sanity checks on shader IR"}, - {"mono", DBG(MONOLITHIC_SHADERS), "Use old-style monolithic shaders compiled on demand"}, - {"nooptvariant", DBG(NO_OPT_VARIANT), "Disable compiling optimized shader variants."}, - {"usellvm", DBG(USE_LLVM), "Use LLVM as shader compiler when possible"}, - - DEBUG_NAMED_VALUE_END /* must be last */ -}; - static const struct debug_named_value test_options[] = { /* Tests: */ {"clearbuffer", DBG(TEST_CLEAR_BUFFER), "Test correctness of the clear_buffer compute shader"}, @@ -179,7 +142,7 @@ void si_init_aux_async_compute_ctx(struct si_screen *sscreen) ((struct si_context*)sscreen->async_compute_context)->cs_max_waves_per_sh = 2; } -static void si_destroy_llvm_compiler(struct ac_llvm_compiler *compiler) +void si_destroy_llvm_compiler(struct ac_llvm_compiler *compiler) { #if AMD_LLVM_AVAILABLE ac_destroy_llvm_compiler(compiler); @@ -1026,26 +989,10 @@ static struct pipe_context *si_pipe_create_context(struct pipe_screen *screen, v void si_destroy_screen(struct pipe_screen *pscreen) { struct si_screen *sscreen = (struct si_screen *)pscreen; - struct si_shader_part *parts[] = {sscreen->ps_prologs, sscreen->ps_epilogs}; - unsigned i; if (!sscreen->ws->unref(sscreen->ws)) return; - if (sscreen->debug_flags & DBG(CACHE_STATS)) { - printf("live shader cache: hits = %u, misses = %u\n", sscreen->live_shader_cache.hits, - sscreen->live_shader_cache.misses); - printf("memory shader cache: hits = %u, misses = %u\n", sscreen->num_memory_shader_cache_hits, - sscreen->num_memory_shader_cache_misses); - printf("disk shader cache: hits = %u, misses = %u\n", sscreen->num_disk_shader_cache_hits, - sscreen->num_disk_shader_cache_misses); - } - - si_resource_reference(&sscreen->attribute_pos_prim_ring, NULL); - si_resource_reference(&sscreen->attribute_pos_prim_ring_tmz, NULL); - pipe_resource_reference(&sscreen->tess_rings, NULL); - pipe_resource_reference(&sscreen->tess_rings_tmz, NULL); - for (unsigned i = 0; i < ARRAY_SIZE(sscreen->aux_contexts); i++) { if (!sscreen->aux_contexts[i].ctx) continue; @@ -1063,68 +1010,18 @@ void si_destroy_screen(struct pipe_screen *pscreen) mtx_destroy(&sscreen->aux_contexts[i].lock); } - util_queue_destroy(&sscreen->shader_compiler_queue); - util_queue_destroy(&sscreen->shader_compiler_queue_opt_variants); + si_fini_gfx_screen(sscreen); - simple_mtx_destroy(&sscreen->async_compute_context_lock); - if (sscreen->async_compute_context) { - sscreen->async_compute_context->destroy(sscreen->async_compute_context); - } - - /* Release the reference on glsl types of the compiler threads. */ - glsl_type_singleton_decref(); - - for (i = 0; i < ARRAY_SIZE(sscreen->compiler); i++) { - if (sscreen->compiler[i]) - si_destroy_llvm_compiler(sscreen->compiler[i]); - } - - for (i = 0; i < ARRAY_SIZE(sscreen->compiler_lowp); i++) { - if (sscreen->compiler_lowp[i]) - si_destroy_llvm_compiler(sscreen->compiler_lowp[i]); - } - - /* Free shader parts. */ - for (i = 0; i < ARRAY_SIZE(parts); i++) { - while (parts[i]) { - struct si_shader_part *part = parts[i]; - - parts[i] = part->next; - si_shader_binary_clean(&part->binary); - FREE(part); - } - } - simple_mtx_destroy(&sscreen->shader_parts_mutex); - si_destroy_shader_cache(sscreen); - - si_destroy_perfcounters(sscreen); - si_gpu_load_kill_thread(sscreen); - - simple_mtx_destroy(&sscreen->gpu_load_mutex); - simple_mtx_destroy(&sscreen->gds_mutex); - simple_mtx_destroy(&sscreen->tess_ring_lock); simple_mtx_destroy(&sscreen->print_ib_mutex); - radeon_bo_reference(sscreen->ws, &sscreen->gds_oa, NULL); - slab_destroy_parent(&sscreen->pool_transfers); - disk_cache_destroy(sscreen->disk_shader_cache); - util_live_shader_cache_deinit(&sscreen->live_shader_cache); util_idalloc_mt_fini(&sscreen->buffer_ids); - util_vertex_state_cache_deinit(&sscreen->vertex_state_cache); sscreen->ws->destroy(sscreen->ws); - FREE(sscreen->use_aco_shader_blakes); - FREE(sscreen->nir_options); FREE(sscreen); } -static void si_init_gs_info(struct si_screen *sscreen) -{ - sscreen->gs_table_depth = ac_get_gs_table_depth(sscreen->info.gfx_level, sscreen->info.family); -} - static void si_test_vmfault(struct si_screen *sscreen, uint64_t test_flags) { struct pipe_context *ctx = sscreen->aux_context.general.ctx; @@ -1150,163 +1047,10 @@ static void si_test_vmfault(struct si_screen *sscreen, uint64_t test_flags) exit(0); } -static void -parse_hex(char *out, const char *in, unsigned length) -{ - for (unsigned i = 0; i < length; ++i) - out[i] = 0; - - for (unsigned i = 0; i < 2 * length; ++i) { - unsigned v = in[i] <= '9' ? in[i] - '0' : (in[i] >= 'a' ? (in[i] - 'a' + 10) : (in[i] - 'A' + 10)); - out[i / 2] |= v << (4 * (1 - i % 2)); - } -} - -static void si_disk_cache_create(struct si_screen *sscreen) -{ - /* Don't use the cache if shader dumping is enabled. */ - if (sscreen->shader_debug_flags & DBG_ALL_SHADERS) - return; - - blake3_hasher ctx; - unsigned char blake3[BLAKE3_KEY_LEN]; - char cache_id[BLAKE3_HEX_LEN]; - - _mesa_blake3_init(&ctx); - -#ifdef RADEONSI_BUILD_ID_OVERRIDE - { - unsigned size = strlen(RADEONSI_BUILD_ID_OVERRIDE) / 2; - char *data = alloca(size); - parse_hex(data, RADEONSI_BUILD_ID_OVERRIDE, size); - _mesa_blake3_update(&ctx, data, size); - } -#else - if (!disk_cache_get_function_identifier(si_disk_cache_create, &ctx)) - return; -#endif - -#if AMD_LLVM_AVAILABLE - if (!disk_cache_get_function_identifier(LLVMInitializeAMDGPUTargetInfo, &ctx)) - return; -#endif - - /* NIR options depend on si_screen::use_aco, which affects all shaders, including GLSL - * compilation. - */ - _mesa_blake3_update(&ctx, &sscreen->use_aco, sizeof(sscreen->use_aco)); - - _mesa_blake3_final(&ctx, blake3); - mesa_bytes_to_hex(cache_id, blake3, BLAKE3_KEY_LEN); - - sscreen->disk_shader_cache = disk_cache_create(ac_get_family_name(sscreen->info.family), - cache_id, sscreen->info.address32_hi); -} - -static void si_set_max_shader_compiler_threads(struct pipe_screen *screen, unsigned max_threads) -{ - struct si_screen *sscreen = (struct si_screen *)screen; - - /* This function doesn't allow a greater number of threads than - * the queue had at its creation. */ - util_queue_adjust_num_threads(&sscreen->shader_compiler_queue, max_threads, false); - /* Don't change the number of threads on the low priority queue. */ -} - -static bool si_is_parallel_shader_compilation_finished(struct pipe_screen *screen, void *shader, - mesa_shader_stage shader_type) -{ - struct si_shader_selector *sel = (struct si_shader_selector *)shader; - - return util_queue_fence_is_signalled(&sel->ready); -} - -static void si_setup_force_shader_use_aco(struct si_screen *sscreen, bool support_aco) -{ - /* Usage: - * 1. shader type: vs|tcs|tes|gs|ps|cs, specify a class of shaders to use aco - * 2. shader blake: specify a single shader blake directly to use aco - * 3. filename: specify a file which contains shader blakes in lines - */ - - sscreen->use_aco_shader_type = MESA_SHADER_NONE; - - if (sscreen->use_aco || !support_aco) - return; - - const char *option = debug_get_option("AMD_FORCE_SHADER_USE_ACO", NULL); - if (!option) - return; - - if (!strcmp("vs", option)) { - sscreen->use_aco_shader_type = MESA_SHADER_VERTEX; - return; - } else if (!strcmp("tcs", option)) { - sscreen->use_aco_shader_type = MESA_SHADER_TESS_CTRL; - return; - } else if (!strcmp("tes", option)) { - sscreen->use_aco_shader_type = MESA_SHADER_TESS_EVAL; - return; - } else if (!strcmp("gs", option)) { - sscreen->use_aco_shader_type = MESA_SHADER_GEOMETRY; - return; - } else if (!strcmp("ps", option)) { - sscreen->use_aco_shader_type = MESA_SHADER_FRAGMENT; - return; - } else if (!strcmp("cs", option)) { - sscreen->use_aco_shader_type = MESA_SHADER_COMPUTE; - return; - } - - blake3_hash blake; - if (_mesa_blake3_from_printed_string(blake, option)) { - sscreen->use_aco_shader_blakes = MALLOC(sizeof(blake)); - memcpy(sscreen->use_aco_shader_blakes[0], blake, sizeof(blake)); - sscreen->num_use_aco_shader_blakes = 1; - return; - } - - FILE *f = fopen(option, "r"); - if (!f) { - mesa_loge("invalid AMD_FORCE_SHADER_USE_ACO value"); - return; - } - - unsigned max_size = 16 * sizeof(blake3_hash); - sscreen->use_aco_shader_blakes = MALLOC(max_size); - - char line[1024]; - while (fgets(line, sizeof(line), f)) { - if (sscreen->num_use_aco_shader_blakes * sizeof(blake3_hash) >= max_size) { - sscreen->use_aco_shader_blakes = REALLOC( - sscreen->use_aco_shader_blakes, max_size, max_size * 2); - max_size *= 2; - } - - if (line[BLAKE3_PRINTED_LEN] == '\n') - line[BLAKE3_PRINTED_LEN] = 0; - - if (_mesa_blake3_from_printed_string( - sscreen->use_aco_shader_blakes[sscreen->num_use_aco_shader_blakes], line)) - sscreen->num_use_aco_shader_blakes++; - } - - fclose(f); -} - -static bool -is_pro_graphics(struct si_screen *sscreen) -{ - return strstr(sscreen->info.marketing_name, "Pro") || - strstr(sscreen->info.marketing_name, "PRO") || - strstr(sscreen->info.marketing_name, "Frontier"); -} - static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws, const struct pipe_screen_config *config) { struct si_screen *sscreen = CALLOC_STRUCT(si_screen); - unsigned hw_threads, num_comp_hi_threads, num_comp_lo_threads; uint64_t test_flags; if (!sscreen) { @@ -1324,41 +1068,10 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws, sscreen->ws = ws; ws->query_info(ws, &sscreen->info); - sscreen->context_roll_log_filename = debug_get_option("AMD_ROLLS", NULL); sscreen->debug_flags = debug_get_flags_option("R600_DEBUG", radeonsi_debug_options, 0); sscreen->debug_flags |= debug_get_flags_option("AMD_DEBUG", radeonsi_debug_options, 0); - sscreen->shader_debug_flags = debug_get_flags_option("AMD_DEBUG", radeonsi_shader_debug_options, 0); test_flags = debug_get_flags_option("AMD_TEST", test_options, 0); - if (sscreen->debug_flags & DBG(NO_DISPLAY_DCC)) { - sscreen->info.use_display_dcc_unaligned = false; - sscreen->info.use_display_dcc_with_retile_blit = false; - } - - /* Using the environment variable doesn't enable PAIRS packets for simplicity. */ - if ((sscreen->debug_flags & DBG(SHADOW_REGS)) && - !(sscreen->info.userq_ip_mask & (1 << AMD_IP_GFX))) - sscreen->info.has_kernelq_reg_shadowing = true; - -#ifdef HAVE_GFX_COMPUTE - bool support_aco = aco_is_gpu_supported(&sscreen->info); - -#if AMD_LLVM_AVAILABLE - sscreen->use_aco = support_aco && sscreen->info.has_image_opcodes && - !(sscreen->shader_debug_flags & DBG(USE_LLVM)); -#else - sscreen->use_aco = true; -#endif - - if (sscreen->use_aco && !support_aco) { - mesa_loge("ACO does not support this chip yet"); - FREE(sscreen); - return NULL; - } - - si_setup_force_shader_use_aco(sscreen, support_aco); -#endif - if ((sscreen->debug_flags & DBG(TMZ)) && !sscreen->info.has_tmz_support) { fprintf(stderr, "radeonsi: requesting TMZ features but TMZ is not supported\n"); @@ -1366,71 +1079,18 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws, return NULL; } - if (!sscreen->use_aco) { - /* Initialize just one compiler instance to check for errors. The other compiler instances - * are initialized on demand. - */ - sscreen->compiler[0] = si_create_llvm_compiler(sscreen); - if (!sscreen->compiler[0]) { - /* The callee prints the error message. */ - FREE(sscreen); - return NULL; - } - } - -#ifdef HAVE_GFX_COMPUTE - sscreen->has_gfx_compute = true; -#endif - util_idalloc_mt_init_tc(&sscreen->buffer_ids); /* Set functions first. */ sscreen->b.context_create = si_pipe_create_context; sscreen->b.destroy = si_destroy_screen; - sscreen->b.set_max_shader_compiler_threads = si_set_max_shader_compiler_threads; - sscreen->b.is_parallel_shader_compilation_finished = si_is_parallel_shader_compilation_finished; -#ifdef HAVE_GFX_COMPUTE - sscreen->b.finalize_nir = si_finalize_nir; -#endif - - sscreen->nir_options = CALLOC_STRUCT(nir_shader_compiler_options); si_init_screen_buffer_functions(sscreen); si_init_screen_fence_functions(sscreen); si_init_screen_state_functions(sscreen); si_init_screen_texture_functions(sscreen); - si_init_screen_query_functions(sscreen); - si_init_screen_live_shader_cache(sscreen); - - if (sscreen->info.gfx_level >= GFX11) { - sscreen->use_ngg = true; - sscreen->use_ngg_culling = sscreen->info.max_render_backends >= 2 && - !(sscreen->debug_flags & DBG(NO_NGG_CULLING)); - } else { - sscreen->use_ngg = !(sscreen->debug_flags & DBG(NO_NGG)) && - sscreen->info.gfx_level >= GFX10 && - (sscreen->info.family != CHIP_NAVI14 || - is_pro_graphics(sscreen)); - sscreen->use_ngg_culling = sscreen->use_ngg && - sscreen->info.max_render_backends >= 2 && - !(sscreen->debug_flags & DBG(NO_NGG_CULLING)); - } - - sscreen->has_draw_indirect_multi = - (sscreen->info.family >= CHIP_POLARIS10) || - (sscreen->info.gfx_level == GFX8 && sscreen->info.pfp_fw_version >= 121 && - sscreen->info.me_fw_version >= 87) || - (sscreen->info.gfx_level == GFX7 && sscreen->info.pfp_fw_version >= 211 && - sscreen->info.me_fw_version >= 173) || - (sscreen->info.gfx_level == GFX6 && sscreen->info.pfp_fw_version >= 79 && - sscreen->info.me_fw_version >= 142); si_init_screen_get_functions(sscreen); - si_init_screen_nir_options(sscreen); - si_init_shader_caps(sscreen); - si_init_compute_caps(sscreen); - - /* si_init_screen_caps depends on shader caps. */ si_init_screen_caps(sscreen); if (sscreen->debug_flags & DBG(INFO)) @@ -1438,209 +1098,16 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws, slab_create_parent(&sscreen->pool_transfers, sizeof(struct si_transfer), 64); - sscreen->force_aniso = MIN2(16, debug_get_num_option("R600_TEX_ANISO", -1)); - if (sscreen->force_aniso == -1) { - sscreen->force_aniso = MIN2(16, debug_get_num_option("AMD_TEX_ANISO", -1)); - } - - if (sscreen->force_aniso >= 0) { - printf("radeonsi: Forcing anisotropy filter to %ix\n", - /* round down to a power of two */ - 1 << util_logbase2(sscreen->force_aniso)); - } - - (void)simple_mtx_init(&sscreen->async_compute_context_lock, mtx_plain); - (void)simple_mtx_init(&sscreen->gpu_load_mutex, mtx_plain); - (void)simple_mtx_init(&sscreen->gds_mutex, mtx_plain); - (void)simple_mtx_init(&sscreen->tess_ring_lock, mtx_plain); (void)simple_mtx_init(&sscreen->print_ib_mutex, mtx_plain); - si_init_gs_info(sscreen); - if (!si_init_shader_cache(sscreen)) { - FREE(sscreen->nir_options); + if (!si_init_gfx_screen(sscreen)) { FREE(sscreen); return NULL; } - - if (sscreen->info.gfx_level < GFX10_3) - sscreen->options.vrs2x2 = false; - - si_disk_cache_create(sscreen); - - /* Determine the number of shader compiler threads. */ - const struct util_cpu_caps_t *caps = util_get_cpu_caps(); - hw_threads = caps->nr_cpus; - - if (hw_threads >= 12) { - num_comp_hi_threads = hw_threads * 3 / 4; - num_comp_lo_threads = hw_threads / 3; - } else if (hw_threads >= 6) { - num_comp_hi_threads = hw_threads - 2; - num_comp_lo_threads = hw_threads / 2; - } else if (hw_threads >= 2) { - num_comp_hi_threads = hw_threads - 1; - num_comp_lo_threads = hw_threads / 2; - } else { - num_comp_hi_threads = 1; - num_comp_lo_threads = 1; - } - -#if !defined(NDEBUG) && defined(HAVE_GFX_COMPUTE) - nir_process_debug_variable(); - - /* Use a single compilation thread if NIR printing is enabled to avoid - * multiple shaders being printed at the same time. - */ - if (NIR_DEBUG(PRINT)) { - num_comp_hi_threads = 1; - num_comp_lo_threads = 1; - } -#endif - - num_comp_hi_threads = MIN2(num_comp_hi_threads, ARRAY_SIZE(sscreen->compiler)); - num_comp_lo_threads = MIN2(num_comp_lo_threads, ARRAY_SIZE(sscreen->compiler_lowp)); - - /* Take a reference on the glsl types for the compiler threads. */ - glsl_type_singleton_init_or_ref(); - - /* Start with a single thread and a single slot. - * Each time we'll hit the "all slots are in use" case, the number of threads and - * slots will be increased. - */ - int num_slots = num_comp_hi_threads == 1 ? 64 : 1; - if (!util_queue_init(&sscreen->shader_compiler_queue, "sh", num_slots, - num_comp_hi_threads, - UTIL_QUEUE_INIT_RESIZE_IF_FULL | - UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY, NULL)) { - si_destroy_shader_cache(sscreen); - FREE(sscreen->nir_options); - FREE(sscreen); - glsl_type_singleton_decref(); - return NULL; - } - - if (!util_queue_init(&sscreen->shader_compiler_queue_opt_variants, "sh_opt", num_slots, - num_comp_lo_threads, - UTIL_QUEUE_INIT_RESIZE_IF_FULL | - UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY, NULL)) { - si_destroy_shader_cache(sscreen); - FREE(sscreen->nir_options); - FREE(sscreen); - glsl_type_singleton_decref(); - return NULL; - } - /* Don't fail if the multimedia support is missing. */ si_init_mm_screen(sscreen); - if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", false)) - si_init_perfcounters(sscreen); - - ac_get_task_info(&sscreen->info, &sscreen->task_info); - - if (sscreen->debug_flags & DBG(NO_OUT_OF_ORDER)) - sscreen->info.has_out_of_order_rast = false; - - /* Only set this for the cases that are known to work, which are: - * - GFX9 if bpp >= 4 (in bytes) - */ - if (sscreen->info.gfx_level >= GFX10) { - memset(sscreen->allow_dcc_msaa_clear_to_reg_for_bpp, true, - sizeof(sscreen->allow_dcc_msaa_clear_to_reg_for_bpp)); - } else if (sscreen->info.gfx_level == GFX9) { - for (unsigned bpp_log2 = util_logbase2(1); bpp_log2 <= util_logbase2(16); bpp_log2++) - sscreen->allow_dcc_msaa_clear_to_reg_for_bpp[bpp_log2] = true; - } - - /* DCC stores have 50% performance of uncompressed stores and sometimes - * even less than that. It's risky to enable on dGPUs. - */ - sscreen->always_allow_dcc_stores = !(sscreen->debug_flags & DBG(NO_DCC_STORE)) && - (sscreen->debug_flags & DBG(DCC_STORE) || - sscreen->info.gfx_level >= GFX11 || /* always enabled on gfx11 */ - (sscreen->info.gfx_level >= GFX10_3 && - !sscreen->info.has_dedicated_vram)); - - sscreen->dpbb_allowed = !(sscreen->debug_flags & DBG(NO_DPBB)) && - (sscreen->info.gfx_level >= GFX10 || - /* Only enable primitive binning on gfx9 APUs by default. */ - (sscreen->info.gfx_level == GFX9 && !sscreen->info.has_dedicated_vram) || - sscreen->debug_flags & DBG(DPBB)); - - if (sscreen->dpbb_allowed) { - if ((sscreen->info.has_dedicated_vram && sscreen->info.max_render_backends > 4) || - sscreen->info.gfx_level >= GFX10) { - /* Only bin draws that have no CONTEXT and SH register changes between - * them because higher settings cause hangs. We've only been able to - * reproduce hangs on smaller chips (e.g. Navi24, Phoenix), though all - * chips might have them. What we see may be due to a driver bug. - */ - sscreen->pbb_context_states_per_bin = 1; - sscreen->pbb_persistent_states_per_bin = 1; - } else { - /* This is a workaround for: - * https://bugs.freedesktop.org/show_bug.cgi?id=110214 - * (an alternative is to insert manual BATCH_BREAK event when - * a context_roll is detected). */ - sscreen->pbb_context_states_per_bin = sscreen->info.has_gfx9_scissor_bug ? 1 : 3; - sscreen->pbb_persistent_states_per_bin = 8; - } - - if (!sscreen->info.has_gfx9_scissor_bug) - sscreen->pbb_context_states_per_bin = - debug_get_num_option("AMD_DEBUG_DPBB_CS", sscreen->pbb_context_states_per_bin); - sscreen->pbb_persistent_states_per_bin = - debug_get_num_option("AMD_DEBUG_DPBB_PS", sscreen->pbb_persistent_states_per_bin); - - assert(sscreen->pbb_context_states_per_bin >= 1 && - sscreen->pbb_context_states_per_bin <= 6); - assert(sscreen->pbb_persistent_states_per_bin >= 1 && - sscreen->pbb_persistent_states_per_bin <= 32); - } - - (void)simple_mtx_init(&sscreen->shader_parts_mutex, mtx_plain); - sscreen->use_monolithic_shaders = - (sscreen->shader_debug_flags & DBG(MONOLITHIC_SHADERS)) != 0; - - if (debug_get_bool_option("RADEON_DUMP_SHADERS", false)) - sscreen->shader_debug_flags |= DBG_ALL_SHADERS; - - /* Syntax: - * EQAA=s,z,c - * Example: - * EQAA=8,4,2 - - * That means 8 coverage samples, 4 Z/S samples, and 2 color samples. - * Constraints: - * s >= z >= c (ignoring this only wastes memory) - * s = [2..16] - * z = [2..8] - * c = [2..8] - * - * Only MSAA color and depth buffers are overridden. - */ - if (sscreen->info.has_eqaa_surface_allocator) { - const char *eqaa = debug_get_option("EQAA", NULL); - unsigned s, z, f; - - if (eqaa && sscanf(eqaa, "%u,%u,%u", &s, &z, &f) == 3 && s && z && f) { - sscreen->eqaa_force_coverage_samples = s; - sscreen->eqaa_force_z_samples = z; - sscreen->eqaa_force_color_samples = f; - } - } - - if (sscreen->info.gfx_level >= GFX11) { - sscreen->attribute_pos_prim_ring = - si_aligned_buffer_create(&sscreen->b, - PIPE_RESOURCE_FLAG_UNMAPPABLE | - SI_RESOURCE_FLAG_32BIT | - SI_RESOURCE_FLAG_DRIVER_INTERNAL | - SI_RESOURCE_FLAG_DISCARDABLE, - PIPE_USAGE_DEFAULT, - sscreen->info.total_attribute_pos_prim_ring_size, - 2 * 1024 * 1024); - } + si_init_renderer_string(sscreen); for (unsigned i = 0; i < ARRAY_SIZE(sscreen->aux_contexts); i++) (void)mtx_init(&sscreen->aux_contexts[i].lock, mtx_plain | mtx_recursive); @@ -1666,8 +1133,6 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws, if (test_flags & (DBG(TEST_VMFAULT_CP) | DBG(TEST_VMFAULT_SHADER))) si_test_vmfault(sscreen, test_flags); - ac_print_nonshadowed_regs(sscreen->info.gfx_level, sscreen->info.family); - return &sscreen->b; } @@ -1711,8 +1176,6 @@ struct pipe_screen *radeonsi_screen_create(int fd, const struct pipe_screen_conf } } - si_driver_ds_init(); - drmFreeVersion(version); return rw ? rw->screen : NULL; } diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 73737381657..156c0424325 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1596,6 +1596,9 @@ void si_init_screen_nir_options(struct si_screen *sscreen); void si_init_shader_caps(struct si_screen *sscreen); void si_init_compute_caps(struct si_screen *sscreen); void si_init_screen_caps(struct si_screen *sscreen); +void si_init_mesh_caps(struct si_screen *screen); +void si_init_gfx_caps(struct si_screen *sscreen); +void si_init_renderer_string(struct si_screen *sscreen); bool si_sdma_copy_image(struct si_context *ctx, struct si_texture *dst, struct si_texture *src); @@ -1623,6 +1626,7 @@ MESAPROC void si_init_compute_functions(struct si_context *sctx) TAILV; /* si_pipe.c */ struct ac_llvm_compiler *si_create_llvm_compiler(struct si_screen *sscreen); +void si_destroy_llvm_compiler(struct ac_llvm_compiler *compiler); void si_init_aux_async_compute_ctx(struct si_screen *sscreen); struct si_context *si_get_aux_context(struct si_screen *sscreen, struct si_aux_context *ctx); void si_put_aux_context_flush(struct si_aux_context *ctx);