diff --git a/src/amd/common/ac_cmdbuf.c b/src/amd/common/ac_cmdbuf.c new file mode 100644 index 00000000000..60b816a8966 --- /dev/null +++ b/src/amd/common/ac_cmdbuf.c @@ -0,0 +1,131 @@ +/* + * Copyright 2012 Advanced Micro Devices, Inc. + * Copyright 2024 Valve Corporation + * + * SPDX-License-Identifier: MIT + */ + +#include "ac_cmdbuf.h" +#include "ac_pm4.h" + +#include "sid.h" + +static void +gfx6_init_compute_preamble_state(const struct ac_preamble_state *state, + struct ac_pm4_state *pm4) +{ + const struct radeon_info *info = pm4->info; + const uint32_t compute_cu_en = S_00B858_SH0_CU_EN(info->spi_cu_en) | + S_00B858_SH1_CU_EN(info->spi_cu_en); + + ac_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, S_00B834_DATA(info->address32_hi >> 8)); + + for (unsigned i = 0; i < 2; ++i) + ac_pm4_set_reg(pm4, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 + i * 4, + i < info->max_se ? compute_cu_en : 0x0); + + if (info->gfx_level >= GFX7) { + for (unsigned i = 2; i < 4; ++i) + ac_pm4_set_reg(pm4, R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2 + (i - 2) * 4, + i < info->max_se ? compute_cu_en : 0x0); + } + + if (info->gfx_level >= GFX9) + ac_pm4_set_reg(pm4, R_0301EC_CP_COHER_START_DELAY, 0); + + /* Set the pointer to border colors. */ + if (info->gfx_level >= GFX7) { + ac_pm4_set_reg(pm4, R_030E00_TA_CS_BC_BASE_ADDR, state->border_color_va >> 8); + ac_pm4_set_reg(pm4, R_030E04_TA_CS_BC_BASE_ADDR_HI, + S_030E04_ADDRESS(state->border_color_va >> 40)); + } else if (info->gfx_level == GFX6) { + ac_pm4_set_reg(pm4, R_00950C_TA_CS_BC_BASE_ADDR, state->border_color_va >> 8); + } +} + +static void +gfx10_init_compute_preamble_state(const struct ac_preamble_state *state, + struct ac_pm4_state *pm4) +{ + const struct radeon_info *info = pm4->info; + const uint32_t compute_cu_en = S_00B858_SH0_CU_EN(info->spi_cu_en) | + S_00B858_SH1_CU_EN(info->spi_cu_en); + + if (info->gfx_level < GFX11) + ac_pm4_set_reg(pm4, R_0301EC_CP_COHER_START_DELAY, 0x20); + ac_pm4_set_reg(pm4, R_030E00_TA_CS_BC_BASE_ADDR, state->border_color_va >> 8); + ac_pm4_set_reg(pm4, R_030E04_TA_CS_BC_BASE_ADDR_HI, S_030E04_ADDRESS(state->border_color_va >> 40)); + + ac_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, S_00B834_DATA(info->address32_hi >> 8)); + + for (unsigned i = 0; i < 4; ++i) + ac_pm4_set_reg(pm4, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 + i * 4, + i < info->max_se ? compute_cu_en : 0x0); + + ac_pm4_set_reg(pm4, R_00B890_COMPUTE_USER_ACCUM_0, 0); + ac_pm4_set_reg(pm4, R_00B894_COMPUTE_USER_ACCUM_1, 0); + ac_pm4_set_reg(pm4, R_00B898_COMPUTE_USER_ACCUM_2, 0); + ac_pm4_set_reg(pm4, R_00B89C_COMPUTE_USER_ACCUM_3, 0); + + if (info->gfx_level >= GFX11) { + for (unsigned i = 4; i < 8; ++i) + ac_pm4_set_reg(pm4, R_00B8AC_COMPUTE_STATIC_THREAD_MGMT_SE4 + (i - 4) * 4, + i < info->max_se ? compute_cu_en : 0x0); + + /* How many threads should go to 1 SE before moving onto the next. Think of GL1 cache hits. + * Only these values are valid: 0 (disabled), 64, 128, 256, 512 + * Recommendation: 64 = RT, 256 = non-RT (run benchmarks to be sure) + */ + ac_pm4_set_reg(pm4, R_00B8BC_COMPUTE_DISPATCH_INTERLEAVE, + S_00B8BC_INTERLEAVE(state->gfx11.compute_dispatch_interleave)); + } + + ac_pm4_set_reg(pm4, R_00B9F4_COMPUTE_DISPATCH_TUNNEL, 0); +} + +static void +gfx12_init_compute_preamble_state(const struct ac_preamble_state *state, + struct ac_pm4_state *pm4) +{ + const struct radeon_info *info = pm4->info; + const uint32_t compute_cu_en = S_00B858_SH0_CU_EN(info->spi_cu_en) | + S_00B858_SH1_CU_EN(info->spi_cu_en); + const uint32_t num_se = info->max_se; + + ac_pm4_set_reg(pm4, R_030E00_TA_CS_BC_BASE_ADDR, state->border_color_va >> 8); + ac_pm4_set_reg(pm4, R_030E04_TA_CS_BC_BASE_ADDR_HI, S_030E04_ADDRESS(state->border_color_va >> 40)); + + ac_pm4_set_reg(pm4, R_00B82C_COMPUTE_PERFCOUNT_ENABLE, 0); + ac_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, S_00B834_DATA(info->address32_hi >> 8)); + ac_pm4_set_reg(pm4, R_00B838_COMPUTE_DISPATCH_PKT_ADDR_LO, 0); + ac_pm4_set_reg(pm4, R_00B83C_COMPUTE_DISPATCH_PKT_ADDR_HI, 0); + ac_pm4_set_reg(pm4, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, compute_cu_en); + ac_pm4_set_reg(pm4, R_00B85C_COMPUTE_STATIC_THREAD_MGMT_SE1, num_se > 1 ? compute_cu_en : 0); + ac_pm4_set_reg(pm4, R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, num_se > 2 ? compute_cu_en : 0); + ac_pm4_set_reg(pm4, R_00B868_COMPUTE_STATIC_THREAD_MGMT_SE3, num_se > 3 ? compute_cu_en : 0); + ac_pm4_set_reg(pm4, R_00B88C_COMPUTE_STATIC_THREAD_MGMT_SE8, num_se > 8 ? compute_cu_en : 0); + ac_pm4_set_reg(pm4, R_00B890_COMPUTE_USER_ACCUM_0, 0); + ac_pm4_set_reg(pm4, R_00B894_COMPUTE_USER_ACCUM_1, 0); + ac_pm4_set_reg(pm4, R_00B898_COMPUTE_USER_ACCUM_2, 0); + ac_pm4_set_reg(pm4, R_00B89C_COMPUTE_USER_ACCUM_3, 0); + ac_pm4_set_reg(pm4, R_00B8AC_COMPUTE_STATIC_THREAD_MGMT_SE4, num_se > 4 ? compute_cu_en : 0); + ac_pm4_set_reg(pm4, R_00B8B0_COMPUTE_STATIC_THREAD_MGMT_SE5, num_se > 5 ? compute_cu_en : 0); + ac_pm4_set_reg(pm4, R_00B8B4_COMPUTE_STATIC_THREAD_MGMT_SE6, num_se > 6 ? compute_cu_en : 0); + ac_pm4_set_reg(pm4, R_00B8B8_COMPUTE_STATIC_THREAD_MGMT_SE7, num_se > 7 ? compute_cu_en : 0); + ac_pm4_set_reg(pm4, R_00B9F4_COMPUTE_DISPATCH_TUNNEL, 0); +} + +void +ac_init_compute_preamble_state(const struct ac_preamble_state *state, + struct ac_pm4_state *pm4) +{ + const struct radeon_info *info = pm4->info; + + if (info->gfx_level >= GFX12) { + gfx12_init_compute_preamble_state(state, pm4); + } else if (info->gfx_level >= GFX10) { + gfx10_init_compute_preamble_state(state, pm4); + } else { + gfx6_init_compute_preamble_state(state, pm4); + } +} diff --git a/src/amd/common/ac_cmdbuf.h b/src/amd/common/ac_cmdbuf.h new file mode 100644 index 00000000000..bb00706641e --- /dev/null +++ b/src/amd/common/ac_cmdbuf.h @@ -0,0 +1,34 @@ +/* + * Copyright 2012 Advanced Micro Devices, Inc. + * + * SPDX-License-Identifier: MIT + */ + +#ifndef AC_CMDBUF_H +#define AC_CMDBUF_H + +#include + +#include "ac_pm4.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct ac_preamble_state { + uint64_t border_color_va; + + struct { + uint32_t compute_dispatch_interleave; + } gfx11; +}; + +void +ac_init_compute_preamble_state(const struct ac_preamble_state *state, + struct ac_pm4_state *pm4); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/amd/common/meson.build b/src/amd/common/meson.build index cebeae803d4..72a5660b840 100644 --- a/src/amd/common/meson.build +++ b/src/amd/common/meson.build @@ -71,6 +71,8 @@ gfx10_format_table_c = custom_target( amd_common_files = files( 'ac_binary.c', 'ac_binary.h', + 'ac_cmdbuf.c', + 'ac_cmdbuf.h', 'ac_shader_args.c', 'ac_shader_args.h', 'ac_shader_util.c', diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index ea165771c24..9d620480b85 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -18,6 +18,7 @@ #include "util/u_upload_mgr.h" #include "util/u_blend.h" +#include "ac_cmdbuf.h" #include "ac_descriptors.h" #include "ac_formats.h" #include "gfx10_format_table.h" @@ -5136,13 +5137,30 @@ unsigned gfx103_get_cu_mask_ps(struct si_screen *sscreen) return u_bit_consecutive(0, sscreen->info.min_good_cu_per_sa); } +static void si_init_compute_preamble_state(struct si_context *sctx, + struct si_pm4_state *pm4) +{ + uint64_t border_color_va = + sctx->border_color_buffer ? sctx->border_color_buffer->gpu_address : 0; + + const struct ac_preamble_state preamble_state = { + .border_color_va = border_color_va, + .gfx11 = { + .compute_dispatch_interleave = 256, + }, + }; + + ac_init_compute_preamble_state(&preamble_state, &pm4->base); + + if (sctx->gfx_level == GFX10 || sctx->gfx_level == GFX10_3) + ac_pm4_set_reg(&pm4->base, R_00B8A0_COMPUTE_PGM_RSRC3, 0); +} + static void gfx6_init_gfx_preamble_state(struct si_context *sctx) { struct si_screen *sscreen = sctx->screen; uint64_t border_color_va = sctx->border_color_buffer ? sctx->border_color_buffer->gpu_address : 0; - uint32_t compute_cu_en = S_00B858_SH0_CU_EN(sscreen->info.spi_cu_en) | - S_00B858_SH1_CU_EN(sscreen->info.spi_cu_en); bool has_clear_state = sscreen->info.has_clear_state; /* We need more space because the preamble is large. */ @@ -5166,27 +5184,7 @@ static void gfx6_init_gfx_preamble_state(struct si_context *sctx) } } - /* Compute registers. */ - ac_pm4_set_reg(&pm4->base, R_00B834_COMPUTE_PGM_HI, S_00B834_DATA(sctx->screen->info.address32_hi >> 8)); - ac_pm4_set_reg(&pm4->base, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, compute_cu_en); - ac_pm4_set_reg(&pm4->base, R_00B85C_COMPUTE_STATIC_THREAD_MGMT_SE1, compute_cu_en); - - if (sctx->gfx_level >= GFX7) { - ac_pm4_set_reg(&pm4->base, R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, compute_cu_en); - ac_pm4_set_reg(&pm4->base, R_00B868_COMPUTE_STATIC_THREAD_MGMT_SE3, compute_cu_en); - } - - if (sctx->gfx_level >= GFX9) - ac_pm4_set_reg(&pm4->base, R_0301EC_CP_COHER_START_DELAY, 0); - - /* Set the pointer to border colors. MI200 doesn't support border colors. */ - if (sctx->gfx_level >= GFX7 && sctx->border_color_buffer) { - ac_pm4_set_reg(&pm4->base, R_030E00_TA_CS_BC_BASE_ADDR, border_color_va >> 8); - ac_pm4_set_reg(&pm4->base, R_030E04_TA_CS_BC_BASE_ADDR_HI, - S_030E04_ADDRESS(border_color_va >> 40)); - } else if (sctx->gfx_level == GFX6) { - ac_pm4_set_reg(&pm4->base, R_00950C_TA_CS_BC_BASE_ADDR, border_color_va >> 8); - } + si_init_compute_preamble_state(sctx, pm4); if (!sctx->has_graphics) goto done; @@ -5414,8 +5412,6 @@ static void gfx10_init_gfx_preamble_state(struct si_context *sctx) struct si_screen *sscreen = sctx->screen; uint64_t border_color_va = sctx->border_color_buffer ? sctx->border_color_buffer->gpu_address : 0; - uint32_t compute_cu_en = S_00B858_SH0_CU_EN(sscreen->info.spi_cu_en) | - S_00B858_SH1_CU_EN(sscreen->info.spi_cu_en); unsigned meta_write_policy, meta_read_policy, color_write_policy, color_read_policy; unsigned zs_write_policy, zs_read_policy; unsigned cache_no_alloc = sctx->gfx_level >= GFX11 ? V_02807C_CACHE_NOA_GFX11: @@ -5463,39 +5459,7 @@ static void gfx10_init_gfx_preamble_state(struct si_context *sctx) ac_pm4_cmd_add(&pm4->base, 0); } - /* Non-graphics uconfig registers. */ - if (sctx->gfx_level < GFX11) - ac_pm4_set_reg(&pm4->base, R_0301EC_CP_COHER_START_DELAY, 0x20); - ac_pm4_set_reg(&pm4->base, R_030E00_TA_CS_BC_BASE_ADDR, border_color_va >> 8); - ac_pm4_set_reg(&pm4->base, R_030E04_TA_CS_BC_BASE_ADDR_HI, S_030E04_ADDRESS(border_color_va >> 40)); - - /* Compute registers. */ - ac_pm4_set_reg(&pm4->base, R_00B834_COMPUTE_PGM_HI, S_00B834_DATA(sscreen->info.address32_hi >> 8)); - - for (unsigned i = 0; i < 4; ++i) - ac_pm4_set_reg(&pm4->base, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 + i * 4, - i < sscreen->info.max_se ? compute_cu_en : 0x0); - - ac_pm4_set_reg(&pm4->base, R_00B890_COMPUTE_USER_ACCUM_0, 0); - ac_pm4_set_reg(&pm4->base, R_00B894_COMPUTE_USER_ACCUM_1, 0); - ac_pm4_set_reg(&pm4->base, R_00B898_COMPUTE_USER_ACCUM_2, 0); - ac_pm4_set_reg(&pm4->base, R_00B89C_COMPUTE_USER_ACCUM_3, 0); - - if (sctx->gfx_level >= GFX11) { - for (unsigned i = 4; i < 8; ++i) - ac_pm4_set_reg(&pm4->base, R_00B8AC_COMPUTE_STATIC_THREAD_MGMT_SE4 + (i - 4) * 4, - i < sscreen->info.max_se ? compute_cu_en : 0x0); - - /* How many threads should go to 1 SE before moving onto the next. Think of GL1 cache hits. - * Only these values are valid: 0 (disabled), 64, 128, 256, 512 - * Recommendation: 64 = RT, 256 = non-RT (run benchmarks to be sure) - */ - ac_pm4_set_reg(&pm4->base, R_00B8BC_COMPUTE_DISPATCH_INTERLEAVE, S_00B8BC_INTERLEAVE(256)); - } else { - ac_pm4_set_reg(&pm4->base, R_00B8A0_COMPUTE_PGM_RSRC3, 0); - } - - ac_pm4_set_reg(&pm4->base, R_00B9F4_COMPUTE_DISPATCH_TUNNEL, 0); + si_init_compute_preamble_state(sctx, pm4); if (!sctx->has_graphics) goto done; @@ -5692,9 +5656,6 @@ static void gfx12_init_gfx_preamble_state(struct si_context *sctx) { struct si_screen *sscreen = sctx->screen; uint64_t border_color_va = sctx->border_color_buffer->gpu_address; - uint32_t compute_cu_en = S_00B88C_SA0_CU_EN(sscreen->info.spi_cu_en) | - S_00B88C_SA1_CU_EN(sscreen->info.spi_cu_en); - unsigned num_se = sscreen->info.max_se; unsigned color_write_policy, color_read_policy; enum gfx12_store_temporal_hint color_write_temporal_hint, zs_write_temporal_hint; enum gfx12_load_temporal_hint color_read_temporal_hint, zs_read_temporal_hint; @@ -5730,29 +5691,7 @@ static void gfx12_init_gfx_preamble_state(struct si_context *sctx) ac_pm4_cmd_add(&pm4->base, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); } - /* Non-graphics uconfig registers. */ - ac_pm4_set_reg(&pm4->base, R_030E00_TA_CS_BC_BASE_ADDR, border_color_va >> 8); - ac_pm4_set_reg(&pm4->base, R_030E04_TA_CS_BC_BASE_ADDR_HI, S_030E04_ADDRESS(border_color_va >> 40)); - - /* Compute registers. */ - ac_pm4_set_reg(&pm4->base, R_00B82C_COMPUTE_PERFCOUNT_ENABLE, 0); - ac_pm4_set_reg(&pm4->base, R_00B834_COMPUTE_PGM_HI, S_00B834_DATA(sctx->screen->info.address32_hi >> 8)); - ac_pm4_set_reg(&pm4->base, R_00B838_COMPUTE_DISPATCH_PKT_ADDR_LO, 0); - ac_pm4_set_reg(&pm4->base, R_00B83C_COMPUTE_DISPATCH_PKT_ADDR_HI, 0); - ac_pm4_set_reg(&pm4->base, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, compute_cu_en); - ac_pm4_set_reg(&pm4->base, R_00B85C_COMPUTE_STATIC_THREAD_MGMT_SE1, num_se > 1 ? compute_cu_en : 0); - ac_pm4_set_reg(&pm4->base, R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, num_se > 2 ? compute_cu_en : 0); - ac_pm4_set_reg(&pm4->base, R_00B868_COMPUTE_STATIC_THREAD_MGMT_SE3, num_se > 3 ? compute_cu_en : 0); - ac_pm4_set_reg(&pm4->base, R_00B88C_COMPUTE_STATIC_THREAD_MGMT_SE8, num_se > 8 ? compute_cu_en : 0); - ac_pm4_set_reg(&pm4->base, R_00B890_COMPUTE_USER_ACCUM_0, 0); - ac_pm4_set_reg(&pm4->base, R_00B894_COMPUTE_USER_ACCUM_1, 0); - ac_pm4_set_reg(&pm4->base, R_00B898_COMPUTE_USER_ACCUM_2, 0); - ac_pm4_set_reg(&pm4->base, R_00B89C_COMPUTE_USER_ACCUM_3, 0); - ac_pm4_set_reg(&pm4->base, R_00B8AC_COMPUTE_STATIC_THREAD_MGMT_SE4, num_se > 4 ? compute_cu_en : 0); - ac_pm4_set_reg(&pm4->base, R_00B8B0_COMPUTE_STATIC_THREAD_MGMT_SE5, num_se > 5 ? compute_cu_en : 0); - ac_pm4_set_reg(&pm4->base, R_00B8B4_COMPUTE_STATIC_THREAD_MGMT_SE6, num_se > 6 ? compute_cu_en : 0); - ac_pm4_set_reg(&pm4->base, R_00B8B8_COMPUTE_STATIC_THREAD_MGMT_SE7, num_se > 7 ? compute_cu_en : 0); - ac_pm4_set_reg(&pm4->base, R_00B9F4_COMPUTE_DISPATCH_TUNNEL, 0); + si_init_compute_preamble_state(sctx, pm4); if (!sctx->has_graphics) goto done;