mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-22 04:50:11 +01:00
radeonsi: add gfx12
Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29007>
This commit is contained in:
parent
c8ad0f0715
commit
f703dfd1bb
35 changed files with 2738 additions and 727 deletions
|
|
@ -345,6 +345,10 @@ static void gfx11_sh_query_get_result_resource(struct si_context *sctx, struct s
|
||||||
grid.grid[1] = 1;
|
grid.grid[1] = 1;
|
||||||
grid.grid[2] = 1;
|
grid.grid[2] = 1;
|
||||||
|
|
||||||
|
/* TODO: Range-invalidate GL2 */
|
||||||
|
if (sctx->screen->info.cp_sdma_ge_use_system_memory_scope)
|
||||||
|
sctx->flags |= SI_CONTEXT_INV_L2;
|
||||||
|
|
||||||
struct gfx11_sh_query_buffer *qbuf = query->first;
|
struct gfx11_sh_query_buffer *qbuf = query->first;
|
||||||
for (;;) {
|
for (;;) {
|
||||||
unsigned begin = qbuf == query->first ? query->first_begin : 0;
|
unsigned begin = qbuf == query->first ? query->first_begin : 0;
|
||||||
|
|
|
||||||
|
|
@ -144,7 +144,7 @@ if with_llvm
|
||||||
endif
|
endif
|
||||||
|
|
||||||
radeonsi_gfx_libs = []
|
radeonsi_gfx_libs = []
|
||||||
foreach ver : ['6', '7', '8', '9', '10', '103', '11', '115']
|
foreach ver : ['6', '7', '8', '9', '10', '103', '11', '115', '12']
|
||||||
radeonsi_gfx_libs += static_library(
|
radeonsi_gfx_libs += static_library(
|
||||||
'radeonsi_gfx@0@'.format(ver),
|
'radeonsi_gfx@0@'.format(ver),
|
||||||
['si_state_draw.cpp'],
|
['si_state_draw.cpp'],
|
||||||
|
|
|
||||||
|
|
@ -193,6 +193,8 @@ static void si_blit_decompress_zs_planes_in_place(struct si_context *sctx,
|
||||||
unsigned layer, max_layer, checked_last_layer;
|
unsigned layer, max_layer, checked_last_layer;
|
||||||
unsigned fully_decompressed_mask = 0;
|
unsigned fully_decompressed_mask = 0;
|
||||||
|
|
||||||
|
assert(sctx->gfx_level < GFX12);
|
||||||
|
|
||||||
if (!level_mask)
|
if (!level_mask)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
|
@ -287,6 +289,8 @@ static void si_decompress_depth(struct si_context *sctx, struct si_texture *tex,
|
||||||
unsigned levels_z = 0;
|
unsigned levels_z = 0;
|
||||||
unsigned levels_s = 0;
|
unsigned levels_s = 0;
|
||||||
|
|
||||||
|
assert(sctx->gfx_level < GFX12);
|
||||||
|
|
||||||
if (required_planes & PIPE_MASK_Z) {
|
if (required_planes & PIPE_MASK_Z) {
|
||||||
levels_z = level_mask & tex->dirty_level_mask;
|
levels_z = level_mask & tex->dirty_level_mask;
|
||||||
|
|
||||||
|
|
@ -401,6 +405,8 @@ static bool si_decompress_sampler_depth_textures(struct si_context *sctx,
|
||||||
unsigned mask = textures->needs_depth_decompress_mask;
|
unsigned mask = textures->needs_depth_decompress_mask;
|
||||||
bool need_flush = false;
|
bool need_flush = false;
|
||||||
|
|
||||||
|
assert(sctx->gfx_level < GFX12);
|
||||||
|
|
||||||
while (mask) {
|
while (mask) {
|
||||||
struct pipe_sampler_view *view;
|
struct pipe_sampler_view *view;
|
||||||
struct si_sampler_view *sview;
|
struct si_sampler_view *sview;
|
||||||
|
|
@ -437,6 +443,9 @@ static void si_blit_decompress_color(struct si_context *sctx, struct si_texture
|
||||||
unsigned layer, checked_last_layer, max_layer;
|
unsigned layer, checked_last_layer, max_layer;
|
||||||
unsigned level_mask = u_bit_consecutive(first_level, last_level - first_level + 1);
|
unsigned level_mask = u_bit_consecutive(first_level, last_level - first_level + 1);
|
||||||
|
|
||||||
|
/* No decompression is ever needed on Gfx12. */
|
||||||
|
assert(sctx->gfx_level < GFX12);
|
||||||
|
|
||||||
if (!need_dcc_decompress)
|
if (!need_dcc_decompress)
|
||||||
level_mask &= tex->dirty_level_mask;
|
level_mask &= tex->dirty_level_mask;
|
||||||
if (!level_mask)
|
if (!level_mask)
|
||||||
|
|
@ -624,6 +633,8 @@ static void si_check_render_feedback_texture(struct si_context *sctx, struct si_
|
||||||
{
|
{
|
||||||
bool render_feedback = false;
|
bool render_feedback = false;
|
||||||
|
|
||||||
|
assert(sctx->gfx_level < GFX12);
|
||||||
|
|
||||||
if (!vi_dcc_enabled(tex, first_level))
|
if (!vi_dcc_enabled(tex, first_level))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
|
@ -652,6 +663,8 @@ static void si_check_render_feedback_textures(struct si_context *sctx, struct si
|
||||||
{
|
{
|
||||||
uint32_t mask = textures->enabled_mask & in_use_mask;
|
uint32_t mask = textures->enabled_mask & in_use_mask;
|
||||||
|
|
||||||
|
assert(sctx->gfx_level < GFX12);
|
||||||
|
|
||||||
while (mask) {
|
while (mask) {
|
||||||
const struct pipe_sampler_view *view;
|
const struct pipe_sampler_view *view;
|
||||||
struct si_texture *tex;
|
struct si_texture *tex;
|
||||||
|
|
@ -674,6 +687,8 @@ static void si_check_render_feedback_images(struct si_context *sctx, struct si_i
|
||||||
{
|
{
|
||||||
uint32_t mask = images->enabled_mask & in_use_mask;
|
uint32_t mask = images->enabled_mask & in_use_mask;
|
||||||
|
|
||||||
|
assert(sctx->gfx_level < GFX12);
|
||||||
|
|
||||||
while (mask) {
|
while (mask) {
|
||||||
const struct pipe_image_view *view;
|
const struct pipe_image_view *view;
|
||||||
struct si_texture *tex;
|
struct si_texture *tex;
|
||||||
|
|
@ -693,6 +708,8 @@ static void si_check_render_feedback_images(struct si_context *sctx, struct si_i
|
||||||
|
|
||||||
static void si_check_render_feedback_resident_textures(struct si_context *sctx)
|
static void si_check_render_feedback_resident_textures(struct si_context *sctx)
|
||||||
{
|
{
|
||||||
|
assert(sctx->gfx_level < GFX12);
|
||||||
|
|
||||||
util_dynarray_foreach (&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle) {
|
util_dynarray_foreach (&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle) {
|
||||||
struct pipe_sampler_view *view;
|
struct pipe_sampler_view *view;
|
||||||
struct si_texture *tex;
|
struct si_texture *tex;
|
||||||
|
|
@ -710,6 +727,8 @@ static void si_check_render_feedback_resident_textures(struct si_context *sctx)
|
||||||
|
|
||||||
static void si_check_render_feedback_resident_images(struct si_context *sctx)
|
static void si_check_render_feedback_resident_images(struct si_context *sctx)
|
||||||
{
|
{
|
||||||
|
assert(sctx->gfx_level < GFX12);
|
||||||
|
|
||||||
util_dynarray_foreach (&sctx->resident_img_handles, struct si_image_handle *, img_handle) {
|
util_dynarray_foreach (&sctx->resident_img_handles, struct si_image_handle *, img_handle) {
|
||||||
struct pipe_image_view *view;
|
struct pipe_image_view *view;
|
||||||
struct si_texture *tex;
|
struct si_texture *tex;
|
||||||
|
|
@ -727,6 +746,8 @@ static void si_check_render_feedback_resident_images(struct si_context *sctx)
|
||||||
|
|
||||||
static void si_check_render_feedback(struct si_context *sctx)
|
static void si_check_render_feedback(struct si_context *sctx)
|
||||||
{
|
{
|
||||||
|
assert(sctx->gfx_level < GFX12);
|
||||||
|
|
||||||
if (!sctx->need_check_render_feedback)
|
if (!sctx->need_check_render_feedback)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
|
@ -896,6 +917,9 @@ void si_decompress_subresource(struct pipe_context *ctx, struct pipe_resource *t
|
||||||
struct si_context *sctx = (struct si_context *)ctx;
|
struct si_context *sctx = (struct si_context *)ctx;
|
||||||
struct si_texture *stex = (struct si_texture *)tex;
|
struct si_texture *stex = (struct si_texture *)tex;
|
||||||
|
|
||||||
|
if (sctx->gfx_level >= GFX12)
|
||||||
|
return;
|
||||||
|
|
||||||
if (stex->db_compatible) {
|
if (stex->db_compatible) {
|
||||||
planes &= PIPE_MASK_Z | PIPE_MASK_S;
|
planes &= PIPE_MASK_Z | PIPE_MASK_S;
|
||||||
|
|
||||||
|
|
@ -1324,7 +1348,7 @@ static void si_flush_resource(struct pipe_context *ctx, struct pipe_resource *re
|
||||||
struct si_context *sctx = (struct si_context *)ctx;
|
struct si_context *sctx = (struct si_context *)ctx;
|
||||||
struct si_texture *tex = (struct si_texture *)res;
|
struct si_texture *tex = (struct si_texture *)res;
|
||||||
|
|
||||||
if (res->target == PIPE_BUFFER)
|
if (sctx->gfx_level >= GFX12 || res->target == PIPE_BUFFER)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (!tex->is_depth && (tex->cmask_buffer || vi_dcc_enabled(tex, 0))) {
|
if (!tex->is_depth && (tex->cmask_buffer || vi_dcc_enabled(tex, 0))) {
|
||||||
|
|
@ -1340,6 +1364,8 @@ static void si_flush_resource(struct pipe_context *ctx, struct pipe_resource *re
|
||||||
|
|
||||||
void si_flush_implicit_resources(struct si_context *sctx)
|
void si_flush_implicit_resources(struct si_context *sctx)
|
||||||
{
|
{
|
||||||
|
assert(sctx->gfx_level < GFX12);
|
||||||
|
|
||||||
hash_table_foreach(sctx->dirty_implicit_resources, entry) {
|
hash_table_foreach(sctx->dirty_implicit_resources, entry) {
|
||||||
si_flush_resource(&sctx->b, entry->data);
|
si_flush_resource(&sctx->b, entry->data);
|
||||||
pipe_resource_reference((struct pipe_resource **)&entry->data, NULL);
|
pipe_resource_reference((struct pipe_resource **)&entry->data, NULL);
|
||||||
|
|
@ -1349,6 +1375,7 @@ void si_flush_implicit_resources(struct si_context *sctx)
|
||||||
|
|
||||||
void si_decompress_dcc(struct si_context *sctx, struct si_texture *tex)
|
void si_decompress_dcc(struct si_context *sctx, struct si_texture *tex)
|
||||||
{
|
{
|
||||||
|
assert(sctx->gfx_level < GFX12);
|
||||||
assert(!tex->is_depth);
|
assert(!tex->is_depth);
|
||||||
|
|
||||||
/* If graphics is disabled, we can't decompress DCC, but it shouldn't
|
/* If graphics is disabled, we can't decompress DCC, but it shouldn't
|
||||||
|
|
|
||||||
|
|
@ -397,8 +397,106 @@
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
/* GFX12 generic packet building helpers for PAIRS packets. Don't use these directly. */
|
||||||
|
#define gfx12_begin_regs(header) unsigned header = __cs_num++
|
||||||
|
|
||||||
|
#define gfx12_set_reg(reg, value, base_offset) do { \
|
||||||
|
radeon_emit(((reg) - (base_offset)) >> 2); \
|
||||||
|
radeon_emit(value); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define gfx12_opt_set_reg(reg, reg_enum, value, base_offset) do { \
|
||||||
|
unsigned __value = value; \
|
||||||
|
if (!BITSET_TEST(sctx->tracked_regs.reg_saved_mask, (reg_enum)) || \
|
||||||
|
sctx->tracked_regs.reg_value[reg_enum] != __value) { \
|
||||||
|
gfx12_set_reg(reg, __value, base_offset); \
|
||||||
|
BITSET_SET(sctx->tracked_regs.reg_saved_mask, (reg_enum)); \
|
||||||
|
sctx->tracked_regs.reg_value[reg_enum] = __value; \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define gfx12_opt_set_reg4(reg, reg_enum, v1, v2, v3, v4, base_offset) do { \
|
||||||
|
unsigned __v1 = (v1), __v2 = (v2), __v3 = (v3), __v4 = (v4); \
|
||||||
|
if (!BITSET_TEST_RANGE_INSIDE_WORD(sctx->tracked_regs.reg_saved_mask, \
|
||||||
|
(reg_enum), (reg_enum) + 3, 0xf) || \
|
||||||
|
sctx->tracked_regs.reg_value[(reg_enum)] != __v1 || \
|
||||||
|
sctx->tracked_regs.reg_value[(reg_enum) + 1] != __v2 || \
|
||||||
|
sctx->tracked_regs.reg_value[(reg_enum) + 2] != __v3 || \
|
||||||
|
sctx->tracked_regs.reg_value[(reg_enum) + 3] != __v4) { \
|
||||||
|
gfx12_set_reg((reg), __v1, (base_offset)); \
|
||||||
|
gfx12_set_reg((reg) + 4, __v2, (base_offset)); \
|
||||||
|
gfx12_set_reg((reg) + 8, __v3, (base_offset)); \
|
||||||
|
gfx12_set_reg((reg) + 12, __v4, (base_offset)); \
|
||||||
|
BITSET_SET_RANGE_INSIDE_WORD(sctx->tracked_regs.reg_saved_mask, \
|
||||||
|
(reg_enum), (reg_enum) + 3); \
|
||||||
|
sctx->tracked_regs.reg_value[(reg_enum)] = __v1; \
|
||||||
|
sctx->tracked_regs.reg_value[(reg_enum) + 1] = __v2; \
|
||||||
|
sctx->tracked_regs.reg_value[(reg_enum) + 2] = __v3; \
|
||||||
|
sctx->tracked_regs.reg_value[(reg_enum) + 3] = __v4; \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define gfx12_end_regs(header, packet) do { \
|
||||||
|
if ((header) + 1 == __cs_num) { \
|
||||||
|
__cs_num--; /* no registers have been set, back off */ \
|
||||||
|
} else { \
|
||||||
|
unsigned __dw_count = __cs_num - (header) - 2; \
|
||||||
|
__cs_buf[(header)] = PKT3((packet), __dw_count, 0) | PKT3_RESET_FILTER_CAM_S(1); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
/* GFX12 generic packet building helpers for buffered registers. Don't use these directly. */
|
||||||
|
#define gfx12_push_reg(reg, value, base_offset, type) do { \
|
||||||
|
unsigned __i = sctx->num_buffered_##type##_regs++; \
|
||||||
|
assert(__i < ARRAY_SIZE(sctx->gfx12.buffered_##type##_regs)); \
|
||||||
|
sctx->gfx12.buffered_##type##_regs[__i].reg_offset = ((reg) - (base_offset)) >> 2; \
|
||||||
|
sctx->gfx12.buffered_##type##_regs[__i].reg_value = value; \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define gfx12_opt_push_reg(reg, reg_enum, value, type) do { \
|
||||||
|
unsigned __value = value; \
|
||||||
|
unsigned __reg_enum = reg_enum; \
|
||||||
|
if (!BITSET_TEST(sctx->tracked_regs.reg_saved_mask, (reg_enum)) || \
|
||||||
|
sctx->tracked_regs.reg_value[__reg_enum] != __value) { \
|
||||||
|
gfx12_push_##type##_reg(reg, __value); \
|
||||||
|
BITSET_SET(sctx->tracked_regs.reg_saved_mask, (reg_enum)); \
|
||||||
|
sctx->tracked_regs.reg_value[__reg_enum] = __value; \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
/* GFX12 packet building helpers for PAIRS packets. */
|
||||||
|
#define gfx12_begin_context_regs() \
|
||||||
|
gfx12_begin_regs(__cs_context_reg_header)
|
||||||
|
|
||||||
|
#define gfx12_set_context_reg(reg, value) \
|
||||||
|
gfx12_set_reg(reg, value, SI_CONTEXT_REG_OFFSET)
|
||||||
|
|
||||||
|
#define gfx12_opt_set_context_reg(reg, reg_enum, value) \
|
||||||
|
gfx12_opt_set_reg(reg, reg_enum, value, SI_CONTEXT_REG_OFFSET)
|
||||||
|
|
||||||
|
#define gfx12_opt_set_context_reg4(reg, reg_enum, v1, v2, v3, v4) \
|
||||||
|
gfx12_opt_set_reg4(reg, reg_enum, v1, v2, v3, v4, SI_CONTEXT_REG_OFFSET)
|
||||||
|
|
||||||
|
#define gfx12_end_context_regs() \
|
||||||
|
gfx12_end_regs(__cs_context_reg_header, PKT3_SET_CONTEXT_REG_PAIRS)
|
||||||
|
|
||||||
|
/* GFX12 packet building helpers for buffered registers. */
|
||||||
|
#define gfx12_push_gfx_sh_reg(reg, value) \
|
||||||
|
gfx12_push_reg(reg, value, SI_SH_REG_OFFSET, gfx_sh)
|
||||||
|
|
||||||
|
#define gfx12_push_compute_sh_reg(reg, value) \
|
||||||
|
gfx12_push_reg(reg, value, SI_SH_REG_OFFSET, compute_sh)
|
||||||
|
|
||||||
|
#define gfx12_opt_push_gfx_sh_reg(reg, reg_enum, value) \
|
||||||
|
gfx12_opt_push_reg(reg, reg_enum, value, gfx_sh)
|
||||||
|
|
||||||
|
#define gfx12_opt_push_compute_sh_reg(reg, reg_enum, value) \
|
||||||
|
gfx12_opt_push_reg(reg, reg_enum, value, compute_sh)
|
||||||
|
|
||||||
#define radeon_set_or_push_gfx_sh_reg(reg, value) do { \
|
#define radeon_set_or_push_gfx_sh_reg(reg, value) do { \
|
||||||
if (GFX_VERSION >= GFX11 && HAS_SH_PAIRS_PACKED) { \
|
if (GFX_VERSION >= GFX12) { \
|
||||||
|
gfx12_push_gfx_sh_reg(reg, value); \
|
||||||
|
} else if (GFX_VERSION >= GFX11 && HAS_SH_PAIRS_PACKED) { \
|
||||||
gfx11_push_gfx_sh_reg(reg, value); \
|
gfx11_push_gfx_sh_reg(reg, value); \
|
||||||
} else { \
|
} else { \
|
||||||
radeon_set_sh_reg_seq(reg, 1); \
|
radeon_set_sh_reg_seq(reg, 1); \
|
||||||
|
|
|
||||||
|
|
@ -707,6 +707,8 @@ static void si_fast_clear(struct si_context *sctx, unsigned *buffers,
|
||||||
unsigned clear_types = 0;
|
unsigned clear_types = 0;
|
||||||
unsigned num_pixels = fb->width * fb->height;
|
unsigned num_pixels = fb->width * fb->height;
|
||||||
|
|
||||||
|
assert(sctx->gfx_level < GFX12);
|
||||||
|
|
||||||
/* This function is broken in BE, so just disable this path for now */
|
/* This function is broken in BE, so just disable this path for now */
|
||||||
#if UTIL_ARCH_BIG_ENDIAN
|
#if UTIL_ARCH_BIG_ENDIAN
|
||||||
return;
|
return;
|
||||||
|
|
@ -1149,7 +1151,7 @@ static void si_fast_clear(struct si_context *sctx, unsigned *buffers,
|
||||||
si_execute_clears(sctx, info, num_clears, clear_types, sctx->render_cond_enabled);
|
si_execute_clears(sctx, info, num_clears, clear_types, sctx->render_cond_enabled);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void si_clear(struct pipe_context *ctx, unsigned buffers,
|
static void gfx6_clear(struct pipe_context *ctx, unsigned buffers,
|
||||||
const struct pipe_scissor_state *scissor_state,
|
const struct pipe_scissor_state *scissor_state,
|
||||||
const union pipe_color_union *color, double depth, unsigned stencil)
|
const union pipe_color_union *color, double depth, unsigned stencil)
|
||||||
{
|
{
|
||||||
|
|
@ -1271,6 +1273,44 @@ static void si_clear(struct pipe_context *ctx, unsigned buffers,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void gfx12_clear(struct pipe_context *ctx, unsigned buffers,
|
||||||
|
const struct pipe_scissor_state *scissor_state,
|
||||||
|
const union pipe_color_union *color, double depth, unsigned stencil)
|
||||||
|
{
|
||||||
|
struct si_context *sctx = (struct si_context *)ctx;
|
||||||
|
struct pipe_framebuffer_state *fb = &sctx->framebuffer.state;
|
||||||
|
struct pipe_surface *zsbuf = fb->zsbuf;
|
||||||
|
struct si_texture *zstex = zsbuf ? (struct si_texture *)zsbuf->texture : NULL;
|
||||||
|
|
||||||
|
/* Unset clear flags for non-existent buffers. */
|
||||||
|
for (unsigned i = 0; i < 8; i++) {
|
||||||
|
if (i >= fb->nr_cbufs || !fb->cbufs[i])
|
||||||
|
buffers &= ~(PIPE_CLEAR_COLOR0 << i);
|
||||||
|
}
|
||||||
|
if (!zsbuf)
|
||||||
|
buffers &= ~PIPE_CLEAR_DEPTHSTENCIL;
|
||||||
|
else if (!util_format_has_stencil(util_format_description(zsbuf->format)))
|
||||||
|
buffers &= ~PIPE_CLEAR_STENCIL;
|
||||||
|
|
||||||
|
if (unlikely(sctx->sqtt_enabled)) {
|
||||||
|
if (buffers & PIPE_CLEAR_COLOR)
|
||||||
|
sctx->sqtt_next_event = EventCmdClearColorImage;
|
||||||
|
else if (buffers & PIPE_CLEAR_DEPTHSTENCIL)
|
||||||
|
sctx->sqtt_next_event = EventCmdClearDepthStencilImage;
|
||||||
|
}
|
||||||
|
|
||||||
|
si_blitter_begin(sctx, SI_CLEAR);
|
||||||
|
util_blitter_clear(sctx->blitter, fb->width, fb->height, util_framebuffer_get_num_layers(fb),
|
||||||
|
buffers, color, depth, stencil, sctx->framebuffer.nr_samples > 1);
|
||||||
|
si_blitter_end(sctx);
|
||||||
|
|
||||||
|
/* This is only used by the driver, not the hw. */
|
||||||
|
if (buffers & PIPE_CLEAR_DEPTH) {
|
||||||
|
zstex->depth_cleared_level_mask |= BITFIELD_BIT(zsbuf->u.tex.level);
|
||||||
|
zstex->depth_clear_value[zsbuf->u.tex.level] = depth;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static bool si_try_normal_clear(struct si_context *sctx, struct pipe_surface *dst,
|
static bool si_try_normal_clear(struct si_context *sctx, struct pipe_surface *dst,
|
||||||
unsigned dstx, unsigned dsty, unsigned width, unsigned height,
|
unsigned dstx, unsigned dsty, unsigned width, unsigned height,
|
||||||
bool render_condition_enabled, unsigned buffers,
|
bool render_condition_enabled, unsigned buffers,
|
||||||
|
|
@ -1459,7 +1499,11 @@ void si_init_clear_functions(struct si_context *sctx)
|
||||||
sctx->b.clear_texture = u_default_clear_texture;
|
sctx->b.clear_texture = u_default_clear_texture;
|
||||||
|
|
||||||
if (sctx->has_graphics) {
|
if (sctx->has_graphics) {
|
||||||
sctx->b.clear = si_clear;
|
if (sctx->gfx_level >= GFX12)
|
||||||
|
sctx->b.clear = gfx12_clear;
|
||||||
|
else
|
||||||
|
sctx->b.clear = gfx6_clear;
|
||||||
|
|
||||||
sctx->b.clear_depth_stencil = si_clear_depth_stencil;
|
sctx->b.clear_depth_stencil = si_clear_depth_stencil;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -178,7 +178,7 @@ static void si_create_compute_state_async(void *job, void *gdata, int thread_ind
|
||||||
shader->config.rsrc1 = S_00B848_VGPRS((shader->config.num_vgprs - 1) /
|
shader->config.rsrc1 = S_00B848_VGPRS((shader->config.num_vgprs - 1) /
|
||||||
((shader->wave_size == 32 ||
|
((shader->wave_size == 32 ||
|
||||||
sscreen->info.wave64_vgpr_alloc_granularity == 8) ? 8 : 4)) |
|
sscreen->info.wave64_vgpr_alloc_granularity == 8) ? 8 : 4)) |
|
||||||
S_00B848_DX10_CLAMP(1) |
|
S_00B848_DX10_CLAMP(sscreen->info.gfx_level < GFX12) |
|
||||||
S_00B848_MEM_ORDERED(si_shader_mem_ordered(shader)) |
|
S_00B848_MEM_ORDERED(si_shader_mem_ordered(shader)) |
|
||||||
S_00B848_FLOAT_MODE(shader->config.float_mode);
|
S_00B848_FLOAT_MODE(shader->config.float_mode);
|
||||||
|
|
||||||
|
|
@ -499,15 +499,36 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute
|
||||||
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, shader->bo,
|
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, shader->bo,
|
||||||
RADEON_USAGE_READ | RADEON_PRIO_SHADER_BINARY);
|
RADEON_USAGE_READ | RADEON_PRIO_SHADER_BINARY);
|
||||||
|
|
||||||
if (sctx->screen->info.has_set_sh_pairs_packed) {
|
if (sctx->gfx_level >= GFX12) {
|
||||||
|
unsigned rsrc3 = S_00B8A0_INST_PREF_SIZE_GFX12(si_get_shader_prefetch_size(shader));
|
||||||
|
|
||||||
|
gfx12_push_compute_sh_reg(R_00B830_COMPUTE_PGM_LO, shader_va >> 8);
|
||||||
|
gfx12_opt_push_compute_sh_reg(R_00B848_COMPUTE_PGM_RSRC1,
|
||||||
|
SI_TRACKED_COMPUTE_PGM_RSRC1, config->rsrc1);
|
||||||
|
gfx12_opt_push_compute_sh_reg(R_00B84C_COMPUTE_PGM_RSRC2,
|
||||||
|
SI_TRACKED_COMPUTE_PGM_RSRC2, rsrc2);
|
||||||
|
gfx12_opt_push_compute_sh_reg(R_00B8A0_COMPUTE_PGM_RSRC3,
|
||||||
|
SI_TRACKED_COMPUTE_PGM_RSRC3, rsrc3);
|
||||||
|
gfx12_opt_push_compute_sh_reg(R_00B860_COMPUTE_TMPRING_SIZE,
|
||||||
|
SI_TRACKED_COMPUTE_TMPRING_SIZE, tmpring_size);
|
||||||
|
if (shader->scratch_bo) {
|
||||||
|
gfx12_opt_push_compute_sh_reg(R_00B840_COMPUTE_DISPATCH_SCRATCH_BASE_LO,
|
||||||
|
SI_TRACKED_COMPUTE_DISPATCH_SCRATCH_BASE_LO,
|
||||||
|
sctx->compute_scratch_buffer->gpu_address >> 8);
|
||||||
|
gfx12_opt_push_compute_sh_reg(R_00B844_COMPUTE_DISPATCH_SCRATCH_BASE_HI,
|
||||||
|
SI_TRACKED_COMPUTE_DISPATCH_SCRATCH_BASE_HI,
|
||||||
|
sctx->compute_scratch_buffer->gpu_address >> 40);
|
||||||
|
}
|
||||||
|
} else if (sctx->screen->info.has_set_sh_pairs_packed) {
|
||||||
|
unsigned rsrc3 = S_00B8A0_INST_PREF_SIZE_GFX11(si_get_shader_prefetch_size(shader));
|
||||||
|
|
||||||
gfx11_push_compute_sh_reg(R_00B830_COMPUTE_PGM_LO, shader_va >> 8);
|
gfx11_push_compute_sh_reg(R_00B830_COMPUTE_PGM_LO, shader_va >> 8);
|
||||||
gfx11_opt_push_compute_sh_reg(R_00B848_COMPUTE_PGM_RSRC1,
|
gfx11_opt_push_compute_sh_reg(R_00B848_COMPUTE_PGM_RSRC1,
|
||||||
SI_TRACKED_COMPUTE_PGM_RSRC1, config->rsrc1);
|
SI_TRACKED_COMPUTE_PGM_RSRC1, config->rsrc1);
|
||||||
gfx11_opt_push_compute_sh_reg(R_00B84C_COMPUTE_PGM_RSRC2,
|
gfx11_opt_push_compute_sh_reg(R_00B84C_COMPUTE_PGM_RSRC2,
|
||||||
SI_TRACKED_COMPUTE_PGM_RSRC2, rsrc2);
|
SI_TRACKED_COMPUTE_PGM_RSRC2, rsrc2);
|
||||||
gfx11_opt_push_compute_sh_reg(R_00B8A0_COMPUTE_PGM_RSRC3,
|
gfx11_opt_push_compute_sh_reg(R_00B8A0_COMPUTE_PGM_RSRC3,
|
||||||
SI_TRACKED_COMPUTE_PGM_RSRC3,
|
SI_TRACKED_COMPUTE_PGM_RSRC3, rsrc3);
|
||||||
S_00B8A0_INST_PREF_SIZE_GFX11(si_get_shader_prefetch_size(shader)));
|
|
||||||
gfx11_opt_push_compute_sh_reg(R_00B860_COMPUTE_TMPRING_SIZE,
|
gfx11_opt_push_compute_sh_reg(R_00B860_COMPUTE_TMPRING_SIZE,
|
||||||
SI_TRACKED_COMPUTE_TMPRING_SIZE, tmpring_size);
|
SI_TRACKED_COMPUTE_TMPRING_SIZE, tmpring_size);
|
||||||
if (shader->scratch_bo) {
|
if (shader->scratch_bo) {
|
||||||
|
|
@ -735,7 +756,24 @@ static void si_setup_nir_user_data(struct si_context *sctx, const struct pipe_gr
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sctx->screen->info.has_set_sh_pairs_packed) {
|
if (sctx->gfx_level >= GFX12) {
|
||||||
|
if (sel->info.uses_grid_size && !info->indirect) {
|
||||||
|
gfx12_push_compute_sh_reg(grid_size_reg, info->grid[0]);
|
||||||
|
gfx12_push_compute_sh_reg(grid_size_reg + 4, info->grid[1]);
|
||||||
|
gfx12_push_compute_sh_reg(grid_size_reg + 8, info->grid[2]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sel->info.uses_variable_block_size) {
|
||||||
|
uint32_t value = info->block[0] | (info->block[1] << 10) | (info->block[2] << 20);
|
||||||
|
gfx12_push_compute_sh_reg(block_size_reg, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sel->info.base.cs.user_data_components_amd) {
|
||||||
|
unsigned num = sel->info.base.cs.user_data_components_amd;
|
||||||
|
for (unsigned i = 0; i < num; i++)
|
||||||
|
gfx12_push_compute_sh_reg(cs_user_data_reg + i * 4, sctx->cs_user_data[i]);
|
||||||
|
}
|
||||||
|
} else if (sctx->screen->info.has_set_sh_pairs_packed) {
|
||||||
if (sel->info.uses_grid_size && !info->indirect) {
|
if (sel->info.uses_grid_size && !info->indirect) {
|
||||||
gfx11_push_compute_sh_reg(grid_size_reg, info->grid[0]);
|
gfx11_push_compute_sh_reg(grid_size_reg, info->grid[0]);
|
||||||
gfx11_push_compute_sh_reg(grid_size_reg + 4, info->grid[1]);
|
gfx11_push_compute_sh_reg(grid_size_reg + 4, info->grid[1]);
|
||||||
|
|
@ -776,6 +814,76 @@ static void si_setup_nir_user_data(struct si_context *sctx, const struct pipe_gr
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool si_get_2d_interleave_size(const struct pipe_grid_info *info,
|
||||||
|
unsigned *log_x, unsigned *log_y)
|
||||||
|
{
|
||||||
|
/* The following code produces this behavior:
|
||||||
|
*
|
||||||
|
* WG size | WG block/SE | Thread block/SE
|
||||||
|
* ( 1, 32) = 32 | (16, 1) = 16 | ( 16, 32) = 512
|
||||||
|
* ( 2, 16) = 32 | ( 8, 2) = 16 | ( 16, 32) = 512
|
||||||
|
* ( 2, 32) = 64 | (16, 1) = 16 | ( 32, 32) = 1024
|
||||||
|
* ( 4, 8) = 32 | ( 4, 4) = 16 | ( 16, 32) = 512
|
||||||
|
* ( 4, 16) = 64 | ( 8, 2) = 16 | ( 32, 32) = 1024
|
||||||
|
* ( 4, 32) = 128 | ( 8, 1) = 8 | ( 32, 32) = 1024
|
||||||
|
* ( 8, 4) = 32 | ( 2, 8) = 16 | ( 16, 32) = 512
|
||||||
|
* ( 8, 8) = 64 | ( 4, 4) = 16 | ( 32, 32) = 1024
|
||||||
|
* ( 8, 16) = 128 | ( 4, 2) = 8 | ( 32, 32) = 1024
|
||||||
|
* ( 8, 32) = 256 | ( 4, 1) = 4 | ( 32, 32) = 1024
|
||||||
|
* (16, 2) = 32 | ( 1, 16) = 16 | ( 16, 32) = 512
|
||||||
|
* (16, 4) = 64 | ( 2, 8) = 16 | ( 32, 32) = 1024
|
||||||
|
* (16, 8) = 128 | ( 2, 4) = 8 | ( 32, 32) = 1024
|
||||||
|
* (16, 16) = 256 | ( 2, 2) = 4 | ( 32, 32) = 1024
|
||||||
|
* (16, 32) = 512 | ( 2, 1) = 2 | ( 32, 32) = 1024
|
||||||
|
* (32, 1) = 32 | ( 1, 16) = 16 | ( 32, 16) = 512
|
||||||
|
* (32, 2) = 64 | ( 1, 16) = 16 | ( 32, 32) = 1024
|
||||||
|
* (32, 4) = 128 | ( 1, 8) = 8 | ( 32, 32) = 1024
|
||||||
|
* (32, 8) = 256 | ( 1, 4) = 4 | ( 32, 32) = 1024
|
||||||
|
* (32, 16) = 512 | ( 1, 2) = 2 | ( 32, 32) = 1024
|
||||||
|
*
|
||||||
|
* For 3D workgroups, the total 2D thread count is divided by Z.
|
||||||
|
* Example with Z=8, showing only a 2D slice of the grid:
|
||||||
|
*
|
||||||
|
* WG size | WG block/SE | Thread block/SE
|
||||||
|
* ( 1, 32) = 32 | ( 4, 1) = 4 | ( 4, 32) = 128
|
||||||
|
* ( 2, 16) = 32 | ( 4, 1) = 4 | ( 8, 16) = 128
|
||||||
|
* ( 2, 32) = 64 | ( 2, 1) = 2 | ( 4, 32) = 128
|
||||||
|
* ( 4, 8) = 32 | ( 2, 2) = 4 | ( 8, 16) = 128
|
||||||
|
* ( 4, 16) = 64 | ( 2, 1) = 2 | ( 8, 16) = 128
|
||||||
|
* ( 8, 4) = 32 | ( 1, 4) = 4 | ( 8, 16) = 128
|
||||||
|
* ( 8, 8) = 64 | ( 1, 2) = 2 | ( 8, 16) = 128
|
||||||
|
* (16, 2) = 32 | ( 1, 4) = 4 | ( 16, 8) = 128
|
||||||
|
* (16, 4) = 64 | ( 1, 2) = 2 | ( 16, 8) = 128
|
||||||
|
* (32, 1) = 32 | ( 1, 4) = 4 | ( 32, 4) = 128
|
||||||
|
* (32, 2) = 64 | ( 1, 2) = 2 | ( 32, 4) = 128
|
||||||
|
*
|
||||||
|
* It tries to find a WG block size that corresponds to (N, N) or (N, 2*N) threads,
|
||||||
|
* but it's limited by the maximum WGs/SE, which is 16, and the number of threads/SE,
|
||||||
|
* which we set to 1024.
|
||||||
|
*/
|
||||||
|
unsigned max_threads_per_se = 1024;
|
||||||
|
unsigned threads_per_threadgroup = info->block[0] * info->block[1] * info->block[2];
|
||||||
|
unsigned workgroups_per_se = MIN2(max_threads_per_se / threads_per_threadgroup, 16);
|
||||||
|
unsigned log_workgroups_per_se = util_logbase2(workgroups_per_se);
|
||||||
|
|
||||||
|
if (!log_workgroups_per_se)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
assert(log_workgroups_per_se <= 4);
|
||||||
|
|
||||||
|
*log_x = MIN2(log_workgroups_per_se, 4);
|
||||||
|
*log_y = log_workgroups_per_se - *log_x;
|
||||||
|
|
||||||
|
while (*log_x > 0 && *log_y < 4 &&
|
||||||
|
info->block[0] * (1 << *log_x) > info->block[1] * (1 << *log_y)) {
|
||||||
|
(*log_x)--;
|
||||||
|
(*log_y)++;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(*log_x + *log_y <= 4);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_grid_info *info)
|
static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_grid_info *info)
|
||||||
{
|
{
|
||||||
struct si_screen *sscreen = sctx->screen;
|
struct si_screen *sscreen = sctx->screen;
|
||||||
|
|
@ -807,7 +915,11 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_
|
||||||
sctx->cs_max_waves_per_sh,
|
sctx->cs_max_waves_per_sh,
|
||||||
threadgroups_per_cu);
|
threadgroups_per_cu);
|
||||||
|
|
||||||
if (sctx->screen->info.has_set_sh_pairs_packed) {
|
if (sctx->gfx_level >= GFX12) {
|
||||||
|
gfx12_opt_push_compute_sh_reg(R_00B854_COMPUTE_RESOURCE_LIMITS,
|
||||||
|
SI_TRACKED_COMPUTE_RESOURCE_LIMITS,
|
||||||
|
compute_resource_limits);
|
||||||
|
} else if (sctx->screen->info.has_set_sh_pairs_packed) {
|
||||||
gfx11_opt_push_compute_sh_reg(R_00B854_COMPUTE_RESOURCE_LIMITS,
|
gfx11_opt_push_compute_sh_reg(R_00B854_COMPUTE_RESOURCE_LIMITS,
|
||||||
SI_TRACKED_COMPUTE_RESOURCE_LIMITS,
|
SI_TRACKED_COMPUTE_RESOURCE_LIMITS,
|
||||||
compute_resource_limits);
|
compute_resource_limits);
|
||||||
|
|
@ -830,8 +942,13 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_
|
||||||
bool partial_block_en = last_block[0] || last_block[1] || last_block[2];
|
bool partial_block_en = last_block[0] || last_block[1] || last_block[2];
|
||||||
uint32_t num_threads[3];
|
uint32_t num_threads[3];
|
||||||
|
|
||||||
|
if (sctx->gfx_level >= GFX12) {
|
||||||
|
num_threads[0] = S_00B81C_NUM_THREAD_FULL_GFX12(info->block[0]);
|
||||||
|
num_threads[1] = S_00B820_NUM_THREAD_FULL_GFX12(info->block[1]);
|
||||||
|
} else {
|
||||||
num_threads[0] = S_00B81C_NUM_THREAD_FULL_GFX6(info->block[0]);
|
num_threads[0] = S_00B81C_NUM_THREAD_FULL_GFX6(info->block[0]);
|
||||||
num_threads[1] = S_00B820_NUM_THREAD_FULL_GFX6(info->block[1]);
|
num_threads[1] = S_00B820_NUM_THREAD_FULL_GFX6(info->block[1]);
|
||||||
|
}
|
||||||
num_threads[2] = S_00B824_NUM_THREAD_FULL(info->block[2]);
|
num_threads[2] = S_00B824_NUM_THREAD_FULL(info->block[2]);
|
||||||
|
|
||||||
if (partial_block_en) {
|
if (partial_block_en) {
|
||||||
|
|
@ -849,7 +966,82 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_
|
||||||
dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
|
dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sctx->screen->info.has_set_sh_pairs_packed) {
|
if (sctx->gfx_level >= GFX12) {
|
||||||
|
/* Set PING_PONG_EN for every other dispatch.
|
||||||
|
* Only allowed on a gfx queue, and PARTIAL_TG_EN and USE_THREAD_DIMENSIONS must be 0.
|
||||||
|
*/
|
||||||
|
if (sctx->has_graphics && !partial_block_en) {
|
||||||
|
dispatch_initiator |= S_00B800_PING_PONG_EN(sctx->compute_ping_pong_launch);
|
||||||
|
sctx->compute_ping_pong_launch ^= 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Thread tiling within a workgroup. */
|
||||||
|
switch (sctx->cs_shader_state.program->shader.selector->info.base.cs.derivative_group) {
|
||||||
|
case DERIVATIVE_GROUP_LINEAR:
|
||||||
|
break;
|
||||||
|
case DERIVATIVE_GROUP_QUADS:
|
||||||
|
num_threads[0] |= S_00B81C_INTERLEAVE_BITS_X(1); /* 2x2 */
|
||||||
|
num_threads[1] |= S_00B820_INTERLEAVE_BITS_Y(1);
|
||||||
|
break;
|
||||||
|
case DERIVATIVE_GROUP_NONE:
|
||||||
|
/* These are the only legal combinations. */
|
||||||
|
if (info->block[0] % 8 == 0 && info->block[1] % 8 == 0) {
|
||||||
|
num_threads[0] |= S_00B81C_INTERLEAVE_BITS_X(3); /* 8x8 */
|
||||||
|
num_threads[1] |= S_00B820_INTERLEAVE_BITS_Y(3);
|
||||||
|
} else if (info->block[0] % 4 == 0 && info->block[1] % 8 == 0) {
|
||||||
|
num_threads[0] |= S_00B81C_INTERLEAVE_BITS_X(2); /* 4x8 */
|
||||||
|
num_threads[1] |= S_00B820_INTERLEAVE_BITS_Y(3);
|
||||||
|
} else if (info->block[0] % 4 == 0 && info->block[1] % 4 == 0) {
|
||||||
|
num_threads[0] |= S_00B81C_INTERLEAVE_BITS_X(2); /* 4x4 */
|
||||||
|
num_threads[1] |= S_00B820_INTERLEAVE_BITS_Y(2);
|
||||||
|
} else if (info->block[0] % 2 == 0 && info->block[1] % 2 == 0) {
|
||||||
|
num_threads[0] |= S_00B81C_INTERLEAVE_BITS_X(1); /* 2x2 */
|
||||||
|
num_threads[1] |= S_00B820_INTERLEAVE_BITS_Y(1);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* How many threads should go to 1 SE before moving onto the next if INTERLEAVE_2D_EN == 0.
|
||||||
|
* Only these values are valid: 0 (disabled), 64, 128, 256, 512
|
||||||
|
* 64 = RT, 256 = non-RT (run benchmarks to be sure)
|
||||||
|
*/
|
||||||
|
unsigned dispatch_interleave = S_00B8BC_INTERLEAVE_1D(256);
|
||||||
|
unsigned log_x, log_y;
|
||||||
|
|
||||||
|
/* Launch a 2D subgrid on each SE instead of a 1D subgrid. If enabled, INTERLEAVE_1D is
|
||||||
|
* ignored and each SE gets 1 subgrid up to a certain number of threads.
|
||||||
|
*
|
||||||
|
* Constraints:
|
||||||
|
* - Only supported by the gfx queue.
|
||||||
|
* - Max 16 workgroups per SE can be launched, max 4 in each dimension.
|
||||||
|
* - PARTIAL_TG_EN, USE_THREAD_DIMENSIONS, and ORDERED_APPEND_ENBL must be 0.
|
||||||
|
*/
|
||||||
|
if (sctx->has_graphics && !partial_block_en &&
|
||||||
|
(info->indirect || info->grid[1] >= 4) && MIN2(info->block[0], info->block[1]) >= 4 &&
|
||||||
|
si_get_2d_interleave_size(info, &log_x, &log_y)) {
|
||||||
|
dispatch_interleave = S_00B8BC_INTERLEAVE_1D(1) || /* 1D is disabled */
|
||||||
|
S_00B8BC_INTERLEAVE_2D_X_SIZE(log_x) |
|
||||||
|
S_00B8BC_INTERLEAVE_2D_Y_SIZE(log_y);
|
||||||
|
dispatch_initiator |= S_00B800_INTERLEAVE_2D_EN(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sctx->has_graphics) {
|
||||||
|
radeon_opt_set_sh_reg_idx(sctx, R_00B8BC_COMPUTE_DISPATCH_INTERLEAVE,
|
||||||
|
SI_TRACKED_COMPUTE_DISPATCH_INTERLEAVE, 2, dispatch_interleave);
|
||||||
|
} else {
|
||||||
|
gfx12_opt_push_compute_sh_reg(R_00B8BC_COMPUTE_DISPATCH_INTERLEAVE,
|
||||||
|
SI_TRACKED_COMPUTE_DISPATCH_INTERLEAVE, dispatch_interleave);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sctx->gfx_level >= GFX12) {
|
||||||
|
gfx12_opt_push_compute_sh_reg(R_00B81C_COMPUTE_NUM_THREAD_X,
|
||||||
|
SI_TRACKED_COMPUTE_NUM_THREAD_X, num_threads[0]);
|
||||||
|
gfx12_opt_push_compute_sh_reg(R_00B820_COMPUTE_NUM_THREAD_Y,
|
||||||
|
SI_TRACKED_COMPUTE_NUM_THREAD_Y, num_threads[1]);
|
||||||
|
gfx12_opt_push_compute_sh_reg(R_00B824_COMPUTE_NUM_THREAD_Z,
|
||||||
|
SI_TRACKED_COMPUTE_NUM_THREAD_Z, num_threads[2]);
|
||||||
|
} else if (sctx->screen->info.has_set_sh_pairs_packed) {
|
||||||
gfx11_opt_push_compute_sh_reg(R_00B81C_COMPUTE_NUM_THREAD_X,
|
gfx11_opt_push_compute_sh_reg(R_00B81C_COMPUTE_NUM_THREAD_X,
|
||||||
SI_TRACKED_COMPUTE_NUM_THREAD_X, num_threads[0]);
|
SI_TRACKED_COMPUTE_NUM_THREAD_X, num_threads[0]);
|
||||||
gfx11_opt_push_compute_sh_reg(R_00B820_COMPUTE_NUM_THREAD_Y,
|
gfx11_opt_push_compute_sh_reg(R_00B820_COMPUTE_NUM_THREAD_Y,
|
||||||
|
|
@ -862,7 +1054,7 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_
|
||||||
num_threads[0], num_threads[1], num_threads[2]);
|
num_threads[0], num_threads[1], num_threads[2]);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sctx->gfx_level >= GFX11) {
|
if (sctx->gfx_level >= GFX12 || sctx->screen->info.has_set_sh_pairs_packed) {
|
||||||
radeon_end();
|
radeon_end();
|
||||||
si_emit_buffered_compute_sh_regs(sctx);
|
si_emit_buffered_compute_sh_regs(sctx);
|
||||||
radeon_begin_again(cs);
|
radeon_begin_again(cs);
|
||||||
|
|
@ -879,11 +1071,21 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_
|
||||||
radeon_emit(base_va);
|
radeon_emit(base_va);
|
||||||
radeon_emit(base_va >> 32);
|
radeon_emit(base_va >> 32);
|
||||||
|
|
||||||
radeon_emit(PKT3(PKT3_DISPATCH_INDIRECT, 1, render_cond_bit) | PKT3_SHADER_TYPE_S(1));
|
unsigned pkt = PKT3_DISPATCH_INDIRECT;
|
||||||
|
|
||||||
|
if (sctx->gfx_level >= GFX12 && G_00B800_INTERLEAVE_2D_EN(dispatch_initiator))
|
||||||
|
pkt = PKT3_DISPATCH_INDIRECT_INTERLEAVED;
|
||||||
|
|
||||||
|
radeon_emit(PKT3(pkt, 1, render_cond_bit) | PKT3_SHADER_TYPE_S(1));
|
||||||
radeon_emit(info->indirect_offset);
|
radeon_emit(info->indirect_offset);
|
||||||
radeon_emit(dispatch_initiator);
|
radeon_emit(dispatch_initiator);
|
||||||
} else {
|
} else {
|
||||||
radeon_emit(PKT3(PKT3_DISPATCH_DIRECT, 3, render_cond_bit) | PKT3_SHADER_TYPE_S(1));
|
unsigned pkt = PKT3_DISPATCH_DIRECT;
|
||||||
|
|
||||||
|
if (sctx->gfx_level >= GFX12 && G_00B800_INTERLEAVE_2D_EN(dispatch_initiator))
|
||||||
|
pkt = PKT3_DISPATCH_DIRECT_INTERLEAVED;
|
||||||
|
|
||||||
|
radeon_emit(PKT3(pkt, 3, render_cond_bit) | PKT3_SHADER_TYPE_S(1));
|
||||||
radeon_emit(info->grid[0]);
|
radeon_emit(info->grid[0]);
|
||||||
radeon_emit(info->grid[1]);
|
radeon_emit(info->grid[1]);
|
||||||
radeon_emit(info->grid[2]);
|
radeon_emit(info->grid[2]);
|
||||||
|
|
@ -968,17 +1170,23 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info
|
||||||
si_make_CB_shader_coherent(sctx, 0,
|
si_make_CB_shader_coherent(sctx, 0,
|
||||||
sctx->framebuffer.CB_has_shader_readable_metadata,
|
sctx->framebuffer.CB_has_shader_readable_metadata,
|
||||||
sctx->framebuffer.all_DCC_pipe_aligned);
|
sctx->framebuffer.all_DCC_pipe_aligned);
|
||||||
|
|
||||||
|
if (sctx->gfx_level == GFX12 &&
|
||||||
|
(sctx->force_shader_coherency.with_db ||
|
||||||
|
si_check_needs_implicit_sync(sctx, RADEON_USAGE_DB_NEEDS_IMPLICIT_SYNC)))
|
||||||
|
si_make_DB_shader_coherent(sctx, 0, false, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sctx->gfx_level < GFX11)
|
if (sctx->gfx_level < GFX11)
|
||||||
gfx6_decompress_textures(sctx, 1 << PIPE_SHADER_COMPUTE);
|
gfx6_decompress_textures(sctx, 1 << PIPE_SHADER_COMPUTE);
|
||||||
else
|
else if (sctx->gfx_level < GFX12)
|
||||||
gfx11_decompress_textures(sctx, 1 << PIPE_SHADER_COMPUTE);
|
gfx11_decompress_textures(sctx, 1 << PIPE_SHADER_COMPUTE);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (info->indirect) {
|
if (info->indirect) {
|
||||||
/* Indirect buffers use TC L2 on GFX9, but not older hw. */
|
/* Indirect buffers use TC L2 on GFX9-GFX11, but not other hw. */
|
||||||
if (sctx->gfx_level <= GFX8 && si_resource(info->indirect)->TC_L2_dirty) {
|
if ((sctx->gfx_level <= GFX8 || sctx->gfx_level == GFX12) &&
|
||||||
|
si_resource(info->indirect)->TC_L2_dirty) {
|
||||||
sctx->flags |= SI_CONTEXT_WB_L2;
|
sctx->flags |= SI_CONTEXT_WB_L2;
|
||||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||||
si_resource(info->indirect)->TC_L2_dirty = false;
|
si_resource(info->indirect)->TC_L2_dirty = false;
|
||||||
|
|
@ -1054,6 +1262,7 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info
|
||||||
si_log_compute_state(sctx, sctx->log);
|
si_log_compute_state(sctx, sctx->log);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (sctx->gfx_level < GFX12) {
|
||||||
/* Mark displayable DCC as dirty for bound images. */
|
/* Mark displayable DCC as dirty for bound images. */
|
||||||
unsigned display_dcc_store_mask = sctx->images[PIPE_SHADER_COMPUTE].display_dcc_store_mask &
|
unsigned display_dcc_store_mask = sctx->images[PIPE_SHADER_COMPUTE].display_dcc_store_mask &
|
||||||
BITFIELD_MASK(program->sel.info.base.num_images);
|
BITFIELD_MASK(program->sel.info.base.num_images);
|
||||||
|
|
@ -1065,6 +1274,7 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info
|
||||||
}
|
}
|
||||||
|
|
||||||
/* TODO: Bindless images don't set displayable_dcc_dirty after image stores. */
|
/* TODO: Bindless images don't set displayable_dcc_dirty after image stores. */
|
||||||
|
}
|
||||||
|
|
||||||
sctx->compute_is_busy = true;
|
sctx->compute_is_busy = true;
|
||||||
sctx->num_compute_calls++;
|
sctx->num_compute_calls++;
|
||||||
|
|
@ -1127,4 +1337,40 @@ void si_init_compute_functions(struct si_context *sctx)
|
||||||
sctx->b.set_compute_resources = si_set_compute_resources;
|
sctx->b.set_compute_resources = si_set_compute_resources;
|
||||||
sctx->b.set_global_binding = si_set_global_binding;
|
sctx->b.set_global_binding = si_set_global_binding;
|
||||||
sctx->b.launch_grid = si_launch_grid;
|
sctx->b.launch_grid = si_launch_grid;
|
||||||
|
|
||||||
|
#if 0 /* test for si_get_2d_interleave_size */
|
||||||
|
static bool visited = false;
|
||||||
|
if (visited)
|
||||||
|
return;
|
||||||
|
|
||||||
|
visited = true;
|
||||||
|
struct pipe_grid_info info = {};
|
||||||
|
info.grid[0] = info.grid[1] = info.grid[2] = 1024;
|
||||||
|
info.block[2] = 1;
|
||||||
|
|
||||||
|
for (unsigned block_3d = 0; block_3d < 2; block_3d++) {
|
||||||
|
printf(" WG size | WG block/SE | Thread block/SE\n");
|
||||||
|
|
||||||
|
for (unsigned x = 1; x <= 32; x *= 2) {
|
||||||
|
for (unsigned y = 1; y <= 32; y *= 2) {
|
||||||
|
info.block[0] = x;
|
||||||
|
info.block[1] = y;
|
||||||
|
info.block[2] = block_3d ? 8 : 1;
|
||||||
|
|
||||||
|
if ((x * y) % 32)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
unsigned log_x, log_y;
|
||||||
|
if (!si_get_2d_interleave_size(&info, &log_x, &log_y))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
printf(" (%2u, %2u) = %3u | (%2u, %2u) = %2u | (%3u,%3u) = %u\n",
|
||||||
|
info.block[0], info.block[1], info.block[0] * info.block[1],
|
||||||
|
1 << log_x, 1 << log_y, (1 << log_x) * (1 << log_y),
|
||||||
|
info.block[0] * (1 << log_x), info.block[1] * (1 << log_y),
|
||||||
|
info.block[0] * (1 << log_x) * info.block[1] * (1 << log_y));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -422,13 +422,19 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
|
||||||
compute_min_size = 4 * 1024;
|
compute_min_size = 4 * 1024;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* TODO: use compute for unaligned big sizes */
|
/* TODO: use compute for 8-bit and 16-bit clear values */
|
||||||
if (method == SI_AUTO_SELECT_CLEAR_METHOD &&
|
if (method == SI_AUTO_SELECT_CLEAR_METHOD &&
|
||||||
|
/* CP DMA doesn't support the render condition. */
|
||||||
(flags & SI_OP_CS_RENDER_COND_ENABLE ||
|
(flags & SI_OP_CS_RENDER_COND_ENABLE ||
|
||||||
|
/* CP DMA doesn't support large clear value sizes. */
|
||||||
clear_value_size > 4 ||
|
clear_value_size > 4 ||
|
||||||
(clear_value_size == 4 && offset % 4 == 0 && size > compute_min_size))) {
|
/* Use compute if CP DMA is non-coherent. */
|
||||||
|
(sctx->screen->info.cp_sdma_ge_use_system_memory_scope &&
|
||||||
|
clear_value_size >= 4) ||
|
||||||
|
/* Use compute if the size is large enough. */
|
||||||
|
(clear_value_size == 4 && offset % 4 == 0 && size > compute_min_size)))
|
||||||
method = SI_COMPUTE_CLEAR_METHOD;
|
method = SI_COMPUTE_CLEAR_METHOD;
|
||||||
}
|
|
||||||
if (method == SI_COMPUTE_CLEAR_METHOD) {
|
if (method == SI_COMPUTE_CLEAR_METHOD) {
|
||||||
si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0, aligned_size, clear_value,
|
si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0, aligned_size, clear_value,
|
||||||
clear_value_size, flags, coher);
|
clear_value_size, flags, coher);
|
||||||
|
|
@ -484,8 +490,9 @@ void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct p
|
||||||
/* Only use compute for VRAM copies on dGPUs. */
|
/* Only use compute for VRAM copies on dGPUs. */
|
||||||
/* TODO: use compute for unaligned big sizes */
|
/* TODO: use compute for unaligned big sizes */
|
||||||
if (sctx->screen->info.has_dedicated_vram && si_resource(dst)->domains & RADEON_DOMAIN_VRAM &&
|
if (sctx->screen->info.has_dedicated_vram && si_resource(dst)->domains & RADEON_DOMAIN_VRAM &&
|
||||||
si_resource(src)->domains & RADEON_DOMAIN_VRAM && size > compute_min_size &&
|
si_resource(src)->domains & RADEON_DOMAIN_VRAM &&
|
||||||
dst_offset % 4 == 0 && src_offset % 4 == 0 && size % 4 == 0) {
|
dst_offset % 4 == 0 && src_offset % 4 == 0 && size % 4 == 0 &&
|
||||||
|
(size > compute_min_size || sctx->screen->info.cp_sdma_ge_use_system_memory_scope)) {
|
||||||
si_compute_do_clear_or_copy(sctx, dst, dst_offset, src, src_offset, size, NULL, 0,
|
si_compute_do_clear_or_copy(sctx, dst, dst_offset, src, src_offset, size, NULL, 0,
|
||||||
flags, coher);
|
flags, coher);
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -810,6 +817,8 @@ bool si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u
|
||||||
|
|
||||||
void si_retile_dcc(struct si_context *sctx, struct si_texture *tex)
|
void si_retile_dcc(struct si_context *sctx, struct si_texture *tex)
|
||||||
{
|
{
|
||||||
|
assert(sctx->gfx_level < GFX12);
|
||||||
|
|
||||||
/* Set the DCC buffer. */
|
/* Set the DCC buffer. */
|
||||||
assert(tex->surface.meta_offset && tex->surface.meta_offset <= UINT_MAX);
|
assert(tex->surface.meta_offset && tex->surface.meta_offset <= UINT_MAX);
|
||||||
assert(tex->surface.display_dcc_offset && tex->surface.display_dcc_offset <= UINT_MAX);
|
assert(tex->surface.display_dcc_offset && tex->surface.display_dcc_offset <= UINT_MAX);
|
||||||
|
|
|
||||||
|
|
@ -74,6 +74,9 @@ static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs, ui
|
||||||
/* GDS increments the address, not CP. */
|
/* GDS increments the address, not CP. */
|
||||||
command |= S_415_DAS(V_415_REGISTER) | S_415_DAIC(V_415_NO_INCREMENT);
|
command |= S_415_DAS(V_415_REGISTER) | S_415_DAIC(V_415_NO_INCREMENT);
|
||||||
} else if (sctx->gfx_level >= GFX7 && cache_policy != L2_BYPASS) {
|
} else if (sctx->gfx_level >= GFX7 && cache_policy != L2_BYPASS) {
|
||||||
|
/* GFX12: DST_CACHE_POLICY is changed to DST_TEMPORAL, but the behavior is the same
|
||||||
|
* for values of 0 and 1.
|
||||||
|
*/
|
||||||
header |=
|
header |=
|
||||||
S_501_DST_SEL(V_501_DST_ADDR_TC_L2) | S_501_DST_CACHE_POLICY(cache_policy == L2_STREAM);
|
S_501_DST_SEL(V_501_DST_ADDR_TC_L2) | S_501_DST_CACHE_POLICY(cache_policy == L2_STREAM);
|
||||||
}
|
}
|
||||||
|
|
@ -85,6 +88,9 @@ static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs, ui
|
||||||
/* Both of these are required for GDS. It does increment the address. */
|
/* Both of these are required for GDS. It does increment the address. */
|
||||||
command |= S_415_SAS(V_415_REGISTER) | S_415_SAIC(V_415_NO_INCREMENT);
|
command |= S_415_SAS(V_415_REGISTER) | S_415_SAIC(V_415_NO_INCREMENT);
|
||||||
} else if (sctx->gfx_level >= GFX7 && cache_policy != L2_BYPASS) {
|
} else if (sctx->gfx_level >= GFX7 && cache_policy != L2_BYPASS) {
|
||||||
|
/* GFX12: SRC_CACHE_POLICY is changed to SRC_TEMPORAL, but the behavior is the same
|
||||||
|
* for values of 0 and 1.
|
||||||
|
*/
|
||||||
header |=
|
header |=
|
||||||
S_501_SRC_SEL(V_501_SRC_ADDR_TC_L2) | S_501_SRC_CACHE_POLICY(cache_policy == L2_STREAM);
|
S_501_SRC_SEL(V_501_SRC_ADDR_TC_L2) | S_501_SRC_CACHE_POLICY(cache_policy == L2_STREAM);
|
||||||
}
|
}
|
||||||
|
|
@ -191,6 +197,10 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
|
||||||
if (user_flags & SI_OP_SYNC_PS_BEFORE)
|
if (user_flags & SI_OP_SYNC_PS_BEFORE)
|
||||||
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
|
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
|
||||||
|
|
||||||
|
/* TODO: Range-invalidate GL2 or always use compute shaders */
|
||||||
|
if (sctx->screen->info.cp_sdma_ge_use_system_memory_scope)
|
||||||
|
sctx->flags |= SI_CONTEXT_INV_L2;
|
||||||
|
|
||||||
/* Mark the buffer range of destination as valid (initialized),
|
/* Mark the buffer range of destination as valid (initialized),
|
||||||
* so that transfer_map knows it should wait for the GPU when mapping
|
* so that transfer_map knows it should wait for the GPU when mapping
|
||||||
* that range. */
|
* that range. */
|
||||||
|
|
@ -353,6 +363,10 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
|
||||||
if ((dst || src) && !(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE))
|
if ((dst || src) && !(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE))
|
||||||
sctx->flags |= si_get_flush_flags(sctx, coher, cache_policy);
|
sctx->flags |= si_get_flush_flags(sctx, coher, cache_policy);
|
||||||
|
|
||||||
|
/* TODO: Range-flush GL2 for src and range-invalidate GL2 for dst, or always use compute shaders */
|
||||||
|
if (sctx->screen->info.cp_sdma_ge_use_system_memory_scope)
|
||||||
|
sctx->flags |= SI_CONTEXT_INV_L2;
|
||||||
|
|
||||||
if (sctx->flags)
|
if (sctx->flags)
|
||||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -75,6 +75,8 @@ void si_init_cp_reg_shadowing(struct si_context *sctx)
|
||||||
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, sctx->shadowing.csa,
|
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, sctx->shadowing.csa,
|
||||||
RADEON_USAGE_READWRITE | RADEON_PRIO_DESCRIPTORS);
|
RADEON_USAGE_READWRITE | RADEON_PRIO_DESCRIPTORS);
|
||||||
si_pm4_emit_commands(sctx, shadowing_preamble);
|
si_pm4_emit_commands(sctx, shadowing_preamble);
|
||||||
|
|
||||||
|
if (sctx->gfx_level < GFX12)
|
||||||
ac_emulate_clear_state(&sctx->screen->info, &sctx->gfx_cs, si_set_context_reg_array);
|
ac_emulate_clear_state(&sctx->screen->info, &sctx->gfx_cs, si_set_context_reg_array);
|
||||||
|
|
||||||
/* TODO: Gfx11 fails GLCTS if we don't re-emit the preamble at the beginning of every IB. */
|
/* TODO: Gfx11 fails GLCTS if we don't re-emit the preamble at the beginning of every IB. */
|
||||||
|
|
@ -87,6 +89,7 @@ void si_init_cp_reg_shadowing(struct si_context *sctx)
|
||||||
sctx->cs_preamble_state = NULL;
|
sctx->cs_preamble_state = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (sctx->gfx_level < GFX12)
|
||||||
si_set_tracked_regs_to_clear_state(sctx);
|
si_set_tracked_regs_to_clear_state(sctx);
|
||||||
|
|
||||||
/* Setup preemption. The shadowing preamble will be executed as a preamble IB,
|
/* Setup preemption. The shadowing preamble will be executed as a preamble IB,
|
||||||
|
|
|
||||||
|
|
@ -299,7 +299,7 @@ void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture
|
||||||
state[0] = va >> 8;
|
state[0] = va >> 8;
|
||||||
state[1] |= S_008F14_BASE_ADDRESS_HI(va >> 40);
|
state[1] |= S_008F14_BASE_ADDRESS_HI(va >> 40);
|
||||||
|
|
||||||
if (sscreen->info.gfx_level >= GFX8) {
|
if (sscreen->info.gfx_level >= GFX8 && sscreen->info.gfx_level < GFX12) {
|
||||||
if (!(access & SI_IMAGE_ACCESS_DCC_OFF) && vi_dcc_enabled(tex, first_level)) {
|
if (!(access & SI_IMAGE_ACCESS_DCC_OFF) && vi_dcc_enabled(tex, first_level)) {
|
||||||
meta_va = tex->buffer.gpu_address + tex->surface.meta_offset;
|
meta_va = tex->buffer.gpu_address + tex->surface.meta_offset;
|
||||||
|
|
||||||
|
|
@ -330,7 +330,7 @@ void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture
|
||||||
* of 256B.
|
* of 256B.
|
||||||
*/
|
*/
|
||||||
if (sscreen->info.gfx_level >= GFX10_3 && tex->surface.u.gfx9.uses_custom_pitch) {
|
if (sscreen->info.gfx_level >= GFX10_3 && tex->surface.u.gfx9.uses_custom_pitch) {
|
||||||
ASSERTED unsigned min_alignment = 256;
|
ASSERTED unsigned min_alignment = sscreen->info.gfx_level >= GFX12 ? 128 : 256;
|
||||||
assert((tex->surface.u.gfx9.surf_pitch * tex->surface.bpe) % min_alignment == 0);
|
assert((tex->surface.u.gfx9.surf_pitch * tex->surface.bpe) % min_alignment == 0);
|
||||||
assert(tex->buffer.b.b.target == PIPE_TEXTURE_2D ||
|
assert(tex->buffer.b.b.target == PIPE_TEXTURE_2D ||
|
||||||
tex->buffer.b.b.target == PIPE_TEXTURE_RECT);
|
tex->buffer.b.b.target == PIPE_TEXTURE_RECT);
|
||||||
|
|
@ -341,11 +341,17 @@ void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture
|
||||||
if (tex->surface.blk_w == 2)
|
if (tex->surface.blk_w == 2)
|
||||||
pitch *= 2;
|
pitch *= 2;
|
||||||
|
|
||||||
|
if (sscreen->info.gfx_level >= GFX12) {
|
||||||
|
state[4] |= S_00A010_DEPTH_GFX12(pitch - 1) | /* DEPTH contains low bits of PITCH. */
|
||||||
|
S_00A010_PITCH_MSB_GFX12((pitch - 1) >> 14);
|
||||||
|
} else {
|
||||||
state[4] |= S_00A010_DEPTH_GFX10(pitch - 1) | /* DEPTH contains low bits of PITCH. */
|
state[4] |= S_00A010_DEPTH_GFX10(pitch - 1) | /* DEPTH contains low bits of PITCH. */
|
||||||
S_00A010_PITCH_MSB_GFX103((pitch - 1) >> 13);
|
S_00A010_PITCH_MSB_GFX103((pitch - 1) >> 13);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (meta_va) {
|
if (meta_va) {
|
||||||
|
/* Gfx10-11. */
|
||||||
struct gfx9_surf_meta_flags meta = {
|
struct gfx9_surf_meta_flags meta = {
|
||||||
.rb_aligned = 1,
|
.rb_aligned = 1,
|
||||||
.pipe_aligned = 1,
|
.pipe_aligned = 1,
|
||||||
|
|
@ -573,6 +579,11 @@ static void si_set_sampler_views(struct si_context *sctx, unsigned shader,
|
||||||
|
|
||||||
si_set_sampler_view_desc(sctx, sview, samplers->sampler_states[slot], desc);
|
si_set_sampler_view_desc(sctx, sview, samplers->sampler_states[slot], desc);
|
||||||
|
|
||||||
|
if (sctx->gfx_level >= GFX12) {
|
||||||
|
/* Gfx12 doesn't do any decompression. */
|
||||||
|
if (tex->buffer.b.b.target == PIPE_BUFFER)
|
||||||
|
tex->buffer.bind_history |= SI_BIND_SAMPLER_BUFFER(shader);
|
||||||
|
} else {
|
||||||
if (tex->buffer.b.b.target == PIPE_BUFFER) {
|
if (tex->buffer.b.b.target == PIPE_BUFFER) {
|
||||||
tex->buffer.bind_history |= SI_BIND_SAMPLER_BUFFER(shader);
|
tex->buffer.bind_history |= SI_BIND_SAMPLER_BUFFER(shader);
|
||||||
samplers->needs_depth_decompress_mask &= ~(1u << slot);
|
samplers->needs_depth_decompress_mask &= ~(1u << slot);
|
||||||
|
|
@ -602,6 +613,7 @@ static void si_set_sampler_views(struct si_context *sctx, unsigned shader,
|
||||||
p_atomic_read(&tex->framebuffers_bound))
|
p_atomic_read(&tex->framebuffers_bound))
|
||||||
sctx->need_check_render_feedback = true;
|
sctx->need_check_render_feedback = true;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (take_ownership) {
|
if (take_ownership) {
|
||||||
pipe_sampler_view_reference(&samplers->views[slot], NULL);
|
pipe_sampler_view_reference(&samplers->views[slot], NULL);
|
||||||
|
|
@ -647,6 +659,9 @@ static void si_set_sampler_views(struct si_context *sctx, unsigned shader,
|
||||||
|
|
||||||
static void si_update_shader_needs_decompress_mask(struct si_context *sctx, unsigned shader)
|
static void si_update_shader_needs_decompress_mask(struct si_context *sctx, unsigned shader)
|
||||||
{
|
{
|
||||||
|
if (sctx->gfx_level >= GFX12)
|
||||||
|
return;
|
||||||
|
|
||||||
struct si_samplers *samplers = &sctx->samplers[shader];
|
struct si_samplers *samplers = &sctx->samplers[shader];
|
||||||
unsigned shader_bit = 1 << shader;
|
unsigned shader_bit = 1 << shader;
|
||||||
|
|
||||||
|
|
@ -677,8 +692,11 @@ static void si_pipe_set_sampler_views(struct pipe_context *ctx, enum pipe_shader
|
||||||
si_update_shader_needs_decompress_mask(sctx, shader);
|
si_update_shader_needs_decompress_mask(sctx, shader);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void si_samplers_update_needs_color_decompress_mask(struct si_samplers *samplers)
|
static void si_samplers_update_needs_color_decompress_mask(struct si_context *sctx,
|
||||||
|
struct si_samplers *samplers)
|
||||||
{
|
{
|
||||||
|
assert(sctx->gfx_level < GFX12);
|
||||||
|
|
||||||
unsigned mask = samplers->enabled_mask;
|
unsigned mask = samplers->enabled_mask;
|
||||||
|
|
||||||
while (mask) {
|
while (mask) {
|
||||||
|
|
@ -878,6 +896,11 @@ static void si_set_shader_image(struct si_context *ctx, unsigned shader, unsigne
|
||||||
if (&images->views[slot] != view)
|
if (&images->views[slot] != view)
|
||||||
util_copy_image_view(&images->views[slot], view);
|
util_copy_image_view(&images->views[slot], view);
|
||||||
|
|
||||||
|
if (ctx->gfx_level >= GFX12) {
|
||||||
|
/* Gfx12 doesn't do any decompression. */
|
||||||
|
if (res->b.b.target == PIPE_BUFFER)
|
||||||
|
res->bind_history |= SI_BIND_IMAGE_BUFFER(shader);
|
||||||
|
} else {
|
||||||
if (res->b.b.target == PIPE_BUFFER) {
|
if (res->b.b.target == PIPE_BUFFER) {
|
||||||
images->needs_color_decompress_mask &= ~(1 << slot);
|
images->needs_color_decompress_mask &= ~(1 << slot);
|
||||||
images->display_dcc_store_mask &= ~(1u << slot);
|
images->display_dcc_store_mask &= ~(1u << slot);
|
||||||
|
|
@ -905,6 +928,7 @@ static void si_set_shader_image(struct si_context *ctx, unsigned shader, unsigne
|
||||||
if (vi_dcc_enabled(tex, level) && p_atomic_read(&tex->framebuffers_bound))
|
if (vi_dcc_enabled(tex, level) && p_atomic_read(&tex->framebuffers_bound))
|
||||||
ctx->need_check_render_feedback = true;
|
ctx->need_check_render_feedback = true;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
images->enabled_mask |= 1u << slot;
|
images->enabled_mask |= 1u << slot;
|
||||||
ctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
|
ctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
|
||||||
|
|
@ -951,8 +975,11 @@ static void si_set_shader_images(struct pipe_context *pipe, enum pipe_shader_typ
|
||||||
si_update_shader_needs_decompress_mask(ctx, shader);
|
si_update_shader_needs_decompress_mask(ctx, shader);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void si_images_update_needs_color_decompress_mask(struct si_images *images)
|
static void si_images_update_needs_color_decompress_mask(struct si_context *sctx,
|
||||||
|
struct si_images *images)
|
||||||
{
|
{
|
||||||
|
assert(sctx->gfx_level < GFX12);
|
||||||
|
|
||||||
unsigned mask = images->enabled_mask;
|
unsigned mask = images->enabled_mask;
|
||||||
|
|
||||||
while (mask) {
|
while (mask) {
|
||||||
|
|
@ -1132,7 +1159,10 @@ static void si_init_buffer_resources(struct si_context *sctx,
|
||||||
desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
|
desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
|
||||||
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
|
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
|
||||||
|
|
||||||
if (sctx->gfx_level >= GFX11) {
|
if (sctx->gfx_level >= GFX12) {
|
||||||
|
desc[3] |= S_008F0C_FORMAT_GFX12(V_008F0C_GFX11_FORMAT_32_FLOAT) |
|
||||||
|
S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW);
|
||||||
|
} else if (sctx->gfx_level >= GFX11) {
|
||||||
desc[3] |= S_008F0C_FORMAT_GFX10(V_008F0C_GFX11_FORMAT_32_FLOAT) |
|
desc[3] |= S_008F0C_FORMAT_GFX10(V_008F0C_GFX11_FORMAT_32_FLOAT) |
|
||||||
S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW);
|
S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW);
|
||||||
} else if (sctx->gfx_level >= GFX10) {
|
} else if (sctx->gfx_level >= GFX10) {
|
||||||
|
|
@ -1584,7 +1614,10 @@ void si_set_ring_buffer(struct si_context *sctx, uint slot, struct pipe_resource
|
||||||
desc[3] |= S_008F0C_ELEMENT_SIZE(element_size);
|
desc[3] |= S_008F0C_ELEMENT_SIZE(element_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sctx->gfx_level >= GFX11) {
|
if (sctx->gfx_level >= GFX12) {
|
||||||
|
desc[3] |= S_008F0C_FORMAT_GFX12(V_008F0C_GFX11_FORMAT_32_FLOAT) |
|
||||||
|
S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED);
|
||||||
|
} else if (sctx->gfx_level >= GFX11) {
|
||||||
desc[3] |= S_008F0C_FORMAT_GFX10(V_008F0C_GFX11_FORMAT_32_FLOAT) |
|
desc[3] |= S_008F0C_FORMAT_GFX10(V_008F0C_GFX11_FORMAT_32_FLOAT) |
|
||||||
S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED);
|
S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED);
|
||||||
} else if (sctx->gfx_level >= GFX10) {
|
} else if (sctx->gfx_level >= GFX10) {
|
||||||
|
|
@ -1631,6 +1664,8 @@ static void si_set_polygon_stipple(struct pipe_context *ctx, const struct pipe_p
|
||||||
|
|
||||||
static void si_resident_handles_update_needs_color_decompress(struct si_context *sctx)
|
static void si_resident_handles_update_needs_color_decompress(struct si_context *sctx)
|
||||||
{
|
{
|
||||||
|
assert(sctx->gfx_level < GFX12);
|
||||||
|
|
||||||
util_dynarray_clear(&sctx->resident_tex_needs_color_decompress);
|
util_dynarray_clear(&sctx->resident_tex_needs_color_decompress);
|
||||||
util_dynarray_clear(&sctx->resident_img_needs_color_decompress);
|
util_dynarray_clear(&sctx->resident_img_needs_color_decompress);
|
||||||
|
|
||||||
|
|
@ -1675,8 +1710,8 @@ void si_update_needs_color_decompress_masks(struct si_context *sctx)
|
||||||
assert(sctx->gfx_level < GFX11);
|
assert(sctx->gfx_level < GFX11);
|
||||||
|
|
||||||
for (int i = 0; i < SI_NUM_SHADERS; ++i) {
|
for (int i = 0; i < SI_NUM_SHADERS; ++i) {
|
||||||
si_samplers_update_needs_color_decompress_mask(&sctx->samplers[i]);
|
si_samplers_update_needs_color_decompress_mask(sctx, &sctx->samplers[i]);
|
||||||
si_images_update_needs_color_decompress_mask(&sctx->images[i]);
|
si_images_update_needs_color_decompress_mask(sctx, &sctx->images[i]);
|
||||||
si_update_shader_needs_decompress_mask(sctx, i);
|
si_update_shader_needs_decompress_mask(sctx, i);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -2002,6 +2037,11 @@ static void si_upload_bindless_descriptors(struct si_context *sctx)
|
||||||
|
|
||||||
/* Invalidate scalar L0 because the cache doesn't know that L2 changed. */
|
/* Invalidate scalar L0 because the cache doesn't know that L2 changed. */
|
||||||
sctx->flags |= SI_CONTEXT_INV_SCACHE;
|
sctx->flags |= SI_CONTEXT_INV_SCACHE;
|
||||||
|
|
||||||
|
/* TODO: Range-invalidate GL2 */
|
||||||
|
if (sctx->screen->info.cp_sdma_ge_use_system_memory_scope)
|
||||||
|
sctx->flags |= SI_CONTEXT_INV_L2;
|
||||||
|
|
||||||
sctx->bindless_descriptors_dirty = false;
|
sctx->bindless_descriptors_dirty = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -2234,9 +2274,23 @@ void si_shader_change_notify(struct si_context *sctx)
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
#define gfx12_push_consecutive_shader_pointers(sctx, pointer_mask, sh_base, type) do { \
|
||||||
|
unsigned sh_reg_base = (sh_base); \
|
||||||
|
if (sh_reg_base) { \
|
||||||
|
unsigned mask = shader_pointers_dirty & (pointer_mask); \
|
||||||
|
\
|
||||||
|
u_foreach_bit(i, mask) { \
|
||||||
|
struct si_descriptors *descs = &sctx->descriptors[i]; \
|
||||||
|
unsigned sh_reg = sh_reg_base + descs->shader_userdata_offset; \
|
||||||
|
\
|
||||||
|
gfx12_push_##type##_sh_reg(sh_reg, descs->gpu_address); \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
static void si_emit_global_shader_pointers(struct si_context *sctx, struct si_descriptors *descs)
|
static void si_emit_global_shader_pointers(struct si_context *sctx, struct si_descriptors *descs)
|
||||||
{
|
{
|
||||||
assert(!sctx->screen->info.has_set_sh_pairs_packed);
|
assert(sctx->gfx_level < GFX12 && !sctx->screen->info.has_set_sh_pairs_packed);
|
||||||
|
|
||||||
radeon_begin(&sctx->gfx_cs);
|
radeon_begin(&sctx->gfx_cs);
|
||||||
|
|
||||||
|
|
@ -2280,6 +2334,16 @@ static void gfx11_push_global_shader_pointers(struct si_context *sctx, struct si
|
||||||
descs->gpu_address);
|
descs->gpu_address);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void gfx12_push_global_shader_pointers(struct si_context *sctx, struct si_descriptors *descs)
|
||||||
|
{
|
||||||
|
gfx12_push_gfx_sh_reg(R_00B030_SPI_SHADER_USER_DATA_PS_0 + descs->shader_userdata_offset,
|
||||||
|
descs->gpu_address);
|
||||||
|
gfx12_push_gfx_sh_reg(R_00B230_SPI_SHADER_USER_DATA_GS_0 + descs->shader_userdata_offset,
|
||||||
|
descs->gpu_address);
|
||||||
|
gfx12_push_gfx_sh_reg(R_00B430_SPI_SHADER_USER_DATA_HS_0 + descs->shader_userdata_offset,
|
||||||
|
descs->gpu_address);
|
||||||
|
}
|
||||||
|
|
||||||
void si_emit_graphics_shader_pointers(struct si_context *sctx, unsigned index)
|
void si_emit_graphics_shader_pointers(struct si_context *sctx, unsigned index)
|
||||||
{
|
{
|
||||||
uint32_t *sh_base = sctx->shader_pointers.sh_base;
|
uint32_t *sh_base = sctx->shader_pointers.sh_base;
|
||||||
|
|
@ -2303,7 +2367,33 @@ void si_emit_graphics_shader_pointers(struct si_context *sctx, unsigned index)
|
||||||
si_upload_bindless_descriptors(sctx);
|
si_upload_bindless_descriptors(sctx);
|
||||||
|
|
||||||
/* Set shader pointers. */
|
/* Set shader pointers. */
|
||||||
if (sctx->screen->info.has_set_sh_pairs_packed) {
|
if (sctx->gfx_level >= GFX12) {
|
||||||
|
gfx12_push_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(VERTEX),
|
||||||
|
sh_base[PIPE_SHADER_VERTEX], gfx);
|
||||||
|
gfx12_push_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_EVAL),
|
||||||
|
sh_base[PIPE_SHADER_TESS_EVAL], gfx);
|
||||||
|
gfx12_push_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(FRAGMENT),
|
||||||
|
sh_base[PIPE_SHADER_FRAGMENT], gfx);
|
||||||
|
gfx12_push_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_CTRL),
|
||||||
|
sh_base[PIPE_SHADER_TESS_CTRL], gfx);
|
||||||
|
gfx12_push_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(GEOMETRY),
|
||||||
|
sh_base[PIPE_SHADER_GEOMETRY], gfx);
|
||||||
|
|
||||||
|
if (sctx->gs_attribute_ring_pointer_dirty) {
|
||||||
|
gfx12_push_gfx_sh_reg(R_00B230_SPI_SHADER_USER_DATA_GS_0 +
|
||||||
|
GFX9_SGPR_ATTRIBUTE_RING_ADDR * 4,
|
||||||
|
sctx->screen->attribute_pos_prim_ring->gpu_address);
|
||||||
|
sctx->gs_attribute_ring_pointer_dirty = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (shader_pointers_dirty & (1 << SI_DESCS_INTERNAL))
|
||||||
|
gfx12_push_global_shader_pointers(sctx, &sctx->descriptors[SI_DESCS_INTERNAL]);
|
||||||
|
|
||||||
|
if (sctx->graphics_bindless_pointer_dirty) {
|
||||||
|
gfx12_push_global_shader_pointers(sctx, &sctx->bindless_descriptors);
|
||||||
|
sctx->graphics_bindless_pointer_dirty = false;
|
||||||
|
}
|
||||||
|
} else if (sctx->screen->info.has_set_sh_pairs_packed) {
|
||||||
gfx11_push_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(VERTEX),
|
gfx11_push_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(VERTEX),
|
||||||
sh_base[PIPE_SHADER_VERTEX], gfx);
|
sh_base[PIPE_SHADER_VERTEX], gfx);
|
||||||
gfx11_push_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_EVAL),
|
gfx11_push_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_EVAL),
|
||||||
|
|
@ -2318,7 +2408,7 @@ void si_emit_graphics_shader_pointers(struct si_context *sctx, unsigned index)
|
||||||
if (sctx->gs_attribute_ring_pointer_dirty) {
|
if (sctx->gs_attribute_ring_pointer_dirty) {
|
||||||
gfx11_push_gfx_sh_reg(R_00B230_SPI_SHADER_USER_DATA_GS_0 +
|
gfx11_push_gfx_sh_reg(R_00B230_SPI_SHADER_USER_DATA_GS_0 +
|
||||||
GFX9_SGPR_ATTRIBUTE_RING_ADDR * 4,
|
GFX9_SGPR_ATTRIBUTE_RING_ADDR * 4,
|
||||||
sctx->screen->attribute_ring->gpu_address);
|
sctx->screen->attribute_pos_prim_ring->gpu_address);
|
||||||
sctx->gs_attribute_ring_pointer_dirty = false;
|
sctx->gs_attribute_ring_pointer_dirty = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -2346,7 +2436,7 @@ void si_emit_graphics_shader_pointers(struct si_context *sctx, unsigned index)
|
||||||
assert(sctx->gfx_level >= GFX11);
|
assert(sctx->gfx_level >= GFX11);
|
||||||
radeon_set_sh_reg(R_00B230_SPI_SHADER_USER_DATA_GS_0 +
|
radeon_set_sh_reg(R_00B230_SPI_SHADER_USER_DATA_GS_0 +
|
||||||
GFX9_SGPR_ATTRIBUTE_RING_ADDR * 4,
|
GFX9_SGPR_ATTRIBUTE_RING_ADDR * 4,
|
||||||
sctx->screen->attribute_ring->gpu_address);
|
sctx->screen->attribute_pos_prim_ring->gpu_address);
|
||||||
sctx->gs_attribute_ring_pointer_dirty = false;
|
sctx->gs_attribute_ring_pointer_dirty = false;
|
||||||
}
|
}
|
||||||
radeon_end();
|
radeon_end();
|
||||||
|
|
@ -2383,7 +2473,17 @@ void si_emit_compute_shader_pointers(struct si_context *sctx)
|
||||||
radeon_begin(&sctx->gfx_cs);
|
radeon_begin(&sctx->gfx_cs);
|
||||||
|
|
||||||
/* Set shader pointers. */
|
/* Set shader pointers. */
|
||||||
if (sctx->screen->info.has_set_sh_pairs_packed) {
|
if (sctx->gfx_level >= GFX12) {
|
||||||
|
gfx12_push_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(COMPUTE),
|
||||||
|
R_00B900_COMPUTE_USER_DATA_0, compute);
|
||||||
|
|
||||||
|
if (sctx->compute_bindless_pointer_dirty) {
|
||||||
|
gfx12_push_compute_sh_reg(R_00B900_COMPUTE_USER_DATA_0 +
|
||||||
|
sctx->bindless_descriptors.shader_userdata_offset,
|
||||||
|
sctx->bindless_descriptors.gpu_address);
|
||||||
|
sctx->compute_bindless_pointer_dirty = false;
|
||||||
|
}
|
||||||
|
} else if (sctx->screen->info.has_set_sh_pairs_packed) {
|
||||||
gfx11_push_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(COMPUTE),
|
gfx11_push_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(COMPUTE),
|
||||||
R_00B900_COMPUTE_USER_DATA_0, compute);
|
R_00B900_COMPUTE_USER_DATA_0, compute);
|
||||||
|
|
||||||
|
|
@ -2641,6 +2741,7 @@ static void si_make_texture_handle_resident(struct pipe_context *ctx, uint64_t h
|
||||||
if (sview->base.texture->target != PIPE_BUFFER) {
|
if (sview->base.texture->target != PIPE_BUFFER) {
|
||||||
struct si_texture *tex = (struct si_texture *)sview->base.texture;
|
struct si_texture *tex = (struct si_texture *)sview->base.texture;
|
||||||
|
|
||||||
|
if (sctx->gfx_level < GFX12) {
|
||||||
if (depth_needs_decompression(tex, sview->is_stencil_sampler)) {
|
if (depth_needs_decompression(tex, sview->is_stencil_sampler)) {
|
||||||
util_dynarray_append(&sctx->resident_tex_needs_depth_decompress,
|
util_dynarray_append(&sctx->resident_tex_needs_depth_decompress,
|
||||||
struct si_texture_handle *, tex_handle);
|
struct si_texture_handle *, tex_handle);
|
||||||
|
|
@ -2654,6 +2755,7 @@ static void si_make_texture_handle_resident(struct pipe_context *ctx, uint64_t h
|
||||||
if (vi_dcc_enabled(tex, sview->base.u.tex.first_level) &&
|
if (vi_dcc_enabled(tex, sview->base.u.tex.first_level) &&
|
||||||
p_atomic_read(&tex->framebuffers_bound))
|
p_atomic_read(&tex->framebuffers_bound))
|
||||||
sctx->need_check_render_feedback = true;
|
sctx->need_check_render_feedback = true;
|
||||||
|
}
|
||||||
|
|
||||||
si_update_bindless_texture_descriptor(sctx, tex_handle);
|
si_update_bindless_texture_descriptor(sctx, tex_handle);
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -2680,7 +2782,7 @@ static void si_make_texture_handle_resident(struct pipe_context *ctx, uint64_t h
|
||||||
util_dynarray_delete_unordered(&sctx->resident_tex_handles, struct si_texture_handle *,
|
util_dynarray_delete_unordered(&sctx->resident_tex_handles, struct si_texture_handle *,
|
||||||
tex_handle);
|
tex_handle);
|
||||||
|
|
||||||
if (sview->base.texture->target != PIPE_BUFFER) {
|
if (sctx->gfx_level < GFX12 && sview->base.texture->target != PIPE_BUFFER) {
|
||||||
util_dynarray_delete_unordered(&sctx->resident_tex_needs_depth_decompress,
|
util_dynarray_delete_unordered(&sctx->resident_tex_needs_depth_decompress,
|
||||||
struct si_texture_handle *, tex_handle);
|
struct si_texture_handle *, tex_handle);
|
||||||
|
|
||||||
|
|
@ -2768,6 +2870,7 @@ static void si_make_image_handle_resident(struct pipe_context *ctx, uint64_t han
|
||||||
struct si_texture *tex = (struct si_texture *)res;
|
struct si_texture *tex = (struct si_texture *)res;
|
||||||
unsigned level = view->u.tex.level;
|
unsigned level = view->u.tex.level;
|
||||||
|
|
||||||
|
if (sctx->gfx_level < GFX12) {
|
||||||
if (color_needs_decompression(tex)) {
|
if (color_needs_decompression(tex)) {
|
||||||
util_dynarray_append(&sctx->resident_img_needs_color_decompress,
|
util_dynarray_append(&sctx->resident_img_needs_color_decompress,
|
||||||
struct si_image_handle *, img_handle);
|
struct si_image_handle *, img_handle);
|
||||||
|
|
@ -2775,6 +2878,7 @@ static void si_make_image_handle_resident(struct pipe_context *ctx, uint64_t han
|
||||||
|
|
||||||
if (vi_dcc_enabled(tex, level) && p_atomic_read(&tex->framebuffers_bound))
|
if (vi_dcc_enabled(tex, level) && p_atomic_read(&tex->framebuffers_bound))
|
||||||
sctx->need_check_render_feedback = true;
|
sctx->need_check_render_feedback = true;
|
||||||
|
}
|
||||||
|
|
||||||
si_update_bindless_image_descriptor(sctx, img_handle);
|
si_update_bindless_image_descriptor(sctx, img_handle);
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -2802,7 +2906,7 @@ static void si_make_image_handle_resident(struct pipe_context *ctx, uint64_t han
|
||||||
util_dynarray_delete_unordered(&sctx->resident_img_handles, struct si_image_handle *,
|
util_dynarray_delete_unordered(&sctx->resident_img_handles, struct si_image_handle *,
|
||||||
img_handle);
|
img_handle);
|
||||||
|
|
||||||
if (res->b.b.target != PIPE_BUFFER) {
|
if (sctx->gfx_level < GFX12 && res->b.b.target != PIPE_BUFFER) {
|
||||||
util_dynarray_delete_unordered(&sctx->resident_img_needs_color_decompress,
|
util_dynarray_delete_unordered(&sctx->resident_img_needs_color_decompress,
|
||||||
struct si_image_handle *, img_handle);
|
struct si_image_handle *, img_handle);
|
||||||
}
|
}
|
||||||
|
|
@ -2846,7 +2950,10 @@ void si_init_all_descriptors(struct si_context *sctx)
|
||||||
unsigned first_shader = sctx->has_graphics ? 0 : PIPE_SHADER_COMPUTE;
|
unsigned first_shader = sctx->has_graphics ? 0 : PIPE_SHADER_COMPUTE;
|
||||||
unsigned hs_sgpr0, gs_sgpr0;
|
unsigned hs_sgpr0, gs_sgpr0;
|
||||||
|
|
||||||
if (sctx->gfx_level >= GFX11) {
|
if (sctx->gfx_level >= GFX12) {
|
||||||
|
hs_sgpr0 = R_00B410_SPI_SHADER_PGM_LO_HS;
|
||||||
|
gs_sgpr0 = R_00B210_SPI_SHADER_PGM_LO_GS;
|
||||||
|
} else if (sctx->gfx_level >= GFX11) {
|
||||||
hs_sgpr0 = R_00B420_SPI_SHADER_PGM_LO_HS;
|
hs_sgpr0 = R_00B420_SPI_SHADER_PGM_LO_HS;
|
||||||
gs_sgpr0 = R_00B220_SPI_SHADER_PGM_LO_GS;
|
gs_sgpr0 = R_00B220_SPI_SHADER_PGM_LO_GS;
|
||||||
} else {
|
} else {
|
||||||
|
|
|
||||||
|
|
@ -434,7 +434,7 @@ static void si_flush_all_queues(struct pipe_context *ctx,
|
||||||
struct si_fine_fence fine = {};
|
struct si_fine_fence fine = {};
|
||||||
unsigned rflags = PIPE_FLUSH_ASYNC;
|
unsigned rflags = PIPE_FLUSH_ASYNC;
|
||||||
|
|
||||||
if (!(flags & PIPE_FLUSH_DEFERRED)) {
|
if (sctx->gfx_level < GFX12 && !(flags & PIPE_FLUSH_DEFERRED)) {
|
||||||
si_flush_implicit_resources(sctx);
|
si_flush_implicit_resources(sctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -51,7 +51,7 @@ static int si_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
|
||||||
struct si_screen *sscreen = (struct si_screen *)pscreen;
|
struct si_screen *sscreen = (struct si_screen *)pscreen;
|
||||||
|
|
||||||
/* Gfx8 (Polaris11) hangs, so don't enable this on Gfx8 and older chips. */
|
/* Gfx8 (Polaris11) hangs, so don't enable this on Gfx8 and older chips. */
|
||||||
bool enable_sparse = sscreen->info.gfx_level >= GFX9 &&
|
bool enable_sparse = sscreen->info.gfx_level >= GFX9 && sscreen->info.gfx_level < GFX12 &&
|
||||||
sscreen->info.has_sparse_vm_mappings;
|
sscreen->info.has_sparse_vm_mappings;
|
||||||
|
|
||||||
switch (param) {
|
switch (param) {
|
||||||
|
|
@ -324,23 +324,21 @@ static int si_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
|
||||||
|
|
||||||
/* Texturing. */
|
/* Texturing. */
|
||||||
case PIPE_CAP_MAX_TEXTURE_2D_SIZE:
|
case PIPE_CAP_MAX_TEXTURE_2D_SIZE:
|
||||||
return 16384;
|
/* TODO: Gfx12 supports 64K textures, but Gallium can't represent them at the moment. */
|
||||||
|
return sscreen->info.gfx_level >= GFX12 ? 32768 : 16384;
|
||||||
case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
|
case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
|
||||||
if (!sscreen->info.has_3d_cube_border_color_mipmap)
|
if (!sscreen->info.has_3d_cube_border_color_mipmap)
|
||||||
return 0;
|
return 0;
|
||||||
return 15; /* 16384 */
|
return sscreen->info.gfx_level >= GFX12 ? 16 : 15; /* 32K : 16K */
|
||||||
case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
|
case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
|
||||||
if (!sscreen->info.has_3d_cube_border_color_mipmap)
|
if (!sscreen->info.has_3d_cube_border_color_mipmap)
|
||||||
return 0;
|
return 0;
|
||||||
if (sscreen->info.gfx_level >= GFX10)
|
/* This is limited by maximums that both the texture unit and layered rendering support. */
|
||||||
return 14;
|
return sscreen->info.gfx_level >= GFX12 ? 15 : /* 16K */
|
||||||
/* textures support 8192, but layered rendering supports 2048 */
|
sscreen->info.gfx_level >= GFX10 ? 14 : 12; /* 8K : 2K */
|
||||||
return 12;
|
|
||||||
case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
|
case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
|
||||||
if (sscreen->info.gfx_level >= GFX10)
|
/* This is limited by maximums that both the texture unit and layered rendering support. */
|
||||||
return 8192;
|
return sscreen->info.gfx_level >= GFX10 ? 8192 : 2048;
|
||||||
/* textures support 8192, but layered rendering supports 2048 */
|
|
||||||
return 2048;
|
|
||||||
|
|
||||||
/* Sparse texture */
|
/* Sparse texture */
|
||||||
case PIPE_CAP_MAX_SPARSE_TEXTURE_SIZE:
|
case PIPE_CAP_MAX_SPARSE_TEXTURE_SIZE:
|
||||||
|
|
|
||||||
|
|
@ -140,6 +140,13 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h
|
||||||
if (ctx->streamout.begin_emitted) {
|
if (ctx->streamout.begin_emitted) {
|
||||||
si_emit_streamout_end(ctx);
|
si_emit_streamout_end(ctx);
|
||||||
ctx->streamout.suspended = true;
|
ctx->streamout.suspended = true;
|
||||||
|
|
||||||
|
/* Make sure streamout is idle because the next process might change
|
||||||
|
* GE_GS_ORDERED_ID_BASE (which must not be changed when streamout is busy)
|
||||||
|
* and make this process guilty of hanging.
|
||||||
|
*/
|
||||||
|
if (ctx->gfx_level >= GFX12)
|
||||||
|
wait_flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -268,6 +275,7 @@ static void si_add_gds_to_buffer_list(struct si_context *sctx)
|
||||||
|
|
||||||
void si_set_tracked_regs_to_clear_state(struct si_context *ctx)
|
void si_set_tracked_regs_to_clear_state(struct si_context *ctx)
|
||||||
{
|
{
|
||||||
|
assert(ctx->gfx_level < GFX12);
|
||||||
STATIC_ASSERT(SI_NUM_ALL_TRACKED_REGS <= sizeof(ctx->tracked_regs.reg_saved_mask) * 8);
|
STATIC_ASSERT(SI_NUM_ALL_TRACKED_REGS <= sizeof(ctx->tracked_regs.reg_saved_mask) * 8);
|
||||||
|
|
||||||
ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_CONTROL] = 0;
|
ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_CONTROL] = 0;
|
||||||
|
|
@ -311,6 +319,7 @@ void si_set_tracked_regs_to_clear_state(struct si_context *ctx)
|
||||||
ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ADDR] = 0;
|
ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ADDR] = 0;
|
||||||
|
|
||||||
ctx->tracked_regs.reg_value[SI_TRACKED_DB_EQAA] = 0;
|
ctx->tracked_regs.reg_value[SI_TRACKED_DB_EQAA] = 0;
|
||||||
|
ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_OVERRIDE2] = 0;
|
||||||
ctx->tracked_regs.reg_value[SI_TRACKED_DB_SHADER_CONTROL] = 0;
|
ctx->tracked_regs.reg_value[SI_TRACKED_DB_SHADER_CONTROL] = 0;
|
||||||
ctx->tracked_regs.reg_value[SI_TRACKED_CB_SHADER_MASK] = 0xffffffff;
|
ctx->tracked_regs.reg_value[SI_TRACKED_CB_SHADER_MASK] = 0xffffffff;
|
||||||
ctx->tracked_regs.reg_value[SI_TRACKED_CB_TARGET_MASK] = 0xffffffff;
|
ctx->tracked_regs.reg_value[SI_TRACKED_CB_TARGET_MASK] = 0xffffffff;
|
||||||
|
|
@ -359,7 +368,6 @@ void si_set_tracked_regs_to_clear_state(struct si_context *ctx)
|
||||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2] = 0;
|
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2] = 0;
|
||||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3] = 0;
|
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3] = 0;
|
||||||
|
|
||||||
ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_OVERRIDE2] = 0;
|
|
||||||
ctx->tracked_regs.reg_value[SI_TRACKED_SPI_VS_OUT_CONFIG] = 0;
|
ctx->tracked_regs.reg_value[SI_TRACKED_SPI_VS_OUT_CONFIG] = 0;
|
||||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_PRIMITIVEID_EN] = 0;
|
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_PRIMITIVEID_EN] = 0;
|
||||||
ctx->tracked_regs.reg_value[SI_TRACKED_CB_DCC_CONTROL] = 0;
|
ctx->tracked_regs.reg_value[SI_TRACKED_CB_DCC_CONTROL] = 0;
|
||||||
|
|
@ -477,8 +485,8 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
|
||||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.cache_flush);
|
si_mark_atom_dirty(ctx, &ctx->atoms.s.cache_flush);
|
||||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.spi_ge_ring_state);
|
si_mark_atom_dirty(ctx, &ctx->atoms.s.spi_ge_ring_state);
|
||||||
|
|
||||||
if (ctx->screen->attribute_ring) {
|
if (ctx->screen->attribute_pos_prim_ring) {
|
||||||
radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, ctx->screen->attribute_ring,
|
radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, ctx->screen->attribute_pos_prim_ring,
|
||||||
RADEON_USAGE_READWRITE | RADEON_PRIO_SHADER_RINGS);
|
RADEON_USAGE_READWRITE | RADEON_PRIO_SHADER_RINGS);
|
||||||
}
|
}
|
||||||
if (ctx->border_color_buffer) {
|
if (ctx->border_color_buffer) {
|
||||||
|
|
@ -816,23 +824,27 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)
|
||||||
*/
|
*/
|
||||||
if (flags & SI_CONTEXT_INV_L2) {
|
if (flags & SI_CONTEXT_INV_L2) {
|
||||||
/* Writeback and invalidate everything in L2. */
|
/* Writeback and invalidate everything in L2. */
|
||||||
gcr_cntl |= S_586_GL2_INV(1) | S_586_GL2_WB(1) | S_586_GLM_INV(1) | S_586_GLM_WB(1);
|
gcr_cntl |= S_586_GL2_INV(1) | S_586_GL2_WB(1) |
|
||||||
|
(ctx->gfx_level < GFX12 ? S_586_GLM_INV(1) | S_586_GLM_WB(1) : 0);
|
||||||
ctx->num_L2_invalidates++;
|
ctx->num_L2_invalidates++;
|
||||||
} else if (flags & SI_CONTEXT_WB_L2) {
|
} else if (flags & SI_CONTEXT_WB_L2) {
|
||||||
gcr_cntl |= S_586_GL2_WB(1) | S_586_GLM_WB(1) | S_586_GLM_INV(1);
|
gcr_cntl |= S_586_GL2_WB(1) |
|
||||||
|
(ctx->gfx_level < GFX12 ? S_586_GLM_WB(1) | S_586_GLM_INV(1) : 0);
|
||||||
} else if (flags & SI_CONTEXT_INV_L2_METADATA) {
|
} else if (flags & SI_CONTEXT_INV_L2_METADATA) {
|
||||||
|
assert(ctx->gfx_level < GFX12);
|
||||||
gcr_cntl |= S_586_GLM_INV(1) | S_586_GLM_WB(1);
|
gcr_cntl |= S_586_GLM_INV(1) | S_586_GLM_WB(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) {
|
if (flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) {
|
||||||
if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
|
if (ctx->gfx_level < GFX12 && flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
|
||||||
/* Flush CMASK/FMASK/DCC. Will wait for idle later. */
|
/* Flush CMASK/FMASK/DCC. Will wait for idle later. */
|
||||||
radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||||
radeon_emit(EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
|
radeon_emit(EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Gfx11 can't flush DB_META and should use a TS event instead. */
|
/* Gfx11 can't flush DB_META and should use a TS event instead. */
|
||||||
if (ctx->gfx_level != GFX11 && flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
|
if (ctx->gfx_level < GFX12 && ctx->gfx_level != GFX11 &&
|
||||||
|
flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
|
||||||
/* Flush HTILE. Will wait for idle later. */
|
/* Flush HTILE. Will wait for idle later. */
|
||||||
radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||||
radeon_emit(EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
|
radeon_emit(EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
|
||||||
|
|
|
||||||
|
|
@ -68,6 +68,7 @@ static void si_update_mmio_counters(struct si_screen *sscreen, union si_mmio_cou
|
||||||
sscreen->ws->read_registers(sscreen->ws, GRBM_STATUS, 1, &value);
|
sscreen->ws->read_registers(sscreen->ws, GRBM_STATUS, 1, &value);
|
||||||
|
|
||||||
UPDATE_COUNTER(ta, TA_BUSY);
|
UPDATE_COUNTER(ta, TA_BUSY);
|
||||||
|
if (sscreen->info.gfx_level < GFX12)
|
||||||
UPDATE_COUNTER(gds, GDS_BUSY);
|
UPDATE_COUNTER(gds, GDS_BUSY);
|
||||||
UPDATE_COUNTER(vgt, VGT_BUSY);
|
UPDATE_COUNTER(vgt, VGT_BUSY);
|
||||||
UPDATE_COUNTER(ia, IA_BUSY);
|
UPDATE_COUNTER(ia, IA_BUSY);
|
||||||
|
|
|
||||||
|
|
@ -54,7 +54,9 @@ static nir_def *build_attr_ring_desc(nir_builder *b, struct si_shader *shader,
|
||||||
S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
|
S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
|
||||||
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
|
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
|
||||||
S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
|
S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
|
||||||
S_008F0C_FORMAT_GFX10(V_008F0C_GFX11_FORMAT_32_32_32_32_FLOAT) |
|
(sel->screen->info.gfx_level >= GFX12 ?
|
||||||
|
S_008F0C_FORMAT_GFX12(V_008F0C_GFX11_FORMAT_32_32_32_32_FLOAT) :
|
||||||
|
S_008F0C_FORMAT_GFX10(V_008F0C_GFX11_FORMAT_32_32_32_32_FLOAT)) |
|
||||||
S_008F0C_INDEX_STRIDE(2) /* 32 elements */),
|
S_008F0C_INDEX_STRIDE(2) /* 32 elements */),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -138,7 +140,10 @@ static nir_def *build_tess_ring_desc(nir_builder *b, struct si_screen *screen,
|
||||||
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
|
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
|
||||||
S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
|
S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
|
||||||
|
|
||||||
if (screen->info.gfx_level >= GFX11) {
|
if (screen->info.gfx_level >= GFX12) {
|
||||||
|
rsrc3 |= S_008F0C_FORMAT_GFX12(V_008F0C_GFX11_FORMAT_32_FLOAT) |
|
||||||
|
S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW);
|
||||||
|
} else if (screen->info.gfx_level >= GFX11) {
|
||||||
rsrc3 |= S_008F0C_FORMAT_GFX10(V_008F0C_GFX11_FORMAT_32_FLOAT) |
|
rsrc3 |= S_008F0C_FORMAT_GFX10(V_008F0C_GFX11_FORMAT_32_FLOAT) |
|
||||||
S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW);
|
S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW);
|
||||||
} else if (screen->info.gfx_level >= GFX10) {
|
} else if (screen->info.gfx_level >= GFX10) {
|
||||||
|
|
@ -474,6 +479,12 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s
|
||||||
replacement = si_nir_load_internal_binding(b, args, slot, 4);
|
replacement = si_nir_load_internal_binding(b, args, slot, 4);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
case nir_intrinsic_load_xfb_state_address_gfx12_amd: {
|
||||||
|
nir_def *address = si_nir_load_internal_binding(b, args, SI_STREAMOUT_STATE_BUF, 1);
|
||||||
|
nir_def *address32_hi = nir_imm_int(b, s->shader->selector->screen->info.address32_hi);
|
||||||
|
replacement = nir_pack_64_2x32_split(b, address, address32_hi);
|
||||||
|
break;
|
||||||
|
}
|
||||||
case nir_intrinsic_atomic_add_gs_emit_prim_count_amd:
|
case nir_intrinsic_atomic_add_gs_emit_prim_count_amd:
|
||||||
case nir_intrinsic_atomic_add_shader_invocation_count_amd: {
|
case nir_intrinsic_atomic_add_shader_invocation_count_amd: {
|
||||||
enum pipe_statistics_query_index index =
|
enum pipe_statistics_query_index index =
|
||||||
|
|
@ -613,7 +624,8 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case nir_intrinsic_load_layer_id:
|
case nir_intrinsic_load_layer_id:
|
||||||
replacement = ac_nir_unpack_arg(b, &args->ac, args->ac.ancillary, 16, 13);
|
replacement = ac_nir_unpack_arg(b, &args->ac, args->ac.ancillary,
|
||||||
|
16, sel->screen->info.gfx_level >= GFX12 ? 14 : 13);
|
||||||
break;
|
break;
|
||||||
case nir_intrinsic_load_color0:
|
case nir_intrinsic_load_color0:
|
||||||
case nir_intrinsic_load_color1: {
|
case nir_intrinsic_load_color1: {
|
||||||
|
|
|
||||||
|
|
@ -35,7 +35,10 @@ static nir_def *load_ubo_desc_fast_path(nir_builder *b, nir_def *addr_lo,
|
||||||
S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
|
S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
|
||||||
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
|
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
|
||||||
|
|
||||||
if (sel->screen->info.gfx_level >= GFX11)
|
if (sel->screen->info.gfx_level >= GFX12)
|
||||||
|
rsrc3 |= S_008F0C_FORMAT_GFX12(V_008F0C_GFX11_FORMAT_32_FLOAT) |
|
||||||
|
S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW);
|
||||||
|
else if (sel->screen->info.gfx_level >= GFX11)
|
||||||
rsrc3 |= S_008F0C_FORMAT_GFX10(V_008F0C_GFX11_FORMAT_32_FLOAT) |
|
rsrc3 |= S_008F0C_FORMAT_GFX10(V_008F0C_GFX11_FORMAT_32_FLOAT) |
|
||||||
S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW);
|
S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW);
|
||||||
else if (sel->screen->info.gfx_level >= GFX10)
|
else if (sel->screen->info.gfx_level >= GFX10)
|
||||||
|
|
|
||||||
|
|
@ -715,6 +715,9 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
|
||||||
case GFX11_5:
|
case GFX11_5:
|
||||||
si_init_draw_functions_GFX11_5(sctx);
|
si_init_draw_functions_GFX11_5(sctx);
|
||||||
break;
|
break;
|
||||||
|
case GFX12:
|
||||||
|
si_init_draw_functions_GFX12(sctx);
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
unreachable("unhandled gfx level");
|
unreachable("unhandled gfx level");
|
||||||
}
|
}
|
||||||
|
|
@ -978,7 +981,7 @@ static void si_destroy_screen(struct pipe_screen *pscreen)
|
||||||
sscreen->num_disk_shader_cache_misses);
|
sscreen->num_disk_shader_cache_misses);
|
||||||
}
|
}
|
||||||
|
|
||||||
si_resource_reference(&sscreen->attribute_ring, NULL);
|
si_resource_reference(&sscreen->attribute_pos_prim_ring, NULL);
|
||||||
pipe_resource_reference(&sscreen->tess_rings, NULL);
|
pipe_resource_reference(&sscreen->tess_rings, NULL);
|
||||||
pipe_resource_reference(&sscreen->tess_rings_tmz, NULL);
|
pipe_resource_reference(&sscreen->tess_rings_tmz, NULL);
|
||||||
|
|
||||||
|
|
@ -1456,7 +1459,8 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
|
||||||
sscreen->use_monolithic_shaders = (sscreen->debug_flags & DBG(MONOLITHIC_SHADERS)) != 0;
|
sscreen->use_monolithic_shaders = (sscreen->debug_flags & DBG(MONOLITHIC_SHADERS)) != 0;
|
||||||
|
|
||||||
sscreen->barrier_flags.cp_to_L2 = SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE;
|
sscreen->barrier_flags.cp_to_L2 = SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE;
|
||||||
if (sscreen->info.gfx_level <= GFX8) {
|
|
||||||
|
if (sscreen->info.gfx_level <= GFX8 || sscreen->info.cp_sdma_ge_use_system_memory_scope) {
|
||||||
sscreen->barrier_flags.cp_to_L2 |= SI_CONTEXT_INV_L2;
|
sscreen->barrier_flags.cp_to_L2 |= SI_CONTEXT_INV_L2;
|
||||||
sscreen->barrier_flags.L2_to_cp |= SI_CONTEXT_WB_L2;
|
sscreen->barrier_flags.L2_to_cp |= SI_CONTEXT_WB_L2;
|
||||||
}
|
}
|
||||||
|
|
@ -1490,14 +1494,15 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sscreen->info.gfx_level >= GFX11) {
|
if (sscreen->info.gfx_level >= GFX11) {
|
||||||
unsigned attr_ring_size = sscreen->info.attribute_ring_size_per_se * sscreen->info.max_se;
|
sscreen->attribute_pos_prim_ring =
|
||||||
sscreen->attribute_ring = si_aligned_buffer_create(&sscreen->b,
|
si_aligned_buffer_create(&sscreen->b,
|
||||||
PIPE_RESOURCE_FLAG_UNMAPPABLE |
|
PIPE_RESOURCE_FLAG_UNMAPPABLE |
|
||||||
SI_RESOURCE_FLAG_32BIT |
|
SI_RESOURCE_FLAG_32BIT |
|
||||||
SI_RESOURCE_FLAG_DRIVER_INTERNAL |
|
SI_RESOURCE_FLAG_DRIVER_INTERNAL |
|
||||||
SI_RESOURCE_FLAG_DISCARDABLE,
|
SI_RESOURCE_FLAG_DISCARDABLE,
|
||||||
PIPE_USAGE_DEFAULT,
|
PIPE_USAGE_DEFAULT,
|
||||||
attr_ring_size, 2 * 1024 * 1024);
|
sscreen->info.total_attribute_pos_prim_ring_size,
|
||||||
|
2 * 1024 * 1024);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Create the auxiliary context. This must be done last. */
|
/* Create the auxiliary context. This must be done last. */
|
||||||
|
|
@ -1535,6 +1540,7 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
|
||||||
if (test_flags & (DBG(TEST_VMFAULT_CP) | DBG(TEST_VMFAULT_SHADER)))
|
if (test_flags & (DBG(TEST_VMFAULT_CP) | DBG(TEST_VMFAULT_SHADER)))
|
||||||
si_test_vmfault(sscreen, test_flags);
|
si_test_vmfault(sscreen, test_flags);
|
||||||
|
|
||||||
|
if (sscreen->info.gfx_level < GFX12) {
|
||||||
if (test_flags & DBG(TEST_GDS))
|
if (test_flags & DBG(TEST_GDS))
|
||||||
si_test_gds((struct si_context *)sscreen->aux_context.general.ctx);
|
si_test_gds((struct si_context *)sscreen->aux_context.general.ctx);
|
||||||
|
|
||||||
|
|
@ -1546,6 +1552,7 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
|
||||||
si_test_gds_memory_management((struct si_context *)sscreen->aux_context.general.ctx,
|
si_test_gds_memory_management((struct si_context *)sscreen->aux_context.general.ctx,
|
||||||
4, 1, RADEON_DOMAIN_OA);
|
4, 1, RADEON_DOMAIN_OA);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
ac_print_nonshadowed_regs(sscreen->info.gfx_level, sscreen->info.family);
|
ac_print_nonshadowed_regs(sscreen->info.gfx_level, sscreen->info.family);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -464,6 +464,7 @@ struct si_surface {
|
||||||
/* Color registers. */
|
/* Color registers. */
|
||||||
unsigned cb_color_info;
|
unsigned cb_color_info;
|
||||||
unsigned cb_color_view;
|
unsigned cb_color_view;
|
||||||
|
unsigned cb_color_view2;
|
||||||
unsigned cb_color_attrib;
|
unsigned cb_color_attrib;
|
||||||
unsigned cb_color_attrib2; /* GFX9 and later */
|
unsigned cb_color_attrib2; /* GFX9 and later */
|
||||||
unsigned cb_color_attrib3; /* GFX10 and later */
|
unsigned cb_color_attrib3; /* GFX10 and later */
|
||||||
|
|
@ -474,18 +475,33 @@ struct si_surface {
|
||||||
unsigned spi_shader_col_format_blend_alpha : 8; /* blending with alpha. */
|
unsigned spi_shader_col_format_blend_alpha : 8; /* blending with alpha. */
|
||||||
|
|
||||||
/* DB registers. */
|
/* DB registers. */
|
||||||
uint64_t db_depth_base; /* DB_Z_READ/WRITE_BASE */
|
|
||||||
uint64_t db_stencil_base;
|
|
||||||
uint64_t db_htile_data_base;
|
|
||||||
unsigned db_depth_info;
|
|
||||||
unsigned db_z_info;
|
|
||||||
unsigned db_z_info2; /* GFX9 only */
|
|
||||||
unsigned db_depth_view;
|
unsigned db_depth_view;
|
||||||
unsigned db_depth_size;
|
unsigned db_depth_size;
|
||||||
unsigned db_depth_slice;
|
unsigned db_z_info;
|
||||||
unsigned db_stencil_info;
|
unsigned db_stencil_info;
|
||||||
|
uint64_t db_depth_base; /* DB_Z_READ/WRITE_BASE */
|
||||||
|
uint64_t db_stencil_base;
|
||||||
|
|
||||||
|
union {
|
||||||
|
struct {
|
||||||
|
uint64_t db_htile_data_base;
|
||||||
|
unsigned db_depth_info;
|
||||||
|
unsigned db_z_info2; /* GFX9 only */
|
||||||
|
unsigned db_depth_slice;
|
||||||
unsigned db_stencil_info2; /* GFX9 only */
|
unsigned db_stencil_info2; /* GFX9 only */
|
||||||
unsigned db_htile_surface;
|
unsigned db_htile_surface;
|
||||||
|
} gfx6;
|
||||||
|
|
||||||
|
struct {
|
||||||
|
uint64_t hiz_base;
|
||||||
|
unsigned hiz_info;
|
||||||
|
unsigned hiz_size_xy;
|
||||||
|
uint64_t his_base;
|
||||||
|
unsigned his_info;
|
||||||
|
unsigned his_size_xy;
|
||||||
|
unsigned db_depth_view1;
|
||||||
|
} gfx12;
|
||||||
|
} u;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct si_mmio_counter {
|
struct si_mmio_counter {
|
||||||
|
|
@ -712,7 +728,7 @@ struct si_screen {
|
||||||
struct util_idalloc_mt buffer_ids;
|
struct util_idalloc_mt buffer_ids;
|
||||||
struct util_vertex_state_cache vertex_state_cache;
|
struct util_vertex_state_cache vertex_state_cache;
|
||||||
|
|
||||||
struct si_resource *attribute_ring;
|
struct si_resource *attribute_pos_prim_ring;
|
||||||
|
|
||||||
simple_mtx_t tess_ring_lock;
|
simple_mtx_t tess_ring_lock;
|
||||||
struct pipe_resource *tess_rings;
|
struct pipe_resource *tess_rings;
|
||||||
|
|
@ -805,6 +821,7 @@ struct si_framebuffer {
|
||||||
bool has_dcc_msaa;
|
bool has_dcc_msaa;
|
||||||
bool disable_vrs_flat_shading;
|
bool disable_vrs_flat_shading;
|
||||||
bool has_stencil;
|
bool has_stencil;
|
||||||
|
bool has_hiz_his;
|
||||||
};
|
};
|
||||||
|
|
||||||
enum si_quant_mode
|
enum si_quant_mode
|
||||||
|
|
@ -834,6 +851,7 @@ struct si_streamout_target {
|
||||||
/* The buffer where BUFFER_FILLED_SIZE is stored. */
|
/* The buffer where BUFFER_FILLED_SIZE is stored. */
|
||||||
struct si_resource *buf_filled_size;
|
struct si_resource *buf_filled_size;
|
||||||
unsigned buf_filled_size_offset;
|
unsigned buf_filled_size_offset;
|
||||||
|
unsigned buf_filled_size_draw_count_offset;
|
||||||
bool buf_filled_size_valid;
|
bool buf_filled_size_valid;
|
||||||
|
|
||||||
unsigned stride_in_dw;
|
unsigned stride_in_dw;
|
||||||
|
|
@ -968,6 +986,12 @@ struct gfx11_reg_pair {
|
||||||
uint32_t reg_value[2];
|
uint32_t reg_value[2];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/* A pair of values for SET_*_REG_PAIRS. */
|
||||||
|
struct gfx12_reg {
|
||||||
|
uint32_t reg_offset;
|
||||||
|
uint32_t reg_value;
|
||||||
|
};
|
||||||
|
|
||||||
typedef void (*pipe_draw_vertex_state_func)(struct pipe_context *ctx,
|
typedef void (*pipe_draw_vertex_state_func)(struct pipe_context *ctx,
|
||||||
struct pipe_vertex_state *vstate,
|
struct pipe_vertex_state *vstate,
|
||||||
uint32_t partial_velem_mask,
|
uint32_t partial_velem_mask,
|
||||||
|
|
@ -1070,11 +1094,18 @@ struct si_context {
|
||||||
/* Gfx11+: Buffered SH registers for SET_SH_REG_PAIRS_*. */
|
/* Gfx11+: Buffered SH registers for SET_SH_REG_PAIRS_*. */
|
||||||
unsigned num_buffered_gfx_sh_regs;
|
unsigned num_buffered_gfx_sh_regs;
|
||||||
unsigned num_buffered_compute_sh_regs;
|
unsigned num_buffered_compute_sh_regs;
|
||||||
|
union {
|
||||||
struct {
|
struct {
|
||||||
struct gfx11_reg_pair buffered_gfx_sh_regs[32];
|
struct gfx11_reg_pair buffered_gfx_sh_regs[32];
|
||||||
struct gfx11_reg_pair buffered_compute_sh_regs[32];
|
struct gfx11_reg_pair buffered_compute_sh_regs[32];
|
||||||
} gfx11;
|
} gfx11;
|
||||||
|
|
||||||
|
struct {
|
||||||
|
struct gfx12_reg buffered_gfx_sh_regs[64];
|
||||||
|
struct gfx12_reg buffered_compute_sh_regs[64];
|
||||||
|
} gfx12;
|
||||||
|
};
|
||||||
|
|
||||||
/* Atom declarations. */
|
/* Atom declarations. */
|
||||||
struct si_framebuffer framebuffer;
|
struct si_framebuffer framebuffer;
|
||||||
unsigned sample_locs_num_samples;
|
unsigned sample_locs_num_samples;
|
||||||
|
|
@ -1117,6 +1148,7 @@ struct si_context {
|
||||||
struct si_shader_ctx_state shaders[SI_NUM_GRAPHICS_SHADERS];
|
struct si_shader_ctx_state shaders[SI_NUM_GRAPHICS_SHADERS];
|
||||||
};
|
};
|
||||||
struct si_cs_shader_state cs_shader_state;
|
struct si_cs_shader_state cs_shader_state;
|
||||||
|
bool compute_ping_pong_launch;
|
||||||
/* if current tcs set by user */
|
/* if current tcs set by user */
|
||||||
bool is_user_tcs;
|
bool is_user_tcs;
|
||||||
|
|
||||||
|
|
@ -1186,6 +1218,7 @@ struct si_context {
|
||||||
|
|
||||||
/* DB render state. */
|
/* DB render state. */
|
||||||
unsigned ps_db_shader_control;
|
unsigned ps_db_shader_control;
|
||||||
|
unsigned ps_pa_sc_hisz_control;
|
||||||
unsigned dbcb_copy_sample;
|
unsigned dbcb_copy_sample;
|
||||||
bool dbcb_depth_copy_enabled : 1;
|
bool dbcb_depth_copy_enabled : 1;
|
||||||
bool dbcb_stencil_copy_enabled : 1;
|
bool dbcb_stencil_copy_enabled : 1;
|
||||||
|
|
@ -1922,7 +1955,7 @@ static inline void si_make_CB_shader_coherent(struct si_context *sctx, unsigned
|
||||||
sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_INV_VCACHE;
|
sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_INV_VCACHE;
|
||||||
sctx->force_shader_coherency.with_cb = false;
|
sctx->force_shader_coherency.with_cb = false;
|
||||||
|
|
||||||
if (sctx->gfx_level >= GFX10) {
|
if (sctx->gfx_level >= GFX10 && sctx->gfx_level < GFX12) {
|
||||||
if (sctx->screen->info.tcc_rb_non_coherent)
|
if (sctx->screen->info.tcc_rb_non_coherent)
|
||||||
sctx->flags |= SI_CONTEXT_INV_L2;
|
sctx->flags |= SI_CONTEXT_INV_L2;
|
||||||
else if (shaders_read_metadata)
|
else if (shaders_read_metadata)
|
||||||
|
|
@ -1936,7 +1969,7 @@ static inline void si_make_CB_shader_coherent(struct si_context *sctx, unsigned
|
||||||
sctx->flags |= SI_CONTEXT_INV_L2;
|
sctx->flags |= SI_CONTEXT_INV_L2;
|
||||||
else if (shaders_read_metadata)
|
else if (shaders_read_metadata)
|
||||||
sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
|
sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
|
||||||
} else {
|
} else if (sctx->gfx_level <= GFX8) {
|
||||||
/* GFX6-GFX8 */
|
/* GFX6-GFX8 */
|
||||||
sctx->flags |= SI_CONTEXT_INV_L2;
|
sctx->flags |= SI_CONTEXT_INV_L2;
|
||||||
}
|
}
|
||||||
|
|
@ -1950,7 +1983,7 @@ static inline void si_make_DB_shader_coherent(struct si_context *sctx, unsigned
|
||||||
sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB | SI_CONTEXT_INV_VCACHE;
|
sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB | SI_CONTEXT_INV_VCACHE;
|
||||||
sctx->force_shader_coherency.with_db = false;
|
sctx->force_shader_coherency.with_db = false;
|
||||||
|
|
||||||
if (sctx->gfx_level >= GFX10) {
|
if (sctx->gfx_level >= GFX10 && sctx->gfx_level < GFX12) {
|
||||||
if (sctx->screen->info.tcc_rb_non_coherent)
|
if (sctx->screen->info.tcc_rb_non_coherent)
|
||||||
sctx->flags |= SI_CONTEXT_INV_L2;
|
sctx->flags |= SI_CONTEXT_INV_L2;
|
||||||
else if (shaders_read_metadata)
|
else if (shaders_read_metadata)
|
||||||
|
|
@ -1964,7 +1997,7 @@ static inline void si_make_DB_shader_coherent(struct si_context *sctx, unsigned
|
||||||
sctx->flags |= SI_CONTEXT_INV_L2;
|
sctx->flags |= SI_CONTEXT_INV_L2;
|
||||||
else if (shaders_read_metadata)
|
else if (shaders_read_metadata)
|
||||||
sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
|
sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
|
||||||
} else {
|
} else if (sctx->gfx_level <= GFX8) {
|
||||||
/* GFX6-GFX8 */
|
/* GFX6-GFX8 */
|
||||||
sctx->flags |= SI_CONTEXT_INV_L2;
|
sctx->flags |= SI_CONTEXT_INV_L2;
|
||||||
}
|
}
|
||||||
|
|
@ -1979,13 +2012,17 @@ static inline bool si_can_sample_zs(struct si_texture *tex, bool stencil_sampler
|
||||||
|
|
||||||
static inline bool si_htile_enabled(struct si_texture *tex, unsigned level, unsigned zs_mask)
|
static inline bool si_htile_enabled(struct si_texture *tex, unsigned level, unsigned zs_mask)
|
||||||
{
|
{
|
||||||
|
struct si_screen *sscreen = (struct si_screen *)tex->buffer.b.b.screen;
|
||||||
|
|
||||||
|
/* Gfx12 should never call this. */
|
||||||
|
assert(sscreen->info.gfx_level < GFX12);
|
||||||
|
|
||||||
if (zs_mask == PIPE_MASK_S && (tex->htile_stencil_disabled || !tex->surface.has_stencil))
|
if (zs_mask == PIPE_MASK_S && (tex->htile_stencil_disabled || !tex->surface.has_stencil))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
if (!tex->is_depth || !tex->surface.meta_offset)
|
if (!tex->is_depth || !tex->surface.meta_offset)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
struct si_screen *sscreen = (struct si_screen *)tex->buffer.b.b.screen;
|
|
||||||
if (sscreen->info.gfx_level >= GFX8) {
|
if (sscreen->info.gfx_level >= GFX8) {
|
||||||
return level < tex->surface.num_meta_levels;
|
return level < tex->surface.num_meta_levels;
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -2000,6 +2037,11 @@ static inline bool si_htile_enabled(struct si_texture *tex, unsigned level, unsi
|
||||||
static inline bool vi_tc_compat_htile_enabled(struct si_texture *tex, unsigned level,
|
static inline bool vi_tc_compat_htile_enabled(struct si_texture *tex, unsigned level,
|
||||||
unsigned zs_mask)
|
unsigned zs_mask)
|
||||||
{
|
{
|
||||||
|
struct si_screen *sscreen = (struct si_screen *)tex->buffer.b.b.screen;
|
||||||
|
|
||||||
|
/* Gfx12 should never call this. */
|
||||||
|
assert(sscreen->info.gfx_level < GFX12);
|
||||||
|
|
||||||
assert(!tex->tc_compatible_htile || tex->surface.meta_offset);
|
assert(!tex->tc_compatible_htile || tex->surface.meta_offset);
|
||||||
return tex->tc_compatible_htile && si_htile_enabled(tex, level, zs_mask);
|
return tex->tc_compatible_htile && si_htile_enabled(tex, level, zs_mask);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -14,6 +14,13 @@
|
||||||
static void si_pm4_set_reg_custom(struct si_pm4_state *state, unsigned reg, uint32_t val,
|
static void si_pm4_set_reg_custom(struct si_pm4_state *state, unsigned reg, uint32_t val,
|
||||||
unsigned opcode, unsigned idx);
|
unsigned opcode, unsigned idx);
|
||||||
|
|
||||||
|
static bool opcode_is_pairs(unsigned opcode)
|
||||||
|
{
|
||||||
|
return opcode == PKT3_SET_CONTEXT_REG_PAIRS ||
|
||||||
|
opcode == PKT3_SET_SH_REG_PAIRS ||
|
||||||
|
opcode == PKT3_SET_UCONFIG_REG_PAIRS;
|
||||||
|
}
|
||||||
|
|
||||||
static bool opcode_is_pairs_packed(unsigned opcode)
|
static bool opcode_is_pairs_packed(unsigned opcode)
|
||||||
{
|
{
|
||||||
return opcode == PKT3_SET_CONTEXT_REG_PAIRS_PACKED ||
|
return opcode == PKT3_SET_CONTEXT_REG_PAIRS_PACKED ||
|
||||||
|
|
@ -39,9 +46,13 @@ static unsigned regular_opcode_to_pairs(struct si_pm4_state *state, unsigned opc
|
||||||
|
|
||||||
switch (opcode) {
|
switch (opcode) {
|
||||||
case PKT3_SET_CONTEXT_REG:
|
case PKT3_SET_CONTEXT_REG:
|
||||||
return info->has_set_context_pairs_packed ? PKT3_SET_CONTEXT_REG_PAIRS_PACKED : opcode;
|
return info->has_set_context_pairs_packed ? PKT3_SET_CONTEXT_REG_PAIRS_PACKED :
|
||||||
|
info->has_set_context_pairs ? PKT3_SET_CONTEXT_REG_PAIRS : opcode;
|
||||||
case PKT3_SET_SH_REG:
|
case PKT3_SET_SH_REG:
|
||||||
return info->has_set_sh_pairs_packed ? PKT3_SET_SH_REG_PAIRS_PACKED : opcode;
|
return info->has_set_sh_pairs_packed ? PKT3_SET_SH_REG_PAIRS_PACKED :
|
||||||
|
info->has_set_sh_pairs ? PKT3_SET_SH_REG_PAIRS : opcode;
|
||||||
|
case PKT3_SET_UCONFIG_REG:
|
||||||
|
return info->has_set_uconfig_pairs ? PKT3_SET_UCONFIG_REG_PAIRS : opcode;
|
||||||
}
|
}
|
||||||
|
|
||||||
return opcode;
|
return opcode;
|
||||||
|
|
@ -192,7 +203,8 @@ static void si_pm4_cmd_end(struct si_pm4_state *state, bool predicate)
|
||||||
count = state->ndw - state->last_pm4 - 2;
|
count = state->ndw - state->last_pm4 - 2;
|
||||||
/* All SET_*_PAIRS* packets on the gfx queue must set RESET_FILTER_CAM. */
|
/* All SET_*_PAIRS* packets on the gfx queue must set RESET_FILTER_CAM. */
|
||||||
bool reset_filter_cam = !state->is_compute_queue &&
|
bool reset_filter_cam = !state->is_compute_queue &&
|
||||||
opcode_is_pairs_packed(state->last_opcode);
|
(opcode_is_pairs(state->last_opcode) ||
|
||||||
|
opcode_is_pairs_packed(state->last_opcode));
|
||||||
|
|
||||||
state->pm4[state->last_pm4] = PKT3(state->last_opcode, count, predicate) |
|
state->pm4[state->last_pm4] = PKT3(state->last_opcode, count, predicate) |
|
||||||
PKT3_RESET_FILTER_CAM_S(reset_filter_cam);
|
PKT3_RESET_FILTER_CAM_S(reset_filter_cam);
|
||||||
|
|
@ -226,6 +238,13 @@ static void si_pm4_set_reg_custom(struct si_pm4_state *state, unsigned reg, uint
|
||||||
si_pm4_cmd_begin(state, opcode); /* reserve space for the header */
|
si_pm4_cmd_begin(state, opcode); /* reserve space for the header */
|
||||||
state->ndw++; /* reserve space for the register count, it will be set at the end */
|
state->ndw++; /* reserve space for the register count, it will be set at the end */
|
||||||
}
|
}
|
||||||
|
} else if (opcode_is_pairs(opcode)) {
|
||||||
|
assert(idx == 0);
|
||||||
|
|
||||||
|
if (opcode != state->last_opcode)
|
||||||
|
si_pm4_cmd_begin(state, opcode);
|
||||||
|
|
||||||
|
state->pm4[state->ndw++] = reg;
|
||||||
} else if (opcode != state->last_opcode || reg != (state->last_reg + 1) ||
|
} else if (opcode != state->last_opcode || reg != (state->last_reg + 1) ||
|
||||||
idx != state->last_idx) {
|
idx != state->last_idx) {
|
||||||
si_pm4_cmd_begin(state, opcode);
|
si_pm4_cmd_begin(state, opcode);
|
||||||
|
|
|
||||||
|
|
@ -57,6 +57,7 @@ static bool si_sdma_v4_v5_copy_texture(struct si_context *sctx, struct si_textur
|
||||||
{
|
{
|
||||||
bool is_v5 = sctx->gfx_level >= GFX10;
|
bool is_v5 = sctx->gfx_level >= GFX10;
|
||||||
bool is_v5_2 = sctx->gfx_level >= GFX10_3;
|
bool is_v5_2 = sctx->gfx_level >= GFX10_3;
|
||||||
|
bool is_v7 = sctx->gfx_level >= GFX12;
|
||||||
unsigned bpp = sdst->surface.bpe;
|
unsigned bpp = sdst->surface.bpe;
|
||||||
uint64_t dst_address = sdst->buffer.gpu_address + sdst->surface.u.gfx9.surf_offset;
|
uint64_t dst_address = sdst->buffer.gpu_address + sdst->surface.u.gfx9.surf_offset;
|
||||||
uint64_t src_address = ssrc->buffer.gpu_address + ssrc->surface.u.gfx9.surf_offset;
|
uint64_t src_address = ssrc->buffer.gpu_address + ssrc->surface.u.gfx9.surf_offset;
|
||||||
|
|
@ -111,17 +112,27 @@ static bool si_sdma_v4_v5_copy_texture(struct si_context *sctx, struct si_textur
|
||||||
uint64_t tiled_address = tiled == ssrc ? src_address : dst_address;
|
uint64_t tiled_address = tiled == ssrc ? src_address : dst_address;
|
||||||
uint64_t linear_address = linear == ssrc ? src_address : dst_address;
|
uint64_t linear_address = linear == ssrc ? src_address : dst_address;
|
||||||
struct radeon_cmdbuf *cs = sctx->sdma_cs;
|
struct radeon_cmdbuf *cs = sctx->sdma_cs;
|
||||||
/* Only SDMA 5 supports DCC with SDMA */
|
|
||||||
bool dcc = vi_dcc_enabled(tiled, 0) && is_v5;
|
|
||||||
assert(tiled->buffer.b.b.depth0 == 1);
|
assert(tiled->buffer.b.b.depth0 == 1);
|
||||||
|
bool dcc = false;
|
||||||
|
|
||||||
linear_address += linear->surface.u.gfx9.offset[0];
|
if (is_v7) {
|
||||||
|
/* Check if everything fits into the bitfields */
|
||||||
|
if (!(tiled_width <= (1 << 16) && tiled_height <= (1 << 16) &&
|
||||||
|
linear_pitch <= (1 << 16) && linear_slice_pitch <= (1ull << 32) &&
|
||||||
|
copy_width <= (1 << 16) && copy_height <= (1 << 16)))
|
||||||
|
return false;
|
||||||
|
} else {
|
||||||
|
/* Only SDMA 5 supports DCC with SDMA */
|
||||||
|
dcc = is_v5 && vi_dcc_enabled(tiled, 0);
|
||||||
|
|
||||||
/* Check if everything fits into the bitfields */
|
/* Check if everything fits into the bitfields */
|
||||||
if (!(tiled_width <= (1 << 14) && tiled_height <= (1 << 14) &&
|
if (!(tiled_width <= (1 << 14) && tiled_height <= (1 << 14) &&
|
||||||
linear_pitch <= (1 << 14) && linear_slice_pitch <= (1 << 28) &&
|
linear_pitch <= (1 << 14) && linear_slice_pitch <= (1 << 28) &&
|
||||||
copy_width <= (1 << 14) && copy_height <= (1 << 14)))
|
copy_width <= (1 << 14) && copy_height <= (1 << 14)))
|
||||||
return false;
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
linear_address += linear->surface.u.gfx9.offset[0];
|
||||||
|
|
||||||
radeon_begin(cs);
|
radeon_begin(cs);
|
||||||
radeon_emit(
|
radeon_emit(
|
||||||
|
|
@ -138,7 +149,7 @@ static bool si_sdma_v4_v5_copy_texture(struct si_context *sctx, struct si_textur
|
||||||
radeon_emit((tiled_height - 1));
|
radeon_emit((tiled_height - 1));
|
||||||
radeon_emit(util_logbase2(bpp) |
|
radeon_emit(util_logbase2(bpp) |
|
||||||
tiled->surface.u.gfx9.swizzle_mode << 3 |
|
tiled->surface.u.gfx9.swizzle_mode << 3 |
|
||||||
tiled->surface.u.gfx9.resource_type << 9 |
|
(is_v7 ? 0 : tiled->surface.u.gfx9.resource_type << 9) |
|
||||||
(is_v5 ? tiled->buffer.b.b.last_level : tiled->surface.u.gfx9.epitch) << 16);
|
(is_v5 ? tiled->buffer.b.b.last_level : tiled->surface.u.gfx9.epitch) << 16);
|
||||||
radeon_emit((uint32_t)linear_address);
|
radeon_emit((uint32_t)linear_address);
|
||||||
radeon_emit((uint32_t)(linear_address >> 32));
|
radeon_emit((uint32_t)(linear_address >> 32));
|
||||||
|
|
@ -409,6 +420,7 @@ bool si_sdma_copy_image(struct si_context *sctx, struct si_texture *dst, struct
|
||||||
case GFX10_3:
|
case GFX10_3:
|
||||||
case GFX11:
|
case GFX11:
|
||||||
case GFX11_5:
|
case GFX11_5:
|
||||||
|
case GFX12:
|
||||||
if (!si_sdma_v4_v5_copy_texture(sctx, dst, src))
|
if (!si_sdma_v4_v5_copy_texture(sctx, dst, src))
|
||||||
return false;
|
return false;
|
||||||
break;
|
break;
|
||||||
|
|
|
||||||
|
|
@ -254,7 +254,10 @@ static void declare_vb_descriptor_input_sgprs(struct si_shader_args *args,
|
||||||
static void declare_vs_input_vgprs(struct si_shader_args *args, struct si_shader *shader)
|
static void declare_vs_input_vgprs(struct si_shader_args *args, struct si_shader *shader)
|
||||||
{
|
{
|
||||||
ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vertex_id);
|
ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vertex_id);
|
||||||
if (shader->key.ge.as_ls) {
|
|
||||||
|
if (shader->selector->screen->info.gfx_level >= GFX12) {
|
||||||
|
ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
|
||||||
|
} else if (shader->key.ge.as_ls) {
|
||||||
if (shader->selector->screen->info.gfx_level >= GFX11) {
|
if (shader->selector->screen->info.gfx_level >= GFX11) {
|
||||||
ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
|
ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
|
||||||
ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
|
ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
|
||||||
|
|
@ -520,11 +523,17 @@ void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args)
|
||||||
}
|
}
|
||||||
|
|
||||||
/* VGPRs (first GS, then VS/TES) */
|
/* VGPRs (first GS, then VS/TES) */
|
||||||
|
if (sel->screen->info.gfx_level >= GFX12) {
|
||||||
|
ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]);
|
||||||
|
ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_prim_id);
|
||||||
|
ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[1]);
|
||||||
|
} else {
|
||||||
ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]);
|
ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]);
|
||||||
ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[1]);
|
ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[1]);
|
||||||
ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_prim_id);
|
ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_prim_id);
|
||||||
ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_invocation_id);
|
ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_invocation_id);
|
||||||
ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[2]);
|
ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[2]);
|
||||||
|
}
|
||||||
|
|
||||||
if (stage == MESA_SHADER_VERTEX) {
|
if (stage == MESA_SHADER_VERTEX) {
|
||||||
declare_vs_input_vgprs(args, shader);
|
declare_vs_input_vgprs(args, shader);
|
||||||
|
|
@ -543,7 +552,7 @@ void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args)
|
||||||
/* ES return values are inputs to GS. */
|
/* ES return values are inputs to GS. */
|
||||||
for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++)
|
for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++)
|
||||||
ac_add_return(&args->ac, AC_ARG_SGPR);
|
ac_add_return(&args->ac, AC_ARG_SGPR);
|
||||||
for (i = 0; i < 5; i++)
|
for (i = 0; i < (sel->screen->info.gfx_level >= GFX12 ? 3 : 5); i++)
|
||||||
ac_add_return(&args->ac, AC_ARG_VGPR);
|
ac_add_return(&args->ac, AC_ARG_VGPR);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
@ -706,6 +715,12 @@ void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args)
|
||||||
/* Hardware SGPRs. */
|
/* Hardware SGPRs. */
|
||||||
for (i = 0; i < 3; i++) {
|
for (i = 0; i < 3; i++) {
|
||||||
if (shader->selector->info.uses_block_id[i]) {
|
if (shader->selector->info.uses_block_id[i]) {
|
||||||
|
/* GFX12 loads workgroup IDs into ttmp registers, so they are not input SGPRs, but we
|
||||||
|
* still need to set this to indicate that they are enabled (for ac_nir_to_llvm).
|
||||||
|
*/
|
||||||
|
if (sel->screen->info.gfx_level >= GFX12)
|
||||||
|
args->ac.workgroup_ids[i].used = true;
|
||||||
|
else
|
||||||
ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.workgroup_ids[i]);
|
ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.workgroup_ids[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -854,8 +869,9 @@ unsigned si_get_shader_prefetch_size(struct si_shader *shader)
|
||||||
|
|
||||||
/* INST_PREF_SIZE uses 128B granularity.
|
/* INST_PREF_SIZE uses 128B granularity.
|
||||||
* - GFX11: max 128 * 63 = 8064
|
* - GFX11: max 128 * 63 = 8064
|
||||||
|
* - GFX12: max 128 * 255 = 32640
|
||||||
*/
|
*/
|
||||||
unsigned max_pref_size = 63;
|
unsigned max_pref_size = shader->selector->screen->info.gfx_level >= GFX12 ? 255 : 63;
|
||||||
unsigned exec_size_gran128 = DIV_ROUND_UP(exec_size, 128);
|
unsigned exec_size_gran128 = DIV_ROUND_UP(exec_size, 128);
|
||||||
|
|
||||||
return MIN2(max_pref_size, exec_size_gran128);
|
return MIN2(max_pref_size, exec_size_gran128);
|
||||||
|
|
@ -1547,6 +1563,7 @@ static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
|
||||||
fprintf(f, " opt.kill_clip_distances = 0x%x\n", key->ge.opt.kill_clip_distances);
|
fprintf(f, " opt.kill_clip_distances = 0x%x\n", key->ge.opt.kill_clip_distances);
|
||||||
fprintf(f, " opt.ngg_culling = 0x%x\n", key->ge.opt.ngg_culling);
|
fprintf(f, " opt.ngg_culling = 0x%x\n", key->ge.opt.ngg_culling);
|
||||||
fprintf(f, " opt.remove_streamout = 0x%x\n", key->ge.opt.remove_streamout);
|
fprintf(f, " opt.remove_streamout = 0x%x\n", key->ge.opt.remove_streamout);
|
||||||
|
fprintf(f, " mono.remove_streamout = 0x%x\n", key->ge.mono.remove_streamout);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (stage <= MESA_SHADER_GEOMETRY)
|
if (stage <= MESA_SHADER_GEOMETRY)
|
||||||
|
|
@ -1865,6 +1882,7 @@ static void si_lower_ngg(struct si_shader *shader, nir_shader *nir)
|
||||||
.kill_pointsize = key->ge.opt.kill_pointsize,
|
.kill_pointsize = key->ge.opt.kill_pointsize,
|
||||||
.kill_layer = key->ge.opt.kill_layer,
|
.kill_layer = key->ge.opt.kill_layer,
|
||||||
.force_vrs = sel->screen->options.vrs2x2,
|
.force_vrs = sel->screen->options.vrs2x2,
|
||||||
|
.use_gfx12_xfb_intrinsic = true,
|
||||||
};
|
};
|
||||||
|
|
||||||
if (nir->info.stage == MESA_SHADER_VERTEX ||
|
if (nir->info.stage == MESA_SHADER_VERTEX ||
|
||||||
|
|
|
||||||
|
|
@ -706,6 +706,9 @@ struct si_shader_key_ge {
|
||||||
unsigned vs_export_prim_id : 1; /* VS and TES only */
|
unsigned vs_export_prim_id : 1; /* VS and TES only */
|
||||||
unsigned gs_tri_strip_adj_fix : 1; /* GS only */
|
unsigned gs_tri_strip_adj_fix : 1; /* GS only */
|
||||||
} u;
|
} u;
|
||||||
|
|
||||||
|
/* Gfx12: When no streamout buffers are bound, streamout must be disabled. */
|
||||||
|
unsigned remove_streamout : 1;
|
||||||
} mono;
|
} mono;
|
||||||
|
|
||||||
/* Optimization flags for asynchronous compilation only. */
|
/* Optimization flags for asynchronous compilation only. */
|
||||||
|
|
@ -953,6 +956,8 @@ struct si_shader {
|
||||||
unsigned cb_shader_mask;
|
unsigned cb_shader_mask;
|
||||||
unsigned db_shader_control;
|
unsigned db_shader_control;
|
||||||
unsigned num_interp;
|
unsigned num_interp;
|
||||||
|
unsigned spi_gs_out_config_ps;
|
||||||
|
unsigned pa_sc_hisz_control;
|
||||||
bool writes_samplemask;
|
bool writes_samplemask;
|
||||||
} ps;
|
} ps;
|
||||||
};
|
};
|
||||||
|
|
@ -1069,7 +1074,8 @@ static inline bool si_shader_uses_streamout(const struct si_shader *shader)
|
||||||
{
|
{
|
||||||
return shader->selector->stage <= MESA_SHADER_GEOMETRY &&
|
return shader->selector->stage <= MESA_SHADER_GEOMETRY &&
|
||||||
shader->selector->info.enabled_streamout_buffer_mask &&
|
shader->selector->info.enabled_streamout_buffer_mask &&
|
||||||
!shader->key.ge.opt.remove_streamout;
|
!shader->key.ge.opt.remove_streamout &&
|
||||||
|
!shader->key.ge.mono.remove_streamout;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline bool si_shader_uses_discard(struct si_shader *shader)
|
static inline bool si_shader_uses_discard(struct si_shader *shader)
|
||||||
|
|
|
||||||
|
|
@ -668,10 +668,12 @@ void si_nir_scan_shader(struct si_screen *sscreen, const struct nir_shader *nir,
|
||||||
info->uses_base_instance = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE);
|
info->uses_base_instance = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE);
|
||||||
info->uses_invocationid = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_INVOCATION_ID);
|
info->uses_invocationid = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_INVOCATION_ID);
|
||||||
info->uses_grid_size = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_NUM_WORKGROUPS);
|
info->uses_grid_size = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_NUM_WORKGROUPS);
|
||||||
info->uses_tg_size = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_NUM_SUBGROUPS) ||
|
info->uses_tg_size = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_NUM_SUBGROUPS);
|
||||||
BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_LOCAL_INVOCATION_INDEX) ||
|
if (sscreen->info.gfx_level < GFX12) {
|
||||||
|
info->uses_tg_size |= BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_LOCAL_INVOCATION_INDEX) ||
|
||||||
BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SUBGROUP_ID) ||
|
BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SUBGROUP_ID) ||
|
||||||
si_should_clear_lds(sscreen, nir);
|
si_should_clear_lds(sscreen, nir);
|
||||||
|
}
|
||||||
info->uses_variable_block_size = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_WORKGROUP_SIZE);
|
info->uses_variable_block_size = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_WORKGROUP_SIZE);
|
||||||
info->uses_drawid = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
|
info->uses_drawid = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
|
||||||
info->uses_primid = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID) ||
|
info->uses_primid = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID) ||
|
||||||
|
|
|
||||||
|
|
@ -173,8 +173,8 @@ void si_llvm_create_func(struct si_shader_context *ctx, const char *name, LLVMTy
|
||||||
ctx->screen->info.address32_hi);
|
ctx->screen->info.address32_hi);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ctx->stage <= MESA_SHADER_GEOMETRY && ctx->shader->key.ge.as_ngg &&
|
if (ctx->screen->info.gfx_level < GFX12 && ctx->stage <= MESA_SHADER_GEOMETRY &&
|
||||||
si_shader_uses_streamout(ctx->shader))
|
ctx->shader->key.ge.as_ngg && si_shader_uses_streamout(ctx->shader))
|
||||||
ac_llvm_add_target_dep_function_attr(ctx->main_fn.value, "amdgpu-gds-size", 256);
|
ac_llvm_add_target_dep_function_attr(ctx->main_fn.value, "amdgpu-gds-size", 256);
|
||||||
|
|
||||||
ac_llvm_set_workgroup_size(ctx->main_fn.value, max_workgroup_size);
|
ac_llvm_set_workgroup_size(ctx->main_fn.value, max_workgroup_size);
|
||||||
|
|
|
||||||
|
|
@ -58,11 +58,17 @@ void si_llvm_es_build_end(struct si_shader_context *ctx)
|
||||||
|
|
||||||
unsigned vgpr = 8 + GFX9_GS_NUM_USER_SGPR;
|
unsigned vgpr = 8 + GFX9_GS_NUM_USER_SGPR;
|
||||||
|
|
||||||
|
if (ctx->screen->info.gfx_level >= GFX12) {
|
||||||
|
ret = si_insert_input_ret_float(ctx, ret, ctx->args->ac.gs_vtx_offset[0], vgpr++);
|
||||||
|
ret = si_insert_input_ret_float(ctx, ret, ctx->args->ac.gs_prim_id, vgpr++);
|
||||||
|
ret = si_insert_input_ret_float(ctx, ret, ctx->args->ac.gs_vtx_offset[1], vgpr++);
|
||||||
|
} else {
|
||||||
ret = si_insert_input_ret_float(ctx, ret, ctx->args->ac.gs_vtx_offset[0], vgpr++);
|
ret = si_insert_input_ret_float(ctx, ret, ctx->args->ac.gs_vtx_offset[0], vgpr++);
|
||||||
ret = si_insert_input_ret_float(ctx, ret, ctx->args->ac.gs_vtx_offset[1], vgpr++);
|
ret = si_insert_input_ret_float(ctx, ret, ctx->args->ac.gs_vtx_offset[1], vgpr++);
|
||||||
ret = si_insert_input_ret_float(ctx, ret, ctx->args->ac.gs_prim_id, vgpr++);
|
ret = si_insert_input_ret_float(ctx, ret, ctx->args->ac.gs_prim_id, vgpr++);
|
||||||
ret = si_insert_input_ret_float(ctx, ret, ctx->args->ac.gs_invocation_id, vgpr++);
|
ret = si_insert_input_ret_float(ctx, ret, ctx->args->ac.gs_invocation_id, vgpr++);
|
||||||
ret = si_insert_input_ret_float(ctx, ret, ctx->args->ac.gs_vtx_offset[2], vgpr++);
|
ret = si_insert_input_ret_float(ctx, ret, ctx->args->ac.gs_vtx_offset[2], vgpr++);
|
||||||
|
}
|
||||||
ctx->return_value = ret;
|
ctx->return_value = ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -337,12 +337,21 @@ static void si_lower_nir(struct si_screen *sscreen, struct nir_shader *nir)
|
||||||
/* gl_LocalInvocationIndex must be derived from gl_LocalInvocationID.xyz to make it correct
|
/* gl_LocalInvocationIndex must be derived from gl_LocalInvocationID.xyz to make it correct
|
||||||
* with quad derivatives. Using gl_SubgroupID for that (which is what we do by default) is
|
* with quad derivatives. Using gl_SubgroupID for that (which is what we do by default) is
|
||||||
* incorrect with a non-linear thread order.
|
* incorrect with a non-linear thread order.
|
||||||
|
*
|
||||||
|
* On Gfx12, we always use a non-linear thread order if the workgroup X and Y size is
|
||||||
|
* divisible by 2.
|
||||||
*/
|
*/
|
||||||
options.lower_local_invocation_index =
|
options.lower_local_invocation_index =
|
||||||
nir->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS;
|
nir->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS ||
|
||||||
|
(sscreen->info.gfx_level >= GFX12 &&
|
||||||
|
nir->info.cs.derivative_group == DERIVATIVE_GROUP_NONE &&
|
||||||
|
(nir->info.workgroup_size_variable ||
|
||||||
|
(nir->info.workgroup_size[0] % 2 == 0 && nir->info.workgroup_size[1] % 2 == 0)));
|
||||||
NIR_PASS_V(nir, nir_lower_compute_system_values, &options);
|
NIR_PASS_V(nir, nir_lower_compute_system_values, &options);
|
||||||
|
|
||||||
if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS) {
|
/* Gfx12 supports this in hw. */
|
||||||
|
if (sscreen->info.gfx_level < GFX12 &&
|
||||||
|
nir->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS) {
|
||||||
nir_opt_cse(nir); /* CSE load_local_invocation_id */
|
nir_opt_cse(nir); /* CSE load_local_invocation_id */
|
||||||
memset(&options, 0, sizeof(options));
|
memset(&options, 0, sizeof(options));
|
||||||
options.shuffle_local_ids_for_quad_derivatives = true;
|
options.shuffle_local_ids_for_quad_derivatives = true;
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load diff
|
|
@ -128,6 +128,8 @@ struct si_state_dsa {
|
||||||
unsigned db_depth_bounds_min;
|
unsigned db_depth_bounds_min;
|
||||||
unsigned db_depth_bounds_max;
|
unsigned db_depth_bounds_max;
|
||||||
unsigned spi_shader_user_data_ps_alpha_ref;
|
unsigned spi_shader_user_data_ps_alpha_ref;
|
||||||
|
unsigned db_stencil_read_mask;
|
||||||
|
unsigned db_stencil_write_mask;
|
||||||
|
|
||||||
/* 0 = without stencil buffer, 1 = when both Z and S buffers are present */
|
/* 0 = without stencil buffer, 1 = when both Z and S buffers are present */
|
||||||
struct si_dsa_order_invariance order_invariance[2];
|
struct si_dsa_order_invariance order_invariance[2];
|
||||||
|
|
@ -262,8 +264,8 @@ struct si_shader_data {
|
||||||
enum si_tracked_reg
|
enum si_tracked_reg
|
||||||
{
|
{
|
||||||
/* CONTEXT registers. */
|
/* CONTEXT registers. */
|
||||||
/* 2 consecutive registers */
|
/* 2 consecutive registers (GFX6-11), or separate registers (GFX12) */
|
||||||
SI_TRACKED_DB_RENDER_CONTROL,
|
SI_TRACKED_DB_RENDER_CONTROL, /* GFX6-11 (not tracked on GFX12) */
|
||||||
SI_TRACKED_DB_COUNT_CONTROL,
|
SI_TRACKED_DB_COUNT_CONTROL,
|
||||||
|
|
||||||
SI_TRACKED_DB_DEPTH_CONTROL,
|
SI_TRACKED_DB_DEPTH_CONTROL,
|
||||||
|
|
@ -292,8 +294,9 @@ enum si_tracked_reg
|
||||||
SI_TRACKED_PA_SC_LINE_CNTL,
|
SI_TRACKED_PA_SC_LINE_CNTL,
|
||||||
SI_TRACKED_PA_SC_AA_CONFIG,
|
SI_TRACKED_PA_SC_AA_CONFIG,
|
||||||
|
|
||||||
/* 5 consecutive registers */
|
/* 5 consecutive registers (GFX6-11) */
|
||||||
SI_TRACKED_PA_SU_VTX_CNTL,
|
SI_TRACKED_PA_SU_VTX_CNTL,
|
||||||
|
/* 4 consecutive registers (GFX12) */
|
||||||
SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ,
|
SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ,
|
||||||
SI_TRACKED_PA_CL_GB_VERT_DISC_ADJ,
|
SI_TRACKED_PA_CL_GB_VERT_DISC_ADJ,
|
||||||
SI_TRACKED_PA_CL_GB_HORZ_CLIP_ADJ,
|
SI_TRACKED_PA_CL_GB_HORZ_CLIP_ADJ,
|
||||||
|
|
@ -302,17 +305,17 @@ enum si_tracked_reg
|
||||||
/* Non-consecutive register */
|
/* Non-consecutive register */
|
||||||
SI_TRACKED_SPI_SHADER_POS_FORMAT,
|
SI_TRACKED_SPI_SHADER_POS_FORMAT,
|
||||||
|
|
||||||
/* 2 consecutive registers */
|
/* 5 consecutive registers (GFX12), or 2 consecutive registers (GFX6-11) */
|
||||||
SI_TRACKED_SPI_SHADER_Z_FORMAT,
|
SI_TRACKED_SPI_SHADER_Z_FORMAT,
|
||||||
SI_TRACKED_SPI_SHADER_COL_FORMAT,
|
SI_TRACKED_SPI_SHADER_COL_FORMAT,
|
||||||
|
/* Continuing consecutive registers (GFX12), or separate register (GFX6-11) */
|
||||||
SI_TRACKED_SPI_BARYC_CNTL,
|
SI_TRACKED_SPI_BARYC_CNTL,
|
||||||
|
/* Continuing consecutive registers (GFX12), or 2 consecutive registers (GFX6-11) */
|
||||||
/* 2 consecutive registers */
|
|
||||||
SI_TRACKED_SPI_PS_INPUT_ENA,
|
SI_TRACKED_SPI_PS_INPUT_ENA,
|
||||||
SI_TRACKED_SPI_PS_INPUT_ADDR,
|
SI_TRACKED_SPI_PS_INPUT_ADDR,
|
||||||
|
|
||||||
SI_TRACKED_DB_EQAA,
|
SI_TRACKED_DB_EQAA,
|
||||||
|
SI_TRACKED_DB_RENDER_OVERRIDE2,
|
||||||
SI_TRACKED_DB_SHADER_CONTROL,
|
SI_TRACKED_DB_SHADER_CONTROL,
|
||||||
SI_TRACKED_CB_SHADER_MASK,
|
SI_TRACKED_CB_SHADER_MASK,
|
||||||
SI_TRACKED_CB_TARGET_MASK,
|
SI_TRACKED_CB_TARGET_MASK,
|
||||||
|
|
@ -365,19 +368,24 @@ enum si_tracked_reg
|
||||||
SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2, /* GFX6-10 (GFX11+ can reuse this slot) */
|
SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2, /* GFX6-10 (GFX11+ can reuse this slot) */
|
||||||
SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3, /* GFX6-10 (GFX11+ can reuse this slot) */
|
SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3, /* GFX6-10 (GFX11+ can reuse this slot) */
|
||||||
|
|
||||||
SI_TRACKED_DB_RENDER_OVERRIDE2, /* GFX6-xx (TBD) */
|
SI_TRACKED_SPI_VS_OUT_CONFIG, /* GFX6-11 */
|
||||||
SI_TRACKED_SPI_VS_OUT_CONFIG, /* GFX6-xx (TBD) */
|
SI_TRACKED_VGT_PRIMITIVEID_EN, /* GFX6-11 */
|
||||||
SI_TRACKED_VGT_PRIMITIVEID_EN, /* GFX6-xx (TBD) */
|
SI_TRACKED_CB_DCC_CONTROL, /* GFX8-11 */
|
||||||
SI_TRACKED_CB_DCC_CONTROL, /* GFX8-xx (TBD) */
|
SI_TRACKED_DB_STENCIL_READ_MASK, /* GFX12+ */
|
||||||
|
SI_TRACKED_DB_STENCIL_WRITE_MASK, /* GFX12+ */
|
||||||
|
SI_TRACKED_PA_SC_HISZ_CONTROL, /* GFX12+ */
|
||||||
|
SI_TRACKED_PA_SC_LINE_STIPPLE_RESET, /* GFX12+ */
|
||||||
|
|
||||||
SI_NUM_TRACKED_CONTEXT_REGS,
|
SI_NUM_TRACKED_CONTEXT_REGS,
|
||||||
SI_FIRST_TRACKED_OTHER_REG = SI_NUM_TRACKED_CONTEXT_REGS,
|
SI_FIRST_TRACKED_OTHER_REG = SI_NUM_TRACKED_CONTEXT_REGS,
|
||||||
|
|
||||||
/* SH and UCONFIG registers. */
|
/* SH and UCONFIG registers. */
|
||||||
SI_TRACKED_GE_PC_ALLOC = SI_FIRST_TRACKED_OTHER_REG, /* GFX10+ */
|
SI_TRACKED_GE_PC_ALLOC = SI_FIRST_TRACKED_OTHER_REG, /* GFX10-11 */
|
||||||
SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, /* GFX7+ */
|
SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, /* GFX7-11 */
|
||||||
SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, /* GFX10+ */
|
SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, /* GFX10+ */
|
||||||
SI_TRACKED_VGT_GS_OUT_PRIM_TYPE_UCONFIG, /* GFX11+ */
|
SI_TRACKED_VGT_GS_OUT_PRIM_TYPE_UCONFIG, /* GFX11+ */
|
||||||
|
SI_TRACKED_SPI_SHADER_GS_OUT_CONFIG_PS, /* GFX12+ */
|
||||||
|
SI_TRACKED_VGT_PRIMITIVEID_EN_UCONFIG, /* GFX12+ */
|
||||||
|
|
||||||
SI_TRACKED_IA_MULTI_VGT_PARAM_UCONFIG, /* GFX9 only */
|
SI_TRACKED_IA_MULTI_VGT_PARAM_UCONFIG, /* GFX9 only */
|
||||||
SI_TRACKED_GE_CNTL = SI_TRACKED_IA_MULTI_VGT_PARAM_UCONFIG, /* GFX10+ */
|
SI_TRACKED_GE_CNTL = SI_TRACKED_IA_MULTI_VGT_PARAM_UCONFIG, /* GFX10+ */
|
||||||
|
|
@ -404,6 +412,7 @@ enum si_tracked_reg
|
||||||
SI_TRACKED_SPI_SHADER_USER_DATA_PS__ALPHA_REF,
|
SI_TRACKED_SPI_SHADER_USER_DATA_PS__ALPHA_REF,
|
||||||
|
|
||||||
SI_TRACKED_COMPUTE_RESOURCE_LIMITS,
|
SI_TRACKED_COMPUTE_RESOURCE_LIMITS,
|
||||||
|
SI_TRACKED_COMPUTE_DISPATCH_INTERLEAVE, /* GFX12+ (not tracked on previous chips) */
|
||||||
SI_TRACKED_COMPUTE_NUM_THREAD_X,
|
SI_TRACKED_COMPUTE_NUM_THREAD_X,
|
||||||
SI_TRACKED_COMPUTE_NUM_THREAD_Y,
|
SI_TRACKED_COMPUTE_NUM_THREAD_Y,
|
||||||
SI_TRACKED_COMPUTE_NUM_THREAD_Z,
|
SI_TRACKED_COMPUTE_NUM_THREAD_Z,
|
||||||
|
|
@ -464,6 +473,7 @@ enum
|
||||||
|
|
||||||
/* Aliases to reuse slots that are unused on other generations. */
|
/* Aliases to reuse slots that are unused on other generations. */
|
||||||
SI_GS_QUERY_BUF = SI_RING_ESGS, /* gfx10+ */
|
SI_GS_QUERY_BUF = SI_RING_ESGS, /* gfx10+ */
|
||||||
|
SI_STREAMOUT_STATE_BUF = SI_RING_GSVS, /* gfx12+ */
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Indices into sctx->descriptors, laid out so that gfx and compute pipelines
|
/* Indices into sctx->descriptors, laid out so that gfx and compute pipelines
|
||||||
|
|
@ -691,6 +701,7 @@ void si_init_draw_functions_GFX10(struct si_context *sctx);
|
||||||
void si_init_draw_functions_GFX10_3(struct si_context *sctx);
|
void si_init_draw_functions_GFX10_3(struct si_context *sctx);
|
||||||
void si_init_draw_functions_GFX11(struct si_context *sctx);
|
void si_init_draw_functions_GFX11(struct si_context *sctx);
|
||||||
void si_init_draw_functions_GFX11_5(struct si_context *sctx);
|
void si_init_draw_functions_GFX11_5(struct si_context *sctx);
|
||||||
|
void si_init_draw_functions_GFX12(struct si_context *sctx);
|
||||||
|
|
||||||
/* si_state_msaa.c */
|
/* si_state_msaa.c */
|
||||||
extern unsigned si_msaa_max_distance[5];
|
extern unsigned si_msaa_max_distance[5];
|
||||||
|
|
|
||||||
|
|
@ -302,7 +302,7 @@ static void gfx10_get_bin_sizes(struct si_context *sctx, unsigned cb_target_enab
|
||||||
((FcReadTags * num_rbs / num_pipes) * (FcTagSize * num_pipes));
|
((FcReadTags * num_rbs / num_pipes) * (FcTagSize * num_pipes));
|
||||||
|
|
||||||
const unsigned minBinSizeX = 128;
|
const unsigned minBinSizeX = 128;
|
||||||
const unsigned minBinSizeY = 64;
|
const unsigned minBinSizeY = sctx->gfx_level >= GFX12 ? 128 : 64;
|
||||||
|
|
||||||
const unsigned num_fragments = sctx->framebuffer.nr_color_samples;
|
const unsigned num_fragments = sctx->framebuffer.nr_color_samples;
|
||||||
const unsigned num_samples = sctx->framebuffer.nr_samples;
|
const unsigned num_samples = sctx->framebuffer.nr_samples;
|
||||||
|
|
@ -390,7 +390,19 @@ static void si_emit_dpbb_disable(struct si_context *sctx)
|
||||||
|
|
||||||
radeon_begin(&sctx->gfx_cs);
|
radeon_begin(&sctx->gfx_cs);
|
||||||
|
|
||||||
if (sctx->gfx_level >= GFX10) {
|
if (sctx->gfx_level >= GFX12) {
|
||||||
|
struct uvec2 bin_size = {128, 128};
|
||||||
|
|
||||||
|
radeon_opt_set_context_reg(sctx, R_028C44_PA_SC_BINNER_CNTL_0,
|
||||||
|
SI_TRACKED_PA_SC_BINNER_CNTL_0,
|
||||||
|
S_028C44_BINNING_MODE(V_028C44_BINNING_DISABLED) |
|
||||||
|
S_028C44_BIN_SIZE_X_EXTEND(util_logbase2(bin_size.x) - 5) |
|
||||||
|
S_028C44_BIN_SIZE_Y_EXTEND(util_logbase2(bin_size.y) - 5) |
|
||||||
|
S_028C44_DISABLE_START_OF_PRIM(1) |
|
||||||
|
S_028C44_FPOVS_PER_BATCH(63) |
|
||||||
|
S_028C44_OPTIMAL_BIN_SELECTION(1) |
|
||||||
|
S_028C44_FLUSH_ON_BINNING_TRANSITION(1));
|
||||||
|
} else if (sctx->gfx_level >= GFX10) {
|
||||||
struct uvec2 bin_size = {};
|
struct uvec2 bin_size = {};
|
||||||
struct uvec2 bin_size_extend = {};
|
struct uvec2 bin_size_extend = {};
|
||||||
unsigned binning_disabled =
|
unsigned binning_disabled =
|
||||||
|
|
@ -435,6 +447,7 @@ void si_emit_dpbb_state(struct si_context *sctx, unsigned index)
|
||||||
struct si_state_dsa *dsa = sctx->queued.named.dsa;
|
struct si_state_dsa *dsa = sctx->queued.named.dsa;
|
||||||
unsigned db_shader_control = sctx->ps_db_shader_control;
|
unsigned db_shader_control = sctx->ps_db_shader_control;
|
||||||
unsigned optimal_bin_selection = !sctx->queued.named.rasterizer->bottom_edge_rule;
|
unsigned optimal_bin_selection = !sctx->queued.named.rasterizer->bottom_edge_rule;
|
||||||
|
unsigned pa_sc_hisz_control = sctx->ps_pa_sc_hisz_control;
|
||||||
|
|
||||||
assert(sctx->gfx_level >= GFX9);
|
assert(sctx->gfx_level >= GFX9);
|
||||||
|
|
||||||
|
|
@ -449,8 +462,10 @@ void si_emit_dpbb_state(struct si_context *sctx, unsigned index)
|
||||||
G_02880C_COVERAGE_TO_MASK_ENABLE(db_shader_control) || blend->alpha_to_coverage;
|
G_02880C_COVERAGE_TO_MASK_ENABLE(db_shader_control) || blend->alpha_to_coverage;
|
||||||
|
|
||||||
bool db_can_reject_z_trivially = !G_02880C_Z_EXPORT_ENABLE(db_shader_control) ||
|
bool db_can_reject_z_trivially = !G_02880C_Z_EXPORT_ENABLE(db_shader_control) ||
|
||||||
G_02880C_CONSERVATIVE_Z_EXPORT(db_shader_control) ||
|
G_02880C_DEPTH_BEFORE_SHADER(db_shader_control) ||
|
||||||
G_02880C_DEPTH_BEFORE_SHADER(db_shader_control);
|
(sctx->gfx_level >= GFX12 ?
|
||||||
|
G_028BBC_CONSERVATIVE_Z_EXPORT(pa_sc_hisz_control) :
|
||||||
|
G_02880C_CONSERVATIVE_Z_EXPORT(db_shader_control));
|
||||||
|
|
||||||
/* Disable DPBB when it's believed to be inefficient. */
|
/* Disable DPBB when it's believed to be inefficient. */
|
||||||
if (sscreen->info.max_render_backends > 4 && ps_can_kill && db_can_reject_z_trivially &&
|
if (sscreen->info.max_render_backends > 4 && ps_can_kill && db_can_reject_z_trivially &&
|
||||||
|
|
|
||||||
|
|
@ -33,6 +33,8 @@
|
||||||
#define GFX(name) name##GFX11
|
#define GFX(name) name##GFX11
|
||||||
#elif (GFX_VER == 115)
|
#elif (GFX_VER == 115)
|
||||||
#define GFX(name) name##GFX11_5
|
#define GFX(name) name##GFX11_5
|
||||||
|
#elif (GFX_VER == 12)
|
||||||
|
#define GFX(name) name##GFX12
|
||||||
#else
|
#else
|
||||||
#error "Unknown gfx level"
|
#error "Unknown gfx level"
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -159,12 +161,17 @@ static bool si_update_shaders(struct si_context *sctx)
|
||||||
uint32_t vgt_stages = 0;
|
uint32_t vgt_stages = 0;
|
||||||
|
|
||||||
if (HAS_TESS) {
|
if (HAS_TESS) {
|
||||||
|
if (GFX_VERSION >= GFX12) {
|
||||||
|
vgt_stages |= S_028A98_HS_EN(1) |
|
||||||
|
S_028A98_HS_W32_EN(sctx->queued.named.hs->wave_size == 32);
|
||||||
|
} else {
|
||||||
vgt_stages |= S_028B54_LS_EN(V_028B54_LS_STAGE_ON) |
|
vgt_stages |= S_028B54_LS_EN(V_028B54_LS_STAGE_ON) |
|
||||||
S_028B54_HS_EN(1) |
|
S_028B54_HS_EN(1) |
|
||||||
S_028B54_DYNAMIC_HS(1) |
|
S_028B54_DYNAMIC_HS(1) |
|
||||||
S_028B54_HS_W32_EN(GFX_VERSION >= GFX10 &&
|
S_028B54_HS_W32_EN(GFX_VERSION >= GFX10 &&
|
||||||
sctx->queued.named.hs->wave_size == 32);
|
sctx->queued.named.hs->wave_size == 32);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (NGG) {
|
if (NGG) {
|
||||||
vgt_stages |= si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current->ngg.vgt_shader_stages_en;
|
vgt_stages |= si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current->ngg.vgt_shader_stages_en;
|
||||||
|
|
@ -206,6 +213,9 @@ static bool si_update_shaders(struct si_context *sctx)
|
||||||
} else {
|
} else {
|
||||||
ge_cntl = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current->ge_cntl;
|
ge_cntl = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current->ge_cntl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (GFX_VERSION >= GFX12)
|
||||||
|
ge_cntl |= S_03096C_DIS_PG_SIZE_ADJUST_FOR_STRIP(1);
|
||||||
} else {
|
} else {
|
||||||
unsigned primgroup_size;
|
unsigned primgroup_size;
|
||||||
unsigned vertgroup_size;
|
unsigned vertgroup_size;
|
||||||
|
|
@ -265,6 +275,13 @@ static bool si_update_shaders(struct si_context *sctx)
|
||||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
|
si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
unsigned pa_sc_hisz_control = sctx->shader.ps.current->ps.pa_sc_hisz_control;
|
||||||
|
if (GFX_VERSION >= GFX12 && sctx->screen->dpbb_allowed &&
|
||||||
|
sctx->ps_pa_sc_hisz_control != pa_sc_hisz_control) {
|
||||||
|
sctx->ps_pa_sc_hisz_control = pa_sc_hisz_control;
|
||||||
|
si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
|
||||||
|
}
|
||||||
|
|
||||||
if (si_pm4_state_changed(sctx, ps) ||
|
if (si_pm4_state_changed(sctx, ps) ||
|
||||||
(!NGG && si_pm4_state_changed(sctx, vs)) ||
|
(!NGG && si_pm4_state_changed(sctx, vs)) ||
|
||||||
(NGG && si_pm4_state_changed(sctx, gs))) {
|
(NGG && si_pm4_state_changed(sctx, gs))) {
|
||||||
|
|
@ -560,6 +577,9 @@ void si_cp_dma_prefetch(struct si_context *sctx, struct pipe_resource *buf,
|
||||||
case GFX11_5:
|
case GFX11_5:
|
||||||
si_cp_dma_prefetch_inline<GFX11_5>(sctx, address, size);
|
si_cp_dma_prefetch_inline<GFX11_5>(sctx, address, size);
|
||||||
break;
|
break;
|
||||||
|
case GFX12:
|
||||||
|
si_cp_dma_prefetch_inline<GFX12>(sctx, address, size);
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
@ -910,6 +930,11 @@ static void si_emit_rasterizer_prim_state(struct si_context *sctx)
|
||||||
bool reset_per_prim = rast_prim == MESA_PRIM_LINES ||
|
bool reset_per_prim = rast_prim == MESA_PRIM_LINES ||
|
||||||
rast_prim == MESA_PRIM_LINES_ADJACENCY;
|
rast_prim == MESA_PRIM_LINES_ADJACENCY;
|
||||||
/* 0 = no reset, 1 = reset per prim, 2 = reset per packet */
|
/* 0 = no reset, 1 = reset per prim, 2 = reset per packet */
|
||||||
|
if (GFX_VERSION >= GFX12) {
|
||||||
|
radeon_opt_set_context_reg(sctx, R_028A44_PA_SC_LINE_STIPPLE_RESET,
|
||||||
|
SI_TRACKED_PA_SC_LINE_STIPPLE_RESET,
|
||||||
|
S_028A44_AUTO_RESET_CNTL(reset_per_prim ? 1 : 2));
|
||||||
|
} else {
|
||||||
struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
|
struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
|
||||||
|
|
||||||
radeon_opt_set_context_reg(sctx, R_028A0C_PA_SC_LINE_STIPPLE,
|
radeon_opt_set_context_reg(sctx, R_028A0C_PA_SC_LINE_STIPPLE,
|
||||||
|
|
@ -917,6 +942,7 @@ static void si_emit_rasterizer_prim_state(struct si_context *sctx)
|
||||||
rs->pa_sc_line_stipple |
|
rs->pa_sc_line_stipple |
|
||||||
S_028A0C_AUTO_RESET_CNTL(reset_per_prim ? 1 : 2));
|
S_028A0C_AUTO_RESET_CNTL(reset_per_prim ? 1 : 2));
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (NGG || HAS_GS) {
|
if (NGG || HAS_GS) {
|
||||||
if (GFX_VERSION >= GFX11) {
|
if (GFX_VERSION >= GFX11) {
|
||||||
|
|
@ -1077,6 +1103,9 @@ static void si_emit_draw_registers(struct si_context *sctx,
|
||||||
if (prim != sctx->last_prim) {
|
if (prim != sctx->last_prim) {
|
||||||
unsigned vgt_prim = HAS_TESS ? V_008958_DI_PT_PATCH : si_conv_pipe_prim(prim);
|
unsigned vgt_prim = HAS_TESS ? V_008958_DI_PT_PATCH : si_conv_pipe_prim(prim);
|
||||||
|
|
||||||
|
if (GFX_VERSION >= GFX12 && HAS_TESS)
|
||||||
|
vgt_prim |= S_030908_NUM_INPUT_CP(sctx->patch_vertices);
|
||||||
|
|
||||||
if (GFX_VERSION >= GFX10)
|
if (GFX_VERSION >= GFX10)
|
||||||
radeon_set_uconfig_reg(R_030908_VGT_PRIMITIVE_TYPE, vgt_prim);
|
radeon_set_uconfig_reg(R_030908_VGT_PRIMITIVE_TYPE, vgt_prim);
|
||||||
else if (GFX_VERSION >= GFX7)
|
else if (GFX_VERSION >= GFX7)
|
||||||
|
|
@ -1170,13 +1199,29 @@ gfx11_emit_buffered_sh_regs_inline(struct si_context *sctx, unsigned *num_regs,
|
||||||
radeon_end();
|
radeon_end();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define gfx12_emit_buffered_sh_regs_inline(num_regs, regs) do { \
|
||||||
|
unsigned __reg_count = *(num_regs); \
|
||||||
|
if (__reg_count) { \
|
||||||
|
radeon_emit(PKT3(PKT3_SET_SH_REG_PAIRS, __reg_count * 2 - 1, 0) | PKT3_RESET_FILTER_CAM_S(1)); \
|
||||||
|
radeon_emit_array(regs, __reg_count * 2); \
|
||||||
|
*(num_regs) = 0; \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
#if GFX_VER == 6 /* declare this function only once because there is only one variant. */
|
#if GFX_VER == 6 /* declare this function only once because there is only one variant. */
|
||||||
|
|
||||||
void si_emit_buffered_compute_sh_regs(struct si_context *sctx)
|
void si_emit_buffered_compute_sh_regs(struct si_context *sctx)
|
||||||
{
|
{
|
||||||
|
if (sctx->gfx_level >= GFX12) {
|
||||||
|
radeon_begin(&sctx->gfx_cs);
|
||||||
|
gfx12_emit_buffered_sh_regs_inline(&sctx->num_buffered_compute_sh_regs,
|
||||||
|
sctx->gfx12.buffered_compute_sh_regs);
|
||||||
|
radeon_end();
|
||||||
|
} else {
|
||||||
gfx11_emit_buffered_sh_regs_inline(sctx, &sctx->num_buffered_compute_sh_regs,
|
gfx11_emit_buffered_sh_regs_inline(sctx, &sctx->num_buffered_compute_sh_regs,
|
||||||
sctx->gfx11.buffered_compute_sh_regs);
|
sctx->gfx11.buffered_compute_sh_regs);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
@ -1218,10 +1263,11 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
|
||||||
|
|
||||||
if (GFX_VERSION >= GFX9) {
|
if (GFX_VERSION >= GFX9) {
|
||||||
/* Use PKT3_LOAD_CONTEXT_REG_INDEX instead of si_cp_copy_data to support state shadowing. */
|
/* Use PKT3_LOAD_CONTEXT_REG_INDEX instead of si_cp_copy_data to support state shadowing. */
|
||||||
uint64_t va = t->buf_filled_size->gpu_address + t->buf_filled_size_offset;
|
uint64_t va = t->buf_filled_size->gpu_address + t->buf_filled_size_draw_count_offset;
|
||||||
|
|
||||||
radeon_begin(cs);
|
radeon_begin(cs);
|
||||||
|
|
||||||
|
// TODO: GFX12: This may be discarded by PFP if the shadow base address is provided by the MQD.
|
||||||
radeon_emit(PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, 0));
|
radeon_emit(PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, 0));
|
||||||
radeon_emit(va);
|
radeon_emit(va);
|
||||||
radeon_emit(va >> 32);
|
radeon_emit(va >> 32);
|
||||||
|
|
@ -1232,7 +1278,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
|
||||||
} else {
|
} else {
|
||||||
si_cp_copy_data(sctx, &sctx->gfx_cs, COPY_DATA_REG, NULL,
|
si_cp_copy_data(sctx, &sctx->gfx_cs, COPY_DATA_REG, NULL,
|
||||||
R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2, COPY_DATA_SRC_MEM,
|
R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2, COPY_DATA_SRC_MEM,
|
||||||
t->buf_filled_size, t->buf_filled_size_offset);
|
t->buf_filled_size, t->buf_filled_size_draw_count_offset);
|
||||||
}
|
}
|
||||||
use_opaque = S_0287F0_USE_OPAQUE(1);
|
use_opaque = S_0287F0_USE_OPAQUE(1);
|
||||||
indirect = NULL;
|
indirect = NULL;
|
||||||
|
|
@ -1340,7 +1386,10 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
|
||||||
|
|
||||||
assert(indirect_va % 8 == 0);
|
assert(indirect_va % 8 == 0);
|
||||||
|
|
||||||
if (HAS_SH_PAIRS_PACKED) {
|
if (GFX_VERSION >= GFX12) {
|
||||||
|
gfx12_emit_buffered_sh_regs_inline(&sctx->num_buffered_gfx_sh_regs,
|
||||||
|
sctx->gfx12.buffered_gfx_sh_regs);
|
||||||
|
} else if (HAS_SH_PAIRS_PACKED) {
|
||||||
radeon_end();
|
radeon_end();
|
||||||
gfx11_emit_buffered_sh_regs_inline(sctx, &sctx->num_buffered_gfx_sh_regs,
|
gfx11_emit_buffered_sh_regs_inline(sctx, &sctx->num_buffered_gfx_sh_regs,
|
||||||
sctx->gfx11.buffered_gfx_sh_regs);
|
sctx->gfx11.buffered_gfx_sh_regs);
|
||||||
|
|
@ -1424,7 +1473,18 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
|
||||||
|
|
||||||
if (!is_blit) {
|
if (!is_blit) {
|
||||||
/* Prefer SET_SH_REG_PAIRS_PACKED* on Gfx11+. */
|
/* Prefer SET_SH_REG_PAIRS_PACKED* on Gfx11+. */
|
||||||
if (HAS_SH_PAIRS_PACKED) {
|
if (GFX_VERSION >= GFX12) {
|
||||||
|
gfx12_opt_push_gfx_sh_reg(sh_base_reg + SI_SGPR_BASE_VERTEX * 4,
|
||||||
|
tracked_base_vertex_reg, base_vertex);
|
||||||
|
if (set_draw_id) {
|
||||||
|
gfx12_opt_push_gfx_sh_reg(sh_base_reg + SI_SGPR_DRAWID * 4,
|
||||||
|
tracked_base_vertex_reg + 1, drawid_base);
|
||||||
|
}
|
||||||
|
if (set_base_instance) {
|
||||||
|
gfx12_opt_push_gfx_sh_reg(sh_base_reg + SI_SGPR_START_INSTANCE * 4,
|
||||||
|
tracked_base_vertex_reg + 2, info->start_instance);
|
||||||
|
}
|
||||||
|
} else if (HAS_SH_PAIRS_PACKED) {
|
||||||
gfx11_opt_push_gfx_sh_reg(sh_base_reg + SI_SGPR_BASE_VERTEX * 4,
|
gfx11_opt_push_gfx_sh_reg(sh_base_reg + SI_SGPR_BASE_VERTEX * 4,
|
||||||
tracked_base_vertex_reg, base_vertex);
|
tracked_base_vertex_reg, base_vertex);
|
||||||
if (set_draw_id) {
|
if (set_draw_id) {
|
||||||
|
|
@ -1450,14 +1510,17 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (HAS_SH_PAIRS_PACKED) {
|
if (GFX_VERSION >= GFX12) {
|
||||||
|
gfx12_emit_buffered_sh_regs_inline(&sctx->num_buffered_gfx_sh_regs,
|
||||||
|
sctx->gfx12.buffered_gfx_sh_regs);
|
||||||
|
} else if (HAS_SH_PAIRS_PACKED) {
|
||||||
radeon_end();
|
radeon_end();
|
||||||
gfx11_emit_buffered_sh_regs_inline(sctx, &sctx->num_buffered_gfx_sh_regs,
|
gfx11_emit_buffered_sh_regs_inline(sctx, &sctx->num_buffered_gfx_sh_regs,
|
||||||
sctx->gfx11.buffered_gfx_sh_regs);
|
sctx->gfx11.buffered_gfx_sh_regs);
|
||||||
radeon_begin_again(cs);
|
radeon_begin_again(cs);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Blit SGPRs must be set after gfx11_emit_buffered_sh_regs_inline because they can
|
/* Blit SGPRs must be set after gfx1X_emit_buffered_sh_regs_inline because they can
|
||||||
* overwrite them.
|
* overwrite them.
|
||||||
*/
|
*/
|
||||||
if (is_blit) {
|
if (is_blit) {
|
||||||
|
|
@ -1575,12 +1638,26 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
|
||||||
radeon_emit(va >> 32);
|
radeon_emit(va >> 32);
|
||||||
radeon_emit(draws[i].count);
|
radeon_emit(draws[i].count);
|
||||||
radeon_emit(V_0287F0_DI_SRC_SEL_DMA |
|
radeon_emit(V_0287F0_DI_SRC_SEL_DMA |
|
||||||
S_0287F0_NOT_EOP(GFX_VERSION >= GFX10 && i < num_draws - 1));
|
S_0287F0_NOT_EOP(GFX_VERSION >= GFX10 && GFX_VERSION < GFX12 &&
|
||||||
|
i < num_draws - 1));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (increment_draw_id) {
|
if (GFX_VERSION == GFX12 && !IS_DRAW_VERTEX_STATE &&
|
||||||
|
indirect && indirect->count_from_stream_output) {
|
||||||
|
/* DrawTransformFeedback requires 3 SQ_NON_EVENTs after the packet. */
|
||||||
|
assert(num_draws == 1);
|
||||||
|
|
||||||
|
radeon_emit(PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit));
|
||||||
|
radeon_emit(0);
|
||||||
|
radeon_emit(V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque);
|
||||||
|
|
||||||
|
for (unsigned i = 0; i < 3; i++) {
|
||||||
|
radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||||
|
radeon_emit(EVENT_TYPE(V_028A90_SQ_NON_EVENT) | EVENT_INDEX(0));
|
||||||
|
}
|
||||||
|
} else if (increment_draw_id) {
|
||||||
for (unsigned i = 0; i < num_draws; i++) {
|
for (unsigned i = 0; i < num_draws; i++) {
|
||||||
if (i > 0) {
|
if (i > 0) {
|
||||||
unsigned draw_id = drawid_base + i;
|
unsigned draw_id = drawid_base + i;
|
||||||
|
|
@ -1680,6 +1757,9 @@ void si_set_vertex_buffer_descriptor(struct si_screen *sscreen, struct si_vertex
|
||||||
case GFX11_5:
|
case GFX11_5:
|
||||||
si_set_vb_descriptor<GFX11_5>(velems, vb, element_index, out);
|
si_set_vb_descriptor<GFX11_5>(velems, vb, element_index, out);
|
||||||
break;
|
break;
|
||||||
|
case GFX12:
|
||||||
|
si_set_vb_descriptor<GFX12>(velems, vb, element_index, out);
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
unreachable("unhandled gfx level");
|
unreachable("unhandled gfx level");
|
||||||
}
|
}
|
||||||
|
|
@ -1980,7 +2060,7 @@ static void si_draw(struct pipe_context *ctx,
|
||||||
|
|
||||||
if (GFX_VERSION < GFX11)
|
if (GFX_VERSION < GFX11)
|
||||||
gfx6_decompress_textures(sctx, u_bit_consecutive(0, SI_NUM_GRAPHICS_SHADERS));
|
gfx6_decompress_textures(sctx, u_bit_consecutive(0, SI_NUM_GRAPHICS_SHADERS));
|
||||||
else
|
else if (GFX_VERSION < GFX12)
|
||||||
gfx11_decompress_textures(sctx, u_bit_consecutive(0, SI_NUM_GRAPHICS_SHADERS));
|
gfx11_decompress_textures(sctx, u_bit_consecutive(0, SI_NUM_GRAPHICS_SHADERS));
|
||||||
|
|
||||||
si_need_gfx_cs_space(sctx, num_draws);
|
si_need_gfx_cs_space(sctx, num_draws);
|
||||||
|
|
@ -2067,8 +2147,9 @@ static void si_draw(struct pipe_context *ctx,
|
||||||
|
|
||||||
/* info->start will be added by the drawing code */
|
/* info->start will be added by the drawing code */
|
||||||
index_offset -= start_offset;
|
index_offset -= start_offset;
|
||||||
} else if (GFX_VERSION <= GFX7 && si_resource(indexbuf)->TC_L2_dirty) {
|
} else if ((GFX_VERSION <= GFX7 || GFX_VERSION == GFX12) &&
|
||||||
/* GFX8 reads index buffers through TC L2, so it doesn't
|
si_resource(indexbuf)->TC_L2_dirty) {
|
||||||
|
/* GFX8-GFX11 reads index buffers through TC L2, so it doesn't
|
||||||
* need this. */
|
* need this. */
|
||||||
sctx->flags |= SI_CONTEXT_WB_L2;
|
sctx->flags |= SI_CONTEXT_WB_L2;
|
||||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||||
|
|
@ -2080,8 +2161,8 @@ static void si_draw(struct pipe_context *ctx,
|
||||||
unsigned total_direct_count = 0;
|
unsigned total_direct_count = 0;
|
||||||
|
|
||||||
if (!IS_DRAW_VERTEX_STATE && indirect) {
|
if (!IS_DRAW_VERTEX_STATE && indirect) {
|
||||||
/* Indirect buffers use TC L2 on GFX9, but not older hw. */
|
/* Indirect buffers use TC L2 on GFX9-GFX11, but not other hw. */
|
||||||
if (GFX_VERSION <= GFX8) {
|
if (GFX_VERSION <= GFX8 || GFX_VERSION == GFX12) {
|
||||||
if (indirect->buffer && si_resource(indirect->buffer)->TC_L2_dirty) {
|
if (indirect->buffer && si_resource(indirect->buffer)->TC_L2_dirty) {
|
||||||
sctx->flags |= SI_CONTEXT_WB_L2;
|
sctx->flags |= SI_CONTEXT_WB_L2;
|
||||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||||
|
|
@ -2287,12 +2368,13 @@ static void si_draw(struct pipe_context *ctx,
|
||||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (unlikely(sctx->decompression_enabled)) {
|
if (unlikely(GFX_VERSION < GFX12 && sctx->decompression_enabled)) {
|
||||||
sctx->num_decompress_calls++;
|
sctx->num_decompress_calls++;
|
||||||
} else {
|
} else {
|
||||||
sctx->num_draw_calls += num_draws;
|
sctx->num_draw_calls += num_draws;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* On Gfx12, this is only used to detect whether a depth texture is in the cleared state. */
|
||||||
if (sctx->framebuffer.state.zsbuf) {
|
if (sctx->framebuffer.state.zsbuf) {
|
||||||
struct si_texture *zstex = (struct si_texture *)sctx->framebuffer.state.zsbuf->texture;
|
struct si_texture *zstex = (struct si_texture *)sctx->framebuffer.state.zsbuf->texture;
|
||||||
zstex->depth_cleared_level_mask &= ~BITFIELD_BIT(sctx->framebuffer.state.zsbuf->u.tex.level);
|
zstex->depth_cleared_level_mask &= ~BITFIELD_BIT(sctx->framebuffer.state.zsbuf->u.tex.level);
|
||||||
|
|
@ -2350,7 +2432,7 @@ static void si_draw_rectangle(struct blitter_context *blitter, void *vertex_elem
|
||||||
struct pipe_context *pipe = util_blitter_get_pipe(blitter);
|
struct pipe_context *pipe = util_blitter_get_pipe(blitter);
|
||||||
struct si_context *sctx = (struct si_context *)pipe;
|
struct si_context *sctx = (struct si_context *)pipe;
|
||||||
uint32_t attribute_ring_address_lo =
|
uint32_t attribute_ring_address_lo =
|
||||||
sctx->gfx_level >= GFX11 ? sctx->screen->attribute_ring->gpu_address : 0;
|
sctx->gfx_level >= GFX11 ? sctx->screen->attribute_pos_prim_ring->gpu_address : 0;
|
||||||
|
|
||||||
/* Pack position coordinates as signed int16. */
|
/* Pack position coordinates as signed int16. */
|
||||||
sctx->vs_blit_sh_data[0] = (uint32_t)(x1 & 0xffff) | ((uint32_t)(y1 & 0xffff) << 16);
|
sctx->vs_blit_sh_data[0] = (uint32_t)(x1 & 0xffff) | ((uint32_t)(y1 & 0xffff) << 16);
|
||||||
|
|
@ -2387,7 +2469,8 @@ static void si_draw_rectangle(struct blitter_context *blitter, void *vertex_elem
|
||||||
pipe->draw_vbo(pipe, &info, 0, NULL, &draw, 1);
|
pipe->draw_vbo(pipe, &info, 0, NULL, &draw, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <amd_gfx_level GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG>
|
template <amd_gfx_level GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
|
||||||
|
util_popcnt POPCNT>
|
||||||
static void si_init_draw_vbo(struct si_context *sctx)
|
static void si_init_draw_vbo(struct si_context *sctx)
|
||||||
{
|
{
|
||||||
if (NGG && GFX_VERSION < GFX10)
|
if (NGG && GFX_VERSION < GFX10)
|
||||||
|
|
@ -2396,42 +2479,43 @@ static void si_init_draw_vbo(struct si_context *sctx)
|
||||||
if (!NGG && GFX_VERSION >= GFX11)
|
if (!NGG && GFX_VERSION >= GFX11)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (GFX_VERSION >= GFX11 && sctx->screen->info.has_set_sh_pairs_packed) {
|
if (GFX_VERSION >= GFX11 && GFX_VERSION < GFX12 && sctx->screen->info.has_set_sh_pairs_packed) {
|
||||||
sctx->draw_vbo[HAS_TESS][HAS_GS][NGG] =
|
sctx->draw_vbo[HAS_TESS][HAS_GS][NGG] =
|
||||||
si_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG, HAS_SH_PAIRS_PACKED_ON>;
|
si_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG, HAS_SH_PAIRS_PACKED_ON>;
|
||||||
|
|
||||||
if (util_get_cpu_caps()->has_popcnt) {
|
|
||||||
sctx->draw_vertex_state[HAS_TESS][HAS_GS][NGG] =
|
sctx->draw_vertex_state[HAS_TESS][HAS_GS][NGG] =
|
||||||
si_draw_vertex_state<GFX_VERSION, HAS_TESS, HAS_GS, NGG, HAS_SH_PAIRS_PACKED_ON, POPCNT_YES>;
|
si_draw_vertex_state<GFX_VERSION, HAS_TESS, HAS_GS, NGG, HAS_SH_PAIRS_PACKED_ON, POPCNT>;
|
||||||
} else {
|
|
||||||
sctx->draw_vertex_state[HAS_TESS][HAS_GS][NGG] =
|
|
||||||
si_draw_vertex_state<GFX_VERSION, HAS_TESS, HAS_GS, NGG, HAS_SH_PAIRS_PACKED_ON, POPCNT_NO>;
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
sctx->draw_vbo[HAS_TESS][HAS_GS][NGG] =
|
sctx->draw_vbo[HAS_TESS][HAS_GS][NGG] =
|
||||||
si_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG, HAS_SH_PAIRS_PACKED_OFF>;
|
si_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG, HAS_SH_PAIRS_PACKED_OFF>;
|
||||||
|
|
||||||
if (util_get_cpu_caps()->has_popcnt) {
|
|
||||||
sctx->draw_vertex_state[HAS_TESS][HAS_GS][NGG] =
|
sctx->draw_vertex_state[HAS_TESS][HAS_GS][NGG] =
|
||||||
si_draw_vertex_state<GFX_VERSION, HAS_TESS, HAS_GS, NGG, HAS_SH_PAIRS_PACKED_OFF, POPCNT_YES>;
|
si_draw_vertex_state<GFX_VERSION, HAS_TESS, HAS_GS, NGG, HAS_SH_PAIRS_PACKED_OFF, POPCNT>;
|
||||||
} else {
|
|
||||||
sctx->draw_vertex_state[HAS_TESS][HAS_GS][NGG] =
|
|
||||||
si_draw_vertex_state<GFX_VERSION, HAS_TESS, HAS_GS, NGG, HAS_SH_PAIRS_PACKED_OFF, POPCNT_NO>;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <amd_gfx_level GFX_VERSION>
|
template <amd_gfx_level GFX_VERSION>
|
||||||
static void si_init_draw_vbo_all_pipeline_options(struct si_context *sctx)
|
static void si_init_draw_vbo_all_pipeline_options(struct si_context *sctx)
|
||||||
{
|
{
|
||||||
si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_OFF, NGG_OFF>(sctx);
|
if (util_get_cpu_caps()->has_popcnt) {
|
||||||
si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_ON, NGG_OFF>(sctx);
|
si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_OFF, NGG_OFF, POPCNT_YES>(sctx);
|
||||||
si_init_draw_vbo<GFX_VERSION, TESS_ON, GS_OFF, NGG_OFF>(sctx);
|
si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_ON, NGG_OFF, POPCNT_YES>(sctx);
|
||||||
si_init_draw_vbo<GFX_VERSION, TESS_ON, GS_ON, NGG_OFF>(sctx);
|
si_init_draw_vbo<GFX_VERSION, TESS_ON, GS_OFF, NGG_OFF, POPCNT_YES>(sctx);
|
||||||
si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_OFF, NGG_ON>(sctx);
|
si_init_draw_vbo<GFX_VERSION, TESS_ON, GS_ON, NGG_OFF, POPCNT_YES>(sctx);
|
||||||
si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_ON, NGG_ON>(sctx);
|
si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_OFF, NGG_ON, POPCNT_YES>(sctx);
|
||||||
si_init_draw_vbo<GFX_VERSION, TESS_ON, GS_OFF, NGG_ON>(sctx);
|
si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_ON, NGG_ON, POPCNT_YES>(sctx);
|
||||||
si_init_draw_vbo<GFX_VERSION, TESS_ON, GS_ON, NGG_ON>(sctx);
|
si_init_draw_vbo<GFX_VERSION, TESS_ON, GS_OFF, NGG_ON, POPCNT_YES>(sctx);
|
||||||
|
si_init_draw_vbo<GFX_VERSION, TESS_ON, GS_ON, NGG_ON, POPCNT_YES>(sctx);
|
||||||
|
} else {
|
||||||
|
si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_OFF, NGG_OFF, POPCNT_NO>(sctx);
|
||||||
|
si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_ON, NGG_OFF, POPCNT_NO>(sctx);
|
||||||
|
si_init_draw_vbo<GFX_VERSION, TESS_ON, GS_OFF, NGG_OFF, POPCNT_NO>(sctx);
|
||||||
|
si_init_draw_vbo<GFX_VERSION, TESS_ON, GS_ON, NGG_OFF, POPCNT_NO>(sctx);
|
||||||
|
si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_OFF, NGG_ON, POPCNT_NO>(sctx);
|
||||||
|
si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_ON, NGG_ON, POPCNT_NO>(sctx);
|
||||||
|
si_init_draw_vbo<GFX_VERSION, TESS_ON, GS_OFF, NGG_ON, POPCNT_NO>(sctx);
|
||||||
|
si_init_draw_vbo<GFX_VERSION, TESS_ON, GS_ON, NGG_ON, POPCNT_NO>(sctx);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void si_invalid_draw_vbo(struct pipe_context *pipe,
|
static void si_invalid_draw_vbo(struct pipe_context *pipe,
|
||||||
|
|
|
||||||
|
|
@ -143,9 +143,22 @@ static void si_get_sample_position(struct pipe_context *ctx, unsigned sample_cou
|
||||||
}
|
}
|
||||||
|
|
||||||
static void si_emit_max_4_sample_locs(struct si_context *sctx, uint64_t centroid_priority,
|
static void si_emit_max_4_sample_locs(struct si_context *sctx, uint64_t centroid_priority,
|
||||||
uint32_t sample_locs)
|
uint32_t sample_locs, uint32_t max_sample_dist)
|
||||||
{
|
{
|
||||||
if (sctx->screen->info.has_set_context_pairs_packed) {
|
if (sctx->gfx_level >= GFX12) {
|
||||||
|
radeon_begin(&sctx->gfx_cs);
|
||||||
|
gfx12_begin_context_regs();
|
||||||
|
gfx12_set_context_reg(R_028BF0_PA_SC_CENTROID_PRIORITY_0, centroid_priority);
|
||||||
|
gfx12_set_context_reg(R_028BF4_PA_SC_CENTROID_PRIORITY_1, centroid_priority >> 32);
|
||||||
|
gfx12_set_context_reg(R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs);
|
||||||
|
gfx12_set_context_reg(R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs);
|
||||||
|
gfx12_set_context_reg(R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs);
|
||||||
|
gfx12_set_context_reg(R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs);
|
||||||
|
gfx12_set_context_reg(R_028C5C_PA_SC_SAMPLE_PROPERTIES,
|
||||||
|
S_028C5C_MAX_SAMPLE_DIST(max_sample_dist));
|
||||||
|
gfx12_end_context_regs();
|
||||||
|
radeon_end();
|
||||||
|
} else if (sctx->screen->info.has_set_context_pairs_packed) {
|
||||||
radeon_begin(&sctx->gfx_cs);
|
radeon_begin(&sctx->gfx_cs);
|
||||||
gfx11_begin_packed_context_regs();
|
gfx11_begin_packed_context_regs();
|
||||||
gfx11_set_context_reg(R_028BD4_PA_SC_CENTROID_PRIORITY_0, centroid_priority);
|
gfx11_set_context_reg(R_028BD4_PA_SC_CENTROID_PRIORITY_0, centroid_priority);
|
||||||
|
|
@ -170,12 +183,24 @@ static void si_emit_max_4_sample_locs(struct si_context *sctx, uint64_t centroid
|
||||||
}
|
}
|
||||||
|
|
||||||
static void si_emit_max_16_sample_locs(struct si_context *sctx, uint64_t centroid_priority,
|
static void si_emit_max_16_sample_locs(struct si_context *sctx, uint64_t centroid_priority,
|
||||||
const uint32_t *sample_locs, unsigned num_samples)
|
const uint32_t *sample_locs, unsigned num_samples,
|
||||||
|
uint32_t max_sample_dist)
|
||||||
{
|
{
|
||||||
radeon_begin(&sctx->gfx_cs);
|
radeon_begin(&sctx->gfx_cs);
|
||||||
|
|
||||||
|
if (sctx->gfx_level >= GFX12) {
|
||||||
|
gfx12_begin_context_regs();
|
||||||
|
gfx12_set_context_reg(R_028BF0_PA_SC_CENTROID_PRIORITY_0, centroid_priority);
|
||||||
|
gfx12_set_context_reg(R_028BF4_PA_SC_CENTROID_PRIORITY_1, centroid_priority >> 32);
|
||||||
|
gfx12_set_context_reg(R_028C5C_PA_SC_SAMPLE_PROPERTIES,
|
||||||
|
S_028C5C_MAX_SAMPLE_DIST(max_sample_dist));
|
||||||
|
gfx12_end_context_regs();
|
||||||
|
} else {
|
||||||
radeon_set_context_reg_seq(R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
|
radeon_set_context_reg_seq(R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
|
||||||
radeon_emit(centroid_priority);
|
radeon_emit(centroid_priority);
|
||||||
radeon_emit(centroid_priority >> 32);
|
radeon_emit(centroid_priority >> 32);
|
||||||
|
}
|
||||||
|
|
||||||
radeon_set_context_reg_seq(R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0,
|
radeon_set_context_reg_seq(R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0,
|
||||||
num_samples == 8 ? 14 : 16);
|
num_samples == 8 ? 14 : 16);
|
||||||
radeon_emit_array(sample_locs, 4);
|
radeon_emit_array(sample_locs, 4);
|
||||||
|
|
@ -204,24 +229,27 @@ static void si_emit_sample_locations(struct si_context *sctx, unsigned index)
|
||||||
* to non-MSAA.
|
* to non-MSAA.
|
||||||
*/
|
*/
|
||||||
if (nr_samples != sctx->sample_locs_num_samples) {
|
if (nr_samples != sctx->sample_locs_num_samples) {
|
||||||
|
unsigned max_sample_dist = si_msaa_max_distance[util_logbase2(nr_samples)];
|
||||||
|
|
||||||
switch (nr_samples) {
|
switch (nr_samples) {
|
||||||
default:
|
default:
|
||||||
case 1:
|
case 1:
|
||||||
si_emit_max_4_sample_locs(sctx, centroid_priority_1x, sample_locs_1x);
|
si_emit_max_4_sample_locs(sctx, centroid_priority_1x, sample_locs_1x, max_sample_dist);
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 2:
|
||||||
si_emit_max_4_sample_locs(sctx, centroid_priority_2x, sample_locs_2x);
|
si_emit_max_4_sample_locs(sctx, centroid_priority_2x, sample_locs_2x, max_sample_dist);
|
||||||
break;
|
break;
|
||||||
case 4:
|
case 4:
|
||||||
si_emit_max_4_sample_locs(sctx, centroid_priority_4x, sample_locs_4x);
|
si_emit_max_4_sample_locs(sctx, centroid_priority_4x, sample_locs_4x, max_sample_dist);
|
||||||
break;
|
break;
|
||||||
case 8:
|
case 8:
|
||||||
si_emit_max_16_sample_locs(sctx, centroid_priority_8x, sample_locs_8x, 8);
|
si_emit_max_16_sample_locs(sctx, centroid_priority_8x, sample_locs_8x, 8, max_sample_dist);
|
||||||
break;
|
break;
|
||||||
case 16:
|
case 16:
|
||||||
si_emit_max_16_sample_locs(sctx, centroid_priority_16x, sample_locs_16x, 16);
|
si_emit_max_16_sample_locs(sctx, centroid_priority_16x, sample_locs_16x, 16, max_sample_dist);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
sctx->sample_locs_num_samples = nr_samples;
|
sctx->sample_locs_num_samples = nr_samples;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -493,7 +493,9 @@ unsigned si_shader_encode_sgprs(struct si_shader *shader)
|
||||||
|
|
||||||
bool si_shader_mem_ordered(struct si_shader *shader)
|
bool si_shader_mem_ordered(struct si_shader *shader)
|
||||||
{
|
{
|
||||||
if (shader->selector->screen->info.gfx_level < GFX10)
|
struct si_screen *sscreen = shader->selector->screen;
|
||||||
|
|
||||||
|
if (sscreen->info.gfx_level < GFX10 || sscreen->info.gfx_level >= GFX12)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
/* Return true if both types of VMEM that return something are used. */
|
/* Return true if both types of VMEM that return something are used. */
|
||||||
|
|
@ -563,6 +565,9 @@ static void si_set_tesseval_regs(struct si_screen *sscreen, const struct si_shad
|
||||||
shader->vgt_tf_param = S_028B6C_TYPE(type) | S_028B6C_PARTITIONING(partitioning) |
|
shader->vgt_tf_param = S_028B6C_TYPE(type) | S_028B6C_PARTITIONING(partitioning) |
|
||||||
S_028B6C_TOPOLOGY(topology) |
|
S_028B6C_TOPOLOGY(topology) |
|
||||||
S_028B6C_DISTRIBUTION_MODE(distribution_mode);
|
S_028B6C_DISTRIBUTION_MODE(distribution_mode);
|
||||||
|
|
||||||
|
if (sscreen->info.gfx_level >= GFX12)
|
||||||
|
shader->vgt_tf_param |= S_028AA4_TEMPORAL(gfx12_load_last_use_discard);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Polaris needs different VTX_REUSE_DEPTH settings depending on
|
/* Polaris needs different VTX_REUSE_DEPTH settings depending on
|
||||||
|
|
@ -635,12 +640,15 @@ static unsigned si_get_vs_vgpr_comp_cnt(struct si_screen *sscreen, struct si_sha
|
||||||
* GFX6-9 ES,VS (VertexID, InstanceID / StepRate0, VSPrimID, InstanceID)
|
* GFX6-9 ES,VS (VertexID, InstanceID / StepRate0, VSPrimID, InstanceID)
|
||||||
* GFX10-11 LS (VertexID, RelAutoIndex, UserVGPR1, UserVGPR2 or InstanceID)
|
* GFX10-11 LS (VertexID, RelAutoIndex, UserVGPR1, UserVGPR2 or InstanceID)
|
||||||
* GFX10-11 ES,VS (VertexID, UserVGPR1, UserVGPR2 or VSPrimID, UserVGPR3 or InstanceID)
|
* GFX10-11 ES,VS (VertexID, UserVGPR1, UserVGPR2 or VSPrimID, UserVGPR3 or InstanceID)
|
||||||
|
* GFX12 LS,ES (VertexID, InstanceID)
|
||||||
*/
|
*/
|
||||||
bool is_ls = shader->selector->stage == MESA_SHADER_TESS_CTRL || shader->key.ge.as_ls;
|
bool is_ls = shader->selector->stage == MESA_SHADER_TESS_CTRL || shader->key.ge.as_ls;
|
||||||
unsigned max = 0;
|
unsigned max = 0;
|
||||||
|
|
||||||
if (shader->info.uses_instanceid) {
|
if (shader->info.uses_instanceid) {
|
||||||
if (sscreen->info.gfx_level >= GFX10)
|
if (sscreen->info.gfx_level >= GFX12)
|
||||||
|
max = MAX2(max, 1);
|
||||||
|
else if (sscreen->info.gfx_level >= GFX10)
|
||||||
max = MAX2(max, 3);
|
max = MAX2(max, 3);
|
||||||
else if (is_ls)
|
else if (is_ls)
|
||||||
max = MAX2(max, 2); /* use (InstanceID / StepRate0) because StepRate0 == 1 */
|
max = MAX2(max, 2); /* use (InstanceID / StepRate0) because StepRate0 == 1 */
|
||||||
|
|
@ -653,6 +661,7 @@ static unsigned si_get_vs_vgpr_comp_cnt(struct si_screen *sscreen, struct si_sha
|
||||||
|
|
||||||
/* GFX11: We prefer to compute RelAutoIndex using (WaveID * WaveSize + ThreadID).
|
/* GFX11: We prefer to compute RelAutoIndex using (WaveID * WaveSize + ThreadID).
|
||||||
* Older chips didn't have WaveID in LS.
|
* Older chips didn't have WaveID in LS.
|
||||||
|
* GFX12 doesn't have RelAutoIndex.
|
||||||
*/
|
*/
|
||||||
if (is_ls && sscreen->info.gfx_level <= GFX10_3)
|
if (is_ls && sscreen->info.gfx_level <= GFX10_3)
|
||||||
max = MAX2(max, 1); /* RelAutoIndex */
|
max = MAX2(max, 1); /* RelAutoIndex */
|
||||||
|
|
@ -695,7 +704,14 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader)
|
||||||
si_get_num_vs_user_sgprs(shader, GFX9_TCS_NUM_USER_SGPR) :
|
si_get_num_vs_user_sgprs(shader, GFX9_TCS_NUM_USER_SGPR) :
|
||||||
GFX6_TCS_NUM_USER_SGPR;
|
GFX6_TCS_NUM_USER_SGPR;
|
||||||
|
|
||||||
if (sscreen->info.gfx_level >= GFX11) {
|
if (sscreen->info.gfx_level >= GFX12) {
|
||||||
|
si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_RSRC4_HS,
|
||||||
|
S_00B420_WAVE_LIMIT(0x3ff) |
|
||||||
|
S_00B420_GLG_FORCE_DISABLE(1) |
|
||||||
|
S_00B420_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)));
|
||||||
|
|
||||||
|
si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_LO_LS, va >> 8);
|
||||||
|
} else if (sscreen->info.gfx_level >= GFX11) {
|
||||||
si_pm4_set_reg_idx3(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS,
|
si_pm4_set_reg_idx3(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS,
|
||||||
ac_apply_cu_en(S_00B404_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)) |
|
ac_apply_cu_en(S_00B404_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)) |
|
||||||
S_00B404_CU_EN(0xffff),
|
S_00B404_CU_EN(0xffff),
|
||||||
|
|
@ -715,7 +731,7 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader)
|
||||||
si_pm4_set_reg(pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS,
|
si_pm4_set_reg(pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS,
|
||||||
S_00B428_VGPRS(si_shader_encode_vgprs(shader)) |
|
S_00B428_VGPRS(si_shader_encode_vgprs(shader)) |
|
||||||
S_00B428_SGPRS(si_shader_encode_sgprs(shader)) |
|
S_00B428_SGPRS(si_shader_encode_sgprs(shader)) |
|
||||||
S_00B428_DX10_CLAMP(1) |
|
S_00B428_DX10_CLAMP(sscreen->info.gfx_level < GFX12) |
|
||||||
S_00B428_MEM_ORDERED(si_shader_mem_ordered(shader)) |
|
S_00B428_MEM_ORDERED(si_shader_mem_ordered(shader)) |
|
||||||
S_00B428_FLOAT_MODE(shader->config.float_mode) |
|
S_00B428_FLOAT_MODE(shader->config.float_mode) |
|
||||||
S_00B428_LS_VGPR_COMP_CNT(sscreen->info.gfx_level >= GFX9 ?
|
S_00B428_LS_VGPR_COMP_CNT(sscreen->info.gfx_level >= GFX9 ?
|
||||||
|
|
@ -1287,6 +1303,46 @@ static void gfx11_dgpu_emit_shader_ngg(struct si_context *sctx, unsigned index)
|
||||||
radeon_end();
|
radeon_end();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <enum si_has_tess HAS_TESS>
|
||||||
|
static void gfx12_emit_shader_ngg(struct si_context *sctx, unsigned index)
|
||||||
|
{
|
||||||
|
struct si_shader *shader = sctx->queued.named.gs;
|
||||||
|
|
||||||
|
if (shader->selector->stage == MESA_SHADER_GEOMETRY)
|
||||||
|
gfx9_set_gs_sgpr_num_es_outputs(sctx, shader->ngg.esgs_vertex_stride);
|
||||||
|
|
||||||
|
radeon_begin(&sctx->gfx_cs);
|
||||||
|
gfx12_begin_context_regs();
|
||||||
|
if (HAS_TESS) {
|
||||||
|
gfx12_opt_set_context_reg(R_028AA4_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
|
||||||
|
shader->vgt_tf_param);
|
||||||
|
}
|
||||||
|
gfx12_opt_set_context_reg(R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP,
|
||||||
|
SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP,
|
||||||
|
shader->ngg.ge_max_output_per_subgroup);
|
||||||
|
gfx12_opt_set_context_reg(R_028B4C_GE_NGG_SUBGRP_CNTL, SI_TRACKED_GE_NGG_SUBGRP_CNTL,
|
||||||
|
shader->ngg.ge_ngg_subgrp_cntl);
|
||||||
|
gfx12_opt_set_context_reg(R_028B38_VGT_GS_MAX_VERT_OUT, SI_TRACKED_VGT_GS_MAX_VERT_OUT,
|
||||||
|
shader->ngg.vgt_gs_max_vert_out);
|
||||||
|
gfx12_opt_set_context_reg(R_028B3C_VGT_GS_INSTANCE_CNT, SI_TRACKED_VGT_GS_INSTANCE_CNT,
|
||||||
|
shader->ngg.vgt_gs_instance_cnt);
|
||||||
|
gfx12_opt_set_context_reg(R_02864C_SPI_SHADER_POS_FORMAT, SI_TRACKED_SPI_SHADER_POS_FORMAT,
|
||||||
|
shader->ngg.spi_shader_pos_format);
|
||||||
|
gfx12_opt_set_context_reg(R_028814_PA_CL_VTE_CNTL, SI_TRACKED_PA_CL_VTE_CNTL,
|
||||||
|
shader->ngg.pa_cl_vte_cntl);
|
||||||
|
gfx12_end_context_regs();
|
||||||
|
|
||||||
|
radeon_opt_set_uconfig_reg(sctx, R_030988_VGT_PRIMITIVEID_EN,
|
||||||
|
SI_TRACKED_VGT_PRIMITIVEID_EN_UCONFIG,
|
||||||
|
shader->ngg.vgt_primitiveid_en);
|
||||||
|
radeon_end(); /* don't track context rolls on GFX12 */
|
||||||
|
|
||||||
|
assert(!sctx->screen->info.uses_kernel_cu_mask);
|
||||||
|
gfx12_opt_push_gfx_sh_reg(R_00B220_SPI_SHADER_PGM_RSRC4_GS,
|
||||||
|
SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
|
||||||
|
shader->ngg.spi_shader_pgm_rsrc4_gs);
|
||||||
|
}
|
||||||
|
|
||||||
unsigned si_get_input_prim(const struct si_shader_selector *gs, const union si_shader_key *key)
|
unsigned si_get_input_prim(const struct si_shader_selector *gs, const union si_shader_key *key)
|
||||||
{
|
{
|
||||||
if (gs->stage == MESA_SHADER_GEOMETRY)
|
if (gs->stage == MESA_SHADER_GEOMETRY)
|
||||||
|
|
@ -1359,7 +1415,12 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
|
||||||
if (!pm4)
|
if (!pm4)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (sscreen->info.has_set_context_pairs_packed) {
|
if (sscreen->info.gfx_level >= GFX12) {
|
||||||
|
if (es_stage == MESA_SHADER_TESS_EVAL)
|
||||||
|
pm4->atom.emit = gfx12_emit_shader_ngg<TESS_ON>;
|
||||||
|
else
|
||||||
|
pm4->atom.emit = gfx12_emit_shader_ngg<TESS_OFF>;
|
||||||
|
} else if (sscreen->info.has_set_context_pairs_packed) {
|
||||||
if (es_stage == MESA_SHADER_TESS_EVAL)
|
if (es_stage == MESA_SHADER_TESS_EVAL)
|
||||||
pm4->atom.emit = gfx11_dgpu_emit_shader_ngg<TESS_ON>;
|
pm4->atom.emit = gfx11_dgpu_emit_shader_ngg<TESS_ON>;
|
||||||
else
|
else
|
||||||
|
|
@ -1394,6 +1455,15 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
|
||||||
/* Primitives with adjancency can only occur without tessellation. */
|
/* Primitives with adjancency can only occur without tessellation. */
|
||||||
assert(gs_info->gs_input_verts_per_prim <= 3 || es_stage == MESA_SHADER_VERTEX);
|
assert(gs_info->gs_input_verts_per_prim <= 3 || es_stage == MESA_SHADER_VERTEX);
|
||||||
|
|
||||||
|
if (sscreen->info.gfx_level >= GFX12) {
|
||||||
|
if (gs_info->gs_input_verts_per_prim >= 4)
|
||||||
|
gs_vgpr_comp_cnt = 2; /* VGPR2 contains offsets 3-5 */
|
||||||
|
else if ((gs_stage == MESA_SHADER_GEOMETRY && gs_info->uses_primid) ||
|
||||||
|
(gs_stage == MESA_SHADER_VERTEX && shader->key.ge.mono.u.vs_export_prim_id))
|
||||||
|
gs_vgpr_comp_cnt = 1; /* VGPR1 contains PrimitiveID */
|
||||||
|
else
|
||||||
|
gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0-2, edgeflags, GS invocation ID. */
|
||||||
|
} else {
|
||||||
/* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and
|
/* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and
|
||||||
* VGPR[0:4] are always loaded.
|
* VGPR[0:4] are always loaded.
|
||||||
*
|
*
|
||||||
|
|
@ -1411,12 +1481,18 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
|
||||||
gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
|
gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
|
||||||
else
|
else
|
||||||
gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
|
gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sscreen->info.gfx_level >= GFX12) {
|
||||||
|
si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_LO_ES, va >> 8);
|
||||||
|
} else {
|
||||||
si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
|
si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
|
||||||
|
}
|
||||||
|
|
||||||
si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
|
si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
|
||||||
S_00B228_VGPRS(si_shader_encode_vgprs(shader)) |
|
S_00B228_VGPRS(si_shader_encode_vgprs(shader)) |
|
||||||
S_00B228_FLOAT_MODE(shader->config.float_mode) |
|
S_00B228_FLOAT_MODE(shader->config.float_mode) |
|
||||||
S_00B228_DX10_CLAMP(1) |
|
S_00B228_DX10_CLAMP(sscreen->info.gfx_level < GFX12) |
|
||||||
S_00B228_MEM_ORDERED(si_shader_mem_ordered(shader)) |
|
S_00B228_MEM_ORDERED(si_shader_mem_ordered(shader)) |
|
||||||
S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt));
|
S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt));
|
||||||
si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
|
si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
|
||||||
|
|
@ -1460,6 +1536,25 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
|
||||||
S_028A84_NGG_DISABLE_PROVOK_REUSE(shader->key.ge.mono.u.vs_export_prim_id ||
|
S_028A84_NGG_DISABLE_PROVOK_REUSE(shader->key.ge.mono.u.vs_export_prim_id ||
|
||||||
gs_sel->info.writes_primid);
|
gs_sel->info.writes_primid);
|
||||||
|
|
||||||
|
if (sscreen->info.gfx_level >= GFX12) {
|
||||||
|
unsigned num_params = shader->info.nr_param_exports;
|
||||||
|
|
||||||
|
/* Since there is no alloc/dealloc mechanism for the 12-bit ordered IDs, they can wrap
|
||||||
|
* around if there are more than 2^12 workgroups, causing 2 workgroups to get the same
|
||||||
|
* ordered ID, which would break the streamout algorithm.
|
||||||
|
* The recommended solution is to use the alloc/dealloc mechanism of the attribute ring,
|
||||||
|
* which is enough to limit the range of ordered IDs that can be in flight.
|
||||||
|
*/
|
||||||
|
if (si_shader_uses_streamout(shader))
|
||||||
|
num_params = MAX2(num_params, 8);
|
||||||
|
|
||||||
|
shader->ngg.spi_shader_pgm_rsrc4_gs = S_00B220_SPI_SHADER_LATE_ALLOC_GS(127) |
|
||||||
|
S_00B220_GLG_FORCE_DISABLE(1) |
|
||||||
|
S_00B220_WAVE_LIMIT(0x3ff) |
|
||||||
|
S_00B220_INST_PREF_SIZE(si_get_shader_prefetch_size(shader));
|
||||||
|
shader->ngg.spi_vs_out_config = S_00B0C4_VS_EXPORT_COUNT(MAX2(num_params, 1) - 1) |
|
||||||
|
S_00B0C4_NO_PC_EXPORT(num_params == 0);
|
||||||
|
} else {
|
||||||
unsigned late_alloc_wave64, cu_mask;
|
unsigned late_alloc_wave64, cu_mask;
|
||||||
|
|
||||||
ac_compute_late_alloc(&sscreen->info, true, shader->key.ge.opt.ngg_culling,
|
ac_compute_late_alloc(&sscreen->info, true, shader->key.ge.opt.ngg_culling,
|
||||||
|
|
@ -1501,10 +1596,11 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
|
||||||
ac_apply_cu_en(S_00B204_CU_EN_GFX10(0xffff),
|
ac_apply_cu_en(S_00B204_CU_EN_GFX10(0xffff),
|
||||||
C_00B204_CU_EN_GFX10, 16, &sscreen->info);
|
C_00B204_CU_EN_GFX10, 16, &sscreen->info);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (sscreen->info.gfx_level >= GFX11) {
|
if (sscreen->info.gfx_level >= GFX11) {
|
||||||
/* This should be <= 252 for performance on Gfx11. 256 works too but is slower. */
|
/* This should be <= 252 for performance on Gfx11. 256 works too but is slower. */
|
||||||
unsigned max_prim_grp_size = 252;
|
unsigned max_prim_grp_size = sscreen->info.gfx_level >= GFX12 ? 256 : 252;
|
||||||
unsigned prim_amp_factor = gs_stage == MESA_SHADER_GEOMETRY ?
|
unsigned prim_amp_factor = gs_stage == MESA_SHADER_GEOMETRY ?
|
||||||
gs_sel->info.base.gs.vertices_out : 1;
|
gs_sel->info.base.gs.vertices_out : 1;
|
||||||
|
|
||||||
|
|
@ -1552,6 +1648,13 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
|
||||||
S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1);
|
S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (sscreen->info.gfx_level >= GFX12) {
|
||||||
|
shader->ngg.vgt_shader_stages_en =
|
||||||
|
S_028A98_GS_EN(gs_stage == MESA_SHADER_GEOMETRY) |
|
||||||
|
S_028A98_PRIMGEN_PASSTHRU_NO_MSG(gfx10_is_ngg_passthrough(shader)) |
|
||||||
|
S_028A98_GS_W32_EN(shader->wave_size == 32) |
|
||||||
|
S_028A98_NGG_WAVE_ID_EN(si_shader_uses_streamout(shader));
|
||||||
|
} else {
|
||||||
shader->ngg.vgt_shader_stages_en =
|
shader->ngg.vgt_shader_stages_en =
|
||||||
S_028B54_ES_EN(es_stage == MESA_SHADER_TESS_EVAL ?
|
S_028B54_ES_EN(es_stage == MESA_SHADER_TESS_EVAL ?
|
||||||
V_028B54_ES_STAGE_DS : V_028B54_ES_STAGE_REAL) |
|
V_028B54_ES_STAGE_DS : V_028B54_ES_STAGE_REAL) |
|
||||||
|
|
@ -1563,6 +1666,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
|
||||||
S_028B54_NGG_WAVE_ID_EN(si_shader_uses_streamout(shader)) |
|
S_028B54_NGG_WAVE_ID_EN(si_shader_uses_streamout(shader)) |
|
||||||
S_028B54_GS_W32_EN(shader->wave_size == 32) |
|
S_028B54_GS_W32_EN(shader->wave_size == 32) |
|
||||||
S_028B54_MAX_PRIMGRP_IN_WAVE(2);
|
S_028B54_MAX_PRIMGRP_IN_WAVE(2);
|
||||||
|
}
|
||||||
|
|
||||||
si_pm4_finalize(pm4);
|
si_pm4_finalize(pm4);
|
||||||
}
|
}
|
||||||
|
|
@ -1836,6 +1940,32 @@ static void gfx11_dgpu_emit_shader_ps(struct si_context *sctx, unsigned index)
|
||||||
radeon_end(); /* don't track context rolls on GFX11 */
|
radeon_end(); /* don't track context rolls on GFX11 */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void gfx12_emit_shader_ps(struct si_context *sctx, unsigned index)
|
||||||
|
{
|
||||||
|
struct si_shader *shader = sctx->queued.named.ps;
|
||||||
|
|
||||||
|
radeon_begin(&sctx->gfx_cs);
|
||||||
|
gfx12_begin_context_regs();
|
||||||
|
gfx12_opt_set_context_reg(R_028640_SPI_PS_IN_CONTROL, SI_TRACKED_SPI_PS_IN_CONTROL,
|
||||||
|
shader->ps.spi_ps_in_control);
|
||||||
|
gfx12_opt_set_context_reg(R_028650_SPI_SHADER_Z_FORMAT, SI_TRACKED_SPI_SHADER_Z_FORMAT,
|
||||||
|
shader->ps.spi_shader_z_format);
|
||||||
|
gfx12_opt_set_context_reg(R_028654_SPI_SHADER_COL_FORMAT, SI_TRACKED_SPI_SHADER_COL_FORMAT,
|
||||||
|
shader->ps.spi_shader_col_format);
|
||||||
|
gfx12_opt_set_context_reg(R_028658_SPI_BARYC_CNTL, SI_TRACKED_SPI_BARYC_CNTL,
|
||||||
|
shader->ps.spi_baryc_cntl);
|
||||||
|
gfx12_opt_set_context_reg(R_02865C_SPI_PS_INPUT_ENA, SI_TRACKED_SPI_PS_INPUT_ENA,
|
||||||
|
shader->ps.spi_ps_input_ena);
|
||||||
|
gfx12_opt_set_context_reg(R_028660_SPI_PS_INPUT_ADDR, SI_TRACKED_SPI_PS_INPUT_ADDR,
|
||||||
|
shader->ps.spi_ps_input_addr);
|
||||||
|
gfx12_opt_set_context_reg(R_028854_CB_SHADER_MASK, SI_TRACKED_CB_SHADER_MASK,
|
||||||
|
shader->ps.cb_shader_mask);
|
||||||
|
gfx12_opt_set_context_reg(R_028BBC_PA_SC_HISZ_CONTROL, SI_TRACKED_PA_SC_HISZ_CONTROL,
|
||||||
|
shader->ps.pa_sc_hisz_control);
|
||||||
|
gfx12_end_context_regs();
|
||||||
|
radeon_end(); /* don't track context rolls on GFX12 */
|
||||||
|
}
|
||||||
|
|
||||||
static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader)
|
static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader)
|
||||||
{
|
{
|
||||||
struct si_shader_info *info = &shader->selector->info;
|
struct si_shader_info *info = &shader->selector->info;
|
||||||
|
|
@ -1884,13 +2014,19 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader)
|
||||||
S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(info->writes_stencil) |
|
S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(info->writes_stencil) |
|
||||||
S_02880C_MASK_EXPORT_ENABLE(shader->ps.writes_samplemask) |
|
S_02880C_MASK_EXPORT_ENABLE(shader->ps.writes_samplemask) |
|
||||||
S_02880C_KILL_ENABLE(si_shader_uses_discard(shader));
|
S_02880C_KILL_ENABLE(si_shader_uses_discard(shader));
|
||||||
|
if (sscreen->info.gfx_level >= GFX12)
|
||||||
|
shader->ps.pa_sc_hisz_control = S_028BBC_ROUND(2); /* required minimum value */
|
||||||
|
|
||||||
switch (info->base.fs.depth_layout) {
|
switch (info->base.fs.depth_layout) {
|
||||||
case FRAG_DEPTH_LAYOUT_GREATER:
|
case FRAG_DEPTH_LAYOUT_GREATER:
|
||||||
shader->ps.db_shader_control |= S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_GREATER_THAN_Z);
|
shader->ps.db_shader_control |= S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_GREATER_THAN_Z);
|
||||||
|
if (sscreen->info.gfx_level >= GFX12)
|
||||||
|
shader->ps.pa_sc_hisz_control |= S_028BBC_CONSERVATIVE_Z_EXPORT(V_028BBC_EXPORT_GREATER_THAN_Z);
|
||||||
break;
|
break;
|
||||||
case FRAG_DEPTH_LAYOUT_LESS:
|
case FRAG_DEPTH_LAYOUT_LESS:
|
||||||
shader->ps.db_shader_control |= S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_LESS_THAN_Z);
|
shader->ps.db_shader_control |= S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_LESS_THAN_Z);
|
||||||
|
if (sscreen->info.gfx_level >= GFX12)
|
||||||
|
shader->ps.pa_sc_hisz_control |= S_028BBC_CONSERVATIVE_Z_EXPORT(V_028BBC_EXPORT_LESS_THAN_Z);
|
||||||
break;
|
break;
|
||||||
default:;
|
default:;
|
||||||
}
|
}
|
||||||
|
|
@ -2001,6 +2137,11 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (sscreen->info.gfx_level >= GFX12) {
|
||||||
|
shader->ps.spi_ps_in_control = S_028640_PARAM_GEN(shader->key.ps.mono.point_smoothing) |
|
||||||
|
S_028640_PS_W32_EN(shader->wave_size == 32);
|
||||||
|
shader->ps.spi_gs_out_config_ps = S_00B0C4_NUM_INTERP(shader->ps.num_interp);
|
||||||
|
} else {
|
||||||
/* Enable PARAM_GEN for point smoothing.
|
/* Enable PARAM_GEN for point smoothing.
|
||||||
* Gfx11 workaround when there are no PS inputs but LDS is used.
|
* Gfx11 workaround when there are no PS inputs but LDS is used.
|
||||||
*/
|
*/
|
||||||
|
|
@ -2011,12 +2152,15 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader)
|
||||||
shader->ps.spi_ps_in_control = S_0286D8_NUM_INTERP(shader->ps.num_interp) |
|
shader->ps.spi_ps_in_control = S_0286D8_NUM_INTERP(shader->ps.num_interp) |
|
||||||
S_0286D8_PARAM_GEN(param_gen) |
|
S_0286D8_PARAM_GEN(param_gen) |
|
||||||
S_0286D8_PS_W32_EN(shader->wave_size == 32);
|
S_0286D8_PS_W32_EN(shader->wave_size == 32);
|
||||||
|
}
|
||||||
|
|
||||||
struct si_pm4_state *pm4 = si_get_shader_pm4_state(shader, NULL);
|
struct si_pm4_state *pm4 = si_get_shader_pm4_state(shader, NULL);
|
||||||
if (!pm4)
|
if (!pm4)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (sscreen->info.has_set_context_pairs_packed)
|
if (sscreen->info.gfx_level >= GFX12)
|
||||||
|
pm4->atom.emit = gfx12_emit_shader_ps;
|
||||||
|
else if (sscreen->info.has_set_context_pairs_packed)
|
||||||
pm4->atom.emit = gfx11_dgpu_emit_shader_ps;
|
pm4->atom.emit = gfx11_dgpu_emit_shader_ps;
|
||||||
else
|
else
|
||||||
pm4->atom.emit = gfx6_emit_shader_ps;
|
pm4->atom.emit = gfx6_emit_shader_ps;
|
||||||
|
|
@ -2029,7 +2173,12 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader)
|
||||||
si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
|
si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sscreen->info.gfx_level >= GFX11) {
|
if (sscreen->info.gfx_level >= GFX12) {
|
||||||
|
si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC4_PS,
|
||||||
|
S_00B01C_WAVE_LIMIT_GFX12(0x3FF) |
|
||||||
|
S_00B01C_LDS_GROUP_SIZE_GFX12(1) |
|
||||||
|
S_00B01C_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)));
|
||||||
|
} else if (sscreen->info.gfx_level >= GFX11) {
|
||||||
unsigned cu_mask_ps = gfx103_get_cu_mask_ps(sscreen);
|
unsigned cu_mask_ps = gfx103_get_cu_mask_ps(sscreen);
|
||||||
|
|
||||||
si_pm4_set_reg_idx3(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS,
|
si_pm4_set_reg_idx3(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS,
|
||||||
|
|
@ -2046,7 +2195,7 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader)
|
||||||
si_pm4_set_reg(pm4, R_00B028_SPI_SHADER_PGM_RSRC1_PS,
|
si_pm4_set_reg(pm4, R_00B028_SPI_SHADER_PGM_RSRC1_PS,
|
||||||
S_00B028_VGPRS(si_shader_encode_vgprs(shader)) |
|
S_00B028_VGPRS(si_shader_encode_vgprs(shader)) |
|
||||||
S_00B028_SGPRS(si_shader_encode_sgprs(shader)) |
|
S_00B028_SGPRS(si_shader_encode_sgprs(shader)) |
|
||||||
S_00B028_DX10_CLAMP(1) |
|
S_00B028_DX10_CLAMP(sscreen->info.gfx_level < GFX12) |
|
||||||
S_00B028_MEM_ORDERED(si_shader_mem_ordered(shader)) |
|
S_00B028_MEM_ORDERED(si_shader_mem_ordered(shader)) |
|
||||||
S_00B028_FLOAT_MODE(shader->config.float_mode));
|
S_00B028_FLOAT_MODE(shader->config.float_mode));
|
||||||
si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS,
|
si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS,
|
||||||
|
|
@ -2306,6 +2455,8 @@ static void si_get_vs_key_outputs(struct si_context *sctx, struct si_shader_sele
|
||||||
sctx->shader.ps.cso && sctx->shader.ps.cso->info.uses_primid;
|
sctx->shader.ps.cso && sctx->shader.ps.cso->info.uses_primid;
|
||||||
key->ge.opt.remove_streamout = vs->info.enabled_streamout_buffer_mask &&
|
key->ge.opt.remove_streamout = vs->info.enabled_streamout_buffer_mask &&
|
||||||
!sctx->streamout.enabled_mask;
|
!sctx->streamout.enabled_mask;
|
||||||
|
if (sctx->gfx_level >= GFX12)
|
||||||
|
key->ge.mono.remove_streamout = key->ge.opt.remove_streamout;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void si_clear_vs_key_outputs(struct si_context *sctx, struct si_shader_selector *vs,
|
static void si_clear_vs_key_outputs(struct si_context *sctx, struct si_shader_selector *vs,
|
||||||
|
|
@ -2316,6 +2467,7 @@ static void si_clear_vs_key_outputs(struct si_context *sctx, struct si_shader_se
|
||||||
key->ge.opt.remove_streamout = 0;
|
key->ge.opt.remove_streamout = 0;
|
||||||
key->ge.opt.ngg_culling = 0;
|
key->ge.opt.ngg_culling = 0;
|
||||||
key->ge.mono.u.vs_export_prim_id = 0;
|
key->ge.mono.u.vs_export_prim_id = 0;
|
||||||
|
key->ge.mono.remove_streamout = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void si_ps_key_update_framebuffer(struct si_context *sctx)
|
void si_ps_key_update_framebuffer(struct si_context *sctx)
|
||||||
|
|
@ -3434,8 +3586,8 @@ static void si_update_streamout_state(struct si_context *sctx)
|
||||||
sctx->streamout.stride_in_dw = shader_with_so->info.base.xfb_stride;
|
sctx->streamout.stride_in_dw = shader_with_so->info.base.xfb_stride;
|
||||||
|
|
||||||
/* GDS must be allocated when any GDS instructions are used, otherwise it hangs. */
|
/* GDS must be allocated when any GDS instructions are used, otherwise it hangs. */
|
||||||
if (sctx->gfx_level >= GFX11 && shader_with_so->info.enabled_streamout_buffer_mask &&
|
if (sctx->gfx_level >= GFX11 && sctx->gfx_level < GFX12 &&
|
||||||
!sctx->screen->gds_oa) {
|
shader_with_so->info.enabled_streamout_buffer_mask && !sctx->screen->gds_oa) {
|
||||||
/* Gfx11 only uses GDS OA, not GDS memory. */
|
/* Gfx11 only uses GDS OA, not GDS memory. */
|
||||||
simple_mtx_lock(&sctx->screen->gds_mutex);
|
simple_mtx_lock(&sctx->screen->gds_mutex);
|
||||||
if (!sctx->screen->gds_oa) {
|
if (!sctx->screen->gds_oa) {
|
||||||
|
|
@ -4288,8 +4440,10 @@ static void si_emit_vgt_pipeline_state(struct si_context *sctx, unsigned index)
|
||||||
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
|
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
|
||||||
|
|
||||||
radeon_begin(cs);
|
radeon_begin(cs);
|
||||||
radeon_opt_set_context_reg(sctx, R_028B54_VGT_SHADER_STAGES_EN, SI_TRACKED_VGT_SHADER_STAGES_EN,
|
radeon_opt_set_context_reg(sctx, sctx->gfx_level >= GFX12 ?
|
||||||
sctx->vgt_shader_stages_en);
|
R_028A98_VGT_SHADER_STAGES_EN :
|
||||||
|
R_028B54_VGT_SHADER_STAGES_EN,
|
||||||
|
SI_TRACKED_VGT_SHADER_STAGES_EN, sctx->vgt_shader_stages_en);
|
||||||
if (sctx->gfx_level == GFX10_3) {
|
if (sctx->gfx_level == GFX10_3) {
|
||||||
/* Legacy Tess+GS should disable reuse to prevent hangs on GFX10.3. */
|
/* Legacy Tess+GS should disable reuse to prevent hangs on GFX10.3. */
|
||||||
bool has_legacy_tess_gs = G_028B54_HS_EN(sctx->vgt_shader_stages_en) &&
|
bool has_legacy_tess_gs = G_028B54_HS_EN(sctx->vgt_shader_stages_en) &&
|
||||||
|
|
@ -4414,6 +4568,11 @@ static void si_set_patch_vertices(struct pipe_context *ctx, uint8_t patch_vertic
|
||||||
sctx->do_update_shaders = true;
|
sctx->do_update_shaders = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Gfx12 programs patch_vertices in VGT_PRIMITIVE_TYPE.NUM_INPUT_CP. Make sure
|
||||||
|
* the register is updated.
|
||||||
|
*/
|
||||||
|
if (sctx->gfx_level >= GFX12 && sctx->last_prim == MESA_PRIM_PATCHES)
|
||||||
|
sctx->last_prim = -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -4556,13 +4715,15 @@ void si_update_tess_io_layout_state(struct si_context *sctx)
|
||||||
sctx->ls_hs_rsrc2 = ls_hs_rsrc2;
|
sctx->ls_hs_rsrc2 = ls_hs_rsrc2;
|
||||||
sctx->ls_hs_config =
|
sctx->ls_hs_config =
|
||||||
S_028B58_NUM_PATCHES(sctx->num_patches_per_workgroup) |
|
S_028B58_NUM_PATCHES(sctx->num_patches_per_workgroup) |
|
||||||
S_028B58_HS_NUM_INPUT_CP(num_tcs_input_cp) |
|
|
||||||
S_028B58_HS_NUM_OUTPUT_CP(num_tcs_output_cp);
|
S_028B58_HS_NUM_OUTPUT_CP(num_tcs_output_cp);
|
||||||
|
|
||||||
|
if (sctx->gfx_level < GFX12)
|
||||||
|
sctx->ls_hs_config |= S_028B58_HS_NUM_INPUT_CP(num_tcs_input_cp);
|
||||||
|
|
||||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.tess_io_layout);
|
si_mark_atom_dirty(sctx, &sctx->atoms.s.tess_io_layout);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void si_emit_tess_io_layout_state(struct si_context *sctx, unsigned index)
|
static void gfx6_emit_tess_io_layout_state(struct si_context *sctx, unsigned index)
|
||||||
{
|
{
|
||||||
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
|
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
|
||||||
|
|
||||||
|
|
@ -4570,7 +4731,20 @@ static void si_emit_tess_io_layout_state(struct si_context *sctx, unsigned index
|
||||||
return;
|
return;
|
||||||
|
|
||||||
radeon_begin(cs);
|
radeon_begin(cs);
|
||||||
if (sctx->screen->info.has_set_sh_pairs_packed) {
|
if (sctx->gfx_level >= GFX12) {
|
||||||
|
gfx12_opt_push_gfx_sh_reg(R_00B42C_SPI_SHADER_PGM_RSRC2_HS,
|
||||||
|
SI_TRACKED_SPI_SHADER_PGM_RSRC2_HS, sctx->ls_hs_rsrc2);
|
||||||
|
|
||||||
|
/* Set userdata SGPRs for merged LS-HS. */
|
||||||
|
gfx12_opt_push_gfx_sh_reg(R_00B430_SPI_SHADER_USER_DATA_HS_0 +
|
||||||
|
GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 4,
|
||||||
|
SI_TRACKED_SPI_SHADER_USER_DATA_HS__TCS_OFFCHIP_LAYOUT,
|
||||||
|
sctx->tcs_offchip_layout);
|
||||||
|
gfx12_opt_push_gfx_sh_reg(R_00B430_SPI_SHADER_USER_DATA_HS_0 +
|
||||||
|
GFX9_SGPR_TCS_OFFCHIP_ADDR * 4,
|
||||||
|
SI_TRACKED_SPI_SHADER_USER_DATA_HS__TCS_OFFCHIP_ADDR,
|
||||||
|
sctx->tes_offchip_ring_va_sgpr);
|
||||||
|
} else if (sctx->screen->info.has_set_sh_pairs_packed) {
|
||||||
gfx11_opt_push_gfx_sh_reg(R_00B42C_SPI_SHADER_PGM_RSRC2_HS,
|
gfx11_opt_push_gfx_sh_reg(R_00B42C_SPI_SHADER_PGM_RSRC2_HS,
|
||||||
SI_TRACKED_SPI_SHADER_PGM_RSRC2_HS, sctx->ls_hs_rsrc2);
|
SI_TRACKED_SPI_SHADER_PGM_RSRC2_HS, sctx->ls_hs_rsrc2);
|
||||||
|
|
||||||
|
|
@ -4647,6 +4821,46 @@ static void si_emit_tess_io_layout_state(struct si_context *sctx, unsigned index
|
||||||
radeon_end_update_context_roll(sctx);
|
radeon_end_update_context_roll(sctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void gfx12_emit_tess_io_layout_state(struct si_context *sctx, unsigned index)
|
||||||
|
{
|
||||||
|
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
|
||||||
|
|
||||||
|
if (!sctx->shader.tes.cso || !sctx->shader.tcs.current)
|
||||||
|
return;
|
||||||
|
|
||||||
|
gfx12_opt_push_gfx_sh_reg(R_00B42C_SPI_SHADER_PGM_RSRC2_HS,
|
||||||
|
SI_TRACKED_SPI_SHADER_PGM_RSRC2_HS, sctx->ls_hs_rsrc2);
|
||||||
|
/* Set userdata SGPRs for merged LS-HS. */
|
||||||
|
gfx12_opt_push_gfx_sh_reg(R_00B430_SPI_SHADER_USER_DATA_HS_0 +
|
||||||
|
GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 4,
|
||||||
|
SI_TRACKED_SPI_SHADER_USER_DATA_HS__TCS_OFFCHIP_LAYOUT,
|
||||||
|
sctx->tcs_offchip_layout);
|
||||||
|
gfx12_opt_push_gfx_sh_reg(R_00B430_SPI_SHADER_USER_DATA_HS_0 +
|
||||||
|
GFX9_SGPR_TCS_OFFCHIP_ADDR * 4,
|
||||||
|
SI_TRACKED_SPI_SHADER_USER_DATA_HS__TCS_OFFCHIP_ADDR,
|
||||||
|
sctx->tes_offchip_ring_va_sgpr);
|
||||||
|
|
||||||
|
/* Set userdata SGPRs for TES. */
|
||||||
|
unsigned tes_sh_base = sctx->shader_pointers.sh_base[PIPE_SHADER_TESS_EVAL];
|
||||||
|
assert(tes_sh_base);
|
||||||
|
|
||||||
|
/* TES (as ES or VS) reuses the BaseVertex and DrawID user SGPRs that are used when
|
||||||
|
* tessellation is disabled. We can do that because those user SGPRs are only set in LS
|
||||||
|
* for tessellation and are unused in TES.
|
||||||
|
*/
|
||||||
|
gfx12_opt_push_gfx_sh_reg(tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4,
|
||||||
|
SI_TRACKED_SPI_SHADER_USER_DATA_ES__BASE_VERTEX,
|
||||||
|
sctx->tcs_offchip_layout);
|
||||||
|
gfx12_opt_push_gfx_sh_reg(tes_sh_base + SI_SGPR_TES_OFFCHIP_ADDR * 4,
|
||||||
|
SI_TRACKED_SPI_SHADER_USER_DATA_ES__DRAWID,
|
||||||
|
sctx->tes_offchip_ring_va_sgpr);
|
||||||
|
|
||||||
|
radeon_begin(cs);
|
||||||
|
radeon_opt_set_context_reg_idx(sctx, R_028B58_VGT_LS_HS_CONFIG,
|
||||||
|
SI_TRACKED_VGT_LS_HS_CONFIG, 2, sctx->ls_hs_config);
|
||||||
|
radeon_end(); /* don't track context rolls on GFX12 */
|
||||||
|
}
|
||||||
|
|
||||||
void si_init_screen_live_shader_cache(struct si_screen *sscreen)
|
void si_init_screen_live_shader_cache(struct si_screen *sscreen)
|
||||||
{
|
{
|
||||||
util_live_shader_cache_init(&sscreen->live_shader_cache, si_create_shader_selector,
|
util_live_shader_cache_init(&sscreen->live_shader_cache, si_create_shader_selector,
|
||||||
|
|
@ -4657,14 +4871,20 @@ template<int NUM_INTERP>
|
||||||
static void si_emit_spi_map(struct si_context *sctx, unsigned index)
|
static void si_emit_spi_map(struct si_context *sctx, unsigned index)
|
||||||
{
|
{
|
||||||
struct si_shader *ps = sctx->shader.ps.current;
|
struct si_shader *ps = sctx->shader.ps.current;
|
||||||
|
struct si_shader *vs = si_get_vs(sctx)->current;
|
||||||
unsigned spi_ps_input_cntl[NUM_INTERP];
|
unsigned spi_ps_input_cntl[NUM_INTERP];
|
||||||
|
|
||||||
STATIC_ASSERT(NUM_INTERP >= 0 && NUM_INTERP <= 32);
|
STATIC_ASSERT(NUM_INTERP >= 0 && NUM_INTERP <= 32);
|
||||||
|
|
||||||
|
if (sctx->gfx_level >= GFX12) {
|
||||||
|
gfx12_opt_push_gfx_sh_reg(R_00B0C4_SPI_SHADER_GS_OUT_CONFIG_PS,
|
||||||
|
SI_TRACKED_SPI_SHADER_GS_OUT_CONFIG_PS,
|
||||||
|
vs->ngg.spi_vs_out_config | ps->ps.spi_gs_out_config_ps);
|
||||||
|
}
|
||||||
|
|
||||||
if (!NUM_INTERP)
|
if (!NUM_INTERP)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
struct si_shader *vs = si_get_vs(sctx)->current;
|
|
||||||
struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
|
struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
|
||||||
|
|
||||||
for (unsigned i = 0; i < NUM_INTERP; i++) {
|
for (unsigned i = 0; i < NUM_INTERP; i++) {
|
||||||
|
|
@ -4703,11 +4923,18 @@ static void si_emit_spi_map(struct si_context *sctx, unsigned index)
|
||||||
* Dota 2: Only ~16% of SPI map updates set different values.
|
* Dota 2: Only ~16% of SPI map updates set different values.
|
||||||
* Talos: Only ~9% of SPI map updates set different values.
|
* Talos: Only ~9% of SPI map updates set different values.
|
||||||
*/
|
*/
|
||||||
|
if (sctx->gfx_level >= GFX12) {
|
||||||
|
radeon_begin(&sctx->gfx_cs);
|
||||||
|
radeon_opt_set_context_regn(sctx, R_028664_SPI_PS_INPUT_CNTL_0, spi_ps_input_cntl,
|
||||||
|
sctx->tracked_regs.spi_ps_input_cntl, NUM_INTERP);
|
||||||
|
radeon_end(); /* don't track context rolls on GFX12 */
|
||||||
|
} else {
|
||||||
radeon_begin(&sctx->gfx_cs);
|
radeon_begin(&sctx->gfx_cs);
|
||||||
radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0, spi_ps_input_cntl,
|
radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0, spi_ps_input_cntl,
|
||||||
sctx->tracked_regs.spi_ps_input_cntl, NUM_INTERP);
|
sctx->tracked_regs.spi_ps_input_cntl, NUM_INTERP);
|
||||||
radeon_end_update_context_roll(sctx);
|
radeon_end_update_context_roll(sctx);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void si_emit_spi_ge_ring_state(struct si_context *sctx, unsigned index)
|
static void si_emit_spi_ge_ring_state(struct si_context *sctx, unsigned index)
|
||||||
{
|
{
|
||||||
|
|
@ -4742,7 +4969,9 @@ static void si_emit_spi_ge_ring_state(struct si_context *sctx, unsigned index)
|
||||||
radeon_emit(sscreen->hs.hs_offchip_param); /* R_03093C_VGT_HS_OFFCHIP_PARAM */
|
radeon_emit(sscreen->hs.hs_offchip_param); /* R_03093C_VGT_HS_OFFCHIP_PARAM */
|
||||||
radeon_emit(factor_va >> 8); /* R_030940_VGT_TF_MEMORY_BASE */
|
radeon_emit(factor_va >> 8); /* R_030940_VGT_TF_MEMORY_BASE */
|
||||||
|
|
||||||
if (sctx->gfx_level >= GFX10)
|
if (sctx->gfx_level >= GFX12)
|
||||||
|
radeon_set_uconfig_reg(R_03099C_VGT_TF_MEMORY_BASE_HI, S_03099C_BASE_HI(factor_va >> 40));
|
||||||
|
else if (sctx->gfx_level >= GFX10)
|
||||||
radeon_set_uconfig_reg(R_030984_VGT_TF_MEMORY_BASE_HI, S_030984_BASE_HI(factor_va >> 40));
|
radeon_set_uconfig_reg(R_030984_VGT_TF_MEMORY_BASE_HI, S_030984_BASE_HI(factor_va >> 40));
|
||||||
else if (sctx->gfx_level == GFX9)
|
else if (sctx->gfx_level == GFX9)
|
||||||
radeon_set_uconfig_reg(R_030944_VGT_TF_MEMORY_BASE_HI, S_030944_BASE_HI(factor_va >> 40));
|
radeon_set_uconfig_reg(R_030944_VGT_TF_MEMORY_BASE_HI, S_030944_BASE_HI(factor_va >> 40));
|
||||||
|
|
@ -4783,15 +5012,34 @@ static void si_emit_spi_ge_ring_state(struct si_context *sctx, unsigned index)
|
||||||
radeon_emit(S_585_PWS_ENA(1));
|
radeon_emit(S_585_PWS_ENA(1));
|
||||||
radeon_emit(0); /* GCR_CNTL */
|
radeon_emit(0); /* GCR_CNTL */
|
||||||
|
|
||||||
assert((sscreen->attribute_ring->gpu_address >> 32) == sscreen->info.address32_hi);
|
uint64_t attr_address = sscreen->attribute_pos_prim_ring->gpu_address;
|
||||||
|
assert((attr_address >> 32) == sscreen->info.address32_hi);
|
||||||
|
|
||||||
radeon_set_uconfig_reg_seq(R_031110_SPI_GS_THROTTLE_CNTL1, 4);
|
radeon_set_uconfig_reg_seq(R_031110_SPI_GS_THROTTLE_CNTL1, 4);
|
||||||
radeon_emit(0x12355123); /* SPI_GS_THROTTLE_CNTL1 */
|
radeon_emit(0x12355123); /* SPI_GS_THROTTLE_CNTL1 */
|
||||||
radeon_emit(0x1544D); /* SPI_GS_THROTTLE_CNTL2 */
|
radeon_emit(0x1544D); /* SPI_GS_THROTTLE_CNTL2 */
|
||||||
radeon_emit(sscreen->attribute_ring->gpu_address >> 16); /* SPI_ATTRIBUTE_RING_BASE */
|
radeon_emit(attr_address >> 16); /* SPI_ATTRIBUTE_RING_BASE */
|
||||||
radeon_emit(S_03111C_MEM_SIZE((sscreen->info.attribute_ring_size_per_se >> 16) - 1) |
|
radeon_emit(S_03111C_MEM_SIZE((sscreen->info.attribute_ring_size_per_se >> 16) - 1) |
|
||||||
S_03111C_BIG_PAGE(sscreen->info.discardable_allows_big_page) |
|
S_03111C_BIG_PAGE(sscreen->info.discardable_allows_big_page) |
|
||||||
S_03111C_L1_POLICY(1)); /* SPI_ATTRIBUTE_RING_SIZE */
|
S_03111C_L1_POLICY(1)); /* SPI_ATTRIBUTE_RING_SIZE */
|
||||||
|
|
||||||
|
if (sctx->gfx_level >= GFX12) {
|
||||||
|
uint64_t pos_address = attr_address + sscreen->info.pos_ring_offset;
|
||||||
|
uint64_t prim_address = attr_address + sscreen->info.prim_ring_offset;
|
||||||
|
|
||||||
|
/* When one of these 4 registers is updated, all 4 must be updated. */
|
||||||
|
radeon_set_uconfig_reg_seq(R_0309A0_GE_POS_RING_BASE, 4);
|
||||||
|
radeon_emit(pos_address >> 16); /* R_0309A0_GE_POS_RING_BASE */
|
||||||
|
radeon_emit(S_0309A4_MEM_SIZE(sscreen->info.pos_ring_size_per_se >> 5)); /* R_0309A4_GE_POS_RING_SIZE */
|
||||||
|
radeon_emit(prim_address >> 16); /* R_0309A8_GE_PRIM_RING_BASE */
|
||||||
|
radeon_emit(S_0309AC_MEM_SIZE(sscreen->info.prim_ring_size_per_se >> 5) |
|
||||||
|
S_0309AC_SCOPE(gfx12_scope_device) |
|
||||||
|
S_0309AC_PAF_TEMPORAL(gfx12_store_high_temporal_stay_dirty) |
|
||||||
|
S_0309AC_PAB_TEMPORAL(gfx12_load_last_use_discard) |
|
||||||
|
S_0309AC_SPEC_DATA_READ(gfx12_spec_read_auto) |
|
||||||
|
S_0309AC_FORCE_SE_SCOPE(1) |
|
||||||
|
S_0309AC_PAB_NOFILL(1)); /* R_0309AC_GE_PRIM_RING_SIZE */
|
||||||
|
}
|
||||||
radeon_end();
|
radeon_end();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -4800,9 +5048,13 @@ void si_init_shader_functions(struct si_context *sctx)
|
||||||
{
|
{
|
||||||
sctx->atoms.s.vgt_pipeline_state.emit = si_emit_vgt_pipeline_state;
|
sctx->atoms.s.vgt_pipeline_state.emit = si_emit_vgt_pipeline_state;
|
||||||
sctx->atoms.s.scratch_state.emit = si_emit_scratch_state;
|
sctx->atoms.s.scratch_state.emit = si_emit_scratch_state;
|
||||||
sctx->atoms.s.tess_io_layout.emit = si_emit_tess_io_layout_state;
|
|
||||||
sctx->atoms.s.spi_ge_ring_state.emit = si_emit_spi_ge_ring_state;
|
sctx->atoms.s.spi_ge_ring_state.emit = si_emit_spi_ge_ring_state;
|
||||||
|
|
||||||
|
if (sctx->gfx_level >= GFX12)
|
||||||
|
sctx->atoms.s.tess_io_layout.emit = gfx12_emit_tess_io_layout_state;
|
||||||
|
else
|
||||||
|
sctx->atoms.s.tess_io_layout.emit = gfx6_emit_tess_io_layout_state;
|
||||||
|
|
||||||
sctx->b.create_vs_state = si_create_shader;
|
sctx->b.create_vs_state = si_create_shader;
|
||||||
sctx->b.create_tcs_state = si_create_shader;
|
sctx->b.create_tcs_state = si_create_shader;
|
||||||
sctx->b.create_tes_state = si_create_shader;
|
sctx->b.create_tes_state = si_create_shader;
|
||||||
|
|
|
||||||
|
|
@ -67,6 +67,9 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ
|
||||||
if (!old_num_targets && !num_targets)
|
if (!old_num_targets && !num_targets)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
if (sctx->gfx_level >= GFX12)
|
||||||
|
si_set_internal_shader_buffer(sctx, SI_STREAMOUT_STATE_BUF, NULL);
|
||||||
|
|
||||||
/* We are going to unbind the buffers. Mark which caches need to be flushed. */
|
/* We are going to unbind the buffers. Mark which caches need to be flushed. */
|
||||||
if (old_num_targets && sctx->streamout.begin_emitted) {
|
if (old_num_targets && sctx->streamout.begin_emitted) {
|
||||||
/* Stop streamout. */
|
/* Stop streamout. */
|
||||||
|
|
@ -97,6 +100,11 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ
|
||||||
*/
|
*/
|
||||||
sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
|
sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
|
||||||
SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
|
SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
|
||||||
|
|
||||||
|
/* Make the streamout state buffer available to the CP for resuming and DrawTF. */
|
||||||
|
if (sctx->screen->info.cp_sdma_ge_use_system_memory_scope)
|
||||||
|
sctx->flags |= SI_CONTEXT_WB_L2;
|
||||||
|
|
||||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -105,7 +113,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ
|
||||||
* spec@ext_transform_feedback@immediate-reuse-index-buffer
|
* spec@ext_transform_feedback@immediate-reuse-index-buffer
|
||||||
* spec@ext_transform_feedback@immediate-reuse-uniform-buffer
|
* spec@ext_transform_feedback@immediate-reuse-uniform-buffer
|
||||||
*/
|
*/
|
||||||
if (sctx->gfx_level >= GFX11 && old_num_targets)
|
if (sctx->gfx_level >= GFX11 && sctx->gfx_level < GFX12 && old_num_targets)
|
||||||
si_flush_gfx_cs(sctx, 0, NULL);
|
si_flush_gfx_cs(sctx, 0, NULL);
|
||||||
|
|
||||||
/* Streamout buffers must be bound in 2 places:
|
/* Streamout buffers must be bound in 2 places:
|
||||||
|
|
@ -129,11 +137,55 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ
|
||||||
|
|
||||||
/* Allocate space for the filled buffer size. */
|
/* Allocate space for the filled buffer size. */
|
||||||
struct si_streamout_target *t = sctx->streamout.targets[i];
|
struct si_streamout_target *t = sctx->streamout.targets[i];
|
||||||
if (!t->buf_filled_size) {
|
|
||||||
unsigned buf_filled_size_size = sctx->gfx_level >= GFX11 ? 8 : 4;
|
if (sctx->gfx_level >= GFX12) {
|
||||||
u_suballocator_alloc(&sctx->allocator_zeroed_memory, buf_filled_size_size, 4,
|
bool first_target = util_bitcount(enabled_mask) == 1;
|
||||||
|
|
||||||
|
/* The first enabled streamout target will contain the ordered ID/offset buffer for all
|
||||||
|
* targets.
|
||||||
|
*/
|
||||||
|
if (first_target && !append_bitmask) {
|
||||||
|
/* The layout is:
|
||||||
|
* struct {
|
||||||
|
* struct {
|
||||||
|
* uint32_t ordered_id; // equal for all buffers
|
||||||
|
* uint32_t dwords_written;
|
||||||
|
* } buffer[4];
|
||||||
|
* };
|
||||||
|
*
|
||||||
|
* The buffer must be initialized to 0 and the address must be aligned to 64
|
||||||
|
* because it's faster when the atomic doesn't straddle a 64B block boundary.
|
||||||
|
*/
|
||||||
|
unsigned alloc_size = 32;
|
||||||
|
unsigned alignment = 64;
|
||||||
|
|
||||||
|
si_resource_reference(&t->buf_filled_size, NULL);
|
||||||
|
u_suballocator_alloc(&sctx->allocator_zeroed_memory, alloc_size, alignment,
|
||||||
&t->buf_filled_size_offset,
|
&t->buf_filled_size_offset,
|
||||||
(struct pipe_resource **)&t->buf_filled_size);
|
(struct pipe_resource **)&t->buf_filled_size);
|
||||||
|
|
||||||
|
/* Offset to dwords_written of the first enabled streamout buffer. */
|
||||||
|
t->buf_filled_size_draw_count_offset = t->buf_filled_size_offset + i * 8 + 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (first_target) {
|
||||||
|
struct pipe_shader_buffer sbuf;
|
||||||
|
sbuf.buffer = &t->buf_filled_size->b.b;
|
||||||
|
sbuf.buffer_offset = t->buf_filled_size_offset;
|
||||||
|
sbuf.buffer_size = 32; /* unused, the shader only uses the low 32 bits of the address */
|
||||||
|
|
||||||
|
si_set_internal_shader_buffer(sctx, SI_STREAMOUT_STATE_BUF, &sbuf);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
/* GFX6-11 */
|
||||||
|
if (!t->buf_filled_size) {
|
||||||
|
unsigned alloc_size = sctx->gfx_level >= GFX11 ? 8 : 4;
|
||||||
|
|
||||||
|
u_suballocator_alloc(&sctx->allocator_zeroed_memory, alloc_size, 4,
|
||||||
|
&t->buf_filled_size_offset,
|
||||||
|
(struct pipe_resource **)&t->buf_filled_size);
|
||||||
|
t->buf_filled_size_draw_count_offset = t->buf_filled_size_offset;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Bind it to the shader. */
|
/* Bind it to the shader. */
|
||||||
|
|
@ -156,6 +208,11 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ
|
||||||
si_set_internal_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
|
si_set_internal_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Either streamout is being resumed for all targets or none. Required by how we implement it
|
||||||
|
* for GFX12.
|
||||||
|
*/
|
||||||
|
assert(!append_bitmask || enabled_mask == append_bitmask);
|
||||||
|
|
||||||
if (!!sctx->streamout.enabled_mask != !!enabled_mask)
|
if (!!sctx->streamout.enabled_mask != !!enabled_mask)
|
||||||
sctx->do_update_shaders = true; /* to keep/remove streamout shader code as an optimization */
|
sctx->do_update_shaders = true; /* to keep/remove streamout shader code as an optimization */
|
||||||
|
|
||||||
|
|
@ -219,6 +276,7 @@ static void si_emit_streamout_begin(struct si_context *sctx, unsigned index)
|
||||||
{
|
{
|
||||||
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
|
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
|
||||||
struct si_streamout_target **t = sctx->streamout.targets;
|
struct si_streamout_target **t = sctx->streamout.targets;
|
||||||
|
bool first_target = true;
|
||||||
|
|
||||||
if (sctx->gfx_level < GFX11)
|
if (sctx->gfx_level < GFX11)
|
||||||
si_flush_vgt_streamout(sctx);
|
si_flush_vgt_streamout(sctx);
|
||||||
|
|
@ -229,7 +287,22 @@ static void si_emit_streamout_begin(struct si_context *sctx, unsigned index)
|
||||||
|
|
||||||
t[i]->stride_in_dw = sctx->streamout.stride_in_dw[i];
|
t[i]->stride_in_dw = sctx->streamout.stride_in_dw[i];
|
||||||
|
|
||||||
if (sctx->gfx_level >= GFX11) {
|
if (sctx->gfx_level >= GFX12) {
|
||||||
|
/* Only the first streamout target holds information. */
|
||||||
|
if (first_target) {
|
||||||
|
if (sctx->streamout.append_bitmask & (1 << i)) {
|
||||||
|
si_cp_copy_data(sctx, cs, COPY_DATA_REG, NULL,
|
||||||
|
R_0309B0_GE_GS_ORDERED_ID_BASE >> 2, COPY_DATA_SRC_MEM,
|
||||||
|
t[i]->buf_filled_size, t[i]->buf_filled_size_offset);
|
||||||
|
} else {
|
||||||
|
radeon_begin(cs);
|
||||||
|
radeon_set_uconfig_reg(R_0309B0_GE_GS_ORDERED_ID_BASE, 0);
|
||||||
|
radeon_end();
|
||||||
|
}
|
||||||
|
|
||||||
|
first_target = false;
|
||||||
|
}
|
||||||
|
} else if (sctx->gfx_level >= GFX11) {
|
||||||
if (sctx->streamout.append_bitmask & (1 << i)) {
|
if (sctx->streamout.append_bitmask & (1 << i)) {
|
||||||
/* Restore the register value. */
|
/* Restore the register value. */
|
||||||
si_cp_copy_data(sctx, cs, COPY_DATA_REG, NULL,
|
si_cp_copy_data(sctx, cs, COPY_DATA_REG, NULL,
|
||||||
|
|
@ -289,6 +362,14 @@ void si_emit_streamout_end(struct si_context *sctx)
|
||||||
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
|
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
|
||||||
struct si_streamout_target **t = sctx->streamout.targets;
|
struct si_streamout_target **t = sctx->streamout.targets;
|
||||||
|
|
||||||
|
if (sctx->gfx_level >= GFX12) {
|
||||||
|
/* Nothing to do. The streamout state buffer already contains the next ordered ID, which
|
||||||
|
* is the only thing we need to restore.
|
||||||
|
*/
|
||||||
|
sctx->streamout.begin_emitted = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (sctx->gfx_level >= GFX11) {
|
if (sctx->gfx_level >= GFX11) {
|
||||||
/* Wait for streamout to finish before reading GDS_STRMOUT registers. */
|
/* Wait for streamout to finish before reading GDS_STRMOUT registers. */
|
||||||
sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
|
sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,8 @@
|
||||||
#include "util/u_upload_mgr.h"
|
#include "util/u_upload_mgr.h"
|
||||||
#include "util/u_viewport.h"
|
#include "util/u_viewport.h"
|
||||||
|
|
||||||
#define SI_MAX_SCISSOR 16384
|
#define GFX6_MAX_VIEWPORT_SIZE 16384
|
||||||
|
#define GFX12_MAX_VIEWPORT_SIZE 32768 /* TODO: this should be 64K, but maxx/maxy doesn't have enough bits */
|
||||||
|
|
||||||
static void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small_prim_cull_info *out)
|
static void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small_prim_cull_info *out)
|
||||||
{
|
{
|
||||||
|
|
@ -97,7 +98,11 @@ static void si_emit_cull_state(struct si_context *sctx, unsigned index)
|
||||||
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, sctx->small_prim_cull_info_buf,
|
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, sctx->small_prim_cull_info_buf,
|
||||||
RADEON_USAGE_READ | RADEON_PRIO_CONST_BUFFER);
|
RADEON_USAGE_READ | RADEON_PRIO_CONST_BUFFER);
|
||||||
|
|
||||||
if (sctx->screen->info.has_set_sh_pairs_packed) {
|
if (sctx->gfx_level >= GFX12) {
|
||||||
|
gfx12_push_gfx_sh_reg(R_00B230_SPI_SHADER_USER_DATA_GS_0 +
|
||||||
|
GFX9_SGPR_SMALL_PRIM_CULL_INFO * 4,
|
||||||
|
sctx->small_prim_cull_info_address);
|
||||||
|
} else if (sctx->screen->info.has_set_sh_pairs_packed) {
|
||||||
gfx11_push_gfx_sh_reg(R_00B230_SPI_SHADER_USER_DATA_GS_0 +
|
gfx11_push_gfx_sh_reg(R_00B230_SPI_SHADER_USER_DATA_GS_0 +
|
||||||
GFX9_SGPR_SMALL_PRIM_CULL_INFO * 4,
|
GFX9_SGPR_SMALL_PRIM_CULL_INFO * 4,
|
||||||
sctx->small_prim_cull_info_address);
|
sctx->small_prim_cull_info_address);
|
||||||
|
|
@ -195,10 +200,12 @@ static void si_get_scissor_from_viewport(struct si_context *ctx,
|
||||||
static void si_clamp_scissor(struct si_context *ctx, struct pipe_scissor_state *out,
|
static void si_clamp_scissor(struct si_context *ctx, struct pipe_scissor_state *out,
|
||||||
struct si_signed_scissor *scissor)
|
struct si_signed_scissor *scissor)
|
||||||
{
|
{
|
||||||
out->minx = CLAMP(scissor->minx, 0, SI_MAX_SCISSOR);
|
unsigned max_scissor = ctx->gfx_level >= GFX12 ? GFX12_MAX_VIEWPORT_SIZE : GFX6_MAX_VIEWPORT_SIZE;
|
||||||
out->miny = CLAMP(scissor->miny, 0, SI_MAX_SCISSOR);
|
|
||||||
out->maxx = CLAMP(scissor->maxx, 0, SI_MAX_SCISSOR);
|
out->minx = CLAMP(scissor->minx, 0, max_scissor);
|
||||||
out->maxy = CLAMP(scissor->maxy, 0, SI_MAX_SCISSOR);
|
out->miny = CLAMP(scissor->miny, 0, max_scissor);
|
||||||
|
out->maxx = CLAMP(scissor->maxx, 0, max_scissor);
|
||||||
|
out->maxy = CLAMP(scissor->maxy, 0, max_scissor);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void si_clip_scissor(struct pipe_scissor_state *out, struct pipe_scissor_state *clip)
|
static void si_clip_scissor(struct pipe_scissor_state *out, struct pipe_scissor_state *clip)
|
||||||
|
|
@ -226,7 +233,7 @@ static void si_emit_one_scissor(struct si_context *ctx, struct radeon_cmdbuf *cs
|
||||||
|
|
||||||
if (ctx->vs_disables_clipping_viewport) {
|
if (ctx->vs_disables_clipping_viewport) {
|
||||||
final.minx = final.miny = 0;
|
final.minx = final.miny = 0;
|
||||||
final.maxx = final.maxy = SI_MAX_SCISSOR;
|
final.maxx = final.maxy = ctx->gfx_level >= GFX12 ? GFX12_MAX_VIEWPORT_SIZE : GFX6_MAX_VIEWPORT_SIZE;
|
||||||
} else {
|
} else {
|
||||||
si_clamp_scissor(ctx, &final, vp_scissor);
|
si_clamp_scissor(ctx, &final, vp_scissor);
|
||||||
}
|
}
|
||||||
|
|
@ -235,6 +242,16 @@ static void si_emit_one_scissor(struct si_context *ctx, struct radeon_cmdbuf *cs
|
||||||
si_clip_scissor(&final, scissor);
|
si_clip_scissor(&final, scissor);
|
||||||
|
|
||||||
radeon_begin(cs);
|
radeon_begin(cs);
|
||||||
|
if (ctx->gfx_level >= GFX12) {
|
||||||
|
if (final.maxx == 0 || final.maxy == 0) {
|
||||||
|
/* An empty scissor must be done like this because the bottom-right bounds are inclusive. */
|
||||||
|
radeon_emit(S_028250_TL_X(1) | S_028250_TL_Y_GFX12(1));
|
||||||
|
radeon_emit(S_028254_BR_X(0) | S_028254_BR_Y(0));
|
||||||
|
} else {
|
||||||
|
radeon_emit(S_028250_TL_X(final.minx) | S_028250_TL_Y_GFX12(final.miny));
|
||||||
|
radeon_emit(S_028254_BR_X(final.maxx - 1) | S_028254_BR_Y(final.maxy - 1));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
/* Workaround for a hw bug on GFX6 that occurs when PA_SU_HARDWARE_SCREEN_OFFSET != 0 and
|
/* Workaround for a hw bug on GFX6 that occurs when PA_SU_HARDWARE_SCREEN_OFFSET != 0 and
|
||||||
* any_scissor.BR_X/Y <= 0.
|
* any_scissor.BR_X/Y <= 0.
|
||||||
*/
|
*/
|
||||||
|
|
@ -246,6 +263,7 @@ static void si_emit_one_scissor(struct si_context *ctx, struct radeon_cmdbuf *cs
|
||||||
S_028250_WINDOW_OFFSET_DISABLE(1));
|
S_028250_WINDOW_OFFSET_DISABLE(1));
|
||||||
radeon_emit(S_028254_BR_X(final.maxx) | S_028254_BR_Y(final.maxy));
|
radeon_emit(S_028254_BR_X(final.maxx) | S_028254_BR_Y(final.maxy));
|
||||||
}
|
}
|
||||||
|
}
|
||||||
radeon_end();
|
radeon_end();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -284,7 +302,7 @@ static void si_emit_guardband(struct si_context *sctx, unsigned index)
|
||||||
const unsigned hw_screen_offset_alignment =
|
const unsigned hw_screen_offset_alignment =
|
||||||
sctx->gfx_level >= GFX11 ? 32 :
|
sctx->gfx_level >= GFX11 ? 32 :
|
||||||
sctx->gfx_level >= GFX8 ? 16 : MAX2(sctx->screen->se_tile_repeat, 16);
|
sctx->gfx_level >= GFX8 ? 16 : MAX2(sctx->screen->se_tile_repeat, 16);
|
||||||
const unsigned max_hw_screen_offset = 8176;
|
const unsigned max_hw_screen_offset = sctx->gfx_level >= GFX12 ? 32752 : 8176;
|
||||||
|
|
||||||
/* Indexed by quantization modes */
|
/* Indexed by quantization modes */
|
||||||
static int max_viewport_size[] = {65536, 16384, 4096};
|
static int max_viewport_size[] = {65536, 16384, 4096};
|
||||||
|
|
@ -367,7 +385,21 @@ static void si_emit_guardband(struct si_context *sctx, unsigned index)
|
||||||
* R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, R_028BEC_PA_CL_GB_VERT_DISC_ADJ
|
* R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, R_028BEC_PA_CL_GB_VERT_DISC_ADJ
|
||||||
* R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ, R_028BF4_PA_CL_GB_HORZ_DISC_ADJ
|
* R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ, R_028BF4_PA_CL_GB_HORZ_DISC_ADJ
|
||||||
*/
|
*/
|
||||||
if (sctx->screen->info.has_set_context_pairs_packed) {
|
if (sctx->gfx_level >= GFX12) {
|
||||||
|
radeon_begin(&sctx->gfx_cs);
|
||||||
|
gfx12_begin_context_regs();
|
||||||
|
gfx12_opt_set_context_reg(R_028BE4_PA_SU_VTX_CNTL, SI_TRACKED_PA_SU_VTX_CNTL,
|
||||||
|
pa_su_vtx_cntl);
|
||||||
|
gfx12_opt_set_context_reg4(R_02842C_PA_CL_GB_VERT_CLIP_ADJ,
|
||||||
|
SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ,
|
||||||
|
fui(guardband_y), fui(discard_y),
|
||||||
|
fui(guardband_x), fui(discard_x));
|
||||||
|
gfx12_opt_set_context_reg(R_028234_PA_SU_HARDWARE_SCREEN_OFFSET,
|
||||||
|
SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET,
|
||||||
|
pa_su_hardware_screen_offset);
|
||||||
|
gfx12_end_context_regs();
|
||||||
|
radeon_end(); /* don't track context rolls on GFX12 */
|
||||||
|
} else if (sctx->screen->info.has_set_context_pairs_packed) {
|
||||||
radeon_begin(&sctx->gfx_cs);
|
radeon_begin(&sctx->gfx_cs);
|
||||||
gfx11_begin_packed_context_regs();
|
gfx11_begin_packed_context_regs();
|
||||||
gfx11_opt_set_context_reg(R_028BE4_PA_SU_VTX_CNTL, SI_TRACKED_PA_SU_VTX_CNTL,
|
gfx11_opt_set_context_reg(R_028BE4_PA_SU_VTX_CNTL, SI_TRACKED_PA_SU_VTX_CNTL,
|
||||||
|
|
@ -489,7 +521,7 @@ static void si_set_viewport_states(struct pipe_context *pctx, unsigned start_slo
|
||||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
|
si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void si_emit_one_viewport(struct si_context *ctx, struct pipe_viewport_state *state)
|
static void gfx6_emit_one_viewport(struct si_context *ctx, struct pipe_viewport_state *state)
|
||||||
{
|
{
|
||||||
struct radeon_cmdbuf *cs = &ctx->gfx_cs;
|
struct radeon_cmdbuf *cs = &ctx->gfx_cs;
|
||||||
|
|
||||||
|
|
@ -503,7 +535,7 @@ static void si_emit_one_viewport(struct si_context *ctx, struct pipe_viewport_st
|
||||||
radeon_end();
|
radeon_end();
|
||||||
}
|
}
|
||||||
|
|
||||||
static void si_emit_viewports(struct si_context *ctx)
|
static void gfx6_emit_viewports(struct si_context *ctx)
|
||||||
{
|
{
|
||||||
struct radeon_cmdbuf *cs = &ctx->gfx_cs;
|
struct radeon_cmdbuf *cs = &ctx->gfx_cs;
|
||||||
struct pipe_viewport_state *states = ctx->viewports.states;
|
struct pipe_viewport_state *states = ctx->viewports.states;
|
||||||
|
|
@ -514,7 +546,7 @@ static void si_emit_viewports(struct si_context *ctx)
|
||||||
radeon_set_context_reg_seq(R_02843C_PA_CL_VPORT_XSCALE, 6);
|
radeon_set_context_reg_seq(R_02843C_PA_CL_VPORT_XSCALE, 6);
|
||||||
radeon_end();
|
radeon_end();
|
||||||
|
|
||||||
si_emit_one_viewport(ctx, &states[0]);
|
gfx6_emit_one_viewport(ctx, &states[0]);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -526,7 +558,7 @@ static void si_emit_viewports(struct si_context *ctx)
|
||||||
radeon_end();
|
radeon_end();
|
||||||
|
|
||||||
for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++)
|
for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++)
|
||||||
si_emit_one_viewport(ctx, &states[i]);
|
gfx6_emit_one_viewport(ctx, &states[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void si_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz,
|
static inline void si_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz,
|
||||||
|
|
@ -540,7 +572,7 @@ static inline void si_viewport_zmin_zmax(const struct pipe_viewport_state *vp, b
|
||||||
util_viewport_zmin_zmax(vp, halfz, zmin, zmax);
|
util_viewport_zmin_zmax(vp, halfz, zmin, zmax);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void si_emit_depth_ranges(struct si_context *ctx)
|
static void gfx6_emit_depth_ranges(struct si_context *ctx)
|
||||||
{
|
{
|
||||||
struct radeon_cmdbuf *cs = &ctx->gfx_cs;
|
struct radeon_cmdbuf *cs = &ctx->gfx_cs;
|
||||||
struct pipe_viewport_state *states = ctx->viewports.states;
|
struct pipe_viewport_state *states = ctx->viewports.states;
|
||||||
|
|
@ -573,10 +605,57 @@ static void si_emit_depth_ranges(struct si_context *ctx)
|
||||||
radeon_end();
|
radeon_end();
|
||||||
}
|
}
|
||||||
|
|
||||||
static void si_emit_viewport_states(struct si_context *ctx, unsigned index)
|
static void gfx6_emit_viewport_states(struct si_context *ctx, unsigned index)
|
||||||
{
|
{
|
||||||
si_emit_viewports(ctx);
|
gfx6_emit_viewports(ctx);
|
||||||
si_emit_depth_ranges(ctx);
|
gfx6_emit_depth_ranges(ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void gfx12_emit_viewport_states(struct si_context *ctx, unsigned index)
|
||||||
|
{
|
||||||
|
struct radeon_cmdbuf *cs = &ctx->gfx_cs;
|
||||||
|
struct pipe_viewport_state *states = ctx->viewports.states;
|
||||||
|
bool clip_halfz = ctx->queued.named.rasterizer->clip_halfz;
|
||||||
|
bool window_space = ctx->vs_disables_clipping_viewport;
|
||||||
|
float zmin, zmax;
|
||||||
|
|
||||||
|
/* The simple case: Only 1 viewport is active. */
|
||||||
|
if (!ctx->vs_writes_viewport_index) {
|
||||||
|
si_viewport_zmin_zmax(&states[0], clip_halfz, window_space, &zmin, &zmax);
|
||||||
|
|
||||||
|
radeon_begin(cs);
|
||||||
|
radeon_set_context_reg_seq(R_02843C_PA_CL_VPORT_XSCALE, 8);
|
||||||
|
radeon_emit(fui(states[0].scale[0]));
|
||||||
|
radeon_emit(fui(states[0].translate[0]));
|
||||||
|
radeon_emit(fui(states[0].scale[1]));
|
||||||
|
radeon_emit(fui(states[0].translate[1]));
|
||||||
|
radeon_emit(fui(states[0].scale[2]));
|
||||||
|
radeon_emit(fui(states[0].translate[2]));
|
||||||
|
radeon_emit(fui(zmin));
|
||||||
|
radeon_emit(fui(zmax));
|
||||||
|
radeon_end();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* All registers in the array need to be updated if any of them is changed.
|
||||||
|
* This is (or was) a hardware requirement.
|
||||||
|
*/
|
||||||
|
radeon_begin(cs);
|
||||||
|
radeon_set_context_reg_seq(R_02843C_PA_CL_VPORT_XSCALE, SI_MAX_VIEWPORTS * 8);
|
||||||
|
|
||||||
|
for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++) {
|
||||||
|
si_viewport_zmin_zmax(&states[i], clip_halfz, window_space, &zmin, &zmax);
|
||||||
|
|
||||||
|
radeon_emit(fui(states[i].scale[0]));
|
||||||
|
radeon_emit(fui(states[i].translate[0]));
|
||||||
|
radeon_emit(fui(states[i].scale[1]));
|
||||||
|
radeon_emit(fui(states[i].translate[1]));
|
||||||
|
radeon_emit(fui(states[i].scale[2]));
|
||||||
|
radeon_emit(fui(states[i].translate[2]));
|
||||||
|
radeon_emit(fui(zmin));
|
||||||
|
radeon_emit(fui(zmax));
|
||||||
|
}
|
||||||
|
radeon_end();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -661,21 +740,44 @@ static void si_emit_window_rectangles(struct si_context *sctx, unsigned index)
|
||||||
else
|
else
|
||||||
rule = outside[num_rectangles - 1];
|
rule = outside[num_rectangles - 1];
|
||||||
|
|
||||||
|
if (sctx->gfx_level >= GFX12) {
|
||||||
|
radeon_begin(cs);
|
||||||
|
gfx12_begin_context_regs();
|
||||||
|
gfx12_opt_set_context_reg(R_02820C_PA_SC_CLIPRECT_RULE, SI_TRACKED_PA_SC_CLIPRECT_RULE, rule);
|
||||||
|
|
||||||
|
if (num_rectangles) {
|
||||||
|
radeon_set_context_reg_seq(R_028210_PA_SC_CLIPRECT_0_TL, num_rectangles * 2);
|
||||||
|
for (unsigned i = 0; i < num_rectangles; i++) {
|
||||||
|
gfx12_set_context_reg(R_028210_PA_SC_CLIPRECT_0_TL + i * 8,
|
||||||
|
S_028210_TL_X(rects[i].minx) | S_028210_TL_Y(rects[i].miny));
|
||||||
|
gfx12_set_context_reg(R_028214_PA_SC_CLIPRECT_0_BR + i * 8,
|
||||||
|
S_028214_BR_X(rects[i].maxx) | S_028214_BR_Y(rects[i].maxy));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (unsigned i = 0; i < num_rectangles; i++) {
|
||||||
|
gfx12_set_context_reg(R_028374_PA_SC_CLIPRECT_0_EXT + i * 4,
|
||||||
|
S_028374_TL_X_EXT(rects[i].minx >> 15) |
|
||||||
|
S_028374_TL_Y_EXT(rects[i].miny >> 15) |
|
||||||
|
S_028374_BR_X_EXT(rects[i].maxx >> 15) |
|
||||||
|
S_028374_BR_Y_EXT(rects[i].maxy >> 15));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
gfx12_end_context_regs();
|
||||||
|
radeon_end();
|
||||||
|
} else {
|
||||||
radeon_begin(cs);
|
radeon_begin(cs);
|
||||||
radeon_opt_set_context_reg(sctx, R_02820C_PA_SC_CLIPRECT_RULE, SI_TRACKED_PA_SC_CLIPRECT_RULE,
|
radeon_opt_set_context_reg(sctx, R_02820C_PA_SC_CLIPRECT_RULE, SI_TRACKED_PA_SC_CLIPRECT_RULE,
|
||||||
rule);
|
rule);
|
||||||
if (num_rectangles == 0) {
|
if (num_rectangles) {
|
||||||
radeon_end();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
radeon_set_context_reg_seq(R_028210_PA_SC_CLIPRECT_0_TL, num_rectangles * 2);
|
radeon_set_context_reg_seq(R_028210_PA_SC_CLIPRECT_0_TL, num_rectangles * 2);
|
||||||
for (unsigned i = 0; i < num_rectangles; i++) {
|
for (unsigned i = 0; i < num_rectangles; i++) {
|
||||||
radeon_emit(S_028210_TL_X(rects[i].minx) | S_028210_TL_Y(rects[i].miny));
|
radeon_emit(S_028210_TL_X(rects[i].minx) | S_028210_TL_Y(rects[i].miny));
|
||||||
radeon_emit(S_028214_BR_X(rects[i].maxx) | S_028214_BR_Y(rects[i].maxy));
|
radeon_emit(S_028214_BR_X(rects[i].maxx) | S_028214_BR_Y(rects[i].maxy));
|
||||||
}
|
}
|
||||||
|
}
|
||||||
radeon_end();
|
radeon_end();
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void si_set_window_rectangles(struct pipe_context *ctx, bool include,
|
static void si_set_window_rectangles(struct pipe_context *ctx, bool include,
|
||||||
unsigned num_rectangles,
|
unsigned num_rectangles,
|
||||||
|
|
@ -696,7 +798,10 @@ void si_init_viewport_functions(struct si_context *ctx)
|
||||||
{
|
{
|
||||||
ctx->atoms.s.guardband.emit = si_emit_guardband;
|
ctx->atoms.s.guardband.emit = si_emit_guardband;
|
||||||
ctx->atoms.s.scissors.emit = si_emit_scissors;
|
ctx->atoms.s.scissors.emit = si_emit_scissors;
|
||||||
ctx->atoms.s.viewports.emit = si_emit_viewport_states;
|
if (ctx->gfx_level >= GFX12)
|
||||||
|
ctx->atoms.s.viewports.emit = gfx12_emit_viewport_states;
|
||||||
|
else
|
||||||
|
ctx->atoms.s.viewports.emit = gfx6_emit_viewport_states;
|
||||||
ctx->atoms.s.window_rectangles.emit = si_emit_window_rectangles;
|
ctx->atoms.s.window_rectangles.emit = si_emit_window_rectangles;
|
||||||
ctx->atoms.s.ngg_cull_state.emit = si_emit_cull_state;
|
ctx->atoms.s.ngg_cull_state.emit = si_emit_cull_state;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -214,6 +214,18 @@ static int si_init_surface(struct si_screen *sscreen, struct radeon_surf *surfac
|
||||||
assert(util_is_power_of_two_or_zero(bpe));
|
assert(util_is_power_of_two_or_zero(bpe));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (sscreen->info.gfx_level >= GFX12) {
|
||||||
|
if (!is_flushed_depth && is_depth) {
|
||||||
|
flags |= RADEON_SURF_ZBUFFER;
|
||||||
|
if (is_stencil)
|
||||||
|
flags |= RADEON_SURF_SBUFFER;
|
||||||
|
|
||||||
|
if (sscreen->debug_flags & DBG(NO_HYPERZ) ||
|
||||||
|
ptex->flags & PIPE_RESOURCE_FLAG_SPARSE)
|
||||||
|
flags |= RADEON_SURF_NO_HTILE;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
/* Gfx6-11 */
|
||||||
if (!is_flushed_depth && is_depth) {
|
if (!is_flushed_depth && is_depth) {
|
||||||
flags |= RADEON_SURF_ZBUFFER;
|
flags |= RADEON_SURF_ZBUFFER;
|
||||||
|
|
||||||
|
|
@ -326,18 +338,6 @@ static int si_init_surface(struct si_screen *sscreen, struct radeon_surf *surfac
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (is_scanout) {
|
|
||||||
/* This should catch bugs in gallium users setting incorrect flags. */
|
|
||||||
assert(ptex->nr_samples <= 1 && ptex->depth0 == 1 &&
|
|
||||||
ptex->last_level == 0 && !(flags & RADEON_SURF_Z_OR_SBUFFER));
|
|
||||||
|
|
||||||
flags |= RADEON_SURF_SCANOUT;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ptex->bind & PIPE_BIND_SHARED)
|
|
||||||
flags |= RADEON_SURF_SHAREABLE;
|
|
||||||
if (is_imported)
|
|
||||||
flags |= RADEON_SURF_IMPORTED | RADEON_SURF_SHAREABLE;
|
|
||||||
if (sscreen->debug_flags & DBG(NO_FMASK))
|
if (sscreen->debug_flags & DBG(NO_FMASK))
|
||||||
flags |= RADEON_SURF_NO_FMASK;
|
flags |= RADEON_SURF_NO_FMASK;
|
||||||
|
|
||||||
|
|
@ -359,12 +359,28 @@ static int si_init_surface(struct si_screen *sscreen, struct radeon_surf *surfac
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ptex->flags & PIPE_RESOURCE_FLAG_SPARSE) {
|
if (ptex->flags & PIPE_RESOURCE_FLAG_SPARSE) {
|
||||||
flags |=
|
flags |= RADEON_SURF_NO_FMASK |
|
||||||
RADEON_SURF_PRT |
|
|
||||||
RADEON_SURF_NO_FMASK |
|
|
||||||
RADEON_SURF_NO_HTILE |
|
RADEON_SURF_NO_HTILE |
|
||||||
RADEON_SURF_DISABLE_DCC;
|
RADEON_SURF_DISABLE_DCC;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_scanout) {
|
||||||
|
/* This should catch bugs in gallium users setting incorrect flags. */
|
||||||
|
assert(ptex->nr_samples <= 1 && ptex->depth0 == 1 &&
|
||||||
|
ptex->last_level == 0 && !(flags & RADEON_SURF_Z_OR_SBUFFER));
|
||||||
|
|
||||||
|
flags |= RADEON_SURF_SCANOUT;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ptex->bind & PIPE_BIND_SHARED)
|
||||||
|
flags |= RADEON_SURF_SHAREABLE;
|
||||||
|
|
||||||
|
if (is_imported)
|
||||||
|
flags |= RADEON_SURF_IMPORTED | RADEON_SURF_SHAREABLE;
|
||||||
|
|
||||||
|
if (ptex->flags & PIPE_RESOURCE_FLAG_SPARSE)
|
||||||
|
flags |= RADEON_SURF_PRT;
|
||||||
|
|
||||||
surface->modifier = modifier;
|
surface->modifier = modifier;
|
||||||
|
|
||||||
|
|
@ -1064,6 +1080,26 @@ static struct si_texture *si_texture_create_object(struct pipe_screen *screen,
|
||||||
u_log_context_destroy(&log);
|
u_log_context_destroy(&log);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (sscreen->info.gfx_level >= GFX12) {
|
||||||
|
if (tex->is_depth) {
|
||||||
|
/* Z24 is no longer supported. We should use Z32_FLOAT instead. */
|
||||||
|
if (base->format == PIPE_FORMAT_Z16_UNORM) {
|
||||||
|
tex->db_render_format = base->format;
|
||||||
|
} else {
|
||||||
|
tex->db_render_format = PIPE_FORMAT_Z32_FLOAT;
|
||||||
|
tex->upgraded_depth = base->format != PIPE_FORMAT_Z32_FLOAT &&
|
||||||
|
base->format != PIPE_FORMAT_Z32_FLOAT_S8X24_UINT;
|
||||||
|
}
|
||||||
|
|
||||||
|
tex->db_compatible = true;
|
||||||
|
tex->can_sample_z = true;
|
||||||
|
tex->can_sample_s = true;
|
||||||
|
}
|
||||||
|
return tex;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Everything below is for GFX6-11. */
|
||||||
|
|
||||||
/* Use 1.0 as the default clear value to get optimal ZRANGE_PRECISION if we don't
|
/* Use 1.0 as the default clear value to get optimal ZRANGE_PRECISION if we don't
|
||||||
* get a fast clear.
|
* get a fast clear.
|
||||||
*/
|
*/
|
||||||
|
|
@ -1326,7 +1362,7 @@ si_texture_create_with_modifier(struct pipe_screen *screen,
|
||||||
bool is_flushed_depth = templ->flags & SI_RESOURCE_FLAG_FLUSHED_DEPTH ||
|
bool is_flushed_depth = templ->flags & SI_RESOURCE_FLAG_FLUSHED_DEPTH ||
|
||||||
templ->flags & SI_RESOURCE_FLAG_FORCE_LINEAR;
|
templ->flags & SI_RESOURCE_FLAG_FORCE_LINEAR;
|
||||||
bool tc_compatible_htile =
|
bool tc_compatible_htile =
|
||||||
sscreen->info.gfx_level >= GFX8 &&
|
sscreen->info.gfx_level >= GFX8 && sscreen->info.gfx_level < GFX12 &&
|
||||||
/* There are issues with TC-compatible HTILE on Tonga (and
|
/* There are issues with TC-compatible HTILE on Tonga (and
|
||||||
* Iceland is the same design), and documented bug workarounds
|
* Iceland is the same design), and documented bug workarounds
|
||||||
* don't help. For example, this fails:
|
* don't help. For example, this fails:
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue