diff --git a/src/gallium/drivers/radeonsi/gfx11_query.c b/src/gallium/drivers/radeonsi/gfx11_query.c index 792afdc53a7..722f91d36a6 100644 --- a/src/gallium/drivers/radeonsi/gfx11_query.c +++ b/src/gallium/drivers/radeonsi/gfx11_query.c @@ -345,6 +345,10 @@ static void gfx11_sh_query_get_result_resource(struct si_context *sctx, struct s grid.grid[1] = 1; grid.grid[2] = 1; + /* TODO: Range-invalidate GL2 */ + if (sctx->screen->info.cp_sdma_ge_use_system_memory_scope) + sctx->flags |= SI_CONTEXT_INV_L2; + struct gfx11_sh_query_buffer *qbuf = query->first; for (;;) { unsigned begin = qbuf == query->first ? query->first_begin : 0; diff --git a/src/gallium/drivers/radeonsi/meson.build b/src/gallium/drivers/radeonsi/meson.build index ed76ef822b1..f38e6f1ac75 100644 --- a/src/gallium/drivers/radeonsi/meson.build +++ b/src/gallium/drivers/radeonsi/meson.build @@ -144,7 +144,7 @@ if with_llvm endif radeonsi_gfx_libs = [] -foreach ver : ['6', '7', '8', '9', '10', '103', '11', '115'] +foreach ver : ['6', '7', '8', '9', '10', '103', '11', '115', '12'] radeonsi_gfx_libs += static_library( 'radeonsi_gfx@0@'.format(ver), ['si_state_draw.cpp'], diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c index 946228b6d9c..cbe2e86cc63 100644 --- a/src/gallium/drivers/radeonsi/si_blit.c +++ b/src/gallium/drivers/radeonsi/si_blit.c @@ -193,6 +193,8 @@ static void si_blit_decompress_zs_planes_in_place(struct si_context *sctx, unsigned layer, max_layer, checked_last_layer; unsigned fully_decompressed_mask = 0; + assert(sctx->gfx_level < GFX12); + if (!level_mask) return; @@ -287,6 +289,8 @@ static void si_decompress_depth(struct si_context *sctx, struct si_texture *tex, unsigned levels_z = 0; unsigned levels_s = 0; + assert(sctx->gfx_level < GFX12); + if (required_planes & PIPE_MASK_Z) { levels_z = level_mask & tex->dirty_level_mask; @@ -401,6 +405,8 @@ static bool si_decompress_sampler_depth_textures(struct si_context *sctx, unsigned mask = textures->needs_depth_decompress_mask; bool need_flush = false; + assert(sctx->gfx_level < GFX12); + while (mask) { struct pipe_sampler_view *view; struct si_sampler_view *sview; @@ -437,6 +443,9 @@ static void si_blit_decompress_color(struct si_context *sctx, struct si_texture unsigned layer, checked_last_layer, max_layer; unsigned level_mask = u_bit_consecutive(first_level, last_level - first_level + 1); + /* No decompression is ever needed on Gfx12. */ + assert(sctx->gfx_level < GFX12); + if (!need_dcc_decompress) level_mask &= tex->dirty_level_mask; if (!level_mask) @@ -624,6 +633,8 @@ static void si_check_render_feedback_texture(struct si_context *sctx, struct si_ { bool render_feedback = false; + assert(sctx->gfx_level < GFX12); + if (!vi_dcc_enabled(tex, first_level)) return; @@ -652,6 +663,8 @@ static void si_check_render_feedback_textures(struct si_context *sctx, struct si { uint32_t mask = textures->enabled_mask & in_use_mask; + assert(sctx->gfx_level < GFX12); + while (mask) { const struct pipe_sampler_view *view; struct si_texture *tex; @@ -674,6 +687,8 @@ static void si_check_render_feedback_images(struct si_context *sctx, struct si_i { uint32_t mask = images->enabled_mask & in_use_mask; + assert(sctx->gfx_level < GFX12); + while (mask) { const struct pipe_image_view *view; struct si_texture *tex; @@ -693,6 +708,8 @@ static void si_check_render_feedback_images(struct si_context *sctx, struct si_i static void si_check_render_feedback_resident_textures(struct si_context *sctx) { + assert(sctx->gfx_level < GFX12); + util_dynarray_foreach (&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle) { struct pipe_sampler_view *view; struct si_texture *tex; @@ -710,6 +727,8 @@ static void si_check_render_feedback_resident_textures(struct si_context *sctx) static void si_check_render_feedback_resident_images(struct si_context *sctx) { + assert(sctx->gfx_level < GFX12); + util_dynarray_foreach (&sctx->resident_img_handles, struct si_image_handle *, img_handle) { struct pipe_image_view *view; struct si_texture *tex; @@ -727,6 +746,8 @@ static void si_check_render_feedback_resident_images(struct si_context *sctx) static void si_check_render_feedback(struct si_context *sctx) { + assert(sctx->gfx_level < GFX12); + if (!sctx->need_check_render_feedback) return; @@ -896,6 +917,9 @@ void si_decompress_subresource(struct pipe_context *ctx, struct pipe_resource *t struct si_context *sctx = (struct si_context *)ctx; struct si_texture *stex = (struct si_texture *)tex; + if (sctx->gfx_level >= GFX12) + return; + if (stex->db_compatible) { planes &= PIPE_MASK_Z | PIPE_MASK_S; @@ -1324,7 +1348,7 @@ static void si_flush_resource(struct pipe_context *ctx, struct pipe_resource *re struct si_context *sctx = (struct si_context *)ctx; struct si_texture *tex = (struct si_texture *)res; - if (res->target == PIPE_BUFFER) + if (sctx->gfx_level >= GFX12 || res->target == PIPE_BUFFER) return; if (!tex->is_depth && (tex->cmask_buffer || vi_dcc_enabled(tex, 0))) { @@ -1340,6 +1364,8 @@ static void si_flush_resource(struct pipe_context *ctx, struct pipe_resource *re void si_flush_implicit_resources(struct si_context *sctx) { + assert(sctx->gfx_level < GFX12); + hash_table_foreach(sctx->dirty_implicit_resources, entry) { si_flush_resource(&sctx->b, entry->data); pipe_resource_reference((struct pipe_resource **)&entry->data, NULL); @@ -1349,6 +1375,7 @@ void si_flush_implicit_resources(struct si_context *sctx) void si_decompress_dcc(struct si_context *sctx, struct si_texture *tex) { + assert(sctx->gfx_level < GFX12); assert(!tex->is_depth); /* If graphics is disabled, we can't decompress DCC, but it shouldn't diff --git a/src/gallium/drivers/radeonsi/si_build_pm4.h b/src/gallium/drivers/radeonsi/si_build_pm4.h index 56334250ef1..f0261ca816f 100644 --- a/src/gallium/drivers/radeonsi/si_build_pm4.h +++ b/src/gallium/drivers/radeonsi/si_build_pm4.h @@ -397,8 +397,106 @@ } \ } while (0) +/* GFX12 generic packet building helpers for PAIRS packets. Don't use these directly. */ +#define gfx12_begin_regs(header) unsigned header = __cs_num++ + +#define gfx12_set_reg(reg, value, base_offset) do { \ + radeon_emit(((reg) - (base_offset)) >> 2); \ + radeon_emit(value); \ +} while (0) + +#define gfx12_opt_set_reg(reg, reg_enum, value, base_offset) do { \ + unsigned __value = value; \ + if (!BITSET_TEST(sctx->tracked_regs.reg_saved_mask, (reg_enum)) || \ + sctx->tracked_regs.reg_value[reg_enum] != __value) { \ + gfx12_set_reg(reg, __value, base_offset); \ + BITSET_SET(sctx->tracked_regs.reg_saved_mask, (reg_enum)); \ + sctx->tracked_regs.reg_value[reg_enum] = __value; \ + } \ +} while (0) + +#define gfx12_opt_set_reg4(reg, reg_enum, v1, v2, v3, v4, base_offset) do { \ + unsigned __v1 = (v1), __v2 = (v2), __v3 = (v3), __v4 = (v4); \ + if (!BITSET_TEST_RANGE_INSIDE_WORD(sctx->tracked_regs.reg_saved_mask, \ + (reg_enum), (reg_enum) + 3, 0xf) || \ + sctx->tracked_regs.reg_value[(reg_enum)] != __v1 || \ + sctx->tracked_regs.reg_value[(reg_enum) + 1] != __v2 || \ + sctx->tracked_regs.reg_value[(reg_enum) + 2] != __v3 || \ + sctx->tracked_regs.reg_value[(reg_enum) + 3] != __v4) { \ + gfx12_set_reg((reg), __v1, (base_offset)); \ + gfx12_set_reg((reg) + 4, __v2, (base_offset)); \ + gfx12_set_reg((reg) + 8, __v3, (base_offset)); \ + gfx12_set_reg((reg) + 12, __v4, (base_offset)); \ + BITSET_SET_RANGE_INSIDE_WORD(sctx->tracked_regs.reg_saved_mask, \ + (reg_enum), (reg_enum) + 3); \ + sctx->tracked_regs.reg_value[(reg_enum)] = __v1; \ + sctx->tracked_regs.reg_value[(reg_enum) + 1] = __v2; \ + sctx->tracked_regs.reg_value[(reg_enum) + 2] = __v3; \ + sctx->tracked_regs.reg_value[(reg_enum) + 3] = __v4; \ + } \ +} while (0) + +#define gfx12_end_regs(header, packet) do { \ + if ((header) + 1 == __cs_num) { \ + __cs_num--; /* no registers have been set, back off */ \ + } else { \ + unsigned __dw_count = __cs_num - (header) - 2; \ + __cs_buf[(header)] = PKT3((packet), __dw_count, 0) | PKT3_RESET_FILTER_CAM_S(1); \ + } \ +} while (0) + +/* GFX12 generic packet building helpers for buffered registers. Don't use these directly. */ +#define gfx12_push_reg(reg, value, base_offset, type) do { \ + unsigned __i = sctx->num_buffered_##type##_regs++; \ + assert(__i < ARRAY_SIZE(sctx->gfx12.buffered_##type##_regs)); \ + sctx->gfx12.buffered_##type##_regs[__i].reg_offset = ((reg) - (base_offset)) >> 2; \ + sctx->gfx12.buffered_##type##_regs[__i].reg_value = value; \ +} while (0) + +#define gfx12_opt_push_reg(reg, reg_enum, value, type) do { \ + unsigned __value = value; \ + unsigned __reg_enum = reg_enum; \ + if (!BITSET_TEST(sctx->tracked_regs.reg_saved_mask, (reg_enum)) || \ + sctx->tracked_regs.reg_value[__reg_enum] != __value) { \ + gfx12_push_##type##_reg(reg, __value); \ + BITSET_SET(sctx->tracked_regs.reg_saved_mask, (reg_enum)); \ + sctx->tracked_regs.reg_value[__reg_enum] = __value; \ + } \ +} while (0) + +/* GFX12 packet building helpers for PAIRS packets. */ +#define gfx12_begin_context_regs() \ + gfx12_begin_regs(__cs_context_reg_header) + +#define gfx12_set_context_reg(reg, value) \ + gfx12_set_reg(reg, value, SI_CONTEXT_REG_OFFSET) + +#define gfx12_opt_set_context_reg(reg, reg_enum, value) \ + gfx12_opt_set_reg(reg, reg_enum, value, SI_CONTEXT_REG_OFFSET) + +#define gfx12_opt_set_context_reg4(reg, reg_enum, v1, v2, v3, v4) \ + gfx12_opt_set_reg4(reg, reg_enum, v1, v2, v3, v4, SI_CONTEXT_REG_OFFSET) + +#define gfx12_end_context_regs() \ + gfx12_end_regs(__cs_context_reg_header, PKT3_SET_CONTEXT_REG_PAIRS) + +/* GFX12 packet building helpers for buffered registers. */ +#define gfx12_push_gfx_sh_reg(reg, value) \ + gfx12_push_reg(reg, value, SI_SH_REG_OFFSET, gfx_sh) + +#define gfx12_push_compute_sh_reg(reg, value) \ + gfx12_push_reg(reg, value, SI_SH_REG_OFFSET, compute_sh) + +#define gfx12_opt_push_gfx_sh_reg(reg, reg_enum, value) \ + gfx12_opt_push_reg(reg, reg_enum, value, gfx_sh) + +#define gfx12_opt_push_compute_sh_reg(reg, reg_enum, value) \ + gfx12_opt_push_reg(reg, reg_enum, value, compute_sh) + #define radeon_set_or_push_gfx_sh_reg(reg, value) do { \ - if (GFX_VERSION >= GFX11 && HAS_SH_PAIRS_PACKED) { \ + if (GFX_VERSION >= GFX12) { \ + gfx12_push_gfx_sh_reg(reg, value); \ + } else if (GFX_VERSION >= GFX11 && HAS_SH_PAIRS_PACKED) { \ gfx11_push_gfx_sh_reg(reg, value); \ } else { \ radeon_set_sh_reg_seq(reg, 1); \ diff --git a/src/gallium/drivers/radeonsi/si_clear.c b/src/gallium/drivers/radeonsi/si_clear.c index f8e3ef12b53..40c953e4f28 100644 --- a/src/gallium/drivers/radeonsi/si_clear.c +++ b/src/gallium/drivers/radeonsi/si_clear.c @@ -707,6 +707,8 @@ static void si_fast_clear(struct si_context *sctx, unsigned *buffers, unsigned clear_types = 0; unsigned num_pixels = fb->width * fb->height; + assert(sctx->gfx_level < GFX12); + /* This function is broken in BE, so just disable this path for now */ #if UTIL_ARCH_BIG_ENDIAN return; @@ -1149,9 +1151,9 @@ static void si_fast_clear(struct si_context *sctx, unsigned *buffers, si_execute_clears(sctx, info, num_clears, clear_types, sctx->render_cond_enabled); } -static void si_clear(struct pipe_context *ctx, unsigned buffers, - const struct pipe_scissor_state *scissor_state, - const union pipe_color_union *color, double depth, unsigned stencil) +static void gfx6_clear(struct pipe_context *ctx, unsigned buffers, + const struct pipe_scissor_state *scissor_state, + const union pipe_color_union *color, double depth, unsigned stencil) { struct si_context *sctx = (struct si_context *)ctx; struct pipe_framebuffer_state *fb = &sctx->framebuffer.state; @@ -1271,6 +1273,44 @@ static void si_clear(struct pipe_context *ctx, unsigned buffers, } } +static void gfx12_clear(struct pipe_context *ctx, unsigned buffers, + const struct pipe_scissor_state *scissor_state, + const union pipe_color_union *color, double depth, unsigned stencil) +{ + struct si_context *sctx = (struct si_context *)ctx; + struct pipe_framebuffer_state *fb = &sctx->framebuffer.state; + struct pipe_surface *zsbuf = fb->zsbuf; + struct si_texture *zstex = zsbuf ? (struct si_texture *)zsbuf->texture : NULL; + + /* Unset clear flags for non-existent buffers. */ + for (unsigned i = 0; i < 8; i++) { + if (i >= fb->nr_cbufs || !fb->cbufs[i]) + buffers &= ~(PIPE_CLEAR_COLOR0 << i); + } + if (!zsbuf) + buffers &= ~PIPE_CLEAR_DEPTHSTENCIL; + else if (!util_format_has_stencil(util_format_description(zsbuf->format))) + buffers &= ~PIPE_CLEAR_STENCIL; + + if (unlikely(sctx->sqtt_enabled)) { + if (buffers & PIPE_CLEAR_COLOR) + sctx->sqtt_next_event = EventCmdClearColorImage; + else if (buffers & PIPE_CLEAR_DEPTHSTENCIL) + sctx->sqtt_next_event = EventCmdClearDepthStencilImage; + } + + si_blitter_begin(sctx, SI_CLEAR); + util_blitter_clear(sctx->blitter, fb->width, fb->height, util_framebuffer_get_num_layers(fb), + buffers, color, depth, stencil, sctx->framebuffer.nr_samples > 1); + si_blitter_end(sctx); + + /* This is only used by the driver, not the hw. */ + if (buffers & PIPE_CLEAR_DEPTH) { + zstex->depth_cleared_level_mask |= BITFIELD_BIT(zsbuf->u.tex.level); + zstex->depth_clear_value[zsbuf->u.tex.level] = depth; + } +} + static bool si_try_normal_clear(struct si_context *sctx, struct pipe_surface *dst, unsigned dstx, unsigned dsty, unsigned width, unsigned height, bool render_condition_enabled, unsigned buffers, @@ -1459,7 +1499,11 @@ void si_init_clear_functions(struct si_context *sctx) sctx->b.clear_texture = u_default_clear_texture; if (sctx->has_graphics) { - sctx->b.clear = si_clear; + if (sctx->gfx_level >= GFX12) + sctx->b.clear = gfx12_clear; + else + sctx->b.clear = gfx6_clear; + sctx->b.clear_depth_stencil = si_clear_depth_stencil; } } diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index b855269ba1b..405a602369b 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -178,7 +178,7 @@ static void si_create_compute_state_async(void *job, void *gdata, int thread_ind shader->config.rsrc1 = S_00B848_VGPRS((shader->config.num_vgprs - 1) / ((shader->wave_size == 32 || sscreen->info.wave64_vgpr_alloc_granularity == 8) ? 8 : 4)) | - S_00B848_DX10_CLAMP(1) | + S_00B848_DX10_CLAMP(sscreen->info.gfx_level < GFX12) | S_00B848_MEM_ORDERED(si_shader_mem_ordered(shader)) | S_00B848_FLOAT_MODE(shader->config.float_mode); @@ -499,15 +499,36 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, shader->bo, RADEON_USAGE_READ | RADEON_PRIO_SHADER_BINARY); - if (sctx->screen->info.has_set_sh_pairs_packed) { + if (sctx->gfx_level >= GFX12) { + unsigned rsrc3 = S_00B8A0_INST_PREF_SIZE_GFX12(si_get_shader_prefetch_size(shader)); + + gfx12_push_compute_sh_reg(R_00B830_COMPUTE_PGM_LO, shader_va >> 8); + gfx12_opt_push_compute_sh_reg(R_00B848_COMPUTE_PGM_RSRC1, + SI_TRACKED_COMPUTE_PGM_RSRC1, config->rsrc1); + gfx12_opt_push_compute_sh_reg(R_00B84C_COMPUTE_PGM_RSRC2, + SI_TRACKED_COMPUTE_PGM_RSRC2, rsrc2); + gfx12_opt_push_compute_sh_reg(R_00B8A0_COMPUTE_PGM_RSRC3, + SI_TRACKED_COMPUTE_PGM_RSRC3, rsrc3); + gfx12_opt_push_compute_sh_reg(R_00B860_COMPUTE_TMPRING_SIZE, + SI_TRACKED_COMPUTE_TMPRING_SIZE, tmpring_size); + if (shader->scratch_bo) { + gfx12_opt_push_compute_sh_reg(R_00B840_COMPUTE_DISPATCH_SCRATCH_BASE_LO, + SI_TRACKED_COMPUTE_DISPATCH_SCRATCH_BASE_LO, + sctx->compute_scratch_buffer->gpu_address >> 8); + gfx12_opt_push_compute_sh_reg(R_00B844_COMPUTE_DISPATCH_SCRATCH_BASE_HI, + SI_TRACKED_COMPUTE_DISPATCH_SCRATCH_BASE_HI, + sctx->compute_scratch_buffer->gpu_address >> 40); + } + } else if (sctx->screen->info.has_set_sh_pairs_packed) { + unsigned rsrc3 = S_00B8A0_INST_PREF_SIZE_GFX11(si_get_shader_prefetch_size(shader)); + gfx11_push_compute_sh_reg(R_00B830_COMPUTE_PGM_LO, shader_va >> 8); gfx11_opt_push_compute_sh_reg(R_00B848_COMPUTE_PGM_RSRC1, SI_TRACKED_COMPUTE_PGM_RSRC1, config->rsrc1); gfx11_opt_push_compute_sh_reg(R_00B84C_COMPUTE_PGM_RSRC2, SI_TRACKED_COMPUTE_PGM_RSRC2, rsrc2); gfx11_opt_push_compute_sh_reg(R_00B8A0_COMPUTE_PGM_RSRC3, - SI_TRACKED_COMPUTE_PGM_RSRC3, - S_00B8A0_INST_PREF_SIZE_GFX11(si_get_shader_prefetch_size(shader))); + SI_TRACKED_COMPUTE_PGM_RSRC3, rsrc3); gfx11_opt_push_compute_sh_reg(R_00B860_COMPUTE_TMPRING_SIZE, SI_TRACKED_COMPUTE_TMPRING_SIZE, tmpring_size); if (shader->scratch_bo) { @@ -735,7 +756,24 @@ static void si_setup_nir_user_data(struct si_context *sctx, const struct pipe_gr } } - if (sctx->screen->info.has_set_sh_pairs_packed) { + if (sctx->gfx_level >= GFX12) { + if (sel->info.uses_grid_size && !info->indirect) { + gfx12_push_compute_sh_reg(grid_size_reg, info->grid[0]); + gfx12_push_compute_sh_reg(grid_size_reg + 4, info->grid[1]); + gfx12_push_compute_sh_reg(grid_size_reg + 8, info->grid[2]); + } + + if (sel->info.uses_variable_block_size) { + uint32_t value = info->block[0] | (info->block[1] << 10) | (info->block[2] << 20); + gfx12_push_compute_sh_reg(block_size_reg, value); + } + + if (sel->info.base.cs.user_data_components_amd) { + unsigned num = sel->info.base.cs.user_data_components_amd; + for (unsigned i = 0; i < num; i++) + gfx12_push_compute_sh_reg(cs_user_data_reg + i * 4, sctx->cs_user_data[i]); + } + } else if (sctx->screen->info.has_set_sh_pairs_packed) { if (sel->info.uses_grid_size && !info->indirect) { gfx11_push_compute_sh_reg(grid_size_reg, info->grid[0]); gfx11_push_compute_sh_reg(grid_size_reg + 4, info->grid[1]); @@ -776,6 +814,76 @@ static void si_setup_nir_user_data(struct si_context *sctx, const struct pipe_gr } } +static bool si_get_2d_interleave_size(const struct pipe_grid_info *info, + unsigned *log_x, unsigned *log_y) +{ + /* The following code produces this behavior: + * + * WG size | WG block/SE | Thread block/SE + * ( 1, 32) = 32 | (16, 1) = 16 | ( 16, 32) = 512 + * ( 2, 16) = 32 | ( 8, 2) = 16 | ( 16, 32) = 512 + * ( 2, 32) = 64 | (16, 1) = 16 | ( 32, 32) = 1024 + * ( 4, 8) = 32 | ( 4, 4) = 16 | ( 16, 32) = 512 + * ( 4, 16) = 64 | ( 8, 2) = 16 | ( 32, 32) = 1024 + * ( 4, 32) = 128 | ( 8, 1) = 8 | ( 32, 32) = 1024 + * ( 8, 4) = 32 | ( 2, 8) = 16 | ( 16, 32) = 512 + * ( 8, 8) = 64 | ( 4, 4) = 16 | ( 32, 32) = 1024 + * ( 8, 16) = 128 | ( 4, 2) = 8 | ( 32, 32) = 1024 + * ( 8, 32) = 256 | ( 4, 1) = 4 | ( 32, 32) = 1024 + * (16, 2) = 32 | ( 1, 16) = 16 | ( 16, 32) = 512 + * (16, 4) = 64 | ( 2, 8) = 16 | ( 32, 32) = 1024 + * (16, 8) = 128 | ( 2, 4) = 8 | ( 32, 32) = 1024 + * (16, 16) = 256 | ( 2, 2) = 4 | ( 32, 32) = 1024 + * (16, 32) = 512 | ( 2, 1) = 2 | ( 32, 32) = 1024 + * (32, 1) = 32 | ( 1, 16) = 16 | ( 32, 16) = 512 + * (32, 2) = 64 | ( 1, 16) = 16 | ( 32, 32) = 1024 + * (32, 4) = 128 | ( 1, 8) = 8 | ( 32, 32) = 1024 + * (32, 8) = 256 | ( 1, 4) = 4 | ( 32, 32) = 1024 + * (32, 16) = 512 | ( 1, 2) = 2 | ( 32, 32) = 1024 + * + * For 3D workgroups, the total 2D thread count is divided by Z. + * Example with Z=8, showing only a 2D slice of the grid: + * + * WG size | WG block/SE | Thread block/SE + * ( 1, 32) = 32 | ( 4, 1) = 4 | ( 4, 32) = 128 + * ( 2, 16) = 32 | ( 4, 1) = 4 | ( 8, 16) = 128 + * ( 2, 32) = 64 | ( 2, 1) = 2 | ( 4, 32) = 128 + * ( 4, 8) = 32 | ( 2, 2) = 4 | ( 8, 16) = 128 + * ( 4, 16) = 64 | ( 2, 1) = 2 | ( 8, 16) = 128 + * ( 8, 4) = 32 | ( 1, 4) = 4 | ( 8, 16) = 128 + * ( 8, 8) = 64 | ( 1, 2) = 2 | ( 8, 16) = 128 + * (16, 2) = 32 | ( 1, 4) = 4 | ( 16, 8) = 128 + * (16, 4) = 64 | ( 1, 2) = 2 | ( 16, 8) = 128 + * (32, 1) = 32 | ( 1, 4) = 4 | ( 32, 4) = 128 + * (32, 2) = 64 | ( 1, 2) = 2 | ( 32, 4) = 128 + * + * It tries to find a WG block size that corresponds to (N, N) or (N, 2*N) threads, + * but it's limited by the maximum WGs/SE, which is 16, and the number of threads/SE, + * which we set to 1024. + */ + unsigned max_threads_per_se = 1024; + unsigned threads_per_threadgroup = info->block[0] * info->block[1] * info->block[2]; + unsigned workgroups_per_se = MIN2(max_threads_per_se / threads_per_threadgroup, 16); + unsigned log_workgroups_per_se = util_logbase2(workgroups_per_se); + + if (!log_workgroups_per_se) + return false; + + assert(log_workgroups_per_se <= 4); + + *log_x = MIN2(log_workgroups_per_se, 4); + *log_y = log_workgroups_per_se - *log_x; + + while (*log_x > 0 && *log_y < 4 && + info->block[0] * (1 << *log_x) > info->block[1] * (1 << *log_y)) { + (*log_x)--; + (*log_y)++; + } + + assert(*log_x + *log_y <= 4); + return true; +} + static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_grid_info *info) { struct si_screen *sscreen = sctx->screen; @@ -807,7 +915,11 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_ sctx->cs_max_waves_per_sh, threadgroups_per_cu); - if (sctx->screen->info.has_set_sh_pairs_packed) { + if (sctx->gfx_level >= GFX12) { + gfx12_opt_push_compute_sh_reg(R_00B854_COMPUTE_RESOURCE_LIMITS, + SI_TRACKED_COMPUTE_RESOURCE_LIMITS, + compute_resource_limits); + } else if (sctx->screen->info.has_set_sh_pairs_packed) { gfx11_opt_push_compute_sh_reg(R_00B854_COMPUTE_RESOURCE_LIMITS, SI_TRACKED_COMPUTE_RESOURCE_LIMITS, compute_resource_limits); @@ -830,8 +942,13 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_ bool partial_block_en = last_block[0] || last_block[1] || last_block[2]; uint32_t num_threads[3]; - num_threads[0] = S_00B81C_NUM_THREAD_FULL_GFX6(info->block[0]); - num_threads[1] = S_00B820_NUM_THREAD_FULL_GFX6(info->block[1]); + if (sctx->gfx_level >= GFX12) { + num_threads[0] = S_00B81C_NUM_THREAD_FULL_GFX12(info->block[0]); + num_threads[1] = S_00B820_NUM_THREAD_FULL_GFX12(info->block[1]); + } else { + num_threads[0] = S_00B81C_NUM_THREAD_FULL_GFX6(info->block[0]); + num_threads[1] = S_00B820_NUM_THREAD_FULL_GFX6(info->block[1]); + } num_threads[2] = S_00B824_NUM_THREAD_FULL(info->block[2]); if (partial_block_en) { @@ -849,7 +966,82 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_ dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1); } - if (sctx->screen->info.has_set_sh_pairs_packed) { + if (sctx->gfx_level >= GFX12) { + /* Set PING_PONG_EN for every other dispatch. + * Only allowed on a gfx queue, and PARTIAL_TG_EN and USE_THREAD_DIMENSIONS must be 0. + */ + if (sctx->has_graphics && !partial_block_en) { + dispatch_initiator |= S_00B800_PING_PONG_EN(sctx->compute_ping_pong_launch); + sctx->compute_ping_pong_launch ^= 1; + } + + /* Thread tiling within a workgroup. */ + switch (sctx->cs_shader_state.program->shader.selector->info.base.cs.derivative_group) { + case DERIVATIVE_GROUP_LINEAR: + break; + case DERIVATIVE_GROUP_QUADS: + num_threads[0] |= S_00B81C_INTERLEAVE_BITS_X(1); /* 2x2 */ + num_threads[1] |= S_00B820_INTERLEAVE_BITS_Y(1); + break; + case DERIVATIVE_GROUP_NONE: + /* These are the only legal combinations. */ + if (info->block[0] % 8 == 0 && info->block[1] % 8 == 0) { + num_threads[0] |= S_00B81C_INTERLEAVE_BITS_X(3); /* 8x8 */ + num_threads[1] |= S_00B820_INTERLEAVE_BITS_Y(3); + } else if (info->block[0] % 4 == 0 && info->block[1] % 8 == 0) { + num_threads[0] |= S_00B81C_INTERLEAVE_BITS_X(2); /* 4x8 */ + num_threads[1] |= S_00B820_INTERLEAVE_BITS_Y(3); + } else if (info->block[0] % 4 == 0 && info->block[1] % 4 == 0) { + num_threads[0] |= S_00B81C_INTERLEAVE_BITS_X(2); /* 4x4 */ + num_threads[1] |= S_00B820_INTERLEAVE_BITS_Y(2); + } else if (info->block[0] % 2 == 0 && info->block[1] % 2 == 0) { + num_threads[0] |= S_00B81C_INTERLEAVE_BITS_X(1); /* 2x2 */ + num_threads[1] |= S_00B820_INTERLEAVE_BITS_Y(1); + } + break; + } + + /* How many threads should go to 1 SE before moving onto the next if INTERLEAVE_2D_EN == 0. + * Only these values are valid: 0 (disabled), 64, 128, 256, 512 + * 64 = RT, 256 = non-RT (run benchmarks to be sure) + */ + unsigned dispatch_interleave = S_00B8BC_INTERLEAVE_1D(256); + unsigned log_x, log_y; + + /* Launch a 2D subgrid on each SE instead of a 1D subgrid. If enabled, INTERLEAVE_1D is + * ignored and each SE gets 1 subgrid up to a certain number of threads. + * + * Constraints: + * - Only supported by the gfx queue. + * - Max 16 workgroups per SE can be launched, max 4 in each dimension. + * - PARTIAL_TG_EN, USE_THREAD_DIMENSIONS, and ORDERED_APPEND_ENBL must be 0. + */ + if (sctx->has_graphics && !partial_block_en && + (info->indirect || info->grid[1] >= 4) && MIN2(info->block[0], info->block[1]) >= 4 && + si_get_2d_interleave_size(info, &log_x, &log_y)) { + dispatch_interleave = S_00B8BC_INTERLEAVE_1D(1) || /* 1D is disabled */ + S_00B8BC_INTERLEAVE_2D_X_SIZE(log_x) | + S_00B8BC_INTERLEAVE_2D_Y_SIZE(log_y); + dispatch_initiator |= S_00B800_INTERLEAVE_2D_EN(1); + } + + if (sctx->has_graphics) { + radeon_opt_set_sh_reg_idx(sctx, R_00B8BC_COMPUTE_DISPATCH_INTERLEAVE, + SI_TRACKED_COMPUTE_DISPATCH_INTERLEAVE, 2, dispatch_interleave); + } else { + gfx12_opt_push_compute_sh_reg(R_00B8BC_COMPUTE_DISPATCH_INTERLEAVE, + SI_TRACKED_COMPUTE_DISPATCH_INTERLEAVE, dispatch_interleave); + } + } + + if (sctx->gfx_level >= GFX12) { + gfx12_opt_push_compute_sh_reg(R_00B81C_COMPUTE_NUM_THREAD_X, + SI_TRACKED_COMPUTE_NUM_THREAD_X, num_threads[0]); + gfx12_opt_push_compute_sh_reg(R_00B820_COMPUTE_NUM_THREAD_Y, + SI_TRACKED_COMPUTE_NUM_THREAD_Y, num_threads[1]); + gfx12_opt_push_compute_sh_reg(R_00B824_COMPUTE_NUM_THREAD_Z, + SI_TRACKED_COMPUTE_NUM_THREAD_Z, num_threads[2]); + } else if (sctx->screen->info.has_set_sh_pairs_packed) { gfx11_opt_push_compute_sh_reg(R_00B81C_COMPUTE_NUM_THREAD_X, SI_TRACKED_COMPUTE_NUM_THREAD_X, num_threads[0]); gfx11_opt_push_compute_sh_reg(R_00B820_COMPUTE_NUM_THREAD_Y, @@ -862,7 +1054,7 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_ num_threads[0], num_threads[1], num_threads[2]); } - if (sctx->gfx_level >= GFX11) { + if (sctx->gfx_level >= GFX12 || sctx->screen->info.has_set_sh_pairs_packed) { radeon_end(); si_emit_buffered_compute_sh_regs(sctx); radeon_begin_again(cs); @@ -879,11 +1071,21 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_ radeon_emit(base_va); radeon_emit(base_va >> 32); - radeon_emit(PKT3(PKT3_DISPATCH_INDIRECT, 1, render_cond_bit) | PKT3_SHADER_TYPE_S(1)); + unsigned pkt = PKT3_DISPATCH_INDIRECT; + + if (sctx->gfx_level >= GFX12 && G_00B800_INTERLEAVE_2D_EN(dispatch_initiator)) + pkt = PKT3_DISPATCH_INDIRECT_INTERLEAVED; + + radeon_emit(PKT3(pkt, 1, render_cond_bit) | PKT3_SHADER_TYPE_S(1)); radeon_emit(info->indirect_offset); radeon_emit(dispatch_initiator); } else { - radeon_emit(PKT3(PKT3_DISPATCH_DIRECT, 3, render_cond_bit) | PKT3_SHADER_TYPE_S(1)); + unsigned pkt = PKT3_DISPATCH_DIRECT; + + if (sctx->gfx_level >= GFX12 && G_00B800_INTERLEAVE_2D_EN(dispatch_initiator)) + pkt = PKT3_DISPATCH_DIRECT_INTERLEAVED; + + radeon_emit(PKT3(pkt, 3, render_cond_bit) | PKT3_SHADER_TYPE_S(1)); radeon_emit(info->grid[0]); radeon_emit(info->grid[1]); radeon_emit(info->grid[2]); @@ -968,17 +1170,23 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info si_make_CB_shader_coherent(sctx, 0, sctx->framebuffer.CB_has_shader_readable_metadata, sctx->framebuffer.all_DCC_pipe_aligned); + + if (sctx->gfx_level == GFX12 && + (sctx->force_shader_coherency.with_db || + si_check_needs_implicit_sync(sctx, RADEON_USAGE_DB_NEEDS_IMPLICIT_SYNC))) + si_make_DB_shader_coherent(sctx, 0, false, false); } if (sctx->gfx_level < GFX11) gfx6_decompress_textures(sctx, 1 << PIPE_SHADER_COMPUTE); - else + else if (sctx->gfx_level < GFX12) gfx11_decompress_textures(sctx, 1 << PIPE_SHADER_COMPUTE); } if (info->indirect) { - /* Indirect buffers use TC L2 on GFX9, but not older hw. */ - if (sctx->gfx_level <= GFX8 && si_resource(info->indirect)->TC_L2_dirty) { + /* Indirect buffers use TC L2 on GFX9-GFX11, but not other hw. */ + if ((sctx->gfx_level <= GFX8 || sctx->gfx_level == GFX12) && + si_resource(info->indirect)->TC_L2_dirty) { sctx->flags |= SI_CONTEXT_WB_L2; si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); si_resource(info->indirect)->TC_L2_dirty = false; @@ -1054,18 +1262,20 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info si_log_compute_state(sctx, sctx->log); } - /* Mark displayable DCC as dirty for bound images. */ - unsigned display_dcc_store_mask = sctx->images[PIPE_SHADER_COMPUTE].display_dcc_store_mask & - BITFIELD_MASK(program->sel.info.base.num_images); - while (display_dcc_store_mask) { - struct si_texture *tex = (struct si_texture *) - sctx->images[PIPE_SHADER_COMPUTE].views[u_bit_scan(&display_dcc_store_mask)].resource; + if (sctx->gfx_level < GFX12) { + /* Mark displayable DCC as dirty for bound images. */ + unsigned display_dcc_store_mask = sctx->images[PIPE_SHADER_COMPUTE].display_dcc_store_mask & + BITFIELD_MASK(program->sel.info.base.num_images); + while (display_dcc_store_mask) { + struct si_texture *tex = (struct si_texture *) + sctx->images[PIPE_SHADER_COMPUTE].views[u_bit_scan(&display_dcc_store_mask)].resource; - si_mark_display_dcc_dirty(sctx, tex); + si_mark_display_dcc_dirty(sctx, tex); + } + + /* TODO: Bindless images don't set displayable_dcc_dirty after image stores. */ } - /* TODO: Bindless images don't set displayable_dcc_dirty after image stores. */ - sctx->compute_is_busy = true; sctx->num_compute_calls++; @@ -1127,4 +1337,40 @@ void si_init_compute_functions(struct si_context *sctx) sctx->b.set_compute_resources = si_set_compute_resources; sctx->b.set_global_binding = si_set_global_binding; sctx->b.launch_grid = si_launch_grid; + +#if 0 /* test for si_get_2d_interleave_size */ + static bool visited = false; + if (visited) + return; + + visited = true; + struct pipe_grid_info info = {}; + info.grid[0] = info.grid[1] = info.grid[2] = 1024; + info.block[2] = 1; + + for (unsigned block_3d = 0; block_3d < 2; block_3d++) { + printf(" WG size | WG block/SE | Thread block/SE\n"); + + for (unsigned x = 1; x <= 32; x *= 2) { + for (unsigned y = 1; y <= 32; y *= 2) { + info.block[0] = x; + info.block[1] = y; + info.block[2] = block_3d ? 8 : 1; + + if ((x * y) % 32) + continue; + + unsigned log_x, log_y; + if (!si_get_2d_interleave_size(&info, &log_x, &log_y)) + continue; + + printf(" (%2u, %2u) = %3u | (%2u, %2u) = %2u | (%3u,%3u) = %u\n", + info.block[0], info.block[1], info.block[0] * info.block[1], + 1 << log_x, 1 << log_y, (1 << log_x) * (1 << log_y), + info.block[0] * (1 << log_x), info.block[1] * (1 << log_y), + info.block[0] * (1 << log_x) * info.block[1] * (1 << log_y)); + } + } + } +#endif } diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c index b108f2aee8d..355cd33b8d0 100644 --- a/src/gallium/drivers/radeonsi/si_compute_blit.c +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -422,13 +422,19 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, compute_min_size = 4 * 1024; } - /* TODO: use compute for unaligned big sizes */ + /* TODO: use compute for 8-bit and 16-bit clear values */ if (method == SI_AUTO_SELECT_CLEAR_METHOD && + /* CP DMA doesn't support the render condition. */ (flags & SI_OP_CS_RENDER_COND_ENABLE || + /* CP DMA doesn't support large clear value sizes. */ clear_value_size > 4 || - (clear_value_size == 4 && offset % 4 == 0 && size > compute_min_size))) { + /* Use compute if CP DMA is non-coherent. */ + (sctx->screen->info.cp_sdma_ge_use_system_memory_scope && + clear_value_size >= 4) || + /* Use compute if the size is large enough. */ + (clear_value_size == 4 && offset % 4 == 0 && size > compute_min_size))) method = SI_COMPUTE_CLEAR_METHOD; - } + if (method == SI_COMPUTE_CLEAR_METHOD) { si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0, aligned_size, clear_value, clear_value_size, flags, coher); @@ -484,8 +490,9 @@ void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct p /* Only use compute for VRAM copies on dGPUs. */ /* TODO: use compute for unaligned big sizes */ if (sctx->screen->info.has_dedicated_vram && si_resource(dst)->domains & RADEON_DOMAIN_VRAM && - si_resource(src)->domains & RADEON_DOMAIN_VRAM && size > compute_min_size && - dst_offset % 4 == 0 && src_offset % 4 == 0 && size % 4 == 0) { + si_resource(src)->domains & RADEON_DOMAIN_VRAM && + dst_offset % 4 == 0 && src_offset % 4 == 0 && size % 4 == 0 && + (size > compute_min_size || sctx->screen->info.cp_sdma_ge_use_system_memory_scope)) { si_compute_do_clear_or_copy(sctx, dst, dst_offset, src, src_offset, size, NULL, 0, flags, coher); } else { @@ -810,6 +817,8 @@ bool si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u void si_retile_dcc(struct si_context *sctx, struct si_texture *tex) { + assert(sctx->gfx_level < GFX12); + /* Set the DCC buffer. */ assert(tex->surface.meta_offset && tex->surface.meta_offset <= UINT_MAX); assert(tex->surface.display_dcc_offset && tex->surface.display_dcc_offset <= UINT_MAX); diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index 466650ea7b3..6459b89c4fd 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -74,6 +74,9 @@ static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs, ui /* GDS increments the address, not CP. */ command |= S_415_DAS(V_415_REGISTER) | S_415_DAIC(V_415_NO_INCREMENT); } else if (sctx->gfx_level >= GFX7 && cache_policy != L2_BYPASS) { + /* GFX12: DST_CACHE_POLICY is changed to DST_TEMPORAL, but the behavior is the same + * for values of 0 and 1. + */ header |= S_501_DST_SEL(V_501_DST_ADDR_TC_L2) | S_501_DST_CACHE_POLICY(cache_policy == L2_STREAM); } @@ -85,6 +88,9 @@ static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs, ui /* Both of these are required for GDS. It does increment the address. */ command |= S_415_SAS(V_415_REGISTER) | S_415_SAIC(V_415_NO_INCREMENT); } else if (sctx->gfx_level >= GFX7 && cache_policy != L2_BYPASS) { + /* GFX12: SRC_CACHE_POLICY is changed to SRC_TEMPORAL, but the behavior is the same + * for values of 0 and 1. + */ header |= S_501_SRC_SEL(V_501_SRC_ADDR_TC_L2) | S_501_SRC_CACHE_POLICY(cache_policy == L2_STREAM); } @@ -191,6 +197,10 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs, if (user_flags & SI_OP_SYNC_PS_BEFORE) sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH; + /* TODO: Range-invalidate GL2 or always use compute shaders */ + if (sctx->screen->info.cp_sdma_ge_use_system_memory_scope) + sctx->flags |= SI_CONTEXT_INV_L2; + /* Mark the buffer range of destination as valid (initialized), * so that transfer_map knows it should wait for the GPU when mapping * that range. */ @@ -353,6 +363,10 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, if ((dst || src) && !(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE)) sctx->flags |= si_get_flush_flags(sctx, coher, cache_policy); + /* TODO: Range-flush GL2 for src and range-invalidate GL2 for dst, or always use compute shaders */ + if (sctx->screen->info.cp_sdma_ge_use_system_memory_scope) + sctx->flags |= SI_CONTEXT_INV_L2; + if (sctx->flags) si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); diff --git a/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c b/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c index 0319ae0bf1b..23e3d0274bd 100644 --- a/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c +++ b/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c @@ -75,7 +75,9 @@ void si_init_cp_reg_shadowing(struct si_context *sctx) radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, sctx->shadowing.csa, RADEON_USAGE_READWRITE | RADEON_PRIO_DESCRIPTORS); si_pm4_emit_commands(sctx, shadowing_preamble); - ac_emulate_clear_state(&sctx->screen->info, &sctx->gfx_cs, si_set_context_reg_array); + + if (sctx->gfx_level < GFX12) + ac_emulate_clear_state(&sctx->screen->info, &sctx->gfx_cs, si_set_context_reg_array); /* TODO: Gfx11 fails GLCTS if we don't re-emit the preamble at the beginning of every IB. */ /* TODO: Skipping this may have made register shadowing slower on Gfx11. */ @@ -87,7 +89,8 @@ void si_init_cp_reg_shadowing(struct si_context *sctx) sctx->cs_preamble_state = NULL; } - si_set_tracked_regs_to_clear_state(sctx); + if (sctx->gfx_level < GFX12) + si_set_tracked_regs_to_clear_state(sctx); /* Setup preemption. The shadowing preamble will be executed as a preamble IB, * which will load register values from memory on a context switch. diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index 241478517f6..66e8f3dad42 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -299,7 +299,7 @@ void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture state[0] = va >> 8; state[1] |= S_008F14_BASE_ADDRESS_HI(va >> 40); - if (sscreen->info.gfx_level >= GFX8) { + if (sscreen->info.gfx_level >= GFX8 && sscreen->info.gfx_level < GFX12) { if (!(access & SI_IMAGE_ACCESS_DCC_OFF) && vi_dcc_enabled(tex, first_level)) { meta_va = tex->buffer.gpu_address + tex->surface.meta_offset; @@ -330,7 +330,7 @@ void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture * of 256B. */ if (sscreen->info.gfx_level >= GFX10_3 && tex->surface.u.gfx9.uses_custom_pitch) { - ASSERTED unsigned min_alignment = 256; + ASSERTED unsigned min_alignment = sscreen->info.gfx_level >= GFX12 ? 128 : 256; assert((tex->surface.u.gfx9.surf_pitch * tex->surface.bpe) % min_alignment == 0); assert(tex->buffer.b.b.target == PIPE_TEXTURE_2D || tex->buffer.b.b.target == PIPE_TEXTURE_RECT); @@ -341,11 +341,17 @@ void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture if (tex->surface.blk_w == 2) pitch *= 2; - state[4] |= S_00A010_DEPTH_GFX10(pitch - 1) | /* DEPTH contains low bits of PITCH. */ - S_00A010_PITCH_MSB_GFX103((pitch - 1) >> 13); + if (sscreen->info.gfx_level >= GFX12) { + state[4] |= S_00A010_DEPTH_GFX12(pitch - 1) | /* DEPTH contains low bits of PITCH. */ + S_00A010_PITCH_MSB_GFX12((pitch - 1) >> 14); + } else { + state[4] |= S_00A010_DEPTH_GFX10(pitch - 1) | /* DEPTH contains low bits of PITCH. */ + S_00A010_PITCH_MSB_GFX103((pitch - 1) >> 13); + } } if (meta_va) { + /* Gfx10-11. */ struct gfx9_surf_meta_flags meta = { .rb_aligned = 1, .pipe_aligned = 1, @@ -573,34 +579,40 @@ static void si_set_sampler_views(struct si_context *sctx, unsigned shader, si_set_sampler_view_desc(sctx, sview, samplers->sampler_states[slot], desc); - if (tex->buffer.b.b.target == PIPE_BUFFER) { - tex->buffer.bind_history |= SI_BIND_SAMPLER_BUFFER(shader); - samplers->needs_depth_decompress_mask &= ~(1u << slot); - samplers->needs_color_decompress_mask &= ~(1u << slot); + if (sctx->gfx_level >= GFX12) { + /* Gfx12 doesn't do any decompression. */ + if (tex->buffer.b.b.target == PIPE_BUFFER) + tex->buffer.bind_history |= SI_BIND_SAMPLER_BUFFER(shader); } else { - if (tex->is_depth) { - samplers->has_depth_tex_mask |= 1u << slot; - samplers->needs_color_decompress_mask &= ~(1u << slot); - - if (depth_needs_decompression(tex, sview->is_stencil_sampler)) { - samplers->needs_depth_decompress_mask |= 1u << slot; - } else { - samplers->needs_depth_decompress_mask &= ~(1u << slot); - } - } else { - samplers->has_depth_tex_mask &= ~(1u << slot); + if (tex->buffer.b.b.target == PIPE_BUFFER) { + tex->buffer.bind_history |= SI_BIND_SAMPLER_BUFFER(shader); samplers->needs_depth_decompress_mask &= ~(1u << slot); - - if (color_needs_decompression(tex)) { - samplers->needs_color_decompress_mask |= 1u << slot; - } else { + samplers->needs_color_decompress_mask &= ~(1u << slot); + } else { + if (tex->is_depth) { + samplers->has_depth_tex_mask |= 1u << slot; samplers->needs_color_decompress_mask &= ~(1u << slot); - } - } - if (vi_dcc_enabled(tex, sview->base.u.tex.first_level) && - p_atomic_read(&tex->framebuffers_bound)) - sctx->need_check_render_feedback = true; + if (depth_needs_decompression(tex, sview->is_stencil_sampler)) { + samplers->needs_depth_decompress_mask |= 1u << slot; + } else { + samplers->needs_depth_decompress_mask &= ~(1u << slot); + } + } else { + samplers->has_depth_tex_mask &= ~(1u << slot); + samplers->needs_depth_decompress_mask &= ~(1u << slot); + + if (color_needs_decompression(tex)) { + samplers->needs_color_decompress_mask |= 1u << slot; + } else { + samplers->needs_color_decompress_mask &= ~(1u << slot); + } + } + + if (vi_dcc_enabled(tex, sview->base.u.tex.first_level) && + p_atomic_read(&tex->framebuffers_bound)) + sctx->need_check_render_feedback = true; + } } if (take_ownership) { @@ -647,6 +659,9 @@ static void si_set_sampler_views(struct si_context *sctx, unsigned shader, static void si_update_shader_needs_decompress_mask(struct si_context *sctx, unsigned shader) { + if (sctx->gfx_level >= GFX12) + return; + struct si_samplers *samplers = &sctx->samplers[shader]; unsigned shader_bit = 1 << shader; @@ -677,8 +692,11 @@ static void si_pipe_set_sampler_views(struct pipe_context *ctx, enum pipe_shader si_update_shader_needs_decompress_mask(sctx, shader); } -static void si_samplers_update_needs_color_decompress_mask(struct si_samplers *samplers) +static void si_samplers_update_needs_color_decompress_mask(struct si_context *sctx, + struct si_samplers *samplers) { + assert(sctx->gfx_level < GFX12); + unsigned mask = samplers->enabled_mask; while (mask) { @@ -878,32 +896,38 @@ static void si_set_shader_image(struct si_context *ctx, unsigned shader, unsigne if (&images->views[slot] != view) util_copy_image_view(&images->views[slot], view); - if (res->b.b.target == PIPE_BUFFER) { - images->needs_color_decompress_mask &= ~(1 << slot); - images->display_dcc_store_mask &= ~(1u << slot); - res->bind_history |= SI_BIND_IMAGE_BUFFER(shader); + if (ctx->gfx_level >= GFX12) { + /* Gfx12 doesn't do any decompression. */ + if (res->b.b.target == PIPE_BUFFER) + res->bind_history |= SI_BIND_IMAGE_BUFFER(shader); } else { - struct si_texture *tex = (struct si_texture *)res; - unsigned level = view->u.tex.level; - - if (color_needs_decompression(tex)) { - images->needs_color_decompress_mask |= 1 << slot; - } else { + if (res->b.b.target == PIPE_BUFFER) { images->needs_color_decompress_mask &= ~(1 << slot); - } - - if (tex->surface.display_dcc_offset && view->access & PIPE_IMAGE_ACCESS_WRITE) { - images->display_dcc_store_mask |= 1u << slot; - - /* Set displayable_dcc_dirty for non-compute stages conservatively (before draw calls). */ - if (shader != PIPE_SHADER_COMPUTE) - tex->displayable_dcc_dirty = true; - } else { images->display_dcc_store_mask &= ~(1u << slot); - } + res->bind_history |= SI_BIND_IMAGE_BUFFER(shader); + } else { + struct si_texture *tex = (struct si_texture *)res; + unsigned level = view->u.tex.level; - if (vi_dcc_enabled(tex, level) && p_atomic_read(&tex->framebuffers_bound)) - ctx->need_check_render_feedback = true; + if (color_needs_decompression(tex)) { + images->needs_color_decompress_mask |= 1 << slot; + } else { + images->needs_color_decompress_mask &= ~(1 << slot); + } + + if (tex->surface.display_dcc_offset && view->access & PIPE_IMAGE_ACCESS_WRITE) { + images->display_dcc_store_mask |= 1u << slot; + + /* Set displayable_dcc_dirty for non-compute stages conservatively (before draw calls). */ + if (shader != PIPE_SHADER_COMPUTE) + tex->displayable_dcc_dirty = true; + } else { + images->display_dcc_store_mask &= ~(1u << slot); + } + + if (vi_dcc_enabled(tex, level) && p_atomic_read(&tex->framebuffers_bound)) + ctx->need_check_render_feedback = true; + } } images->enabled_mask |= 1u << slot; @@ -951,8 +975,11 @@ static void si_set_shader_images(struct pipe_context *pipe, enum pipe_shader_typ si_update_shader_needs_decompress_mask(ctx, shader); } -static void si_images_update_needs_color_decompress_mask(struct si_images *images) +static void si_images_update_needs_color_decompress_mask(struct si_context *sctx, + struct si_images *images) { + assert(sctx->gfx_level < GFX12); + unsigned mask = images->enabled_mask; while (mask) { @@ -1132,7 +1159,10 @@ static void si_init_buffer_resources(struct si_context *sctx, desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); - if (sctx->gfx_level >= GFX11) { + if (sctx->gfx_level >= GFX12) { + desc[3] |= S_008F0C_FORMAT_GFX12(V_008F0C_GFX11_FORMAT_32_FLOAT) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW); + } else if (sctx->gfx_level >= GFX11) { desc[3] |= S_008F0C_FORMAT_GFX10(V_008F0C_GFX11_FORMAT_32_FLOAT) | S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW); } else if (sctx->gfx_level >= GFX10) { @@ -1584,7 +1614,10 @@ void si_set_ring_buffer(struct si_context *sctx, uint slot, struct pipe_resource desc[3] |= S_008F0C_ELEMENT_SIZE(element_size); } - if (sctx->gfx_level >= GFX11) { + if (sctx->gfx_level >= GFX12) { + desc[3] |= S_008F0C_FORMAT_GFX12(V_008F0C_GFX11_FORMAT_32_FLOAT) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED); + } else if (sctx->gfx_level >= GFX11) { desc[3] |= S_008F0C_FORMAT_GFX10(V_008F0C_GFX11_FORMAT_32_FLOAT) | S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED); } else if (sctx->gfx_level >= GFX10) { @@ -1631,6 +1664,8 @@ static void si_set_polygon_stipple(struct pipe_context *ctx, const struct pipe_p static void si_resident_handles_update_needs_color_decompress(struct si_context *sctx) { + assert(sctx->gfx_level < GFX12); + util_dynarray_clear(&sctx->resident_tex_needs_color_decompress); util_dynarray_clear(&sctx->resident_img_needs_color_decompress); @@ -1675,8 +1710,8 @@ void si_update_needs_color_decompress_masks(struct si_context *sctx) assert(sctx->gfx_level < GFX11); for (int i = 0; i < SI_NUM_SHADERS; ++i) { - si_samplers_update_needs_color_decompress_mask(&sctx->samplers[i]); - si_images_update_needs_color_decompress_mask(&sctx->images[i]); + si_samplers_update_needs_color_decompress_mask(sctx, &sctx->samplers[i]); + si_images_update_needs_color_decompress_mask(sctx, &sctx->images[i]); si_update_shader_needs_decompress_mask(sctx, i); } @@ -2002,6 +2037,11 @@ static void si_upload_bindless_descriptors(struct si_context *sctx) /* Invalidate scalar L0 because the cache doesn't know that L2 changed. */ sctx->flags |= SI_CONTEXT_INV_SCACHE; + + /* TODO: Range-invalidate GL2 */ + if (sctx->screen->info.cp_sdma_ge_use_system_memory_scope) + sctx->flags |= SI_CONTEXT_INV_L2; + sctx->bindless_descriptors_dirty = false; } @@ -2234,9 +2274,23 @@ void si_shader_change_notify(struct si_context *sctx) } \ } while (0) +#define gfx12_push_consecutive_shader_pointers(sctx, pointer_mask, sh_base, type) do { \ + unsigned sh_reg_base = (sh_base); \ + if (sh_reg_base) { \ + unsigned mask = shader_pointers_dirty & (pointer_mask); \ + \ + u_foreach_bit(i, mask) { \ + struct si_descriptors *descs = &sctx->descriptors[i]; \ + unsigned sh_reg = sh_reg_base + descs->shader_userdata_offset; \ + \ + gfx12_push_##type##_sh_reg(sh_reg, descs->gpu_address); \ + } \ + } \ +} while (0) + static void si_emit_global_shader_pointers(struct si_context *sctx, struct si_descriptors *descs) { - assert(!sctx->screen->info.has_set_sh_pairs_packed); + assert(sctx->gfx_level < GFX12 && !sctx->screen->info.has_set_sh_pairs_packed); radeon_begin(&sctx->gfx_cs); @@ -2280,6 +2334,16 @@ static void gfx11_push_global_shader_pointers(struct si_context *sctx, struct si descs->gpu_address); } +static void gfx12_push_global_shader_pointers(struct si_context *sctx, struct si_descriptors *descs) +{ + gfx12_push_gfx_sh_reg(R_00B030_SPI_SHADER_USER_DATA_PS_0 + descs->shader_userdata_offset, + descs->gpu_address); + gfx12_push_gfx_sh_reg(R_00B230_SPI_SHADER_USER_DATA_GS_0 + descs->shader_userdata_offset, + descs->gpu_address); + gfx12_push_gfx_sh_reg(R_00B430_SPI_SHADER_USER_DATA_HS_0 + descs->shader_userdata_offset, + descs->gpu_address); +} + void si_emit_graphics_shader_pointers(struct si_context *sctx, unsigned index) { uint32_t *sh_base = sctx->shader_pointers.sh_base; @@ -2303,7 +2367,33 @@ void si_emit_graphics_shader_pointers(struct si_context *sctx, unsigned index) si_upload_bindless_descriptors(sctx); /* Set shader pointers. */ - if (sctx->screen->info.has_set_sh_pairs_packed) { + if (sctx->gfx_level >= GFX12) { + gfx12_push_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(VERTEX), + sh_base[PIPE_SHADER_VERTEX], gfx); + gfx12_push_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_EVAL), + sh_base[PIPE_SHADER_TESS_EVAL], gfx); + gfx12_push_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(FRAGMENT), + sh_base[PIPE_SHADER_FRAGMENT], gfx); + gfx12_push_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_CTRL), + sh_base[PIPE_SHADER_TESS_CTRL], gfx); + gfx12_push_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(GEOMETRY), + sh_base[PIPE_SHADER_GEOMETRY], gfx); + + if (sctx->gs_attribute_ring_pointer_dirty) { + gfx12_push_gfx_sh_reg(R_00B230_SPI_SHADER_USER_DATA_GS_0 + + GFX9_SGPR_ATTRIBUTE_RING_ADDR * 4, + sctx->screen->attribute_pos_prim_ring->gpu_address); + sctx->gs_attribute_ring_pointer_dirty = false; + } + + if (shader_pointers_dirty & (1 << SI_DESCS_INTERNAL)) + gfx12_push_global_shader_pointers(sctx, &sctx->descriptors[SI_DESCS_INTERNAL]); + + if (sctx->graphics_bindless_pointer_dirty) { + gfx12_push_global_shader_pointers(sctx, &sctx->bindless_descriptors); + sctx->graphics_bindless_pointer_dirty = false; + } + } else if (sctx->screen->info.has_set_sh_pairs_packed) { gfx11_push_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(VERTEX), sh_base[PIPE_SHADER_VERTEX], gfx); gfx11_push_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_EVAL), @@ -2318,7 +2408,7 @@ void si_emit_graphics_shader_pointers(struct si_context *sctx, unsigned index) if (sctx->gs_attribute_ring_pointer_dirty) { gfx11_push_gfx_sh_reg(R_00B230_SPI_SHADER_USER_DATA_GS_0 + GFX9_SGPR_ATTRIBUTE_RING_ADDR * 4, - sctx->screen->attribute_ring->gpu_address); + sctx->screen->attribute_pos_prim_ring->gpu_address); sctx->gs_attribute_ring_pointer_dirty = false; } @@ -2346,7 +2436,7 @@ void si_emit_graphics_shader_pointers(struct si_context *sctx, unsigned index) assert(sctx->gfx_level >= GFX11); radeon_set_sh_reg(R_00B230_SPI_SHADER_USER_DATA_GS_0 + GFX9_SGPR_ATTRIBUTE_RING_ADDR * 4, - sctx->screen->attribute_ring->gpu_address); + sctx->screen->attribute_pos_prim_ring->gpu_address); sctx->gs_attribute_ring_pointer_dirty = false; } radeon_end(); @@ -2383,7 +2473,17 @@ void si_emit_compute_shader_pointers(struct si_context *sctx) radeon_begin(&sctx->gfx_cs); /* Set shader pointers. */ - if (sctx->screen->info.has_set_sh_pairs_packed) { + if (sctx->gfx_level >= GFX12) { + gfx12_push_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(COMPUTE), + R_00B900_COMPUTE_USER_DATA_0, compute); + + if (sctx->compute_bindless_pointer_dirty) { + gfx12_push_compute_sh_reg(R_00B900_COMPUTE_USER_DATA_0 + + sctx->bindless_descriptors.shader_userdata_offset, + sctx->bindless_descriptors.gpu_address); + sctx->compute_bindless_pointer_dirty = false; + } + } else if (sctx->screen->info.has_set_sh_pairs_packed) { gfx11_push_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(COMPUTE), R_00B900_COMPUTE_USER_DATA_0, compute); @@ -2641,19 +2741,21 @@ static void si_make_texture_handle_resident(struct pipe_context *ctx, uint64_t h if (sview->base.texture->target != PIPE_BUFFER) { struct si_texture *tex = (struct si_texture *)sview->base.texture; - if (depth_needs_decompression(tex, sview->is_stencil_sampler)) { - util_dynarray_append(&sctx->resident_tex_needs_depth_decompress, - struct si_texture_handle *, tex_handle); - } + if (sctx->gfx_level < GFX12) { + if (depth_needs_decompression(tex, sview->is_stencil_sampler)) { + util_dynarray_append(&sctx->resident_tex_needs_depth_decompress, + struct si_texture_handle *, tex_handle); + } - if (color_needs_decompression(tex)) { - util_dynarray_append(&sctx->resident_tex_needs_color_decompress, - struct si_texture_handle *, tex_handle); - } + if (color_needs_decompression(tex)) { + util_dynarray_append(&sctx->resident_tex_needs_color_decompress, + struct si_texture_handle *, tex_handle); + } - if (vi_dcc_enabled(tex, sview->base.u.tex.first_level) && - p_atomic_read(&tex->framebuffers_bound)) - sctx->need_check_render_feedback = true; + if (vi_dcc_enabled(tex, sview->base.u.tex.first_level) && + p_atomic_read(&tex->framebuffers_bound)) + sctx->need_check_render_feedback = true; + } si_update_bindless_texture_descriptor(sctx, tex_handle); } else { @@ -2680,7 +2782,7 @@ static void si_make_texture_handle_resident(struct pipe_context *ctx, uint64_t h util_dynarray_delete_unordered(&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle); - if (sview->base.texture->target != PIPE_BUFFER) { + if (sctx->gfx_level < GFX12 && sview->base.texture->target != PIPE_BUFFER) { util_dynarray_delete_unordered(&sctx->resident_tex_needs_depth_decompress, struct si_texture_handle *, tex_handle); @@ -2768,13 +2870,15 @@ static void si_make_image_handle_resident(struct pipe_context *ctx, uint64_t han struct si_texture *tex = (struct si_texture *)res; unsigned level = view->u.tex.level; - if (color_needs_decompression(tex)) { - util_dynarray_append(&sctx->resident_img_needs_color_decompress, - struct si_image_handle *, img_handle); - } + if (sctx->gfx_level < GFX12) { + if (color_needs_decompression(tex)) { + util_dynarray_append(&sctx->resident_img_needs_color_decompress, + struct si_image_handle *, img_handle); + } - if (vi_dcc_enabled(tex, level) && p_atomic_read(&tex->framebuffers_bound)) - sctx->need_check_render_feedback = true; + if (vi_dcc_enabled(tex, level) && p_atomic_read(&tex->framebuffers_bound)) + sctx->need_check_render_feedback = true; + } si_update_bindless_image_descriptor(sctx, img_handle); } else { @@ -2802,7 +2906,7 @@ static void si_make_image_handle_resident(struct pipe_context *ctx, uint64_t han util_dynarray_delete_unordered(&sctx->resident_img_handles, struct si_image_handle *, img_handle); - if (res->b.b.target != PIPE_BUFFER) { + if (sctx->gfx_level < GFX12 && res->b.b.target != PIPE_BUFFER) { util_dynarray_delete_unordered(&sctx->resident_img_needs_color_decompress, struct si_image_handle *, img_handle); } @@ -2846,7 +2950,10 @@ void si_init_all_descriptors(struct si_context *sctx) unsigned first_shader = sctx->has_graphics ? 0 : PIPE_SHADER_COMPUTE; unsigned hs_sgpr0, gs_sgpr0; - if (sctx->gfx_level >= GFX11) { + if (sctx->gfx_level >= GFX12) { + hs_sgpr0 = R_00B410_SPI_SHADER_PGM_LO_HS; + gs_sgpr0 = R_00B210_SPI_SHADER_PGM_LO_GS; + } else if (sctx->gfx_level >= GFX11) { hs_sgpr0 = R_00B420_SPI_SHADER_PGM_LO_HS; gs_sgpr0 = R_00B220_SPI_SHADER_PGM_LO_GS; } else { diff --git a/src/gallium/drivers/radeonsi/si_fence.c b/src/gallium/drivers/radeonsi/si_fence.c index d2edf1165f0..a1ae9125e00 100644 --- a/src/gallium/drivers/radeonsi/si_fence.c +++ b/src/gallium/drivers/radeonsi/si_fence.c @@ -434,7 +434,7 @@ static void si_flush_all_queues(struct pipe_context *ctx, struct si_fine_fence fine = {}; unsigned rflags = PIPE_FLUSH_ASYNC; - if (!(flags & PIPE_FLUSH_DEFERRED)) { + if (sctx->gfx_level < GFX12 && !(flags & PIPE_FLUSH_DEFERRED)) { si_flush_implicit_resources(sctx); } diff --git a/src/gallium/drivers/radeonsi/si_get.c b/src/gallium/drivers/radeonsi/si_get.c index 73341668d24..f795c31935d 100644 --- a/src/gallium/drivers/radeonsi/si_get.c +++ b/src/gallium/drivers/radeonsi/si_get.c @@ -51,7 +51,7 @@ static int si_get_param(struct pipe_screen *pscreen, enum pipe_cap param) struct si_screen *sscreen = (struct si_screen *)pscreen; /* Gfx8 (Polaris11) hangs, so don't enable this on Gfx8 and older chips. */ - bool enable_sparse = sscreen->info.gfx_level >= GFX9 && + bool enable_sparse = sscreen->info.gfx_level >= GFX9 && sscreen->info.gfx_level < GFX12 && sscreen->info.has_sparse_vm_mappings; switch (param) { @@ -324,23 +324,21 @@ static int si_get_param(struct pipe_screen *pscreen, enum pipe_cap param) /* Texturing. */ case PIPE_CAP_MAX_TEXTURE_2D_SIZE: - return 16384; + /* TODO: Gfx12 supports 64K textures, but Gallium can't represent them at the moment. */ + return sscreen->info.gfx_level >= GFX12 ? 32768 : 16384; case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS: if (!sscreen->info.has_3d_cube_border_color_mipmap) return 0; - return 15; /* 16384 */ + return sscreen->info.gfx_level >= GFX12 ? 16 : 15; /* 32K : 16K */ case PIPE_CAP_MAX_TEXTURE_3D_LEVELS: if (!sscreen->info.has_3d_cube_border_color_mipmap) return 0; - if (sscreen->info.gfx_level >= GFX10) - return 14; - /* textures support 8192, but layered rendering supports 2048 */ - return 12; + /* This is limited by maximums that both the texture unit and layered rendering support. */ + return sscreen->info.gfx_level >= GFX12 ? 15 : /* 16K */ + sscreen->info.gfx_level >= GFX10 ? 14 : 12; /* 8K : 2K */ case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS: - if (sscreen->info.gfx_level >= GFX10) - return 8192; - /* textures support 8192, but layered rendering supports 2048 */ - return 2048; + /* This is limited by maximums that both the texture unit and layered rendering support. */ + return sscreen->info.gfx_level >= GFX10 ? 8192 : 2048; /* Sparse texture */ case PIPE_CAP_MAX_SPARSE_TEXTURE_SIZE: diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index 95e6966570f..efbd7f13c0c 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -140,6 +140,13 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h if (ctx->streamout.begin_emitted) { si_emit_streamout_end(ctx); ctx->streamout.suspended = true; + + /* Make sure streamout is idle because the next process might change + * GE_GS_ORDERED_ID_BASE (which must not be changed when streamout is busy) + * and make this process guilty of hanging. + */ + if (ctx->gfx_level >= GFX12) + wait_flags |= SI_CONTEXT_VS_PARTIAL_FLUSH; } } @@ -268,6 +275,7 @@ static void si_add_gds_to_buffer_list(struct si_context *sctx) void si_set_tracked_regs_to_clear_state(struct si_context *ctx) { + assert(ctx->gfx_level < GFX12); STATIC_ASSERT(SI_NUM_ALL_TRACKED_REGS <= sizeof(ctx->tracked_regs.reg_saved_mask) * 8); ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_CONTROL] = 0; @@ -311,6 +319,7 @@ void si_set_tracked_regs_to_clear_state(struct si_context *ctx) ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ADDR] = 0; ctx->tracked_regs.reg_value[SI_TRACKED_DB_EQAA] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_OVERRIDE2] = 0; ctx->tracked_regs.reg_value[SI_TRACKED_DB_SHADER_CONTROL] = 0; ctx->tracked_regs.reg_value[SI_TRACKED_CB_SHADER_MASK] = 0xffffffff; ctx->tracked_regs.reg_value[SI_TRACKED_CB_TARGET_MASK] = 0xffffffff; @@ -359,7 +368,6 @@ void si_set_tracked_regs_to_clear_state(struct si_context *ctx) ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2] = 0; ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3] = 0; - ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_OVERRIDE2] = 0; ctx->tracked_regs.reg_value[SI_TRACKED_SPI_VS_OUT_CONFIG] = 0; ctx->tracked_regs.reg_value[SI_TRACKED_VGT_PRIMITIVEID_EN] = 0; ctx->tracked_regs.reg_value[SI_TRACKED_CB_DCC_CONTROL] = 0; @@ -477,8 +485,8 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs) si_mark_atom_dirty(ctx, &ctx->atoms.s.cache_flush); si_mark_atom_dirty(ctx, &ctx->atoms.s.spi_ge_ring_state); - if (ctx->screen->attribute_ring) { - radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, ctx->screen->attribute_ring, + if (ctx->screen->attribute_pos_prim_ring) { + radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, ctx->screen->attribute_pos_prim_ring, RADEON_USAGE_READWRITE | RADEON_PRIO_SHADER_RINGS); } if (ctx->border_color_buffer) { @@ -816,23 +824,27 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs) */ if (flags & SI_CONTEXT_INV_L2) { /* Writeback and invalidate everything in L2. */ - gcr_cntl |= S_586_GL2_INV(1) | S_586_GL2_WB(1) | S_586_GLM_INV(1) | S_586_GLM_WB(1); + gcr_cntl |= S_586_GL2_INV(1) | S_586_GL2_WB(1) | + (ctx->gfx_level < GFX12 ? S_586_GLM_INV(1) | S_586_GLM_WB(1) : 0); ctx->num_L2_invalidates++; } else if (flags & SI_CONTEXT_WB_L2) { - gcr_cntl |= S_586_GL2_WB(1) | S_586_GLM_WB(1) | S_586_GLM_INV(1); + gcr_cntl |= S_586_GL2_WB(1) | + (ctx->gfx_level < GFX12 ? S_586_GLM_WB(1) | S_586_GLM_INV(1) : 0); } else if (flags & SI_CONTEXT_INV_L2_METADATA) { + assert(ctx->gfx_level < GFX12); gcr_cntl |= S_586_GLM_INV(1) | S_586_GLM_WB(1); } if (flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) { - if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) { + if (ctx->gfx_level < GFX12 && flags & SI_CONTEXT_FLUSH_AND_INV_CB) { /* Flush CMASK/FMASK/DCC. Will wait for idle later. */ radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0)); } /* Gfx11 can't flush DB_META and should use a TS event instead. */ - if (ctx->gfx_level != GFX11 && flags & SI_CONTEXT_FLUSH_AND_INV_DB) { + if (ctx->gfx_level < GFX12 && ctx->gfx_level != GFX11 && + flags & SI_CONTEXT_FLUSH_AND_INV_DB) { /* Flush HTILE. Will wait for idle later. */ radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0)); diff --git a/src/gallium/drivers/radeonsi/si_gpu_load.c b/src/gallium/drivers/radeonsi/si_gpu_load.c index 6dd924ee821..44ca948ad20 100644 --- a/src/gallium/drivers/radeonsi/si_gpu_load.c +++ b/src/gallium/drivers/radeonsi/si_gpu_load.c @@ -68,7 +68,8 @@ static void si_update_mmio_counters(struct si_screen *sscreen, union si_mmio_cou sscreen->ws->read_registers(sscreen->ws, GRBM_STATUS, 1, &value); UPDATE_COUNTER(ta, TA_BUSY); - UPDATE_COUNTER(gds, GDS_BUSY); + if (sscreen->info.gfx_level < GFX12) + UPDATE_COUNTER(gds, GDS_BUSY); UPDATE_COUNTER(vgt, VGT_BUSY); UPDATE_COUNTER(ia, IA_BUSY); UPDATE_COUNTER(sx, SX_BUSY); diff --git a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c index 936c0635a63..9abda275670 100644 --- a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c +++ b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c @@ -54,7 +54,9 @@ static nir_def *build_attr_ring_desc(nir_builder *b, struct si_shader *shader, S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | - S_008F0C_FORMAT_GFX10(V_008F0C_GFX11_FORMAT_32_32_32_32_FLOAT) | + (sel->screen->info.gfx_level >= GFX12 ? + S_008F0C_FORMAT_GFX12(V_008F0C_GFX11_FORMAT_32_32_32_32_FLOAT) : + S_008F0C_FORMAT_GFX10(V_008F0C_GFX11_FORMAT_32_32_32_32_FLOAT)) | S_008F0C_INDEX_STRIDE(2) /* 32 elements */), }; @@ -138,7 +140,10 @@ static nir_def *build_tess_ring_desc(nir_builder *b, struct si_screen *screen, S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); - if (screen->info.gfx_level >= GFX11) { + if (screen->info.gfx_level >= GFX12) { + rsrc3 |= S_008F0C_FORMAT_GFX12(V_008F0C_GFX11_FORMAT_32_FLOAT) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW); + } else if (screen->info.gfx_level >= GFX11) { rsrc3 |= S_008F0C_FORMAT_GFX10(V_008F0C_GFX11_FORMAT_32_FLOAT) | S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW); } else if (screen->info.gfx_level >= GFX10) { @@ -474,6 +479,12 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s replacement = si_nir_load_internal_binding(b, args, slot, 4); break; } + case nir_intrinsic_load_xfb_state_address_gfx12_amd: { + nir_def *address = si_nir_load_internal_binding(b, args, SI_STREAMOUT_STATE_BUF, 1); + nir_def *address32_hi = nir_imm_int(b, s->shader->selector->screen->info.address32_hi); + replacement = nir_pack_64_2x32_split(b, address, address32_hi); + break; + } case nir_intrinsic_atomic_add_gs_emit_prim_count_amd: case nir_intrinsic_atomic_add_shader_invocation_count_amd: { enum pipe_statistics_query_index index = @@ -613,7 +624,8 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s break; } case nir_intrinsic_load_layer_id: - replacement = ac_nir_unpack_arg(b, &args->ac, args->ac.ancillary, 16, 13); + replacement = ac_nir_unpack_arg(b, &args->ac, args->ac.ancillary, + 16, sel->screen->info.gfx_level >= GFX12 ? 14 : 13); break; case nir_intrinsic_load_color0: case nir_intrinsic_load_color1: { diff --git a/src/gallium/drivers/radeonsi/si_nir_lower_resource.c b/src/gallium/drivers/radeonsi/si_nir_lower_resource.c index 08a668b7c0f..adc3f1a4b59 100644 --- a/src/gallium/drivers/radeonsi/si_nir_lower_resource.c +++ b/src/gallium/drivers/radeonsi/si_nir_lower_resource.c @@ -35,7 +35,10 @@ static nir_def *load_ubo_desc_fast_path(nir_builder *b, nir_def *addr_lo, S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); - if (sel->screen->info.gfx_level >= GFX11) + if (sel->screen->info.gfx_level >= GFX12) + rsrc3 |= S_008F0C_FORMAT_GFX12(V_008F0C_GFX11_FORMAT_32_FLOAT) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW); + else if (sel->screen->info.gfx_level >= GFX11) rsrc3 |= S_008F0C_FORMAT_GFX10(V_008F0C_GFX11_FORMAT_32_FLOAT) | S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW); else if (sel->screen->info.gfx_level >= GFX10) diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index bb4dd703407..d07a84de238 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -715,6 +715,9 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign case GFX11_5: si_init_draw_functions_GFX11_5(sctx); break; + case GFX12: + si_init_draw_functions_GFX12(sctx); + break; default: unreachable("unhandled gfx level"); } @@ -978,7 +981,7 @@ static void si_destroy_screen(struct pipe_screen *pscreen) sscreen->num_disk_shader_cache_misses); } - si_resource_reference(&sscreen->attribute_ring, NULL); + si_resource_reference(&sscreen->attribute_pos_prim_ring, NULL); pipe_resource_reference(&sscreen->tess_rings, NULL); pipe_resource_reference(&sscreen->tess_rings_tmz, NULL); @@ -1456,7 +1459,8 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws, sscreen->use_monolithic_shaders = (sscreen->debug_flags & DBG(MONOLITHIC_SHADERS)) != 0; sscreen->barrier_flags.cp_to_L2 = SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE; - if (sscreen->info.gfx_level <= GFX8) { + + if (sscreen->info.gfx_level <= GFX8 || sscreen->info.cp_sdma_ge_use_system_memory_scope) { sscreen->barrier_flags.cp_to_L2 |= SI_CONTEXT_INV_L2; sscreen->barrier_flags.L2_to_cp |= SI_CONTEXT_WB_L2; } @@ -1490,14 +1494,15 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws, } if (sscreen->info.gfx_level >= GFX11) { - unsigned attr_ring_size = sscreen->info.attribute_ring_size_per_se * sscreen->info.max_se; - sscreen->attribute_ring = si_aligned_buffer_create(&sscreen->b, - PIPE_RESOURCE_FLAG_UNMAPPABLE | - SI_RESOURCE_FLAG_32BIT | - SI_RESOURCE_FLAG_DRIVER_INTERNAL | - SI_RESOURCE_FLAG_DISCARDABLE, - PIPE_USAGE_DEFAULT, - attr_ring_size, 2 * 1024 * 1024); + sscreen->attribute_pos_prim_ring = + si_aligned_buffer_create(&sscreen->b, + PIPE_RESOURCE_FLAG_UNMAPPABLE | + SI_RESOURCE_FLAG_32BIT | + SI_RESOURCE_FLAG_DRIVER_INTERNAL | + SI_RESOURCE_FLAG_DISCARDABLE, + PIPE_USAGE_DEFAULT, + sscreen->info.total_attribute_pos_prim_ring_size, + 2 * 1024 * 1024); } /* Create the auxiliary context. This must be done last. */ @@ -1535,16 +1540,18 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws, if (test_flags & (DBG(TEST_VMFAULT_CP) | DBG(TEST_VMFAULT_SHADER))) si_test_vmfault(sscreen, test_flags); - if (test_flags & DBG(TEST_GDS)) - si_test_gds((struct si_context *)sscreen->aux_context.general.ctx); + if (sscreen->info.gfx_level < GFX12) { + if (test_flags & DBG(TEST_GDS)) + si_test_gds((struct si_context *)sscreen->aux_context.general.ctx); - if (test_flags & DBG(TEST_GDS_MM)) { - si_test_gds_memory_management((struct si_context *)sscreen->aux_context.general.ctx, - 32 * 1024, 4, RADEON_DOMAIN_GDS); - } - if (test_flags & DBG(TEST_GDS_OA_MM)) { - si_test_gds_memory_management((struct si_context *)sscreen->aux_context.general.ctx, - 4, 1, RADEON_DOMAIN_OA); + if (test_flags & DBG(TEST_GDS_MM)) { + si_test_gds_memory_management((struct si_context *)sscreen->aux_context.general.ctx, + 32 * 1024, 4, RADEON_DOMAIN_GDS); + } + if (test_flags & DBG(TEST_GDS_OA_MM)) { + si_test_gds_memory_management((struct si_context *)sscreen->aux_context.general.ctx, + 4, 1, RADEON_DOMAIN_OA); + } } ac_print_nonshadowed_regs(sscreen->info.gfx_level, sscreen->info.family); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index d6cfffac5e6..1b54fbab4b8 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -464,6 +464,7 @@ struct si_surface { /* Color registers. */ unsigned cb_color_info; unsigned cb_color_view; + unsigned cb_color_view2; unsigned cb_color_attrib; unsigned cb_color_attrib2; /* GFX9 and later */ unsigned cb_color_attrib3; /* GFX10 and later */ @@ -474,18 +475,33 @@ struct si_surface { unsigned spi_shader_col_format_blend_alpha : 8; /* blending with alpha. */ /* DB registers. */ - uint64_t db_depth_base; /* DB_Z_READ/WRITE_BASE */ - uint64_t db_stencil_base; - uint64_t db_htile_data_base; - unsigned db_depth_info; - unsigned db_z_info; - unsigned db_z_info2; /* GFX9 only */ unsigned db_depth_view; unsigned db_depth_size; - unsigned db_depth_slice; + unsigned db_z_info; unsigned db_stencil_info; - unsigned db_stencil_info2; /* GFX9 only */ - unsigned db_htile_surface; + uint64_t db_depth_base; /* DB_Z_READ/WRITE_BASE */ + uint64_t db_stencil_base; + + union { + struct { + uint64_t db_htile_data_base; + unsigned db_depth_info; + unsigned db_z_info2; /* GFX9 only */ + unsigned db_depth_slice; + unsigned db_stencil_info2; /* GFX9 only */ + unsigned db_htile_surface; + } gfx6; + + struct { + uint64_t hiz_base; + unsigned hiz_info; + unsigned hiz_size_xy; + uint64_t his_base; + unsigned his_info; + unsigned his_size_xy; + unsigned db_depth_view1; + } gfx12; + } u; }; struct si_mmio_counter { @@ -712,7 +728,7 @@ struct si_screen { struct util_idalloc_mt buffer_ids; struct util_vertex_state_cache vertex_state_cache; - struct si_resource *attribute_ring; + struct si_resource *attribute_pos_prim_ring; simple_mtx_t tess_ring_lock; struct pipe_resource *tess_rings; @@ -805,6 +821,7 @@ struct si_framebuffer { bool has_dcc_msaa; bool disable_vrs_flat_shading; bool has_stencil; + bool has_hiz_his; }; enum si_quant_mode @@ -834,6 +851,7 @@ struct si_streamout_target { /* The buffer where BUFFER_FILLED_SIZE is stored. */ struct si_resource *buf_filled_size; unsigned buf_filled_size_offset; + unsigned buf_filled_size_draw_count_offset; bool buf_filled_size_valid; unsigned stride_in_dw; @@ -968,6 +986,12 @@ struct gfx11_reg_pair { uint32_t reg_value[2]; }; +/* A pair of values for SET_*_REG_PAIRS. */ +struct gfx12_reg { + uint32_t reg_offset; + uint32_t reg_value; +}; + typedef void (*pipe_draw_vertex_state_func)(struct pipe_context *ctx, struct pipe_vertex_state *vstate, uint32_t partial_velem_mask, @@ -1070,10 +1094,17 @@ struct si_context { /* Gfx11+: Buffered SH registers for SET_SH_REG_PAIRS_*. */ unsigned num_buffered_gfx_sh_regs; unsigned num_buffered_compute_sh_regs; - struct { - struct gfx11_reg_pair buffered_gfx_sh_regs[32]; - struct gfx11_reg_pair buffered_compute_sh_regs[32]; - } gfx11; + union { + struct { + struct gfx11_reg_pair buffered_gfx_sh_regs[32]; + struct gfx11_reg_pair buffered_compute_sh_regs[32]; + } gfx11; + + struct { + struct gfx12_reg buffered_gfx_sh_regs[64]; + struct gfx12_reg buffered_compute_sh_regs[64]; + } gfx12; + }; /* Atom declarations. */ struct si_framebuffer framebuffer; @@ -1117,6 +1148,7 @@ struct si_context { struct si_shader_ctx_state shaders[SI_NUM_GRAPHICS_SHADERS]; }; struct si_cs_shader_state cs_shader_state; + bool compute_ping_pong_launch; /* if current tcs set by user */ bool is_user_tcs; @@ -1186,6 +1218,7 @@ struct si_context { /* DB render state. */ unsigned ps_db_shader_control; + unsigned ps_pa_sc_hisz_control; unsigned dbcb_copy_sample; bool dbcb_depth_copy_enabled : 1; bool dbcb_stencil_copy_enabled : 1; @@ -1922,7 +1955,7 @@ static inline void si_make_CB_shader_coherent(struct si_context *sctx, unsigned sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_INV_VCACHE; sctx->force_shader_coherency.with_cb = false; - if (sctx->gfx_level >= GFX10) { + if (sctx->gfx_level >= GFX10 && sctx->gfx_level < GFX12) { if (sctx->screen->info.tcc_rb_non_coherent) sctx->flags |= SI_CONTEXT_INV_L2; else if (shaders_read_metadata) @@ -1936,7 +1969,7 @@ static inline void si_make_CB_shader_coherent(struct si_context *sctx, unsigned sctx->flags |= SI_CONTEXT_INV_L2; else if (shaders_read_metadata) sctx->flags |= SI_CONTEXT_INV_L2_METADATA; - } else { + } else if (sctx->gfx_level <= GFX8) { /* GFX6-GFX8 */ sctx->flags |= SI_CONTEXT_INV_L2; } @@ -1950,7 +1983,7 @@ static inline void si_make_DB_shader_coherent(struct si_context *sctx, unsigned sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB | SI_CONTEXT_INV_VCACHE; sctx->force_shader_coherency.with_db = false; - if (sctx->gfx_level >= GFX10) { + if (sctx->gfx_level >= GFX10 && sctx->gfx_level < GFX12) { if (sctx->screen->info.tcc_rb_non_coherent) sctx->flags |= SI_CONTEXT_INV_L2; else if (shaders_read_metadata) @@ -1964,7 +1997,7 @@ static inline void si_make_DB_shader_coherent(struct si_context *sctx, unsigned sctx->flags |= SI_CONTEXT_INV_L2; else if (shaders_read_metadata) sctx->flags |= SI_CONTEXT_INV_L2_METADATA; - } else { + } else if (sctx->gfx_level <= GFX8) { /* GFX6-GFX8 */ sctx->flags |= SI_CONTEXT_INV_L2; } @@ -1979,13 +2012,17 @@ static inline bool si_can_sample_zs(struct si_texture *tex, bool stencil_sampler static inline bool si_htile_enabled(struct si_texture *tex, unsigned level, unsigned zs_mask) { + struct si_screen *sscreen = (struct si_screen *)tex->buffer.b.b.screen; + + /* Gfx12 should never call this. */ + assert(sscreen->info.gfx_level < GFX12); + if (zs_mask == PIPE_MASK_S && (tex->htile_stencil_disabled || !tex->surface.has_stencil)) return false; if (!tex->is_depth || !tex->surface.meta_offset) return false; - struct si_screen *sscreen = (struct si_screen *)tex->buffer.b.b.screen; if (sscreen->info.gfx_level >= GFX8) { return level < tex->surface.num_meta_levels; } else { @@ -2000,6 +2037,11 @@ static inline bool si_htile_enabled(struct si_texture *tex, unsigned level, unsi static inline bool vi_tc_compat_htile_enabled(struct si_texture *tex, unsigned level, unsigned zs_mask) { + struct si_screen *sscreen = (struct si_screen *)tex->buffer.b.b.screen; + + /* Gfx12 should never call this. */ + assert(sscreen->info.gfx_level < GFX12); + assert(!tex->tc_compatible_htile || tex->surface.meta_offset); return tex->tc_compatible_htile && si_htile_enabled(tex, level, zs_mask); } diff --git a/src/gallium/drivers/radeonsi/si_pm4.c b/src/gallium/drivers/radeonsi/si_pm4.c index 488ff62bc50..60ccb55bbfb 100644 --- a/src/gallium/drivers/radeonsi/si_pm4.c +++ b/src/gallium/drivers/radeonsi/si_pm4.c @@ -14,6 +14,13 @@ static void si_pm4_set_reg_custom(struct si_pm4_state *state, unsigned reg, uint32_t val, unsigned opcode, unsigned idx); +static bool opcode_is_pairs(unsigned opcode) +{ + return opcode == PKT3_SET_CONTEXT_REG_PAIRS || + opcode == PKT3_SET_SH_REG_PAIRS || + opcode == PKT3_SET_UCONFIG_REG_PAIRS; +} + static bool opcode_is_pairs_packed(unsigned opcode) { return opcode == PKT3_SET_CONTEXT_REG_PAIRS_PACKED || @@ -39,9 +46,13 @@ static unsigned regular_opcode_to_pairs(struct si_pm4_state *state, unsigned opc switch (opcode) { case PKT3_SET_CONTEXT_REG: - return info->has_set_context_pairs_packed ? PKT3_SET_CONTEXT_REG_PAIRS_PACKED : opcode; + return info->has_set_context_pairs_packed ? PKT3_SET_CONTEXT_REG_PAIRS_PACKED : + info->has_set_context_pairs ? PKT3_SET_CONTEXT_REG_PAIRS : opcode; case PKT3_SET_SH_REG: - return info->has_set_sh_pairs_packed ? PKT3_SET_SH_REG_PAIRS_PACKED : opcode; + return info->has_set_sh_pairs_packed ? PKT3_SET_SH_REG_PAIRS_PACKED : + info->has_set_sh_pairs ? PKT3_SET_SH_REG_PAIRS : opcode; + case PKT3_SET_UCONFIG_REG: + return info->has_set_uconfig_pairs ? PKT3_SET_UCONFIG_REG_PAIRS : opcode; } return opcode; @@ -192,7 +203,8 @@ static void si_pm4_cmd_end(struct si_pm4_state *state, bool predicate) count = state->ndw - state->last_pm4 - 2; /* All SET_*_PAIRS* packets on the gfx queue must set RESET_FILTER_CAM. */ bool reset_filter_cam = !state->is_compute_queue && - opcode_is_pairs_packed(state->last_opcode); + (opcode_is_pairs(state->last_opcode) || + opcode_is_pairs_packed(state->last_opcode)); state->pm4[state->last_pm4] = PKT3(state->last_opcode, count, predicate) | PKT3_RESET_FILTER_CAM_S(reset_filter_cam); @@ -226,6 +238,13 @@ static void si_pm4_set_reg_custom(struct si_pm4_state *state, unsigned reg, uint si_pm4_cmd_begin(state, opcode); /* reserve space for the header */ state->ndw++; /* reserve space for the register count, it will be set at the end */ } + } else if (opcode_is_pairs(opcode)) { + assert(idx == 0); + + if (opcode != state->last_opcode) + si_pm4_cmd_begin(state, opcode); + + state->pm4[state->ndw++] = reg; } else if (opcode != state->last_opcode || reg != (state->last_reg + 1) || idx != state->last_idx) { si_pm4_cmd_begin(state, opcode); diff --git a/src/gallium/drivers/radeonsi/si_sdma_copy_image.c b/src/gallium/drivers/radeonsi/si_sdma_copy_image.c index 65fa4fca2c9..a419172ad98 100644 --- a/src/gallium/drivers/radeonsi/si_sdma_copy_image.c +++ b/src/gallium/drivers/radeonsi/si_sdma_copy_image.c @@ -57,6 +57,7 @@ static bool si_sdma_v4_v5_copy_texture(struct si_context *sctx, struct si_textur { bool is_v5 = sctx->gfx_level >= GFX10; bool is_v5_2 = sctx->gfx_level >= GFX10_3; + bool is_v7 = sctx->gfx_level >= GFX12; unsigned bpp = sdst->surface.bpe; uint64_t dst_address = sdst->buffer.gpu_address + sdst->surface.u.gfx9.surf_offset; uint64_t src_address = ssrc->buffer.gpu_address + ssrc->surface.u.gfx9.surf_offset; @@ -111,18 +112,28 @@ static bool si_sdma_v4_v5_copy_texture(struct si_context *sctx, struct si_textur uint64_t tiled_address = tiled == ssrc ? src_address : dst_address; uint64_t linear_address = linear == ssrc ? src_address : dst_address; struct radeon_cmdbuf *cs = sctx->sdma_cs; - /* Only SDMA 5 supports DCC with SDMA */ - bool dcc = vi_dcc_enabled(tiled, 0) && is_v5; assert(tiled->buffer.b.b.depth0 == 1); + bool dcc = false; + + if (is_v7) { + /* Check if everything fits into the bitfields */ + if (!(tiled_width <= (1 << 16) && tiled_height <= (1 << 16) && + linear_pitch <= (1 << 16) && linear_slice_pitch <= (1ull << 32) && + copy_width <= (1 << 16) && copy_height <= (1 << 16))) + return false; + } else { + /* Only SDMA 5 supports DCC with SDMA */ + dcc = is_v5 && vi_dcc_enabled(tiled, 0); + + /* Check if everything fits into the bitfields */ + if (!(tiled_width <= (1 << 14) && tiled_height <= (1 << 14) && + linear_pitch <= (1 << 14) && linear_slice_pitch <= (1 << 28) && + copy_width <= (1 << 14) && copy_height <= (1 << 14))) + return false; + } linear_address += linear->surface.u.gfx9.offset[0]; - /* Check if everything fits into the bitfields */ - if (!(tiled_width <= (1 << 14) && tiled_height <= (1 << 14) && - linear_pitch <= (1 << 14) && linear_slice_pitch <= (1 << 28) && - copy_width <= (1 << 14) && copy_height <= (1 << 14))) - return false; - radeon_begin(cs); radeon_emit( SDMA_PACKET(SDMA_OPCODE_COPY, @@ -138,7 +149,7 @@ static bool si_sdma_v4_v5_copy_texture(struct si_context *sctx, struct si_textur radeon_emit((tiled_height - 1)); radeon_emit(util_logbase2(bpp) | tiled->surface.u.gfx9.swizzle_mode << 3 | - tiled->surface.u.gfx9.resource_type << 9 | + (is_v7 ? 0 : tiled->surface.u.gfx9.resource_type << 9) | (is_v5 ? tiled->buffer.b.b.last_level : tiled->surface.u.gfx9.epitch) << 16); radeon_emit((uint32_t)linear_address); radeon_emit((uint32_t)(linear_address >> 32)); @@ -409,6 +420,7 @@ bool si_sdma_copy_image(struct si_context *sctx, struct si_texture *dst, struct case GFX10_3: case GFX11: case GFX11_5: + case GFX12: if (!si_sdma_v4_v5_copy_texture(sctx, dst, src)) return false; break; diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 383d25c948d..c479a1d10f2 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -254,7 +254,10 @@ static void declare_vb_descriptor_input_sgprs(struct si_shader_args *args, static void declare_vs_input_vgprs(struct si_shader_args *args, struct si_shader *shader) { ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vertex_id); - if (shader->key.ge.as_ls) { + + if (shader->selector->screen->info.gfx_level >= GFX12) { + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id); + } else if (shader->key.ge.as_ls) { if (shader->selector->screen->info.gfx_level >= GFX11) { ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */ ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */ @@ -520,11 +523,17 @@ void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args) } /* VGPRs (first GS, then VS/TES) */ - ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]); - ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[1]); - ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_prim_id); - ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_invocation_id); - ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[2]); + if (sel->screen->info.gfx_level >= GFX12) { + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_prim_id); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[1]); + } else { + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[1]); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_prim_id); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_invocation_id); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[2]); + } if (stage == MESA_SHADER_VERTEX) { declare_vs_input_vgprs(args, shader); @@ -543,7 +552,7 @@ void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args) /* ES return values are inputs to GS. */ for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++) ac_add_return(&args->ac, AC_ARG_SGPR); - for (i = 0; i < 5; i++) + for (i = 0; i < (sel->screen->info.gfx_level >= GFX12 ? 3 : 5); i++) ac_add_return(&args->ac, AC_ARG_VGPR); } break; @@ -706,7 +715,13 @@ void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args) /* Hardware SGPRs. */ for (i = 0; i < 3; i++) { if (shader->selector->info.uses_block_id[i]) { - ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.workgroup_ids[i]); + /* GFX12 loads workgroup IDs into ttmp registers, so they are not input SGPRs, but we + * still need to set this to indicate that they are enabled (for ac_nir_to_llvm). + */ + if (sel->screen->info.gfx_level >= GFX12) + args->ac.workgroup_ids[i].used = true; + else + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.workgroup_ids[i]); } } if (shader->selector->info.uses_tg_size) @@ -854,8 +869,9 @@ unsigned si_get_shader_prefetch_size(struct si_shader *shader) /* INST_PREF_SIZE uses 128B granularity. * - GFX11: max 128 * 63 = 8064 + * - GFX12: max 128 * 255 = 32640 */ - unsigned max_pref_size = 63; + unsigned max_pref_size = shader->selector->screen->info.gfx_level >= GFX12 ? 255 : 63; unsigned exec_size_gran128 = DIV_ROUND_UP(exec_size, 128); return MIN2(max_pref_size, exec_size_gran128); @@ -1547,6 +1563,7 @@ static void si_dump_shader_key(const struct si_shader *shader, FILE *f) fprintf(f, " opt.kill_clip_distances = 0x%x\n", key->ge.opt.kill_clip_distances); fprintf(f, " opt.ngg_culling = 0x%x\n", key->ge.opt.ngg_culling); fprintf(f, " opt.remove_streamout = 0x%x\n", key->ge.opt.remove_streamout); + fprintf(f, " mono.remove_streamout = 0x%x\n", key->ge.mono.remove_streamout); } if (stage <= MESA_SHADER_GEOMETRY) @@ -1865,6 +1882,7 @@ static void si_lower_ngg(struct si_shader *shader, nir_shader *nir) .kill_pointsize = key->ge.opt.kill_pointsize, .kill_layer = key->ge.opt.kill_layer, .force_vrs = sel->screen->options.vrs2x2, + .use_gfx12_xfb_intrinsic = true, }; if (nir->info.stage == MESA_SHADER_VERTEX || diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 8b90301eb65..855a90b14e8 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -706,6 +706,9 @@ struct si_shader_key_ge { unsigned vs_export_prim_id : 1; /* VS and TES only */ unsigned gs_tri_strip_adj_fix : 1; /* GS only */ } u; + + /* Gfx12: When no streamout buffers are bound, streamout must be disabled. */ + unsigned remove_streamout : 1; } mono; /* Optimization flags for asynchronous compilation only. */ @@ -953,6 +956,8 @@ struct si_shader { unsigned cb_shader_mask; unsigned db_shader_control; unsigned num_interp; + unsigned spi_gs_out_config_ps; + unsigned pa_sc_hisz_control; bool writes_samplemask; } ps; }; @@ -1069,7 +1074,8 @@ static inline bool si_shader_uses_streamout(const struct si_shader *shader) { return shader->selector->stage <= MESA_SHADER_GEOMETRY && shader->selector->info.enabled_streamout_buffer_mask && - !shader->key.ge.opt.remove_streamout; + !shader->key.ge.opt.remove_streamout && + !shader->key.ge.mono.remove_streamout; } static inline bool si_shader_uses_discard(struct si_shader *shader) diff --git a/src/gallium/drivers/radeonsi/si_shader_info.c b/src/gallium/drivers/radeonsi/si_shader_info.c index 9249e9b1c42..a0d427ffa8c 100644 --- a/src/gallium/drivers/radeonsi/si_shader_info.c +++ b/src/gallium/drivers/radeonsi/si_shader_info.c @@ -668,10 +668,12 @@ void si_nir_scan_shader(struct si_screen *sscreen, const struct nir_shader *nir, info->uses_base_instance = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE); info->uses_invocationid = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_INVOCATION_ID); info->uses_grid_size = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_NUM_WORKGROUPS); - info->uses_tg_size = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_NUM_SUBGROUPS) || - BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_LOCAL_INVOCATION_INDEX) || - BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SUBGROUP_ID) || - si_should_clear_lds(sscreen, nir); + info->uses_tg_size = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_NUM_SUBGROUPS); + if (sscreen->info.gfx_level < GFX12) { + info->uses_tg_size |= BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_LOCAL_INVOCATION_INDEX) || + BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SUBGROUP_ID) || + si_should_clear_lds(sscreen, nir); + } info->uses_variable_block_size = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_WORKGROUP_SIZE); info->uses_drawid = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID); info->uses_primid = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID) || diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c index f5d6b8cf6b4..1342d0e6425 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c @@ -173,8 +173,8 @@ void si_llvm_create_func(struct si_shader_context *ctx, const char *name, LLVMTy ctx->screen->info.address32_hi); } - if (ctx->stage <= MESA_SHADER_GEOMETRY && ctx->shader->key.ge.as_ngg && - si_shader_uses_streamout(ctx->shader)) + if (ctx->screen->info.gfx_level < GFX12 && ctx->stage <= MESA_SHADER_GEOMETRY && + ctx->shader->key.ge.as_ngg && si_shader_uses_streamout(ctx->shader)) ac_llvm_add_target_dep_function_attr(ctx->main_fn.value, "amdgpu-gds-size", 256); ac_llvm_set_workgroup_size(ctx->main_fn.value, max_workgroup_size); diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c index a06ae356a7a..2da289d9e78 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c @@ -58,11 +58,17 @@ void si_llvm_es_build_end(struct si_shader_context *ctx) unsigned vgpr = 8 + GFX9_GS_NUM_USER_SGPR; - ret = si_insert_input_ret_float(ctx, ret, ctx->args->ac.gs_vtx_offset[0], vgpr++); - ret = si_insert_input_ret_float(ctx, ret, ctx->args->ac.gs_vtx_offset[1], vgpr++); - ret = si_insert_input_ret_float(ctx, ret, ctx->args->ac.gs_prim_id, vgpr++); - ret = si_insert_input_ret_float(ctx, ret, ctx->args->ac.gs_invocation_id, vgpr++); - ret = si_insert_input_ret_float(ctx, ret, ctx->args->ac.gs_vtx_offset[2], vgpr++); + if (ctx->screen->info.gfx_level >= GFX12) { + ret = si_insert_input_ret_float(ctx, ret, ctx->args->ac.gs_vtx_offset[0], vgpr++); + ret = si_insert_input_ret_float(ctx, ret, ctx->args->ac.gs_prim_id, vgpr++); + ret = si_insert_input_ret_float(ctx, ret, ctx->args->ac.gs_vtx_offset[1], vgpr++); + } else { + ret = si_insert_input_ret_float(ctx, ret, ctx->args->ac.gs_vtx_offset[0], vgpr++); + ret = si_insert_input_ret_float(ctx, ret, ctx->args->ac.gs_vtx_offset[1], vgpr++); + ret = si_insert_input_ret_float(ctx, ret, ctx->args->ac.gs_prim_id, vgpr++); + ret = si_insert_input_ret_float(ctx, ret, ctx->args->ac.gs_invocation_id, vgpr++); + ret = si_insert_input_ret_float(ctx, ret, ctx->args->ac.gs_vtx_offset[2], vgpr++); + } ctx->return_value = ret; } diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c index 81cf051dedc..3d70756a601 100644 --- a/src/gallium/drivers/radeonsi/si_shader_nir.c +++ b/src/gallium/drivers/radeonsi/si_shader_nir.c @@ -337,12 +337,21 @@ static void si_lower_nir(struct si_screen *sscreen, struct nir_shader *nir) /* gl_LocalInvocationIndex must be derived from gl_LocalInvocationID.xyz to make it correct * with quad derivatives. Using gl_SubgroupID for that (which is what we do by default) is * incorrect with a non-linear thread order. + * + * On Gfx12, we always use a non-linear thread order if the workgroup X and Y size is + * divisible by 2. */ options.lower_local_invocation_index = - nir->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS; + nir->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS || + (sscreen->info.gfx_level >= GFX12 && + nir->info.cs.derivative_group == DERIVATIVE_GROUP_NONE && + (nir->info.workgroup_size_variable || + (nir->info.workgroup_size[0] % 2 == 0 && nir->info.workgroup_size[1] % 2 == 0))); NIR_PASS_V(nir, nir_lower_compute_system_values, &options); - if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS) { + /* Gfx12 supports this in hw. */ + if (sscreen->info.gfx_level < GFX12 && + nir->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS) { nir_opt_cse(nir); /* CSE load_local_invocation_id */ memset(&options, 0, sizeof(options)); options.shuffle_local_ids_for_quad_derivatives = true; diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index f35eca7928d..f8a36060be0 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -84,7 +84,7 @@ static void si_emit_cb_render_state(struct si_context *sctx, unsigned index) uint32_t cb_dcc_control = 0; - if (sctx->gfx_level >= GFX8) { + if (sctx->gfx_level >= GFX8 && sctx->gfx_level < GFX12) { /* DCC MSAA workaround. * Alternatively, we can set CB_COLORi_DCC_CONTROL.OVERWRITE_- * COMBINER_DISABLE, but that would be more complicated. @@ -250,7 +250,23 @@ static void si_emit_cb_render_state(struct si_context *sctx, unsigned index) sx_ps_downconvert = V_028754_SX_RT_EXPORT_32_R; } - if (sctx->screen->info.has_set_context_pairs_packed) { + if (sctx->gfx_level >= GFX12) { + /* GFX12 doesn't have CB_FDCC_CONTROL. */ + assert(cb_dcc_control == 0); + + radeon_begin(cs); + gfx12_begin_context_regs(); + gfx12_opt_set_context_reg(R_028850_CB_TARGET_MASK, SI_TRACKED_CB_TARGET_MASK, + cb_target_mask); + gfx12_opt_set_context_reg(R_028754_SX_PS_DOWNCONVERT, SI_TRACKED_SX_PS_DOWNCONVERT, + sx_ps_downconvert); + gfx12_opt_set_context_reg(R_028758_SX_BLEND_OPT_EPSILON, SI_TRACKED_SX_BLEND_OPT_EPSILON, + sx_blend_opt_epsilon); + gfx12_opt_set_context_reg(R_02875C_SX_BLEND_OPT_CONTROL, SI_TRACKED_SX_BLEND_OPT_CONTROL, + sx_blend_opt_control); + gfx12_end_context_regs(); + radeon_end(); /* don't track context rolls on GFX12 */ + } else if (sctx->screen->info.has_set_context_pairs_packed) { radeon_begin(cs); gfx11_begin_packed_context_regs(); gfx11_opt_set_context_reg(R_028238_CB_TARGET_MASK, SI_TRACKED_CB_TARGET_MASK, @@ -496,7 +512,10 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx, S_028B70_OFFSET_ROUND(0); } - si_pm4_set_reg(pm4, R_028B70_DB_ALPHA_TO_MASK, db_alpha_to_mask); + if (sctx->gfx_level >= GFX12) + si_pm4_set_reg(pm4, R_02807C_DB_ALPHA_TO_MASK, db_alpha_to_mask); + else + si_pm4_set_reg(pm4, R_028B70_DB_ALPHA_TO_MASK, db_alpha_to_mask); blend->cb_target_mask = 0; blend->cb_target_enabled_4bit = 0; @@ -659,7 +678,11 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx, color_control |= S_028808_DISABLE_DUAL_QUAD(1); } - si_pm4_set_reg(pm4, R_028808_CB_COLOR_CONTROL, color_control); + if (sctx->gfx_level >= GFX12) + si_pm4_set_reg(pm4, R_028858_CB_COLOR_CONTROL, color_control); + else + si_pm4_set_reg(pm4, R_028808_CB_COLOR_CONTROL, color_control); + si_pm4_finalize(pm4); return blend; } @@ -871,7 +894,10 @@ static void si_emit_clip_state(struct si_context *sctx, unsigned index) struct radeon_cmdbuf *cs = &sctx->gfx_cs; radeon_begin(cs); - radeon_set_context_reg_seq(R_0285BC_PA_CL_UCP_0_X, 6 * 4); + if (sctx->gfx_level >= GFX12) + radeon_set_context_reg_seq(R_0282D0_PA_CL_UCP_0_X, 6 * 4); + else + radeon_set_context_reg_seq(R_0285BC_PA_CL_UCP_0_X, 6 * 4); radeon_emit_array((uint32_t *)sctx->clip_state.ucp, 6 * 4); radeon_end(); } @@ -906,7 +932,16 @@ static void si_emit_clip_regs(struct si_context *sctx, unsigned index) S_028810_CLIP_DISABLE(window_space); unsigned pa_cl_vs_out_cntl = pa_cl_cntl | vs->pa_cl_vs_out_cntl; - if (sctx->screen->info.has_set_context_pairs_packed) { + if (sctx->gfx_level >= GFX12) { + radeon_begin(&sctx->gfx_cs); + gfx12_begin_context_regs(); + gfx12_opt_set_context_reg(R_028810_PA_CL_CLIP_CNTL, SI_TRACKED_PA_CL_CLIP_CNTL, + pa_cl_clip_cntl); + gfx12_opt_set_context_reg(R_028818_PA_CL_VS_OUT_CNTL, SI_TRACKED_PA_CL_VS_OUT_CNTL, + pa_cl_vs_out_cntl); + gfx12_end_context_regs(); + radeon_end(); /* don't track context rolls on GFX12 */ + } else if (sctx->screen->info.has_set_context_pairs_packed) { radeon_begin(&sctx->gfx_cs); gfx11_begin_packed_context_regs(); gfx11_opt_set_context_reg(R_028810_PA_CL_CLIP_CNTL, SI_TRACKED_PA_CL_CLIP_CNTL, @@ -1086,7 +1121,8 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(state->fill_front)) | S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(state->fill_back)) | /* this must be set if POLY_MODE or PERPENDICULAR_ENDCAP_ENA is set */ - S_028814_KEEP_TOGETHER_ENABLE(sscreen->info.gfx_level >= GFX10 ? + S_028814_KEEP_TOGETHER_ENABLE(sscreen->info.gfx_level >= GFX10 && + sscreen->info.gfx_level < GFX12 ? polygon_mode_enabled || rs->perpendicular_end_caps : 0); if (sscreen->info.gfx_level >= GFX10) { @@ -1147,7 +1183,57 @@ static void si_pm4_emit_rasterizer(struct si_context *sctx, unsigned index) { struct si_state_rasterizer *state = sctx->queued.named.rasterizer; - if (sctx->screen->info.has_set_context_pairs_packed) { + if (sctx->screen->info.gfx_level >= GFX12) { + radeon_begin(&sctx->gfx_cs); + gfx12_begin_context_regs(); + if (state->line_stipple_enable) { + gfx12_opt_set_context_reg(R_028A0C_PA_SC_LINE_STIPPLE, SI_TRACKED_PA_SC_LINE_STIPPLE, + state->pa_sc_line_stipple); + } + + gfx12_opt_set_context_reg(R_028644_SPI_INTERP_CONTROL_0, SI_TRACKED_SPI_INTERP_CONTROL_0, + state->spi_interp_control_0); + gfx12_opt_set_context_reg(R_028A00_PA_SU_POINT_SIZE, SI_TRACKED_PA_SU_POINT_SIZE, + state->pa_su_point_size); + gfx12_opt_set_context_reg(R_028A04_PA_SU_POINT_MINMAX, SI_TRACKED_PA_SU_POINT_MINMAX, + state->pa_su_point_minmax); + gfx12_opt_set_context_reg(R_028A08_PA_SU_LINE_CNTL, SI_TRACKED_PA_SU_LINE_CNTL, + state->pa_su_line_cntl); + gfx12_opt_set_context_reg(R_028A48_PA_SC_MODE_CNTL_0, SI_TRACKED_PA_SC_MODE_CNTL_0, + state->pa_sc_mode_cntl_0); + gfx12_opt_set_context_reg(R_02881C_PA_SU_SC_MODE_CNTL, SI_TRACKED_PA_SU_SC_MODE_CNTL, + state->pa_su_sc_mode_cntl); + gfx12_opt_set_context_reg(R_028838_PA_CL_NGG_CNTL, SI_TRACKED_PA_CL_NGG_CNTL, + state->pa_cl_ngg_cntl); + gfx12_opt_set_context_reg(R_028230_PA_SC_EDGERULE, SI_TRACKED_PA_SC_EDGERULE, + state->pa_sc_edgerule); + + if (state->uses_poly_offset && sctx->framebuffer.state.zsbuf) { + unsigned db_format_index = + ((struct si_surface *)sctx->framebuffer.state.zsbuf)->db_format_index; + + gfx12_opt_set_context_reg(R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, + SI_TRACKED_PA_SU_POLY_OFFSET_DB_FMT_CNTL, + state->pa_su_poly_offset_db_fmt_cntl[db_format_index]); + gfx12_opt_set_context_reg(R_028B7C_PA_SU_POLY_OFFSET_CLAMP, + SI_TRACKED_PA_SU_POLY_OFFSET_CLAMP, + state->pa_su_poly_offset_clamp); + gfx12_opt_set_context_reg(R_028B80_PA_SU_POLY_OFFSET_FRONT_SCALE, + SI_TRACKED_PA_SU_POLY_OFFSET_FRONT_SCALE, + state->pa_su_poly_offset_frontback_scale); + gfx12_opt_set_context_reg(R_028B84_PA_SU_POLY_OFFSET_FRONT_OFFSET, + SI_TRACKED_PA_SU_POLY_OFFSET_FRONT_OFFSET, + state->pa_su_poly_offset_frontback_offset[db_format_index]); + gfx12_opt_set_context_reg(R_028B88_PA_SU_POLY_OFFSET_BACK_SCALE, + SI_TRACKED_PA_SU_POLY_OFFSET_BACK_SCALE, + state->pa_su_poly_offset_frontback_scale); + gfx12_opt_set_context_reg(R_028B8C_PA_SU_POLY_OFFSET_BACK_OFFSET, + SI_TRACKED_PA_SU_POLY_OFFSET_BACK_OFFSET, + state->pa_su_poly_offset_frontback_offset[db_format_index]); + } + gfx12_end_context_regs(); + radeon_end(); /* don't track context rolls on GFX12 */ + } else if (sctx->screen->info.has_set_context_pairs_packed) { radeon_begin(&sctx->gfx_cs); gfx11_begin_packed_context_regs(); gfx11_opt_set_context_reg(R_0286D4_SPI_INTERP_CONTROL_0, SI_TRACKED_SPI_INTERP_CONTROL_0, @@ -1350,21 +1436,29 @@ static void si_delete_rs_state(struct pipe_context *ctx, void *state) */ static void si_emit_stencil_ref(struct si_context *sctx, unsigned index) { - struct radeon_cmdbuf *cs = &sctx->gfx_cs; struct pipe_stencil_ref *ref = &sctx->stencil_ref.state; - struct si_dsa_stencil_ref_part *dsa = &sctx->stencil_ref.dsa_part; - radeon_begin(cs); - radeon_set_context_reg_seq(R_028430_DB_STENCILREFMASK, 2); - radeon_emit(S_028430_STENCILTESTVAL(ref->ref_value[0]) | - S_028430_STENCILMASK(dsa->valuemask[0]) | - S_028430_STENCILWRITEMASK(dsa->writemask[0]) | - S_028430_STENCILOPVAL(1)); - radeon_emit(S_028434_STENCILTESTVAL_BF(ref->ref_value[1]) | - S_028434_STENCILMASK_BF(dsa->valuemask[1]) | - S_028434_STENCILWRITEMASK_BF(dsa->writemask[1]) | - S_028434_STENCILOPVAL_BF(1)); - radeon_end(); + if (sctx->gfx_level >= GFX12) { + radeon_begin(&sctx->gfx_cs); + radeon_set_context_reg(R_028088_DB_STENCIL_REF, + S_028088_TESTVAL(ref->ref_value[0]) | + S_028088_TESTVAL_BF(ref->ref_value[1])); + radeon_end(); + } else { + struct si_dsa_stencil_ref_part *dsa = &sctx->stencil_ref.dsa_part; + + radeon_begin(&sctx->gfx_cs); + radeon_set_context_reg_seq(R_028430_DB_STENCILREFMASK, 2); + radeon_emit(S_028430_STENCILTESTVAL(ref->ref_value[0]) | + S_028430_STENCILMASK(dsa->valuemask[0]) | + S_028430_STENCILWRITEMASK(dsa->writemask[0]) | + S_028430_STENCILOPVAL(1)); + radeon_emit(S_028434_STENCILTESTVAL_BF(ref->ref_value[1]) | + S_028434_STENCILMASK_BF(dsa->valuemask[1]) | + S_028434_STENCILWRITEMASK_BF(dsa->writemask[1]) | + S_028434_STENCILOPVAL_BF(1)); + radeon_end(); + } } static void si_set_stencil_ref(struct pipe_context *ctx, const struct pipe_stencil_ref state) @@ -1432,6 +1526,7 @@ static bool si_order_invariant_stencil_state(const struct pipe_stencil_state *st static void *si_create_dsa_state(struct pipe_context *ctx, const struct pipe_depth_stencil_alpha_state *state) { + struct si_context *sctx = (struct si_context *)ctx; struct si_state_dsa *dsa = CALLOC_STRUCT(si_state_dsa); if (!dsa) { return NULL; @@ -1488,6 +1583,13 @@ static void *si_create_dsa_state(struct pipe_context *ctx, dsa->db_can_write = dsa->depth_write_enabled || dsa->stencil_write_enabled; dsa->depth_bounds_enabled = state->depth_bounds_test; + if (sctx->gfx_level >= GFX12) { + dsa->db_stencil_read_mask = S_028090_TESTMASK(state->stencil[0].valuemask) | + S_028090_TESTMASK_BF(state->stencil[1].valuemask); + dsa->db_stencil_write_mask = S_028094_WRITEMASK(state->stencil[0].writemask) | + S_028094_WRITEMASK_BF(state->stencil[1].writemask); + } + bool zfunc_is_ordered = state->depth_func == PIPE_FUNC_NEVER || state->depth_func == PIPE_FUNC_LESS || state->depth_func == PIPE_FUNC_LEQUAL || state->depth_func == PIPE_FUNC_GREATER || @@ -1518,7 +1620,28 @@ static void si_pm4_emit_dsa(struct si_context *sctx, unsigned index) struct si_state_dsa *state = sctx->queued.named.dsa; assert(state && state != sctx->emitted.named.dsa); - if (sctx->screen->info.has_set_context_pairs_packed) { + if (sctx->gfx_level >= GFX12) { + radeon_begin(&sctx->gfx_cs); + gfx12_begin_context_regs(); + gfx12_opt_set_context_reg(R_028070_DB_DEPTH_CONTROL, SI_TRACKED_DB_DEPTH_CONTROL, + state->db_depth_control); + if (state->stencil_enabled) { + gfx12_opt_set_context_reg(R_028074_DB_STENCIL_CONTROL, SI_TRACKED_DB_STENCIL_CONTROL, + state->db_stencil_control); + gfx12_opt_set_context_reg(R_028090_DB_STENCIL_READ_MASK, SI_TRACKED_DB_STENCIL_READ_MASK, + state->db_stencil_read_mask); + gfx12_opt_set_context_reg(R_028094_DB_STENCIL_WRITE_MASK, SI_TRACKED_DB_STENCIL_WRITE_MASK, + state->db_stencil_write_mask); + } + if (state->depth_bounds_enabled) { + gfx12_opt_set_context_reg(R_028050_DB_DEPTH_BOUNDS_MIN, SI_TRACKED_DB_DEPTH_BOUNDS_MIN, + fui(state->db_depth_bounds_min)); + gfx12_opt_set_context_reg(R_028054_DB_DEPTH_BOUNDS_MAX, SI_TRACKED_DB_DEPTH_BOUNDS_MAX, + fui(state->db_depth_bounds_max)); + } + gfx12_end_context_regs(); + radeon_end(); /* don't track context rolls on GFX12 */ + } else if (sctx->screen->info.has_set_context_pairs_packed) { radeon_begin(&sctx->gfx_cs); gfx11_begin_packed_context_regs(); gfx11_opt_set_context_reg(R_028800_DB_DEPTH_CONTROL, SI_TRACKED_DB_DEPTH_CONTROL, @@ -1586,7 +1709,9 @@ static void si_bind_dsa_state(struct pipe_context *ctx, void *state) si_pm4_bind_state(sctx, dsa, dsa); - if (memcmp(&dsa->stencil_ref, &sctx->stencil_ref.dsa_part, + /* Gfx12 doesn't need to combine a DSA state with a stencil ref state. */ + if (sctx->gfx_level < GFX12 && + memcmp(&dsa->stencil_ref, &sctx->stencil_ref.dsa_part, sizeof(struct si_dsa_stencil_ref_part)) != 0) { sctx->stencil_ref.dsa_part = dsa->stencil_ref; si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref); @@ -1675,39 +1800,44 @@ static void si_emit_db_render_state(struct si_context *sctx, unsigned index) unsigned db_shader_control = 0, db_render_control = 0, db_count_control = 0, vrs_override_cntl = 0; /* DB_RENDER_CONTROL */ - if (sctx->dbcb_depth_copy_enabled || sctx->dbcb_stencil_copy_enabled) { - assert(sctx->gfx_level < GFX11); - db_render_control |= S_028000_DEPTH_COPY(sctx->dbcb_depth_copy_enabled) | - S_028000_STENCIL_COPY(sctx->dbcb_stencil_copy_enabled) | - S_028000_COPY_CENTROID(1) | S_028000_COPY_SAMPLE(sctx->dbcb_copy_sample); - } else if (sctx->db_flush_depth_inplace || sctx->db_flush_stencil_inplace) { - db_render_control |= S_028000_DEPTH_COMPRESS_DISABLE(sctx->db_flush_depth_inplace) | - S_028000_STENCIL_COMPRESS_DISABLE(sctx->db_flush_stencil_inplace); + if (sctx->gfx_level >= GFX12) { + assert(!sctx->dbcb_depth_copy_enabled && !sctx->dbcb_stencil_copy_enabled); + assert(!sctx->db_flush_depth_inplace && !sctx->db_flush_stencil_inplace); + assert(!sctx->db_depth_clear && !sctx->db_stencil_clear); } else { - db_render_control |= S_028000_DEPTH_CLEAR_ENABLE(sctx->db_depth_clear) | - S_028000_STENCIL_CLEAR_ENABLE(sctx->db_stencil_clear); - } - - if (sctx->gfx_level >= GFX11) { - unsigned max_allowed_tiles_in_wave; - - if (sctx->screen->info.has_dedicated_vram) { - if (sctx->framebuffer.nr_samples == 8) - max_allowed_tiles_in_wave = 6; - else if (sctx->framebuffer.nr_samples == 4) - max_allowed_tiles_in_wave = 13; - else - max_allowed_tiles_in_wave = 0; + if (sctx->dbcb_depth_copy_enabled || sctx->dbcb_stencil_copy_enabled) { + db_render_control |= S_028000_DEPTH_COPY(sctx->dbcb_depth_copy_enabled) | + S_028000_STENCIL_COPY(sctx->dbcb_stencil_copy_enabled) | + S_028000_COPY_CENTROID(1) | S_028000_COPY_SAMPLE(sctx->dbcb_copy_sample); + } else if (sctx->db_flush_depth_inplace || sctx->db_flush_stencil_inplace) { + db_render_control |= S_028000_DEPTH_COMPRESS_DISABLE(sctx->db_flush_depth_inplace) | + S_028000_STENCIL_COMPRESS_DISABLE(sctx->db_flush_stencil_inplace); } else { - if (sctx->framebuffer.nr_samples == 8) - max_allowed_tiles_in_wave = 7; - else if (sctx->framebuffer.nr_samples == 4) - max_allowed_tiles_in_wave = 15; - else - max_allowed_tiles_in_wave = 0; + db_render_control |= S_028000_DEPTH_CLEAR_ENABLE(sctx->db_depth_clear) | + S_028000_STENCIL_CLEAR_ENABLE(sctx->db_stencil_clear); } - db_render_control |= S_028000_MAX_ALLOWED_TILES_IN_WAVE(max_allowed_tiles_in_wave); + if (sctx->gfx_level >= GFX11) { + unsigned max_allowed_tiles_in_wave; + + if (sctx->screen->info.has_dedicated_vram) { + if (sctx->framebuffer.nr_samples == 8) + max_allowed_tiles_in_wave = 6; + else if (sctx->framebuffer.nr_samples == 4) + max_allowed_tiles_in_wave = 13; + else + max_allowed_tiles_in_wave = 0; + } else { + if (sctx->framebuffer.nr_samples == 8) + max_allowed_tiles_in_wave = 7; + else if (sctx->framebuffer.nr_samples == 4) + max_allowed_tiles_in_wave = 15; + else + max_allowed_tiles_in_wave = 0; + } + + db_render_control |= S_028000_MAX_ALLOWED_TILES_IN_WAVE(max_allowed_tiles_in_wave); + } } /* DB_COUNT_CONTROL (occlusion queries) */ @@ -1720,7 +1850,8 @@ static void si_emit_db_render_state(struct si_context *sctx, unsigned index) db_count_control |= S_028004_ZPASS_INCREMENT_DISABLE(1); } else { /* Occlusion queries enabled. */ - db_count_control |= S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples); + if (sctx->gfx_level < GFX12) + db_count_control |= S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples); if (sctx->gfx_level >= GFX7) { db_count_control |= S_028004_ZPASS_ENABLE(1) | @@ -1790,7 +1921,24 @@ static void si_emit_db_render_state(struct si_context *sctx, unsigned index) S_028010_DECOMPRESS_Z_ON_FLUSH(sctx->framebuffer.nr_samples >= 4) | S_028010_CENTROID_COMPUTATION_MODE(sctx->gfx_level >= GFX10_3 ? 1 : 0); - if (sctx->screen->info.has_set_context_pairs_packed) { + if (sctx->gfx_level >= GFX12) { + /* Gfx12 sets DB_RENDER_CONTROL in the gfx preamble. */ + assert(db_render_control == 0); + + radeon_begin(&sctx->gfx_cs); + gfx12_begin_context_regs(); + gfx12_opt_set_context_reg(R_028010_DB_RENDER_OVERRIDE2, SI_TRACKED_DB_RENDER_OVERRIDE2, + S_028010_DECOMPRESS_Z_ON_FLUSH(sctx->framebuffer.nr_samples >= 4) | + S_028010_CENTROID_COMPUTATION_MODE(1)); + gfx12_opt_set_context_reg(R_028060_DB_COUNT_CONTROL, SI_TRACKED_DB_COUNT_CONTROL, + db_count_control); + gfx12_opt_set_context_reg(R_02806C_DB_SHADER_CONTROL, SI_TRACKED_DB_SHADER_CONTROL, + db_shader_control); + gfx12_opt_set_context_reg(R_0283D0_PA_SC_VRS_OVERRIDE_CNTL, + SI_TRACKED_DB_PA_SC_VRS_OVERRIDE_CNTL, vrs_override_cntl); + gfx12_end_context_regs(); + radeon_end(); /* don't track context rolls on GFX12 */ + } else if (sctx->screen->info.has_set_context_pairs_packed) { radeon_begin(&sctx->gfx_cs); gfx11_begin_packed_context_regs(); gfx11_opt_set_context_reg(R_028000_DB_RENDER_CONTROL, SI_TRACKED_DB_RENDER_CONTROL, @@ -1881,7 +2029,7 @@ static uint32_t si_translate_dbformat(enum pipe_format format) case PIPE_FORMAT_X8Z24_UNORM: case PIPE_FORMAT_Z24X8_UNORM: case PIPE_FORMAT_Z24_UNORM_S8_UINT: - return V_028040_Z_24; /* deprecated on AMD GCN */ + return V_028040_Z_24; /* not present on GFX12 */ case PIPE_FORMAT_Z32_FLOAT: case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: return V_028040_Z_32_FLOAT; @@ -2661,12 +2809,12 @@ static void si_initialize_color_surface(struct si_context *sctx, struct si_surfa unsigned width0 = surf->width0; /* GFX10.3+ can set a custom pitch for 1D and 2D non-array, but it must be a multiple of - * 256B. + * 256B for GFX10.3-11 and 128B for GFX12. * * We set the pitch in MIP0_WIDTH. */ if (sctx->gfx_level >= GFX10_3 && tex->surface.u.gfx9.uses_custom_pitch) { - ASSERTED unsigned min_alignment = 256; + ASSERTED unsigned min_alignment = sctx->gfx_level >= GFX12 ? 128 : 256; assert((tex->surface.u.gfx9.surf_pitch * tex->surface.bpe) % min_alignment == 0); assert(tex->buffer.b.b.target == PIPE_TEXTURE_2D || tex->buffer.b.b.target == PIPE_TEXTURE_RECT); @@ -2679,7 +2827,26 @@ static void si_initialize_color_surface(struct si_context *sctx, struct si_surfa width0 *= 2; } - if (sctx->gfx_level >= GFX10) { + if (sctx->gfx_level >= GFX12) { + /* Gfx12. */ + assert(!UTIL_ARCH_BIG_ENDIAN); + surf->cb_color_info |= S_028EC0_FORMAT(format); + surf->cb_color_view = S_028C64_SLICE_START(surf->base.u.tex.first_layer) | + S_028C64_SLICE_MAX(surf->base.u.tex.last_layer); + surf->cb_color_view2 = S_028C68_MIP_LEVEL(surf->base.u.tex.level); + surf->cb_color_attrib = S_028C6C_NUM_FRAGMENTS(log_fragments) | + S_028C6C_FORCE_DST_ALPHA_1(force_dst_alpha_1); + surf->cb_color_attrib2 = S_028C78_MIP0_HEIGHT(surf->height0 - 1) | + S_028C78_MIP0_WIDTH(width0 - 1); + surf->cb_color_attrib3 = S_028C7C_MIP0_DEPTH(util_max_layer(&tex->buffer.b.b, 0)) | + S_028C7C_MAX_MIP(tex->buffer.b.b.last_level) | + S_028C7C_RESOURCE_TYPE(tex->surface.u.gfx9.resource_type); + surf->cb_dcc_control = S_028C70_MAX_UNCOMPRESSED_BLOCK_SIZE(1) | /* 256B */ + S_028C70_MAX_COMPRESSED_BLOCK_SIZE(tex->surface.u.gfx9.color.dcc.max_compressed_block_size) | + S_028C70_ENABLE_MAX_COMP_FRAG_OVERRIDE(1) | + S_028C70_MAX_COMP_FRAGS(tex->buffer.b.b.nr_samples >= 8 ? 3 : + tex->buffer.b.b.nr_samples >= 4 ? 2 : 0); + } else if (sctx->gfx_level >= GFX10) { /* Gfx10-11. */ surf->cb_color_view = S_028C6C_SLICE_START(surf->base.u.tex.first_layer) | S_028C6C_SLICE_MAX_GFX10(surf->base.u.tex.last_layer) | @@ -2772,7 +2939,9 @@ static void si_init_depth_surface(struct si_context *sctx, struct si_surface *su format = si_translate_dbformat(tex->db_render_format); stencil_format = tex->surface.has_stencil ? V_028044_STENCIL_8 : V_028044_STENCIL_INVALID; + assert(format != V_028040_Z_24 || sctx->gfx_level < GFX12); assert(format != V_028040_Z_INVALID); + if (format == V_028040_Z_INVALID) PRINT_ERR("Invalid DB format: %d, disabling DB.\n", tex->buffer.b.b.format); @@ -2792,9 +2961,45 @@ static void si_init_depth_surface(struct si_context *sctx, struct si_surface *su break; } - if (sctx->gfx_level >= GFX9) { - surf->db_htile_data_base = 0; - surf->db_htile_surface = 0; + if (sctx->gfx_level >= GFX12) { + surf->db_depth_view = S_028004_SLICE_START(surf->base.u.tex.first_layer) | + S_028004_SLICE_MAX(surf->base.u.tex.last_layer); + surf->u.gfx12.db_depth_view1 = S_028008_MIPID_GFX12(level); + surf->db_depth_size = S_028014_X_MAX(tex->buffer.b.b.width0 - 1) | + S_028014_Y_MAX(tex->buffer.b.b.height0 - 1); + surf->db_z_info = S_028018_FORMAT(format) | + S_028018_NUM_SAMPLES(util_logbase2(tex->buffer.b.b.nr_samples)) | + S_028018_SW_MODE(tex->surface.u.gfx9.swizzle_mode) | + S_028018_MAXMIP(tex->buffer.b.b.last_level); + surf->db_stencil_info = S_02801C_FORMAT(stencil_format) | + S_02801C_SW_MODE(tex->surface.u.gfx9.zs.stencil_swizzle_mode) | + S_02801C_TILE_STENCIL_DISABLE(1); + surf->db_depth_base = tex->buffer.gpu_address >> 8; + surf->db_stencil_base = (tex->buffer.gpu_address + tex->surface.u.gfx9.zs.stencil_offset) >> 8; + surf->u.gfx12.hiz_info = 0; + surf->u.gfx12.his_info = 0; + + /* HiZ. */ + if (tex->surface.u.gfx9.zs.hiz.offset) { + surf->u.gfx12.hiz_info = S_028B94_SURFACE_ENABLE(1) | + S_028B94_FORMAT(0) | /* unorm16 */ + S_028B94_SW_MODE(tex->surface.u.gfx9.zs.hiz.swizzle_mode); + surf->u.gfx12.hiz_size_xy = S_028BA4_X_MAX(tex->surface.u.gfx9.zs.hiz.width_in_tiles - 1) | + S_028BA4_Y_MAX(tex->surface.u.gfx9.zs.hiz.height_in_tiles - 1); + surf->u.gfx12.hiz_base = (tex->buffer.gpu_address + tex->surface.u.gfx9.zs.hiz.offset) >> 8; + } + + /* HiS. */ + if (tex->surface.u.gfx9.zs.his.offset) { + surf->u.gfx12.his_info = S_028B98_SURFACE_ENABLE(1) | + S_028B98_SW_MODE(tex->surface.u.gfx9.zs.his.swizzle_mode); + surf->u.gfx12.his_size_xy = S_028BB0_X_MAX(tex->surface.u.gfx9.zs.his.width_in_tiles - 1) | + S_028BB0_Y_MAX(tex->surface.u.gfx9.zs.his.height_in_tiles - 1); + surf->u.gfx12.his_base = (tex->buffer.gpu_address + tex->surface.u.gfx9.zs.his.offset) >> 8; + } + } else if (sctx->gfx_level >= GFX9) { + surf->u.gfx6.db_htile_data_base = 0; + surf->u.gfx6.db_htile_surface = 0; surf->db_depth_view = S_028008_SLICE_START(surf->base.u.tex.first_layer) | S_028008_SLICE_MAX(surf->base.u.tex.last_layer); if (sctx->gfx_level >= GFX10) { @@ -2815,8 +3020,8 @@ static void si_init_depth_surface(struct si_context *sctx, struct si_surface *su S_028044_ITERATE_256(sctx->gfx_level >= GFX11); if (sctx->gfx_level == GFX9) { - surf->db_z_info2 = S_028068_EPITCH(tex->surface.u.gfx9.epitch); - surf->db_stencil_info2 = S_02806C_EPITCH(tex->surface.u.gfx9.zs.stencil_epitch); + surf->u.gfx6.db_z_info2 = S_028068_EPITCH(tex->surface.u.gfx9.epitch); + surf->u.gfx6.db_stencil_info2 = S_02806C_EPITCH(tex->surface.u.gfx9.zs.stencil_epitch); } surf->db_depth_view |= S_028008_MIPID_GFX9(level); surf->db_depth_size = S_02801C_X_MAX(tex->buffer.b.b.width0 - 1) | @@ -2834,11 +3039,11 @@ static void si_init_depth_surface(struct si_context *sctx, struct si_surface *su surf->db_stencil_info |= S_02803C_ALLOW_EXPCLEAR(tex->buffer.b.b.nr_samples <= 1); } - surf->db_htile_data_base = (tex->buffer.gpu_address + tex->surface.meta_offset) >> 8; - surf->db_htile_surface = S_028ABC_FULL_CACHE(1) | - S_028ABC_PIPE_ALIGNED(1); + surf->u.gfx6.db_htile_data_base = (tex->buffer.gpu_address + tex->surface.meta_offset) >> 8; + surf->u.gfx6.db_htile_surface = S_028ABC_FULL_CACHE(1) | + S_028ABC_PIPE_ALIGNED(1); if (sctx->gfx_level == GFX9) { - surf->db_htile_surface |= S_028ABC_RB_ALIGNED(1); + surf->u.gfx6.db_htile_surface |= S_028ABC_RB_ALIGNED(1); } } } else { @@ -2851,14 +3056,14 @@ static void si_init_depth_surface(struct si_context *sctx, struct si_surface *su (tex->buffer.gpu_address >> 8) + tex->surface.u.legacy.level[level].offset_256B; surf->db_stencil_base = (tex->buffer.gpu_address >> 8) + tex->surface.u.legacy.zs.stencil_level[level].offset_256B; - surf->db_htile_data_base = 0; - surf->db_htile_surface = 0; + surf->u.gfx6.db_htile_data_base = 0; + surf->u.gfx6.db_htile_surface = 0; surf->db_depth_view = S_028008_SLICE_START(surf->base.u.tex.first_layer) | S_028008_SLICE_MAX(surf->base.u.tex.last_layer); surf->db_z_info = S_028040_FORMAT(format) | S_028040_NUM_SAMPLES(util_logbase2(tex->buffer.b.b.nr_samples)); surf->db_stencil_info = S_028044_FORMAT(stencil_format); - surf->db_depth_info = 0; + surf->u.gfx6.db_depth_info = 0; if (sctx->gfx_level >= GFX7) { struct radeon_info *info = &sctx->screen->info; @@ -2869,12 +3074,12 @@ static void si_init_depth_surface(struct si_context *sctx, struct si_surface *su unsigned stencil_tile_mode = info->si_tile_mode_array[stencil_index]; unsigned macro_mode = info->cik_macrotile_mode_array[macro_index]; - surf->db_depth_info |= S_02803C_ARRAY_MODE(G_009910_ARRAY_MODE(tile_mode)) | - S_02803C_PIPE_CONFIG(G_009910_PIPE_CONFIG(tile_mode)) | - S_02803C_BANK_WIDTH(G_009990_BANK_WIDTH(macro_mode)) | - S_02803C_BANK_HEIGHT(G_009990_BANK_HEIGHT(macro_mode)) | - S_02803C_MACRO_TILE_ASPECT(G_009990_MACRO_TILE_ASPECT(macro_mode)) | - S_02803C_NUM_BANKS(G_009990_NUM_BANKS(macro_mode)); + surf->u.gfx6.db_depth_info |= S_02803C_ARRAY_MODE(G_009910_ARRAY_MODE(tile_mode)) | + S_02803C_PIPE_CONFIG(G_009910_PIPE_CONFIG(tile_mode)) | + S_02803C_BANK_WIDTH(G_009990_BANK_WIDTH(macro_mode)) | + S_02803C_BANK_HEIGHT(G_009990_BANK_HEIGHT(macro_mode)) | + S_02803C_MACRO_TILE_ASPECT(G_009990_MACRO_TILE_ASPECT(macro_mode)) | + S_02803C_NUM_BANKS(G_009990_NUM_BANKS(macro_mode)); surf->db_z_info |= S_028040_TILE_SPLIT(G_009910_TILE_SPLIT(tile_mode)); surf->db_stencil_info |= S_028044_TILE_SPLIT(G_009910_TILE_SPLIT(stencil_tile_mode)); } else { @@ -2886,7 +3091,7 @@ static void si_init_depth_surface(struct si_context *sctx, struct si_surface *su surf->db_depth_size = S_028058_PITCH_TILE_MAX((levelinfo->nblk_x / 8) - 1) | S_028058_HEIGHT_TILE_MAX((levelinfo->nblk_y / 8) - 1); - surf->db_depth_slice = + surf->u.gfx6.db_depth_slice = S_02805C_SLICE_TILE_MAX((levelinfo->nblk_x * levelinfo->nblk_y) / 64 - 1); if (si_htile_enabled(tex, level, PIPE_MASK_ZS)) { @@ -2910,8 +3115,8 @@ static void si_init_depth_surface(struct si_context *sctx, struct si_surface *su surf->db_stencil_info |= S_028044_ALLOW_EXPCLEAR(1); } - surf->db_htile_data_base = (tex->buffer.gpu_address + tex->surface.meta_offset) >> 8; - surf->db_htile_surface = S_028ABC_FULL_CACHE(1); + surf->u.gfx6.db_htile_data_base = (tex->buffer.gpu_address + tex->surface.meta_offset) >> 8; + surf->u.gfx6.db_htile_surface = S_028ABC_FULL_CACHE(1); } } @@ -2920,6 +3125,8 @@ static void si_init_depth_surface(struct si_context *sctx, struct si_surface *su void si_set_sampler_depth_decompress_mask(struct si_context *sctx, struct si_texture *tex) { + assert(sctx->gfx_level < GFX12); + /* Check all sampler bindings in all shaders where depth textures are bound, and update * which samplers should be decompressed. */ @@ -2935,7 +3142,7 @@ void si_set_sampler_depth_decompress_mask(struct si_context *sctx, struct si_tex void si_update_fb_dirtiness_after_rendering(struct si_context *sctx) { - if (sctx->decompression_enabled) + if (sctx->gfx_level >= GFX12 || sctx->decompression_enabled) return; if (sctx->framebuffer.state.zsbuf) { @@ -2980,6 +3187,8 @@ static void si_dec_framebuffer_counters(const struct pipe_framebuffer_state *sta void si_mark_display_dcc_dirty(struct si_context *sctx, struct si_texture *tex) { + assert(sctx->gfx_level < GFX12); + if (!tex->surface.display_dcc_offset || tex->displayable_dcc_dirty) return; @@ -3020,6 +3229,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, uint8_t old_db_format_index = old_has_zsbuf ? ((struct si_surface *)sctx->framebuffer.state.zsbuf)->db_format_index : -1; + bool old_has_hiz_his = sctx->framebuffer.has_hiz_his; int i; /* Reject zero-sized framebuffers due to a hw bug on GFX6 that occurs @@ -3082,13 +3292,15 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_PS_PARTIAL_FLUSH; si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); - /* u_blitter doesn't invoke depth decompression when it does multiple - * blits in a row, but the only case when it matters for DB is when - * doing generate_mipmap. So here we flush DB manually between - * individual generate_mipmap blits. - * Note that lower mipmap levels aren't compressed. - */ - if (sctx->generate_mipmap_for_depth) { + if (sctx->gfx_level >= GFX12) { + si_make_DB_shader_coherent(sctx, sctx->framebuffer.nr_samples, true, false); + } else if (sctx->generate_mipmap_for_depth) { + /* u_blitter doesn't invoke depth decompression when it does multiple + * blits in a row, but the only case when it matters for DB is when + * doing generate_mipmap. So here we flush DB manually between + * individual generate_mipmap blits. + * Note that lower mipmap levels aren't compressed. + */ si_make_DB_shader_coherent(sctx, 1, false, sctx->framebuffer.DB_has_shader_readable_metadata); } else if (sctx->gfx_level == GFX9) { /* It appears that DB metadata "leaks" in a sequence of: @@ -3134,6 +3346,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, sctx->framebuffer.min_bytes_per_pixel = 0; sctx->framebuffer.disable_vrs_flat_shading = false; sctx->framebuffer.has_stencil = false; + sctx->framebuffer.has_hiz_his = false; for (i = 0; i < state->nr_cbufs; i++) { if (!state->cbufs[i]) @@ -3179,7 +3392,8 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, if (vi_dcc_enabled(tex, surf->base.u.tex.level)) { sctx->framebuffer.CB_has_shader_readable_metadata = true; - if (sctx->gfx_level >= GFX9 && !tex->surface.u.gfx9.color.dcc.pipe_aligned) + if (sctx->gfx_level >= GFX9 && sctx->gfx_level < GFX12 && + !tex->surface.u.gfx9.color.dcc.pipe_aligned) sctx->framebuffer.all_DCC_pipe_aligned = false; if (tex->buffer.b.b.nr_storage_samples >= 2) @@ -3212,7 +3426,8 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, si_init_depth_surface(sctx, surf); } - if (vi_tc_compat_htile_enabled(zstex, surf->base.u.tex.level, PIPE_MASK_ZS)) + if (sctx->gfx_level < GFX12 && + vi_tc_compat_htile_enabled(zstex, surf->base.u.tex.level, PIPE_MASK_ZS)) sctx->framebuffer.DB_has_shader_readable_metadata = true; /* Update the minimum but don't keep 0. */ @@ -3223,10 +3438,15 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, /* Update polygon offset based on the Z format. */ if (sctx->queued.named.rasterizer->uses_poly_offset && surf->db_format_index != old_db_format_index) - (sctx)->dirty_atoms |= SI_STATE_BIT(rasterizer); + sctx->dirty_atoms |= SI_STATE_BIT(rasterizer); if (util_format_has_stencil(util_format_description(zstex->buffer.b.b.format))) sctx->framebuffer.has_stencil = true; + + if (sctx->gfx_level >= GFX12) { + sctx->framebuffer.has_hiz_his = zstex->surface.u.gfx9.zs.hiz.offset || + zstex->surface.u.gfx9.zs.his.offset; + } } si_update_ps_colorbuf0_slot(sctx); @@ -3240,7 +3460,8 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, if (sctx->screen->dpbb_allowed) si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); - if (sctx->framebuffer.any_dst_linear != old_any_dst_linear) + if (sctx->framebuffer.any_dst_linear != old_any_dst_linear || + sctx->framebuffer.has_hiz_his != old_has_hiz_his) si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); if (sctx->screen->info.has_out_of_order_rast && @@ -3301,7 +3522,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, si_update_vrs_flat_shading(sctx); sctx->do_update_shaders = true; - if (!sctx->decompression_enabled) { + if (sctx->gfx_level < GFX12 && !sctx->decompression_enabled) { /* Prevent textures decompression when the framebuffer state * changes come from the decompression passes themselves. */ @@ -3309,7 +3530,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, } } -static void si_emit_framebuffer_state(struct si_context *sctx, unsigned index) +static void gfx6_emit_framebuffer_state(struct si_context *sctx, unsigned index) { struct radeon_cmdbuf *cs = &sctx->gfx_cs; struct pipe_framebuffer_state *state = &sctx->framebuffer.state; @@ -3616,7 +3837,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx, unsigned index) struct si_texture *tex = (struct si_texture *)zb->base.texture; unsigned db_z_info = zb->db_z_info; unsigned db_stencil_info = zb->db_stencil_info; - unsigned db_htile_surface = zb->db_htile_surface; + unsigned db_htile_surface = zb->u.gfx6.db_htile_surface; radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, &tex->buffer, RADEON_USAGE_READWRITE | (zb->base.texture->nr_samples > 1 ? RADEON_PRIO_DEPTH_BUFFER_MSAA @@ -3653,7 +3874,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx, unsigned index) unsigned level = zb->base.u.tex.level; if (sctx->gfx_level >= GFX10) { - radeon_set_context_reg(R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base); + radeon_set_context_reg(R_028014_DB_HTILE_DATA_BASE, zb->u.gfx6.db_htile_data_base); radeon_set_context_reg(R_02801C_DB_DEPTH_SIZE_XY, zb->db_depth_size); if (sctx->gfx_level >= GFX11) { @@ -3675,11 +3896,11 @@ static void si_emit_framebuffer_state(struct si_context *sctx, unsigned index) radeon_emit(zb->db_stencil_base >> 32); /* DB_STENCIL_READ_BASE_HI */ radeon_emit(zb->db_depth_base >> 32); /* DB_Z_WRITE_BASE_HI */ radeon_emit(zb->db_stencil_base >> 32); /* DB_STENCIL_WRITE_BASE_HI */ - radeon_emit(zb->db_htile_data_base >> 32); /* DB_HTILE_DATA_BASE_HI */ + radeon_emit(zb->u.gfx6.db_htile_data_base >> 32); /* DB_HTILE_DATA_BASE_HI */ } else if (sctx->gfx_level == GFX9) { radeon_set_context_reg_seq(R_028014_DB_HTILE_DATA_BASE, 3); - radeon_emit(zb->db_htile_data_base); /* DB_HTILE_DATA_BASE */ - radeon_emit(S_028018_BASE_HI(zb->db_htile_data_base >> 32)); /* DB_HTILE_DATA_BASE_HI */ + radeon_emit(zb->u.gfx6.db_htile_data_base); /* DB_HTILE_DATA_BASE */ + radeon_emit(S_028018_BASE_HI(zb->u.gfx6.db_htile_data_base >> 32)); /* DB_HTILE_DATA_BASE_HI */ radeon_emit(zb->db_depth_size); /* DB_DEPTH_SIZE */ radeon_set_context_reg_seq(R_028038_DB_Z_INFO, 10); @@ -3696,8 +3917,8 @@ static void si_emit_framebuffer_state(struct si_context *sctx, unsigned index) radeon_emit(S_02805C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */ radeon_set_context_reg_seq(R_028068_DB_Z_INFO2, 2); - radeon_emit(zb->db_z_info2); /* DB_Z_INFO2 */ - radeon_emit(zb->db_stencil_info2); /* DB_STENCIL_INFO2 */ + radeon_emit(zb->u.gfx6.db_z_info2); /* DB_Z_INFO2 */ + radeon_emit(zb->u.gfx6.db_stencil_info2); /* DB_STENCIL_INFO2 */ } else { /* GFX6-GFX8 */ /* Set fields dependent on tc_compatile_htile. */ @@ -3715,10 +3936,10 @@ static void si_emit_framebuffer_state(struct si_context *sctx, unsigned index) } } - radeon_set_context_reg(R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base); + radeon_set_context_reg(R_028014_DB_HTILE_DATA_BASE, zb->u.gfx6.db_htile_data_base); radeon_set_context_reg_seq(R_02803C_DB_DEPTH_INFO, 9); - radeon_emit(zb->db_depth_info | /* DB_DEPTH_INFO */ + radeon_emit(zb->u.gfx6.db_depth_info | /* DB_DEPTH_INFO */ S_02803C_ADDR5_SWIZZLE_MASK(!tex->tc_compatible_htile)); radeon_emit(db_z_info | /* DB_Z_INFO */ S_028040_ZRANGE_PRECISION(tex->depth_clear_value[level] != 0)); @@ -3728,7 +3949,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx, unsigned index) radeon_emit(zb->db_depth_base); /* DB_Z_WRITE_BASE */ radeon_emit(zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */ radeon_emit(zb->db_depth_size); /* DB_DEPTH_SIZE */ - radeon_emit(zb->db_depth_slice); /* DB_DEPTH_SLICE */ + radeon_emit(zb->u.gfx6.db_depth_slice); /* DB_DEPTH_SLICE */ } radeon_set_context_reg_seq(R_028028_DB_STENCIL_CLEAR, 2); @@ -3875,7 +4096,7 @@ static void gfx11_dgpu_emit_framebuffer_state(struct si_context *sctx, unsigned struct si_texture *tex = (struct si_texture *)zb->base.texture; unsigned db_z_info = zb->db_z_info; unsigned db_stencil_info = zb->db_stencil_info; - unsigned db_htile_surface = zb->db_htile_surface; + unsigned db_htile_surface = zb->u.gfx6.db_htile_surface; radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, &tex->buffer, RADEON_USAGE_READWRITE | (zb->base.texture->nr_samples > 1 ? RADEON_PRIO_DEPTH_BUFFER_MSAA @@ -3905,7 +4126,7 @@ static void gfx11_dgpu_emit_framebuffer_state(struct si_context *sctx, unsigned unsigned level = zb->base.u.tex.level; - gfx11_set_context_reg(R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base); + gfx11_set_context_reg(R_028014_DB_HTILE_DATA_BASE, zb->u.gfx6.db_htile_data_base); gfx11_set_context_reg(R_02801C_DB_DEPTH_SIZE_XY, zb->db_depth_size); gfx11_set_context_reg(R_028040_DB_Z_INFO, db_z_info | S_028038_ZRANGE_PRECISION(tex->depth_clear_value[level] != 0)); @@ -3918,7 +4139,7 @@ static void gfx11_dgpu_emit_framebuffer_state(struct si_context *sctx, unsigned gfx11_set_context_reg(R_02806C_DB_STENCIL_READ_BASE_HI, zb->db_stencil_base >> 32); gfx11_set_context_reg(R_028070_DB_Z_WRITE_BASE_HI, zb->db_depth_base >> 32); gfx11_set_context_reg(R_028074_DB_STENCIL_WRITE_BASE_HI, zb->db_stencil_base >> 32); - gfx11_set_context_reg(R_028078_DB_HTILE_DATA_BASE_HI, zb->db_htile_data_base >> 32); + gfx11_set_context_reg(R_028078_DB_HTILE_DATA_BASE_HI, zb->u.gfx6.db_htile_data_base >> 32); gfx11_set_context_reg(R_028028_DB_STENCIL_CLEAR, tex->stencil_clear_value[level]); gfx11_set_context_reg(R_02802C_DB_DEPTH_CLEAR, fui(tex->depth_clear_value[level])); gfx11_set_context_reg(R_028008_DB_DEPTH_VIEW, zb->db_depth_view); @@ -3953,6 +4174,137 @@ static void gfx11_dgpu_emit_framebuffer_state(struct si_context *sctx, unsigned sctx->framebuffer.dirty_zsbuf = false; } +static void gfx12_emit_framebuffer_state(struct si_context *sctx, unsigned index) +{ + struct radeon_cmdbuf *cs = &sctx->gfx_cs; + struct pipe_framebuffer_state *state = &sctx->framebuffer.state; + unsigned i, nr_cbufs = state->nr_cbufs; + struct si_texture *tex = NULL; + struct si_surface *cb = NULL; + bool is_msaa_resolve = state->nr_cbufs == 2 && + state->cbufs[0] && state->cbufs[0]->texture->nr_samples > 1 && + state->cbufs[1] && state->cbufs[1]->texture->nr_samples <= 1; + + /* CB can't do MSAA resolve. */ + assert(!is_msaa_resolve); + + radeon_begin(cs); + gfx12_begin_context_regs(); + + /* Colorbuffers. */ + for (i = 0; i < nr_cbufs; i++) { + if (!(sctx->framebuffer.dirty_cbufs & (1 << i))) + continue; + + /* RB+ depth-only rendering. See the comment where we set rbplus_depth_only_opt for more + * information. + */ + if (i == 0 && + sctx->screen->info.rbplus_allowed && + !sctx->queued.named.blend->cb_target_mask) { + gfx12_set_context_reg(R_028EC0_CB_COLOR0_INFO + i * 4, + S_028EC0_FORMAT(V_028C70_COLOR_32) | + S_028EC0_NUMBER_TYPE(V_028C70_NUMBER_FLOAT)); + continue; + } + + cb = (struct si_surface *)state->cbufs[i]; + if (!cb) { + gfx12_set_context_reg(R_028EC0_CB_COLOR0_INFO + i * 4, + S_028EC0_FORMAT(V_028C70_COLOR_INVALID)); + continue; + } + + tex = (struct si_texture *)cb->base.texture; + radeon_add_to_buffer_list( + sctx, &sctx->gfx_cs, &tex->buffer, RADEON_USAGE_READWRITE | RADEON_USAGE_CB_NEEDS_IMPLICIT_SYNC | + (tex->buffer.b.b.nr_samples > 1 ? RADEON_PRIO_COLOR_BUFFER_MSAA : RADEON_PRIO_COLOR_BUFFER)); + + /* Compute mutable surface parameters. */ + uint64_t cb_color_base = ((tex->buffer.gpu_address + tex->surface.u.gfx9.surf_offset) >> 8) | + tex->surface.tile_swizzle; + + gfx12_set_context_reg(R_028C60_CB_COLOR0_BASE + i * 0x24, cb_color_base); + gfx12_set_context_reg(R_028C64_CB_COLOR0_VIEW + i * 0x24, cb->cb_color_view); + gfx12_set_context_reg(R_028C68_CB_COLOR0_VIEW2 + i * 0x24, cb->cb_color_view2); + gfx12_set_context_reg(R_028C6C_CB_COLOR0_ATTRIB + i * 0x24, cb->cb_color_attrib); + gfx12_set_context_reg(R_028C70_CB_COLOR0_FDCC_CONTROL + i * 0x24, cb->cb_dcc_control); + gfx12_set_context_reg(R_028C78_CB_COLOR0_ATTRIB2 + i * 0x24, cb->cb_color_attrib2); + gfx12_set_context_reg(R_028C7C_CB_COLOR0_ATTRIB3 + i * 0x24, + cb->cb_color_attrib3 | + S_028C7C_COLOR_SW_MODE(tex->surface.u.gfx9.swizzle_mode)); + gfx12_set_context_reg(R_028E40_CB_COLOR0_BASE_EXT + i * 4, cb_color_base >> 32); + gfx12_set_context_reg(R_028EC0_CB_COLOR0_INFO + i * 4, cb->cb_color_info); + } + /* Set unbound colorbuffers. */ + for (; i < 8; i++) + if (sctx->framebuffer.dirty_cbufs & (1 << i)) + gfx12_set_context_reg(R_028EC0_CB_COLOR0_INFO + i * 4, 0); + + /* ZS buffer. */ + if (state->zsbuf && sctx->framebuffer.dirty_zsbuf) { + struct si_surface *zb = (struct si_surface *)state->zsbuf; + struct si_texture *tex = (struct si_texture *)zb->base.texture; + + radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, &tex->buffer, + RADEON_USAGE_READWRITE | RADEON_USAGE_DB_NEEDS_IMPLICIT_SYNC | + (zb->base.texture->nr_samples > 1 ? RADEON_PRIO_DEPTH_BUFFER_MSAA + : RADEON_PRIO_DEPTH_BUFFER)); + gfx12_set_context_reg(R_028004_DB_DEPTH_VIEW, zb->db_depth_view); + gfx12_set_context_reg(R_028008_DB_DEPTH_VIEW1, zb->u.gfx12.db_depth_view1); + gfx12_set_context_reg(R_028014_DB_DEPTH_SIZE_XY, zb->db_depth_size); + gfx12_set_context_reg(R_028018_DB_Z_INFO, zb->db_z_info); + gfx12_set_context_reg(R_02801C_DB_STENCIL_INFO, zb->db_stencil_info); + gfx12_set_context_reg(R_028020_DB_Z_READ_BASE, zb->db_depth_base); + gfx12_set_context_reg(R_028024_DB_Z_READ_BASE_HI, zb->db_depth_base >> 32); + gfx12_set_context_reg(R_028028_DB_Z_WRITE_BASE, zb->db_depth_base); + gfx12_set_context_reg(R_02802C_DB_Z_WRITE_BASE_HI, zb->db_depth_base >> 32); + gfx12_set_context_reg(R_028030_DB_STENCIL_READ_BASE, zb->db_stencil_base); + gfx12_set_context_reg(R_028034_DB_STENCIL_READ_BASE_HI, zb->db_stencil_base >> 32); + gfx12_set_context_reg(R_028038_DB_STENCIL_WRITE_BASE, zb->db_stencil_base); + gfx12_set_context_reg(R_02803C_DB_STENCIL_WRITE_BASE_HI, zb->db_stencil_base >> 32); + gfx12_set_context_reg(R_028B94_PA_SC_HIZ_INFO, zb->u.gfx12.hiz_info); + gfx12_set_context_reg(R_028B98_PA_SC_HIS_INFO, zb->u.gfx12.his_info); + + if (zb->u.gfx12.hiz_info) { + gfx12_set_context_reg(R_028B9C_PA_SC_HIZ_BASE, zb->u.gfx12.hiz_base); + gfx12_set_context_reg(R_028BA0_PA_SC_HIZ_BASE_EXT, zb->u.gfx12.hiz_base >> 32); + gfx12_set_context_reg(R_028BA4_PA_SC_HIZ_SIZE_XY, zb->u.gfx12.hiz_size_xy); + } + if (zb->u.gfx12.his_info) { + gfx12_set_context_reg(R_028BA8_PA_SC_HIS_BASE, zb->u.gfx12.his_base); + gfx12_set_context_reg(R_028BAC_PA_SC_HIS_BASE_EXT, zb->u.gfx12.his_base >> 32); + gfx12_set_context_reg(R_028BB0_PA_SC_HIS_SIZE_XY, zb->u.gfx12.his_size_xy); + } + } else if (sctx->framebuffer.dirty_zsbuf) { + gfx12_set_context_reg(R_028018_DB_Z_INFO, + S_028040_FORMAT(V_028040_Z_INVALID) | + S_028040_NUM_SAMPLES(sctx->framebuffer.log_samples)); + gfx12_set_context_reg(R_02801C_DB_STENCIL_INFO, + S_028044_FORMAT(V_028044_STENCIL_INVALID)| + S_028044_TILE_STENCIL_DISABLE(1)); + gfx12_set_context_reg(R_028B94_PA_SC_HIZ_INFO, S_028B94_SURFACE_ENABLE(0)); + gfx12_set_context_reg(R_028B98_PA_SC_HIS_INFO, S_028B98_SURFACE_ENABLE(0)); + } + + /* Framebuffer dimensions. */ + /* PA_SC_WINDOW_SCISSOR_TL is set in gfx12_init_gfx_preamble_state */ + gfx12_set_context_reg(R_028208_PA_SC_WINDOW_SCISSOR_BR, + S_028208_BR_X(state->width - 1) | /* inclusive */ + S_028208_BR_Y(state->height - 1)); /* inclusive */ + gfx12_end_context_regs(); + + if (sctx->screen->dpbb_allowed && + sctx->screen->pbb_context_states_per_bin > 1) { + radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); + } + radeon_end(); + + sctx->framebuffer.dirty_cbufs = 0; + sctx->framebuffer.dirty_zsbuf = false; +} + static bool si_out_of_order_rasterization(struct si_context *sctx) { struct si_state_blend *blend = sctx->queued.named.blend; @@ -4022,12 +4374,15 @@ static void si_emit_msaa_config(struct si_context *sctx, unsigned index) S_028A4C_WALK_SIZE(dst_is_linear) | S_028A4C_WALK_FENCE_ENABLE(!dst_is_linear) | S_028A4C_WALK_FENCE_SIZE(num_tile_pipes == 2 ? 2 : 3) | S_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(out_of_order_rast) | - S_028A4C_OUT_OF_ORDER_WATER_MARK(0x7) | + S_028A4C_OUT_OF_ORDER_WATER_MARK(sctx->gfx_level >= GFX12 ? 0 : 0x7) | + /* This should also be 0 when the VRS image is enabled. */ + S_028A4C_WALK_ALIGN8_PRIM_FITS_ST(!sctx->framebuffer.has_hiz_his) | /* always 1: */ - S_028A4C_WALK_ALIGN8_PRIM_FITS_ST(1) | S_028A4C_SUPERTILE_WALK_ORDER_ENABLE(1) | + S_028A4C_SUPERTILE_WALK_ORDER_ENABLE(1) | S_028A4C_TILE_WALK_ORDER_ENABLE(1) | S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(1) | S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) | S_028A4C_FORCE_EOV_REZ_ENABLE(1); - unsigned db_eqaa = S_028804_HIGH_QUALITY_INTERSECTIONS(1) | S_028804_INCOHERENT_EQAA_READS(1) | + unsigned db_eqaa = S_028804_HIGH_QUALITY_INTERSECTIONS(1) | + S_028804_INCOHERENT_EQAA_READS(sctx->gfx_level < GFX12) | S_028804_STATIC_ANCHOR_ASSOCIATIONS(1); unsigned coverage_samples, z_samples; struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; @@ -4095,9 +4450,12 @@ static void si_emit_msaa_config(struct si_context *sctx, unsigned index) (sctx->family == CHIP_VEGA20 || sctx->gfx_level >= GFX10)); sc_aa_config = S_028BE0_MSAA_NUM_SAMPLES(log_samples) | - S_028BE0_MAX_SAMPLE_DIST(si_msaa_max_distance[log_samples]) | - S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples) | - S_028BE0_COVERED_CENTROID_IS_CENTER(sctx->gfx_level >= GFX10_3); + S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples); + + if (sctx->gfx_level < GFX12) { + sc_aa_config |= S_028BE0_MAX_SAMPLE_DIST(si_msaa_max_distance[log_samples]) | + S_028BE0_COVERED_CENTROID_IS_CENTER(sctx->gfx_level >= GFX10_3); + } } if (sctx->framebuffer.nr_samples > 1 || @@ -4113,17 +4471,35 @@ static void si_emit_msaa_config(struct si_context *sctx, unsigned index) unsigned ps_iter_samples = si_get_ps_iter_samples(sctx); unsigned log_ps_iter_samples = util_logbase2(ps_iter_samples); if (sctx->framebuffer.nr_samples > 1) { - db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_z_samples) | - S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) | - S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) | - S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples); + if (sctx->gfx_level >= GFX12) { + sc_aa_config |= S_028BE0_PS_ITER_SAMPLES(log_ps_iter_samples); + db_eqaa |= S_028078_MASK_EXPORT_NUM_SAMPLES(log_samples) | + S_028078_ALPHA_TO_MASK_NUM_SAMPLES(log_samples); + } else { + db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_z_samples) | + S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) | + S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) | + S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples); + } sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1); } else if (sctx->smoothing_enabled) { db_eqaa |= S_028804_OVERRASTERIZATION_AMOUNT(log_samples); } } - if (sctx->screen->info.has_set_context_pairs_packed) { + if (sctx->gfx_level >= GFX12) { + radeon_begin(cs); + gfx12_begin_context_regs(); + gfx12_opt_set_context_reg(R_028BDC_PA_SC_LINE_CNTL, SI_TRACKED_PA_SC_LINE_CNTL, + sc_line_cntl); + gfx12_opt_set_context_reg(R_028BE0_PA_SC_AA_CONFIG, SI_TRACKED_PA_SC_AA_CONFIG, + sc_aa_config); + gfx12_opt_set_context_reg(R_028078_DB_EQAA, SI_TRACKED_DB_EQAA, db_eqaa); + gfx12_opt_set_context_reg(R_028A4C_PA_SC_MODE_CNTL_1, SI_TRACKED_PA_SC_MODE_CNTL_1, + sc_mode_cntl_1); + gfx12_end_context_regs(); + radeon_end(); /* don't track context rolls on GFX12 */ + } else if (sctx->screen->info.has_set_context_pairs_packed) { radeon_begin(cs); gfx11_begin_packed_context_regs(); gfx11_opt_set_context_reg(R_028BDC_PA_SC_LINE_CNTL, SI_TRACKED_PA_SC_LINE_CNTL, @@ -4252,7 +4628,9 @@ void si_make_buffer_descriptor(struct si_screen *screen, struct si_resource *buf * else: * offset+payload > NUM_RECORDS */ - state[7] |= S_008F0C_FORMAT_GFX10(fmt->img_format) | + state[7] |= (screen->info.gfx_level >= GFX12 ? + S_008F0C_FORMAT_GFX12(fmt->img_format) : + S_008F0C_FORMAT_GFX10(fmt->img_format)) | S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) | S_008F0C_RESOURCE_LEVEL(screen->info.gfx_level < GFX11); } else { @@ -4372,7 +4750,8 @@ static void cdna_emu_make_image_descriptor(struct si_screen *screen, struct si_t if (screen->info.gfx_level >= GFX10) { const struct gfx10_format *fmt = &ac_get_gfx10_format_table(&screen->info)[pipe_format]; - state[3] |= S_008F0C_FORMAT_GFX10(fmt->img_format) | + state[3] |= (screen->info.gfx_level >= GFX12 ? S_008F0C_FORMAT_GFX12(fmt->img_format) : + S_008F0C_FORMAT_GFX10(fmt->img_format)) | S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) | S_008F0C_RESOURCE_LEVEL(screen->info.gfx_level < GFX11); } else { @@ -4475,43 +4854,78 @@ static void gfx10_make_texture_descriptor( } else if (type == V_008F1C_SQ_RSRC_IMG_CUBE) depth = res->array_size / 6; - state[0] = 0; - state[1] = S_00A004_FORMAT_GFX10(img_format) | S_00A004_WIDTH_LO(width - 1); - state[2] = S_00A008_WIDTH_HI((width - 1) >> 2) | S_00A008_HEIGHT(height - 1) | - S_00A008_RESOURCE_LEVEL(screen->info.gfx_level < GFX11); + if (screen->info.gfx_level >= GFX12) { + unsigned max_mip = res->nr_samples > 1 ? util_logbase2(res->nr_samples) : + tex->buffer.b.b.last_level; + unsigned field_last_level = res->nr_samples > 1 ? util_logbase2(res->nr_samples) : last_level; + unsigned field_depth = (type == V_008F1C_SQ_RSRC_IMG_3D && sampler) ? depth - 1 : last_layer; - state[3] = - S_00A00C_DST_SEL_X(si_map_swizzle(swizzle[0])) | - S_00A00C_DST_SEL_Y(si_map_swizzle(swizzle[1])) | - S_00A00C_DST_SEL_Z(si_map_swizzle(swizzle[2])) | - S_00A00C_DST_SEL_W(si_map_swizzle(swizzle[3])) | - S_00A00C_BASE_LEVEL(res->nr_samples > 1 ? 0 : first_level) | - S_00A00C_LAST_LEVEL_GFX10(res->nr_samples > 1 ? util_logbase2(res->nr_samples) : last_level) | - S_00A00C_BC_SWIZZLE(gfx9_border_color_swizzle(desc->swizzle)) | S_00A00C_TYPE(type); - /* Depth is the the last accessible layer on gfx9+. The hw doesn't need - * to know the total number of layers. - */ - state[4] = - S_00A010_DEPTH_GFX10((type == V_008F1C_SQ_RSRC_IMG_3D && sampler) ? depth - 1 : last_layer) | - S_00A010_BASE_ARRAY(first_layer); - state[5] = S_00A014_ARRAY_PITCH(!!(type == V_008F1C_SQ_RSRC_IMG_3D && !sampler)) | - S_00A014_PERF_MOD(4); - - unsigned max_mip = res->nr_samples > 1 ? util_logbase2(res->nr_samples) : - tex->buffer.b.b.last_level; - - if (screen->info.gfx_level >= GFX11) { - state[1] |= S_00A004_MAX_MIP_GFX11(max_mip); + state[0] = 0; + state[1] = S_00A004_MAX_MIP_GFX12(max_mip) | + S_00A004_FORMAT_GFX12(img_format) | + S_00A004_BASE_LEVEL(res->nr_samples > 1 ? 0 : first_level) | + S_00A004_WIDTH_LO(width - 1); + state[2] = S_00A008_WIDTH_HI((width - 1) >> 2) | + S_00A008_HEIGHT(height - 1); + state[3] = S_00A00C_DST_SEL_X(si_map_swizzle(swizzle[0])) | + S_00A00C_DST_SEL_Y(si_map_swizzle(swizzle[1])) | + S_00A00C_DST_SEL_Z(si_map_swizzle(swizzle[2])) | + S_00A00C_DST_SEL_W(si_map_swizzle(swizzle[3])) | + S_00A00C_NO_EDGE_CLAMP(res->last_level > 0 && + util_format_is_compressed(res->format) && + !util_format_is_compressed(pipe_format)) | + S_00A00C_LAST_LEVEL_GFX12(field_last_level) | + S_00A00C_BC_SWIZZLE(gfx9_border_color_swizzle(desc->swizzle)) | + S_00A00C_TYPE(type); + /* Depth is the the last accessible layer on gfx9+. The hw doesn't need + * to know the total number of layers. + */ + state[4] = S_00A010_DEPTH_GFX12(field_depth) | + S_00A010_BASE_ARRAY(first_layer); + state[5] = S_00A014_UAV3D(type == V_008F1C_SQ_RSRC_IMG_3D && !sampler) | + S_00A014_PERF_MOD(4); + state[6] = S_00A018_MAX_UNCOMPRESSED_BLOCK_SIZE(1 /*256B*/) | + S_00A018_MAX_COMPRESSED_BLOCK_SIZE(tex->surface.u.gfx9.color.dcc.max_compressed_block_size); + state[7] = 0; } else { - state[5] |= S_00A014_MAX_MIP(max_mip); - } - state[6] = 0; - state[7] = 0; + state[0] = 0; + state[1] = S_00A004_FORMAT_GFX10(img_format) | S_00A004_WIDTH_LO(width - 1); + state[2] = S_00A008_WIDTH_HI((width - 1) >> 2) | S_00A008_HEIGHT(height - 1) | + S_00A008_RESOURCE_LEVEL(screen->info.gfx_level < GFX11); - if (vi_dcc_enabled(tex, first_level)) { - state[6] |= S_00A018_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) | - S_00A018_MAX_COMPRESSED_BLOCK_SIZE(tex->surface.u.gfx9.color.dcc.max_compressed_block_size) | - S_00A018_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(screen, pipe_format)); + state[3] = + S_00A00C_DST_SEL_X(si_map_swizzle(swizzle[0])) | + S_00A00C_DST_SEL_Y(si_map_swizzle(swizzle[1])) | + S_00A00C_DST_SEL_Z(si_map_swizzle(swizzle[2])) | + S_00A00C_DST_SEL_W(si_map_swizzle(swizzle[3])) | + S_00A00C_BASE_LEVEL(res->nr_samples > 1 ? 0 : first_level) | + S_00A00C_LAST_LEVEL_GFX10(res->nr_samples > 1 ? util_logbase2(res->nr_samples) : last_level) | + S_00A00C_BC_SWIZZLE(gfx9_border_color_swizzle(desc->swizzle)) | S_00A00C_TYPE(type); + /* Depth is the the last accessible layer on gfx9+. The hw doesn't need + * to know the total number of layers. + */ + state[4] = + S_00A010_DEPTH_GFX10((type == V_008F1C_SQ_RSRC_IMG_3D && sampler) ? depth - 1 : last_layer) | + S_00A010_BASE_ARRAY(first_layer); + state[5] = S_00A014_ARRAY_PITCH(!!(type == V_008F1C_SQ_RSRC_IMG_3D && !sampler)) | + S_00A014_PERF_MOD(4); + + unsigned max_mip = res->nr_samples > 1 ? util_logbase2(res->nr_samples) : + tex->buffer.b.b.last_level; + + if (screen->info.gfx_level >= GFX11) { + state[1] |= S_00A004_MAX_MIP_GFX11(max_mip); + } else { + state[5] |= S_00A014_MAX_MIP(max_mip); + } + state[6] = 0; + state[7] = 0; + + if (vi_dcc_enabled(tex, first_level)) { + state[6] |= S_00A018_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) | + S_00A018_MAX_COMPRESSED_BLOCK_SIZE(tex->surface.u.gfx9.color.dcc.max_compressed_block_size) | + S_00A018_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(screen, pipe_format)); + } } /* Initialize the sampler view for FMASK. */ @@ -5147,6 +5561,7 @@ static void *si_create_sampler_state(struct pipe_context *ctx, struct si_sampler_state *rstate = CALLOC_STRUCT(si_sampler_state); unsigned max_aniso = sscreen->force_aniso >= 0 ? sscreen->force_aniso : state->max_anisotropy; unsigned max_aniso_ratio = si_tex_aniso_filter(max_aniso); + unsigned perf_mip = max_aniso_ratio ? max_aniso_ratio + 6 : 0; bool trunc_coord = (state->min_img_filter == PIPE_TEX_FILTER_NEAREST && state->mag_img_filter == PIPE_TEX_FILTER_NEAREST && state->compare_mode == PIPE_TEX_COMPARE_NONE) || @@ -5179,15 +5594,24 @@ static void *si_create_sampler_state(struct pipe_context *ctx, S_008F30_ANISO_THRESHOLD(max_aniso_ratio >> 1) | S_008F30_ANISO_BIAS(max_aniso_ratio) | S_008F30_DISABLE_CUBE_WRAP(!state->seamless_cube_map) | S_008F30_TRUNC_COORD(trunc_coord)); - rstate->val[1] = (S_008F34_MIN_LOD_GFX6(S_FIXED(CLAMP(state->min_lod, 0, 15), 8)) | - S_008F34_MAX_LOD_GFX6(S_FIXED(CLAMP(state->max_lod, 0, 15), 8)) | - S_008F34_PERF_MIP(max_aniso_ratio ? max_aniso_ratio + 6 : 0)); + rstate->val[1] = 0; rstate->val[2] = (S_008F38_XY_MAG_FILTER(si_tex_filter(state->mag_img_filter, max_aniso)) | S_008F38_XY_MIN_FILTER(si_tex_filter(state->min_img_filter, max_aniso)) | S_008F38_MIP_FILTER(si_tex_mipfilter(state->min_mip_filter))); rstate->val[3] = si_translate_border_color(sctx, state, &state->border_color, state->border_color_is_integer); + if (sscreen->info.gfx_level >= GFX12) { + rstate->val[1] |= S_008F34_MIN_LOD_GFX12(S_FIXED(CLAMP(state->min_lod, 0, 17), 8)) | + S_008F34_MAX_LOD_GFX12(S_FIXED(CLAMP(state->max_lod, 0, 17), 8)); + rstate->val[2] |= S_008F38_PERF_MIP_LO(perf_mip); + rstate->val[3] |= S_008F3C_PERF_MIP_HI(perf_mip >> 2); + } else { + rstate->val[1] |= S_008F34_MIN_LOD_GFX6(S_FIXED(CLAMP(state->min_lod, 0, 15), 8)) | + S_008F34_MAX_LOD_GFX6(S_FIXED(CLAMP(state->max_lod, 0, 15), 8)) | + S_008F34_PERF_MIP(perf_mip); + } + if (sscreen->info.gfx_level >= GFX10) { rstate->val[2] |= S_008F38_LOD_BIAS(S_FIXED(CLAMP(state->lod_bias, -32, 31), 8)) | S_008F38_ANISO_OVERRIDE_GFX10(1); @@ -5472,7 +5896,9 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count, ASSERTED unsigned last_vertex_format = sscreen->info.gfx_level >= GFX11 ? 64 : 128; assert(fmt->img_format != 0 && fmt->img_format < last_vertex_format); v->elem[i].rsrc_word3 |= - S_008F0C_FORMAT_GFX10(fmt->img_format) | + (sscreen->info.gfx_level >= GFX12 ? + S_008F0C_FORMAT_GFX12(fmt->img_format) : + S_008F0C_FORMAT_GFX10(fmt->img_format)) | S_008F0C_RESOURCE_LEVEL(sscreen->info.gfx_level < GFX11) | /* OOB_SELECT chooses the out-of-bounds check: * - 1: index >= NUM_RECORDS (Structured) @@ -5769,6 +6195,11 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags) if (sctx->screen->info.gfx_level <= GFX8 && flags & PIPE_BARRIER_INDIRECT_BUFFER) sctx->flags |= SI_CONTEXT_WB_L2; + /* Indices and draw indirect don't use GL2. */ + if (sctx->screen->info.cp_sdma_ge_use_system_memory_scope && + flags & (PIPE_BARRIER_INDEX_BUFFER | PIPE_BARRIER_INDIRECT_BUFFER)) + sctx->flags |= SI_CONTEXT_WB_L2; + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); } @@ -5809,10 +6240,12 @@ void si_init_state_functions(struct si_context *sctx) sctx->atoms.s.pm4_states[SI_STATE_IDX(vs)].emit = si_pm4_emit_shader; sctx->atoms.s.pm4_states[SI_STATE_IDX(ps)].emit = si_pm4_emit_shader; - if (sctx->screen->info.has_set_context_pairs_packed) + if (sctx->gfx_level >= GFX12) + sctx->atoms.s.framebuffer.emit = gfx12_emit_framebuffer_state; + else if (sctx->screen->info.has_set_context_pairs_packed) sctx->atoms.s.framebuffer.emit = gfx11_dgpu_emit_framebuffer_state; else - sctx->atoms.s.framebuffer.emit = si_emit_framebuffer_state; + sctx->atoms.s.framebuffer.emit = gfx6_emit_framebuffer_state; sctx->atoms.s.db_render_state.emit = si_emit_db_render_state; sctx->atoms.s.dpbb_state.emit = si_emit_dpbb_state; @@ -5848,9 +6281,10 @@ void si_init_state_functions(struct si_context *sctx) } sctx->custom_blend_dcc_decompress = - si_create_blend_custom(sctx, sctx->gfx_level >= GFX11 ? - V_028808_CB_DCC_DECOMPRESS_GFX11 : - V_028808_CB_DCC_DECOMPRESS_GFX8); + si_create_blend_custom(sctx, + sctx->gfx_level >= GFX12 ? V_028858_CB_DCC_DECOMPRESS : + sctx->gfx_level >= GFX11 ? V_028808_CB_DCC_DECOMPRESS_GFX11 : + V_028808_CB_DCC_DECOMPRESS_GFX8); sctx->b.set_clip_state = si_set_clip_state; sctx->b.set_stencil_ref = si_set_stencil_ref; @@ -6505,10 +6939,276 @@ done: sctx->cs_preamble_state_tmz = si_pm4_clone(pm4); /* Make a copy of the preamble for TMZ. */ } +static void gfx12_init_gfx_preamble_state(struct si_context *sctx) +{ + struct si_screen *sscreen = sctx->screen; + uint64_t border_color_va = sctx->border_color_buffer->gpu_address; + uint32_t compute_cu_en = S_00B88C_SA0_CU_EN(sscreen->info.spi_cu_en) | + S_00B88C_SA1_CU_EN(sscreen->info.spi_cu_en); + unsigned num_se = sscreen->info.num_se; + unsigned color_write_policy, color_read_policy; + enum gfx12_store_temporal_hint color_write_temporal_hint, zs_write_temporal_hint; + enum gfx12_load_temporal_hint color_read_temporal_hint, zs_read_temporal_hint; + + if (sscreen->options.cache_rb_gl2) { + color_write_policy = V_028410_CACHE_LRU_WR; + color_read_policy = V_028410_CACHE_LRU_RD; + color_write_temporal_hint = gfx12_store_regular_temporal; + color_read_temporal_hint = gfx12_load_regular_temporal; + zs_write_temporal_hint = gfx12_store_regular_temporal; + zs_read_temporal_hint = gfx12_load_regular_temporal; + } else { + color_write_policy = V_028410_CACHE_STREAM; + color_read_policy = V_02807C_CACHE_NOA_GFX11; + color_write_temporal_hint = gfx12_store_near_non_temporal_far_regular_temporal; + color_read_temporal_hint = gfx12_load_near_non_temporal_far_regular_temporal; + zs_write_temporal_hint = gfx12_store_near_non_temporal_far_regular_temporal; + zs_read_temporal_hint = gfx12_load_near_non_temporal_far_regular_temporal; + } + + struct si_pm4_state *pm4 = si_pm4_create_sized(sscreen, 300, sctx->has_graphics); + if (!pm4) + return; + + if (sctx->has_graphics && !sctx->shadowing.registers) { + si_pm4_cmd_add(pm4, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); + si_pm4_cmd_add(pm4, CC0_UPDATE_LOAD_ENABLES(1)); + si_pm4_cmd_add(pm4, CC1_UPDATE_SHADOW_ENABLES(1)); + } + + if (sctx->has_graphics && sscreen->dpbb_allowed) { + si_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0)); + si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); + } + + /* Non-graphics uconfig registers. */ + si_pm4_set_reg(pm4, R_030E00_TA_CS_BC_BASE_ADDR, border_color_va >> 8); + si_pm4_set_reg(pm4, R_030E04_TA_CS_BC_BASE_ADDR_HI, S_030E04_ADDRESS(border_color_va >> 40)); + + /* Compute registers. */ + si_pm4_set_reg(pm4, R_00B82C_COMPUTE_PERFCOUNT_ENABLE, 0); + si_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, S_00B834_DATA(sctx->screen->info.address32_hi >> 8)); + si_pm4_set_reg(pm4, R_00B838_COMPUTE_DISPATCH_PKT_ADDR_LO, 0); + si_pm4_set_reg(pm4, R_00B83C_COMPUTE_DISPATCH_PKT_ADDR_HI, 0); + si_pm4_set_reg(pm4, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, compute_cu_en); + si_pm4_set_reg(pm4, R_00B85C_COMPUTE_STATIC_THREAD_MGMT_SE1, num_se > 1 ? compute_cu_en : 0); + si_pm4_set_reg(pm4, R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, num_se > 2 ? compute_cu_en : 0); + si_pm4_set_reg(pm4, R_00B868_COMPUTE_STATIC_THREAD_MGMT_SE3, num_se > 3 ? compute_cu_en : 0); + si_pm4_set_reg(pm4, R_00B88C_COMPUTE_STATIC_THREAD_MGMT_SE8, num_se > 8 ? compute_cu_en : 0); + si_pm4_set_reg(pm4, R_00B890_COMPUTE_USER_ACCUM_0, 0); + si_pm4_set_reg(pm4, R_00B894_COMPUTE_USER_ACCUM_1, 0); + si_pm4_set_reg(pm4, R_00B898_COMPUTE_USER_ACCUM_2, 0); + si_pm4_set_reg(pm4, R_00B89C_COMPUTE_USER_ACCUM_3, 0); + si_pm4_set_reg(pm4, R_00B8AC_COMPUTE_STATIC_THREAD_MGMT_SE4, num_se > 4 ? compute_cu_en : 0); + si_pm4_set_reg(pm4, R_00B8B0_COMPUTE_STATIC_THREAD_MGMT_SE5, num_se > 5 ? compute_cu_en : 0); + si_pm4_set_reg(pm4, R_00B8B4_COMPUTE_STATIC_THREAD_MGMT_SE6, num_se > 6 ? compute_cu_en : 0); + si_pm4_set_reg(pm4, R_00B8B8_COMPUTE_STATIC_THREAD_MGMT_SE7, num_se > 7 ? compute_cu_en : 0); + si_pm4_set_reg(pm4, R_00B9F4_COMPUTE_DISPATCH_TUNNEL, 0); + + if (!sctx->has_graphics) + goto done; + + /* Graphics registers. */ + /* Shader registers - PS */ + si_pm4_set_reg_idx3(pm4, R_00B018_SPI_SHADER_PGM_RSRC3_PS, + ac_apply_cu_en(S_00B018_CU_EN(0xffff), + C_00B018_CU_EN, 0, &sscreen->info)); + si_pm4_set_reg(pm4, R_00B0C0_SPI_SHADER_REQ_CTRL_PS, + S_00B0C0_SOFT_GROUPING_EN(1) | + S_00B0C0_NUMBER_OF_REQUESTS_PER_CU(4 - 1)); + si_pm4_set_reg(pm4, R_00B0C8_SPI_SHADER_USER_ACCUM_PS_0, 0); + si_pm4_set_reg(pm4, R_00B0CC_SPI_SHADER_USER_ACCUM_PS_1, 0); + si_pm4_set_reg(pm4, R_00B0D0_SPI_SHADER_USER_ACCUM_PS_2, 0); + si_pm4_set_reg(pm4, R_00B0D4_SPI_SHADER_USER_ACCUM_PS_3, 0); + + /* Shader registers - GS */ + si_pm4_set_reg(pm4, R_00B218_SPI_SHADER_PGM_HI_ES, + S_00B324_MEM_BASE(sscreen->info.address32_hi >> 8)); + si_pm4_set_reg_idx3(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, + ac_apply_cu_en(0xfffffefe, 0, 0, &sscreen->info)); + si_pm4_set_reg(pm4, R_00B2C8_SPI_SHADER_USER_ACCUM_ESGS_0, 0); + si_pm4_set_reg(pm4, R_00B2CC_SPI_SHADER_USER_ACCUM_ESGS_1, 0); + si_pm4_set_reg(pm4, R_00B2D0_SPI_SHADER_USER_ACCUM_ESGS_2, 0); + si_pm4_set_reg(pm4, R_00B2D4_SPI_SHADER_USER_ACCUM_ESGS_3, 0); + + /* Shader registers - HS */ + si_pm4_set_reg(pm4, R_00B418_SPI_SHADER_PGM_HI_LS, + S_00B524_MEM_BASE(sscreen->info.address32_hi >> 8)); + si_pm4_set_reg_idx3(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, + ac_apply_cu_en(0xffffffff, 0, 0, &sscreen->info)); + si_pm4_set_reg(pm4, R_00B4C8_SPI_SHADER_USER_ACCUM_LSHS_0, 0); + si_pm4_set_reg(pm4, R_00B4CC_SPI_SHADER_USER_ACCUM_LSHS_1, 0); + si_pm4_set_reg(pm4, R_00B4D0_SPI_SHADER_USER_ACCUM_LSHS_2, 0); + si_pm4_set_reg(pm4, R_00B4D4_SPI_SHADER_USER_ACCUM_LSHS_3, 0); + + /* Context registers */ + si_pm4_set_reg(pm4, R_028000_DB_RENDER_CONTROL, 0); + si_pm4_set_reg(pm4, R_02800C_DB_RENDER_OVERRIDE, 0); + si_pm4_set_reg(pm4, R_028040_DB_GL1_INTERFACE_CONTROL, 0); + si_pm4_set_reg(pm4, R_028048_DB_MEM_TEMPORAL, + S_028048_Z_TEMPORAL_READ(zs_read_temporal_hint) | + S_028048_Z_TEMPORAL_WRITE(zs_write_temporal_hint) | + S_028048_STENCIL_TEMPORAL_READ(zs_read_temporal_hint) | + S_028048_STENCIL_TEMPORAL_WRITE(zs_write_temporal_hint) | + S_028048_OCCLUSION_TEMPORAL_WRITE(gfx12_store_regular_temporal)); + si_pm4_set_reg(pm4, R_028064_DB_VIEWPORT_CONTROL, 0); + si_pm4_set_reg(pm4, R_028068_DB_SPI_VRS_CENTER_LOCATION, 0); + si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, border_color_va >> 8); + si_pm4_set_reg(pm4, R_028084_TA_BC_BASE_ADDR_HI, S_028084_ADDRESS(border_color_va >> 40)); + si_pm4_set_reg(pm4, R_02808C_DB_STENCIL_OPVAL, S_02808C_OPVAL(1) | S_02808C_OPVAL_BF(1)); + si_pm4_set_reg(pm4, R_0280F8_SC_MEM_TEMPORAL, + S_0280F8_VRS_TEMPORAL_READ(gfx12_load_regular_temporal) | + S_0280F8_VRS_TEMPORAL_WRITE(gfx12_store_regular_temporal) | + S_0280F8_HIZ_TEMPORAL_READ(gfx12_load_regular_temporal) | + S_0280F8_HIZ_TEMPORAL_WRITE(gfx12_store_regular_temporal) | + S_0280F8_HIS_TEMPORAL_READ(gfx12_load_regular_temporal) | + S_0280F8_HIS_TEMPORAL_WRITE(gfx12_store_regular_temporal)); + si_pm4_set_reg(pm4, R_0280FC_SC_MEM_SPEC_READ, + S_0280FC_VRS_SPECULATIVE_READ(gfx12_spec_read_force_on) | + S_0280FC_HIZ_SPECULATIVE_READ(gfx12_spec_read_force_on) | + S_0280FC_HIS_SPECULATIVE_READ(gfx12_spec_read_force_on)); + + /* We don't need to initialize PA_SC_VPORT_* because we don't enable + * IMPLICIT_VPORT_SCISSOR_ENABLE, but it might be useful for Vulkan. + * + * If you set IMPLICIT_VPORT_SCISSOR_ENABLE, PA_SC_VPORT_* will take effect and allows + * setting a scissor that covers the whole viewport. If you set VPORT_SCISSOR_ENABLE, + * PA_SC_VPORT_SCISSOR_* will take effect and allows setting a user scissor. If you set + * both enable bits, the hw will use the intersection of both. It allows separating implicit + * viewport scissors from user scissors. + */ + si_pm4_set_reg(pm4, R_028180_PA_SC_SCREEN_SCISSOR_TL, 0); + si_pm4_set_reg(pm4, R_028184_PA_SC_SCREEN_SCISSOR_BR, + S_028184_BR_X(65535) | S_028184_BR_Y(65535)); /* inclusive bounds */ + si_pm4_set_reg(pm4, R_028204_PA_SC_WINDOW_SCISSOR_TL, 0); + si_pm4_set_reg(pm4, R_028240_PA_SC_GENERIC_SCISSOR_TL, 0); + si_pm4_set_reg(pm4, R_028244_PA_SC_GENERIC_SCISSOR_BR, + S_028244_BR_X(65535) | S_028244_BR_Y(65535)); /* inclusive bounds */ + si_pm4_set_reg(pm4, R_028358_PA_SC_SCREEN_EXTENT_CONTROL, 0); + si_pm4_set_reg(pm4, R_02835C_PA_SC_TILE_STEERING_OVERRIDE, + sscreen->info.pa_sc_tile_steering_override); + si_pm4_set_reg(pm4, R_0283E0_PA_SC_VRS_INFO, 0); + + si_pm4_set_reg(pm4, R_028410_CB_RMI_GL2_CACHE_CONTROL, + S_028410_COLOR_WR_POLICY_GFX11(color_write_policy) | + S_028410_COLOR_RD_POLICY(color_read_policy)); + si_pm4_set_reg(pm4, R_028648_SPI_SHADER_IDX_FORMAT, + S_028648_IDX0_EXPORT_FORMAT(V_028648_SPI_SHADER_1COMP)); + si_pm4_set_reg(pm4, R_0286E4_SPI_BARYC_SSAA_CNTL, S_0286E4_COVERED_CENTROID_IS_CENTER(1)); + si_pm4_set_reg(pm4, R_028750_SX_PS_DOWNCONVERT_CONTROL, 0xff); + si_pm4_set_reg(pm4, R_0287D4_PA_CL_POINT_X_RAD, 0); + si_pm4_set_reg(pm4, R_0287D8_PA_CL_POINT_Y_RAD, 0); + si_pm4_set_reg(pm4, R_0287DC_PA_CL_POINT_SIZE, 0); + si_pm4_set_reg(pm4, R_0287E0_PA_CL_POINT_CULL_RAD, 0); + si_pm4_set_reg(pm4, R_028820_PA_CL_NANINF_CNTL, 0); + si_pm4_set_reg(pm4, R_028824_PA_SU_LINE_STIPPLE_CNTL, 0); + si_pm4_set_reg(pm4, R_028828_PA_SU_LINE_STIPPLE_SCALE, 0); + /* If any sample location uses the -8 coordinate, the EXCLUSION fields should be set to 0. */ + si_pm4_set_reg(pm4, R_02882C_PA_SU_PRIM_FILTER_CNTL, + S_02882C_XMAX_RIGHT_EXCLUSION(1) | + S_02882C_YMAX_BOTTOM_EXCLUSION(1)); + si_pm4_set_reg(pm4, R_028830_PA_SU_SMALL_PRIM_FILTER_CNTL, + S_028830_SMALL_PRIM_FILTER_ENABLE(1) | + S_028830_SC_1XMSAA_COMPATIBLE_DISABLE(1) /* use sample locations even for MSAA 1x */); + si_pm4_set_reg(pm4, R_02883C_PA_SU_OVER_RASTERIZATION_CNTL, 0); + si_pm4_set_reg(pm4, R_028840_PA_STEREO_CNTL, S_028840_STEREO_MODE(1)); + + /* The rate combiners have no effect if they are disabled like this: + * VERTEX_RATE: BYPASS_VTX_RATE_COMBINER = 1 + * PRIMITIVE_RATE: BYPASS_PRIM_RATE_COMBINER = 1 + * HTILE_RATE: VRS_HTILE_ENCODING = 0 + * SAMPLE_ITER: PS_ITER_SAMPLE = 0 + * + * Use OVERRIDE, which will ignore results from previous combiners. + * (e.g. enabled sample shading overrides the vertex rate) + */ + si_pm4_set_reg(pm4, R_028848_PA_CL_VRS_CNTL, + S_028848_VERTEX_RATE_COMBINER_MODE(V_028848_SC_VRS_COMB_MODE_OVERRIDE) | + S_028848_SAMPLE_ITER_COMBINER_MODE(V_028848_SC_VRS_COMB_MODE_OVERRIDE)); + + si_pm4_set_reg(pm4, R_028A18_VGT_HOS_MAX_TESS_LEVEL, fui(64)); + si_pm4_set_reg(pm4, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, fui(0)); + si_pm4_set_reg(pm4, R_028A50_GE_SE_ENHANCE, 0); + si_pm4_set_reg(pm4, R_028A70_GE_IA_ENHANCE, 0); + si_pm4_set_reg(pm4, R_028A80_GE_WD_ENHANCE, 0); + si_pm4_set_reg(pm4, R_028A9C_VGT_REUSE_OFF, 0); + si_pm4_set_reg(pm4, R_028AA0_VGT_DRAW_PAYLOAD_CNTL, 0); + si_pm4_set_reg(pm4, R_028ABC_DB_HTILE_SURFACE, 0); + + si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0); + si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION, + S_028B50_ACCUM_ISOLINE(128) | + S_028B50_ACCUM_TRI(128) | + S_028B50_ACCUM_QUAD(128) | + S_028B50_DONUT_SPLIT_GFX9(24) | + S_028B50_TRAP_SPLIT(6)); + si_pm4_set_reg(pm4, R_028BC0_PA_SC_HISZ_RENDER_OVERRIDE, 0); + + si_pm4_set_reg(pm4, R_028C40_PA_SC_BINNER_OUTPUT_TIMEOUT_COUNTER, 0x800); + si_pm4_set_reg(pm4, R_028C48_PA_SC_BINNER_CNTL_1, + S_028C48_MAX_ALLOC_COUNT(254) | + S_028C48_MAX_PRIM_PER_BATCH(511)); + si_pm4_set_reg(pm4, R_028C4C_PA_SC_BINNER_CNTL_2, S_028C4C_ENABLE_PING_PONG_BIN_ORDER(1)); + si_pm4_set_reg(pm4, R_028C50_PA_SC_NGG_MODE_CNTL, S_028C50_MAX_DEALLOCS_IN_WAVE(64)); + si_pm4_set_reg(pm4, R_028C54_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL, + S_028C54_NULL_SQUAD_AA_MASK_ENABLE(1)); + si_pm4_set_reg(pm4, R_028C58_PA_SC_SHADER_CONTROL, 0); + + for (unsigned i = 0; i < 8; i++) { + si_pm4_set_reg(pm4, R_028F00_CB_MEM0_INFO + i * 4, + S_028F00_TEMPORAL_READ(color_read_temporal_hint) | + S_028F00_TEMPORAL_WRITE(color_write_temporal_hint)); + } + + /* Uconfig registers. */ + si_pm4_set_reg(pm4, R_030924_GE_MIN_VTX_INDX, 0); + si_pm4_set_reg(pm4, R_030928_GE_INDX_OFFSET, 0); + /* This is changed by draws for indexed draws, but we need to set DISABLE_FOR_AUTO_INDEX + * here, which disables primitive restart for all non-indexed draws, so that those draws + * won't have to set this state. + */ + si_pm4_set_reg(pm4, R_03092C_GE_MULTI_PRIM_IB_RESET_EN, S_03092C_DISABLE_FOR_AUTO_INDEX(1)); + si_pm4_set_reg(pm4, R_030950_GE_GS_THROTTLE, + S_030950_T0(0x1) | + S_030950_T1(0x4) | + S_030950_T2(0x3) | + S_030950_STALL_CYCLES(0x40) | + S_030950_FACTOR1(0x2) | + S_030950_FACTOR2(0x3) | + S_030950_ENABLE_THROTTLE(0) | + S_030950_NUM_INIT_GRPS(0xff)); + si_pm4_set_reg(pm4, R_030964_GE_MAX_VTX_INDX, ~0); + si_pm4_set_reg(pm4, R_030968_VGT_INSTANCE_BASE_ID, 0); + si_pm4_set_reg(pm4, R_03097C_GE_STEREO_CNTL, 0); + si_pm4_set_reg(pm4, R_030980_GE_USER_VGPR_EN, 0); + si_pm4_set_reg(pm4, R_0309B4_VGT_PRIMITIVEID_RESET, 0); + si_pm4_set_reg(pm4, R_03098C_GE_VRS_RATE, 0); + si_pm4_set_reg(pm4, R_030A00_PA_SU_LINE_STIPPLE_VALUE, 0); + si_pm4_set_reg(pm4, R_030A04_PA_SC_LINE_STIPPLE_STATE, 0); + + si_pm4_set_reg(pm4, R_031128_SPI_GRP_LAUNCH_GUARANTEE_ENABLE, 0x8A4D); + si_pm4_set_reg(pm4, R_03112C_SPI_GRP_LAUNCH_GUARANTEE_CTRL, 0x1123); + + uint64_t rb_mask = BITFIELD64_MASK(sctx->screen->info.max_render_backends); + + si_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 2, 0)); + si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_CONTROL) | EVENT_INDEX(1)); + si_pm4_cmd_add(pm4, PIXEL_PIPE_STATE_CNTL_COUNTER_ID(0) | + PIXEL_PIPE_STATE_CNTL_STRIDE(2) | + PIXEL_PIPE_STATE_CNTL_INSTANCE_EN_LO(rb_mask)); + si_pm4_cmd_add(pm4, PIXEL_PIPE_STATE_CNTL_INSTANCE_EN_HI(rb_mask)); + +done: + sctx->cs_preamble_state = pm4; + sctx->cs_preamble_state_tmz = si_pm4_clone(pm4); /* Make a copy of the preamble for TMZ. */ +} + void si_init_gfx_preamble_state(struct si_context *sctx) { if (!sctx->screen->info.has_graphics) cdna_init_compute_preamble_state(sctx); + else if (sctx->gfx_level >= GFX12) + gfx12_init_gfx_preamble_state(sctx); else if (sctx->gfx_level >= GFX10) gfx10_init_gfx_preamble_state(sctx); else diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 1984336d8ec..e60defabd50 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -128,6 +128,8 @@ struct si_state_dsa { unsigned db_depth_bounds_min; unsigned db_depth_bounds_max; unsigned spi_shader_user_data_ps_alpha_ref; + unsigned db_stencil_read_mask; + unsigned db_stencil_write_mask; /* 0 = without stencil buffer, 1 = when both Z and S buffers are present */ struct si_dsa_order_invariance order_invariance[2]; @@ -262,8 +264,8 @@ struct si_shader_data { enum si_tracked_reg { /* CONTEXT registers. */ - /* 2 consecutive registers */ - SI_TRACKED_DB_RENDER_CONTROL, + /* 2 consecutive registers (GFX6-11), or separate registers (GFX12) */ + SI_TRACKED_DB_RENDER_CONTROL, /* GFX6-11 (not tracked on GFX12) */ SI_TRACKED_DB_COUNT_CONTROL, SI_TRACKED_DB_DEPTH_CONTROL, @@ -292,8 +294,9 @@ enum si_tracked_reg SI_TRACKED_PA_SC_LINE_CNTL, SI_TRACKED_PA_SC_AA_CONFIG, - /* 5 consecutive registers */ + /* 5 consecutive registers (GFX6-11) */ SI_TRACKED_PA_SU_VTX_CNTL, + /* 4 consecutive registers (GFX12) */ SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ, SI_TRACKED_PA_CL_GB_VERT_DISC_ADJ, SI_TRACKED_PA_CL_GB_HORZ_CLIP_ADJ, @@ -302,17 +305,17 @@ enum si_tracked_reg /* Non-consecutive register */ SI_TRACKED_SPI_SHADER_POS_FORMAT, - /* 2 consecutive registers */ + /* 5 consecutive registers (GFX12), or 2 consecutive registers (GFX6-11) */ SI_TRACKED_SPI_SHADER_Z_FORMAT, SI_TRACKED_SPI_SHADER_COL_FORMAT, - + /* Continuing consecutive registers (GFX12), or separate register (GFX6-11) */ SI_TRACKED_SPI_BARYC_CNTL, - - /* 2 consecutive registers */ + /* Continuing consecutive registers (GFX12), or 2 consecutive registers (GFX6-11) */ SI_TRACKED_SPI_PS_INPUT_ENA, SI_TRACKED_SPI_PS_INPUT_ADDR, SI_TRACKED_DB_EQAA, + SI_TRACKED_DB_RENDER_OVERRIDE2, SI_TRACKED_DB_SHADER_CONTROL, SI_TRACKED_CB_SHADER_MASK, SI_TRACKED_CB_TARGET_MASK, @@ -365,19 +368,24 @@ enum si_tracked_reg SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2, /* GFX6-10 (GFX11+ can reuse this slot) */ SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3, /* GFX6-10 (GFX11+ can reuse this slot) */ - SI_TRACKED_DB_RENDER_OVERRIDE2, /* GFX6-xx (TBD) */ - SI_TRACKED_SPI_VS_OUT_CONFIG, /* GFX6-xx (TBD) */ - SI_TRACKED_VGT_PRIMITIVEID_EN, /* GFX6-xx (TBD) */ - SI_TRACKED_CB_DCC_CONTROL, /* GFX8-xx (TBD) */ + SI_TRACKED_SPI_VS_OUT_CONFIG, /* GFX6-11 */ + SI_TRACKED_VGT_PRIMITIVEID_EN, /* GFX6-11 */ + SI_TRACKED_CB_DCC_CONTROL, /* GFX8-11 */ + SI_TRACKED_DB_STENCIL_READ_MASK, /* GFX12+ */ + SI_TRACKED_DB_STENCIL_WRITE_MASK, /* GFX12+ */ + SI_TRACKED_PA_SC_HISZ_CONTROL, /* GFX12+ */ + SI_TRACKED_PA_SC_LINE_STIPPLE_RESET, /* GFX12+ */ SI_NUM_TRACKED_CONTEXT_REGS, SI_FIRST_TRACKED_OTHER_REG = SI_NUM_TRACKED_CONTEXT_REGS, /* SH and UCONFIG registers. */ - SI_TRACKED_GE_PC_ALLOC = SI_FIRST_TRACKED_OTHER_REG, /* GFX10+ */ - SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, /* GFX7+ */ + SI_TRACKED_GE_PC_ALLOC = SI_FIRST_TRACKED_OTHER_REG, /* GFX10-11 */ + SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, /* GFX7-11 */ SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, /* GFX10+ */ SI_TRACKED_VGT_GS_OUT_PRIM_TYPE_UCONFIG, /* GFX11+ */ + SI_TRACKED_SPI_SHADER_GS_OUT_CONFIG_PS, /* GFX12+ */ + SI_TRACKED_VGT_PRIMITIVEID_EN_UCONFIG, /* GFX12+ */ SI_TRACKED_IA_MULTI_VGT_PARAM_UCONFIG, /* GFX9 only */ SI_TRACKED_GE_CNTL = SI_TRACKED_IA_MULTI_VGT_PARAM_UCONFIG, /* GFX10+ */ @@ -404,6 +412,7 @@ enum si_tracked_reg SI_TRACKED_SPI_SHADER_USER_DATA_PS__ALPHA_REF, SI_TRACKED_COMPUTE_RESOURCE_LIMITS, + SI_TRACKED_COMPUTE_DISPATCH_INTERLEAVE, /* GFX12+ (not tracked on previous chips) */ SI_TRACKED_COMPUTE_NUM_THREAD_X, SI_TRACKED_COMPUTE_NUM_THREAD_Y, SI_TRACKED_COMPUTE_NUM_THREAD_Z, @@ -463,7 +472,8 @@ enum SI_NUM_INTERNAL_BINDINGS, /* Aliases to reuse slots that are unused on other generations. */ - SI_GS_QUERY_BUF = SI_RING_ESGS, /* gfx10+ */ + SI_GS_QUERY_BUF = SI_RING_ESGS, /* gfx10+ */ + SI_STREAMOUT_STATE_BUF = SI_RING_GSVS, /* gfx12+ */ }; /* Indices into sctx->descriptors, laid out so that gfx and compute pipelines @@ -691,6 +701,7 @@ void si_init_draw_functions_GFX10(struct si_context *sctx); void si_init_draw_functions_GFX10_3(struct si_context *sctx); void si_init_draw_functions_GFX11(struct si_context *sctx); void si_init_draw_functions_GFX11_5(struct si_context *sctx); +void si_init_draw_functions_GFX12(struct si_context *sctx); /* si_state_msaa.c */ extern unsigned si_msaa_max_distance[5]; diff --git a/src/gallium/drivers/radeonsi/si_state_binning.c b/src/gallium/drivers/radeonsi/si_state_binning.c index 2f00f925a69..38cee585cb3 100644 --- a/src/gallium/drivers/radeonsi/si_state_binning.c +++ b/src/gallium/drivers/radeonsi/si_state_binning.c @@ -302,7 +302,7 @@ static void gfx10_get_bin_sizes(struct si_context *sctx, unsigned cb_target_enab ((FcReadTags * num_rbs / num_pipes) * (FcTagSize * num_pipes)); const unsigned minBinSizeX = 128; - const unsigned minBinSizeY = 64; + const unsigned minBinSizeY = sctx->gfx_level >= GFX12 ? 128 : 64; const unsigned num_fragments = sctx->framebuffer.nr_color_samples; const unsigned num_samples = sctx->framebuffer.nr_samples; @@ -390,7 +390,19 @@ static void si_emit_dpbb_disable(struct si_context *sctx) radeon_begin(&sctx->gfx_cs); - if (sctx->gfx_level >= GFX10) { + if (sctx->gfx_level >= GFX12) { + struct uvec2 bin_size = {128, 128}; + + radeon_opt_set_context_reg(sctx, R_028C44_PA_SC_BINNER_CNTL_0, + SI_TRACKED_PA_SC_BINNER_CNTL_0, + S_028C44_BINNING_MODE(V_028C44_BINNING_DISABLED) | + S_028C44_BIN_SIZE_X_EXTEND(util_logbase2(bin_size.x) - 5) | + S_028C44_BIN_SIZE_Y_EXTEND(util_logbase2(bin_size.y) - 5) | + S_028C44_DISABLE_START_OF_PRIM(1) | + S_028C44_FPOVS_PER_BATCH(63) | + S_028C44_OPTIMAL_BIN_SELECTION(1) | + S_028C44_FLUSH_ON_BINNING_TRANSITION(1)); + } else if (sctx->gfx_level >= GFX10) { struct uvec2 bin_size = {}; struct uvec2 bin_size_extend = {}; unsigned binning_disabled = @@ -435,6 +447,7 @@ void si_emit_dpbb_state(struct si_context *sctx, unsigned index) struct si_state_dsa *dsa = sctx->queued.named.dsa; unsigned db_shader_control = sctx->ps_db_shader_control; unsigned optimal_bin_selection = !sctx->queued.named.rasterizer->bottom_edge_rule; + unsigned pa_sc_hisz_control = sctx->ps_pa_sc_hisz_control; assert(sctx->gfx_level >= GFX9); @@ -449,8 +462,10 @@ void si_emit_dpbb_state(struct si_context *sctx, unsigned index) G_02880C_COVERAGE_TO_MASK_ENABLE(db_shader_control) || blend->alpha_to_coverage; bool db_can_reject_z_trivially = !G_02880C_Z_EXPORT_ENABLE(db_shader_control) || - G_02880C_CONSERVATIVE_Z_EXPORT(db_shader_control) || - G_02880C_DEPTH_BEFORE_SHADER(db_shader_control); + G_02880C_DEPTH_BEFORE_SHADER(db_shader_control) || + (sctx->gfx_level >= GFX12 ? + G_028BBC_CONSERVATIVE_Z_EXPORT(pa_sc_hisz_control) : + G_02880C_CONSERVATIVE_Z_EXPORT(db_shader_control)); /* Disable DPBB when it's believed to be inefficient. */ if (sscreen->info.max_render_backends > 4 && ps_can_kill && db_can_reject_z_trivially && diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index 5ac40469a6c..33e350c8723 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -33,6 +33,8 @@ #define GFX(name) name##GFX11 #elif (GFX_VER == 115) #define GFX(name) name##GFX11_5 +#elif (GFX_VER == 12) +#define GFX(name) name##GFX12 #else #error "Unknown gfx level" #endif @@ -159,11 +161,16 @@ static bool si_update_shaders(struct si_context *sctx) uint32_t vgt_stages = 0; if (HAS_TESS) { - vgt_stages |= S_028B54_LS_EN(V_028B54_LS_STAGE_ON) | - S_028B54_HS_EN(1) | - S_028B54_DYNAMIC_HS(1) | - S_028B54_HS_W32_EN(GFX_VERSION >= GFX10 && - sctx->queued.named.hs->wave_size == 32); + if (GFX_VERSION >= GFX12) { + vgt_stages |= S_028A98_HS_EN(1) | + S_028A98_HS_W32_EN(sctx->queued.named.hs->wave_size == 32); + } else { + vgt_stages |= S_028B54_LS_EN(V_028B54_LS_STAGE_ON) | + S_028B54_HS_EN(1) | + S_028B54_DYNAMIC_HS(1) | + S_028B54_HS_W32_EN(GFX_VERSION >= GFX10 && + sctx->queued.named.hs->wave_size == 32); + } } if (NGG) { @@ -206,6 +213,9 @@ static bool si_update_shaders(struct si_context *sctx) } else { ge_cntl = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current->ge_cntl; } + + if (GFX_VERSION >= GFX12) + ge_cntl |= S_03096C_DIS_PG_SIZE_ADJUST_FOR_STRIP(1); } else { unsigned primgroup_size; unsigned vertgroup_size; @@ -265,6 +275,13 @@ static bool si_update_shaders(struct si_context *sctx) si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); } + unsigned pa_sc_hisz_control = sctx->shader.ps.current->ps.pa_sc_hisz_control; + if (GFX_VERSION >= GFX12 && sctx->screen->dpbb_allowed && + sctx->ps_pa_sc_hisz_control != pa_sc_hisz_control) { + sctx->ps_pa_sc_hisz_control = pa_sc_hisz_control; + si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); + } + if (si_pm4_state_changed(sctx, ps) || (!NGG && si_pm4_state_changed(sctx, vs)) || (NGG && si_pm4_state_changed(sctx, gs))) { @@ -560,6 +577,9 @@ void si_cp_dma_prefetch(struct si_context *sctx, struct pipe_resource *buf, case GFX11_5: si_cp_dma_prefetch_inline(sctx, address, size); break; + case GFX12: + si_cp_dma_prefetch_inline(sctx, address, size); + break; default: break; } @@ -910,12 +930,18 @@ static void si_emit_rasterizer_prim_state(struct si_context *sctx) bool reset_per_prim = rast_prim == MESA_PRIM_LINES || rast_prim == MESA_PRIM_LINES_ADJACENCY; /* 0 = no reset, 1 = reset per prim, 2 = reset per packet */ - struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + if (GFX_VERSION >= GFX12) { + radeon_opt_set_context_reg(sctx, R_028A44_PA_SC_LINE_STIPPLE_RESET, + SI_TRACKED_PA_SC_LINE_STIPPLE_RESET, + S_028A44_AUTO_RESET_CNTL(reset_per_prim ? 1 : 2)); + } else { + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - radeon_opt_set_context_reg(sctx, R_028A0C_PA_SC_LINE_STIPPLE, - SI_TRACKED_PA_SC_LINE_STIPPLE, - rs->pa_sc_line_stipple | - S_028A0C_AUTO_RESET_CNTL(reset_per_prim ? 1 : 2)); + radeon_opt_set_context_reg(sctx, R_028A0C_PA_SC_LINE_STIPPLE, + SI_TRACKED_PA_SC_LINE_STIPPLE, + rs->pa_sc_line_stipple | + S_028A0C_AUTO_RESET_CNTL(reset_per_prim ? 1 : 2)); + } } if (NGG || HAS_GS) { @@ -1077,6 +1103,9 @@ static void si_emit_draw_registers(struct si_context *sctx, if (prim != sctx->last_prim) { unsigned vgt_prim = HAS_TESS ? V_008958_DI_PT_PATCH : si_conv_pipe_prim(prim); + if (GFX_VERSION >= GFX12 && HAS_TESS) + vgt_prim |= S_030908_NUM_INPUT_CP(sctx->patch_vertices); + if (GFX_VERSION >= GFX10) radeon_set_uconfig_reg(R_030908_VGT_PRIMITIVE_TYPE, vgt_prim); else if (GFX_VERSION >= GFX7) @@ -1170,12 +1199,28 @@ gfx11_emit_buffered_sh_regs_inline(struct si_context *sctx, unsigned *num_regs, radeon_end(); } +#define gfx12_emit_buffered_sh_regs_inline(num_regs, regs) do { \ + unsigned __reg_count = *(num_regs); \ + if (__reg_count) { \ + radeon_emit(PKT3(PKT3_SET_SH_REG_PAIRS, __reg_count * 2 - 1, 0) | PKT3_RESET_FILTER_CAM_S(1)); \ + radeon_emit_array(regs, __reg_count * 2); \ + *(num_regs) = 0; \ + } \ +} while (0) + #if GFX_VER == 6 /* declare this function only once because there is only one variant. */ void si_emit_buffered_compute_sh_regs(struct si_context *sctx) { - gfx11_emit_buffered_sh_regs_inline(sctx, &sctx->num_buffered_compute_sh_regs, - sctx->gfx11.buffered_compute_sh_regs); + if (sctx->gfx_level >= GFX12) { + radeon_begin(&sctx->gfx_cs); + gfx12_emit_buffered_sh_regs_inline(&sctx->num_buffered_compute_sh_regs, + sctx->gfx12.buffered_compute_sh_regs); + radeon_end(); + } else { + gfx11_emit_buffered_sh_regs_inline(sctx, &sctx->num_buffered_compute_sh_regs, + sctx->gfx11.buffered_compute_sh_regs); + } } #endif @@ -1218,10 +1263,11 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw if (GFX_VERSION >= GFX9) { /* Use PKT3_LOAD_CONTEXT_REG_INDEX instead of si_cp_copy_data to support state shadowing. */ - uint64_t va = t->buf_filled_size->gpu_address + t->buf_filled_size_offset; + uint64_t va = t->buf_filled_size->gpu_address + t->buf_filled_size_draw_count_offset; radeon_begin(cs); + // TODO: GFX12: This may be discarded by PFP if the shadow base address is provided by the MQD. radeon_emit(PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, 0)); radeon_emit(va); radeon_emit(va >> 32); @@ -1232,7 +1278,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw } else { si_cp_copy_data(sctx, &sctx->gfx_cs, COPY_DATA_REG, NULL, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2, COPY_DATA_SRC_MEM, - t->buf_filled_size, t->buf_filled_size_offset); + t->buf_filled_size, t->buf_filled_size_draw_count_offset); } use_opaque = S_0287F0_USE_OPAQUE(1); indirect = NULL; @@ -1340,7 +1386,10 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw assert(indirect_va % 8 == 0); - if (HAS_SH_PAIRS_PACKED) { + if (GFX_VERSION >= GFX12) { + gfx12_emit_buffered_sh_regs_inline(&sctx->num_buffered_gfx_sh_regs, + sctx->gfx12.buffered_gfx_sh_regs); + } else if (HAS_SH_PAIRS_PACKED) { radeon_end(); gfx11_emit_buffered_sh_regs_inline(sctx, &sctx->num_buffered_gfx_sh_regs, sctx->gfx11.buffered_gfx_sh_regs); @@ -1424,7 +1473,18 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw if (!is_blit) { /* Prefer SET_SH_REG_PAIRS_PACKED* on Gfx11+. */ - if (HAS_SH_PAIRS_PACKED) { + if (GFX_VERSION >= GFX12) { + gfx12_opt_push_gfx_sh_reg(sh_base_reg + SI_SGPR_BASE_VERTEX * 4, + tracked_base_vertex_reg, base_vertex); + if (set_draw_id) { + gfx12_opt_push_gfx_sh_reg(sh_base_reg + SI_SGPR_DRAWID * 4, + tracked_base_vertex_reg + 1, drawid_base); + } + if (set_base_instance) { + gfx12_opt_push_gfx_sh_reg(sh_base_reg + SI_SGPR_START_INSTANCE * 4, + tracked_base_vertex_reg + 2, info->start_instance); + } + } else if (HAS_SH_PAIRS_PACKED) { gfx11_opt_push_gfx_sh_reg(sh_base_reg + SI_SGPR_BASE_VERTEX * 4, tracked_base_vertex_reg, base_vertex); if (set_draw_id) { @@ -1450,14 +1510,17 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw } } - if (HAS_SH_PAIRS_PACKED) { + if (GFX_VERSION >= GFX12) { + gfx12_emit_buffered_sh_regs_inline(&sctx->num_buffered_gfx_sh_regs, + sctx->gfx12.buffered_gfx_sh_regs); + } else if (HAS_SH_PAIRS_PACKED) { radeon_end(); gfx11_emit_buffered_sh_regs_inline(sctx, &sctx->num_buffered_gfx_sh_regs, sctx->gfx11.buffered_gfx_sh_regs); radeon_begin_again(cs); } - /* Blit SGPRs must be set after gfx11_emit_buffered_sh_regs_inline because they can + /* Blit SGPRs must be set after gfx1X_emit_buffered_sh_regs_inline because they can * overwrite them. */ if (is_blit) { @@ -1575,12 +1638,26 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw radeon_emit(va >> 32); radeon_emit(draws[i].count); radeon_emit(V_0287F0_DI_SRC_SEL_DMA | - S_0287F0_NOT_EOP(GFX_VERSION >= GFX10 && i < num_draws - 1)); + S_0287F0_NOT_EOP(GFX_VERSION >= GFX10 && GFX_VERSION < GFX12 && + i < num_draws - 1)); } } } } else { - if (increment_draw_id) { + if (GFX_VERSION == GFX12 && !IS_DRAW_VERTEX_STATE && + indirect && indirect->count_from_stream_output) { + /* DrawTransformFeedback requires 3 SQ_NON_EVENTs after the packet. */ + assert(num_draws == 1); + + radeon_emit(PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit)); + radeon_emit(0); + radeon_emit(V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque); + + for (unsigned i = 0; i < 3; i++) { + radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(EVENT_TYPE(V_028A90_SQ_NON_EVENT) | EVENT_INDEX(0)); + } + } else if (increment_draw_id) { for (unsigned i = 0; i < num_draws; i++) { if (i > 0) { unsigned draw_id = drawid_base + i; @@ -1680,6 +1757,9 @@ void si_set_vertex_buffer_descriptor(struct si_screen *sscreen, struct si_vertex case GFX11_5: si_set_vb_descriptor(velems, vb, element_index, out); break; + case GFX12: + si_set_vb_descriptor(velems, vb, element_index, out); + break; default: unreachable("unhandled gfx level"); } @@ -1980,7 +2060,7 @@ static void si_draw(struct pipe_context *ctx, if (GFX_VERSION < GFX11) gfx6_decompress_textures(sctx, u_bit_consecutive(0, SI_NUM_GRAPHICS_SHADERS)); - else + else if (GFX_VERSION < GFX12) gfx11_decompress_textures(sctx, u_bit_consecutive(0, SI_NUM_GRAPHICS_SHADERS)); si_need_gfx_cs_space(sctx, num_draws); @@ -2067,8 +2147,9 @@ static void si_draw(struct pipe_context *ctx, /* info->start will be added by the drawing code */ index_offset -= start_offset; - } else if (GFX_VERSION <= GFX7 && si_resource(indexbuf)->TC_L2_dirty) { - /* GFX8 reads index buffers through TC L2, so it doesn't + } else if ((GFX_VERSION <= GFX7 || GFX_VERSION == GFX12) && + si_resource(indexbuf)->TC_L2_dirty) { + /* GFX8-GFX11 reads index buffers through TC L2, so it doesn't * need this. */ sctx->flags |= SI_CONTEXT_WB_L2; si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); @@ -2080,8 +2161,8 @@ static void si_draw(struct pipe_context *ctx, unsigned total_direct_count = 0; if (!IS_DRAW_VERTEX_STATE && indirect) { - /* Indirect buffers use TC L2 on GFX9, but not older hw. */ - if (GFX_VERSION <= GFX8) { + /* Indirect buffers use TC L2 on GFX9-GFX11, but not other hw. */ + if (GFX_VERSION <= GFX8 || GFX_VERSION == GFX12) { if (indirect->buffer && si_resource(indirect->buffer)->TC_L2_dirty) { sctx->flags |= SI_CONTEXT_WB_L2; si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); @@ -2287,12 +2368,13 @@ static void si_draw(struct pipe_context *ctx, si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); } - if (unlikely(sctx->decompression_enabled)) { + if (unlikely(GFX_VERSION < GFX12 && sctx->decompression_enabled)) { sctx->num_decompress_calls++; } else { sctx->num_draw_calls += num_draws; } + /* On Gfx12, this is only used to detect whether a depth texture is in the cleared state. */ if (sctx->framebuffer.state.zsbuf) { struct si_texture *zstex = (struct si_texture *)sctx->framebuffer.state.zsbuf->texture; zstex->depth_cleared_level_mask &= ~BITFIELD_BIT(sctx->framebuffer.state.zsbuf->u.tex.level); @@ -2350,7 +2432,7 @@ static void si_draw_rectangle(struct blitter_context *blitter, void *vertex_elem struct pipe_context *pipe = util_blitter_get_pipe(blitter); struct si_context *sctx = (struct si_context *)pipe; uint32_t attribute_ring_address_lo = - sctx->gfx_level >= GFX11 ? sctx->screen->attribute_ring->gpu_address : 0; + sctx->gfx_level >= GFX11 ? sctx->screen->attribute_pos_prim_ring->gpu_address : 0; /* Pack position coordinates as signed int16. */ sctx->vs_blit_sh_data[0] = (uint32_t)(x1 & 0xffff) | ((uint32_t)(y1 & 0xffff) << 16); @@ -2387,7 +2469,8 @@ static void si_draw_rectangle(struct blitter_context *blitter, void *vertex_elem pipe->draw_vbo(pipe, &info, 0, NULL, &draw, 1); } -template +template static void si_init_draw_vbo(struct si_context *sctx) { if (NGG && GFX_VERSION < GFX10) @@ -2396,42 +2479,43 @@ static void si_init_draw_vbo(struct si_context *sctx) if (!NGG && GFX_VERSION >= GFX11) return; - if (GFX_VERSION >= GFX11 && sctx->screen->info.has_set_sh_pairs_packed) { + if (GFX_VERSION >= GFX11 && GFX_VERSION < GFX12 && sctx->screen->info.has_set_sh_pairs_packed) { sctx->draw_vbo[HAS_TESS][HAS_GS][NGG] = si_draw_vbo; - if (util_get_cpu_caps()->has_popcnt) { - sctx->draw_vertex_state[HAS_TESS][HAS_GS][NGG] = - si_draw_vertex_state; - } else { - sctx->draw_vertex_state[HAS_TESS][HAS_GS][NGG] = - si_draw_vertex_state; - } + sctx->draw_vertex_state[HAS_TESS][HAS_GS][NGG] = + si_draw_vertex_state; } else { sctx->draw_vbo[HAS_TESS][HAS_GS][NGG] = si_draw_vbo; - if (util_get_cpu_caps()->has_popcnt) { - sctx->draw_vertex_state[HAS_TESS][HAS_GS][NGG] = - si_draw_vertex_state; - } else { - sctx->draw_vertex_state[HAS_TESS][HAS_GS][NGG] = - si_draw_vertex_state; - } + sctx->draw_vertex_state[HAS_TESS][HAS_GS][NGG] = + si_draw_vertex_state; } } template static void si_init_draw_vbo_all_pipeline_options(struct si_context *sctx) { - si_init_draw_vbo(sctx); - si_init_draw_vbo(sctx); - si_init_draw_vbo(sctx); - si_init_draw_vbo(sctx); - si_init_draw_vbo(sctx); - si_init_draw_vbo(sctx); - si_init_draw_vbo(sctx); - si_init_draw_vbo(sctx); + if (util_get_cpu_caps()->has_popcnt) { + si_init_draw_vbo(sctx); + si_init_draw_vbo(sctx); + si_init_draw_vbo(sctx); + si_init_draw_vbo(sctx); + si_init_draw_vbo(sctx); + si_init_draw_vbo(sctx); + si_init_draw_vbo(sctx); + si_init_draw_vbo(sctx); + } else { + si_init_draw_vbo(sctx); + si_init_draw_vbo(sctx); + si_init_draw_vbo(sctx); + si_init_draw_vbo(sctx); + si_init_draw_vbo(sctx); + si_init_draw_vbo(sctx); + si_init_draw_vbo(sctx); + si_init_draw_vbo(sctx); + } } static void si_invalid_draw_vbo(struct pipe_context *pipe, diff --git a/src/gallium/drivers/radeonsi/si_state_msaa.c b/src/gallium/drivers/radeonsi/si_state_msaa.c index 4d2cc64998e..e5706002c30 100644 --- a/src/gallium/drivers/radeonsi/si_state_msaa.c +++ b/src/gallium/drivers/radeonsi/si_state_msaa.c @@ -143,9 +143,22 @@ static void si_get_sample_position(struct pipe_context *ctx, unsigned sample_cou } static void si_emit_max_4_sample_locs(struct si_context *sctx, uint64_t centroid_priority, - uint32_t sample_locs) + uint32_t sample_locs, uint32_t max_sample_dist) { - if (sctx->screen->info.has_set_context_pairs_packed) { + if (sctx->gfx_level >= GFX12) { + radeon_begin(&sctx->gfx_cs); + gfx12_begin_context_regs(); + gfx12_set_context_reg(R_028BF0_PA_SC_CENTROID_PRIORITY_0, centroid_priority); + gfx12_set_context_reg(R_028BF4_PA_SC_CENTROID_PRIORITY_1, centroid_priority >> 32); + gfx12_set_context_reg(R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs); + gfx12_set_context_reg(R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs); + gfx12_set_context_reg(R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs); + gfx12_set_context_reg(R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs); + gfx12_set_context_reg(R_028C5C_PA_SC_SAMPLE_PROPERTIES, + S_028C5C_MAX_SAMPLE_DIST(max_sample_dist)); + gfx12_end_context_regs(); + radeon_end(); + } else if (sctx->screen->info.has_set_context_pairs_packed) { radeon_begin(&sctx->gfx_cs); gfx11_begin_packed_context_regs(); gfx11_set_context_reg(R_028BD4_PA_SC_CENTROID_PRIORITY_0, centroid_priority); @@ -170,12 +183,24 @@ static void si_emit_max_4_sample_locs(struct si_context *sctx, uint64_t centroid } static void si_emit_max_16_sample_locs(struct si_context *sctx, uint64_t centroid_priority, - const uint32_t *sample_locs, unsigned num_samples) + const uint32_t *sample_locs, unsigned num_samples, + uint32_t max_sample_dist) { radeon_begin(&sctx->gfx_cs); - radeon_set_context_reg_seq(R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2); - radeon_emit(centroid_priority); - radeon_emit(centroid_priority >> 32); + + if (sctx->gfx_level >= GFX12) { + gfx12_begin_context_regs(); + gfx12_set_context_reg(R_028BF0_PA_SC_CENTROID_PRIORITY_0, centroid_priority); + gfx12_set_context_reg(R_028BF4_PA_SC_CENTROID_PRIORITY_1, centroid_priority >> 32); + gfx12_set_context_reg(R_028C5C_PA_SC_SAMPLE_PROPERTIES, + S_028C5C_MAX_SAMPLE_DIST(max_sample_dist)); + gfx12_end_context_regs(); + } else { + radeon_set_context_reg_seq(R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2); + radeon_emit(centroid_priority); + radeon_emit(centroid_priority >> 32); + } + radeon_set_context_reg_seq(R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, num_samples == 8 ? 14 : 16); radeon_emit_array(sample_locs, 4); @@ -204,24 +229,27 @@ static void si_emit_sample_locations(struct si_context *sctx, unsigned index) * to non-MSAA. */ if (nr_samples != sctx->sample_locs_num_samples) { + unsigned max_sample_dist = si_msaa_max_distance[util_logbase2(nr_samples)]; + switch (nr_samples) { default: case 1: - si_emit_max_4_sample_locs(sctx, centroid_priority_1x, sample_locs_1x); + si_emit_max_4_sample_locs(sctx, centroid_priority_1x, sample_locs_1x, max_sample_dist); break; case 2: - si_emit_max_4_sample_locs(sctx, centroid_priority_2x, sample_locs_2x); + si_emit_max_4_sample_locs(sctx, centroid_priority_2x, sample_locs_2x, max_sample_dist); break; case 4: - si_emit_max_4_sample_locs(sctx, centroid_priority_4x, sample_locs_4x); + si_emit_max_4_sample_locs(sctx, centroid_priority_4x, sample_locs_4x, max_sample_dist); break; case 8: - si_emit_max_16_sample_locs(sctx, centroid_priority_8x, sample_locs_8x, 8); + si_emit_max_16_sample_locs(sctx, centroid_priority_8x, sample_locs_8x, 8, max_sample_dist); break; case 16: - si_emit_max_16_sample_locs(sctx, centroid_priority_16x, sample_locs_16x, 16); + si_emit_max_16_sample_locs(sctx, centroid_priority_16x, sample_locs_16x, 16, max_sample_dist); break; } + sctx->sample_locs_num_samples = nr_samples; } diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 0b061098e5a..3c144c994f4 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -493,7 +493,9 @@ unsigned si_shader_encode_sgprs(struct si_shader *shader) bool si_shader_mem_ordered(struct si_shader *shader) { - if (shader->selector->screen->info.gfx_level < GFX10) + struct si_screen *sscreen = shader->selector->screen; + + if (sscreen->info.gfx_level < GFX10 || sscreen->info.gfx_level >= GFX12) return false; /* Return true if both types of VMEM that return something are used. */ @@ -563,6 +565,9 @@ static void si_set_tesseval_regs(struct si_screen *sscreen, const struct si_shad shader->vgt_tf_param = S_028B6C_TYPE(type) | S_028B6C_PARTITIONING(partitioning) | S_028B6C_TOPOLOGY(topology) | S_028B6C_DISTRIBUTION_MODE(distribution_mode); + + if (sscreen->info.gfx_level >= GFX12) + shader->vgt_tf_param |= S_028AA4_TEMPORAL(gfx12_load_last_use_discard); } /* Polaris needs different VTX_REUSE_DEPTH settings depending on @@ -635,12 +640,15 @@ static unsigned si_get_vs_vgpr_comp_cnt(struct si_screen *sscreen, struct si_sha * GFX6-9 ES,VS (VertexID, InstanceID / StepRate0, VSPrimID, InstanceID) * GFX10-11 LS (VertexID, RelAutoIndex, UserVGPR1, UserVGPR2 or InstanceID) * GFX10-11 ES,VS (VertexID, UserVGPR1, UserVGPR2 or VSPrimID, UserVGPR3 or InstanceID) + * GFX12 LS,ES (VertexID, InstanceID) */ bool is_ls = shader->selector->stage == MESA_SHADER_TESS_CTRL || shader->key.ge.as_ls; unsigned max = 0; if (shader->info.uses_instanceid) { - if (sscreen->info.gfx_level >= GFX10) + if (sscreen->info.gfx_level >= GFX12) + max = MAX2(max, 1); + else if (sscreen->info.gfx_level >= GFX10) max = MAX2(max, 3); else if (is_ls) max = MAX2(max, 2); /* use (InstanceID / StepRate0) because StepRate0 == 1 */ @@ -653,6 +661,7 @@ static unsigned si_get_vs_vgpr_comp_cnt(struct si_screen *sscreen, struct si_sha /* GFX11: We prefer to compute RelAutoIndex using (WaveID * WaveSize + ThreadID). * Older chips didn't have WaveID in LS. + * GFX12 doesn't have RelAutoIndex. */ if (is_ls && sscreen->info.gfx_level <= GFX10_3) max = MAX2(max, 1); /* RelAutoIndex */ @@ -695,7 +704,14 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader) si_get_num_vs_user_sgprs(shader, GFX9_TCS_NUM_USER_SGPR) : GFX6_TCS_NUM_USER_SGPR; - if (sscreen->info.gfx_level >= GFX11) { + if (sscreen->info.gfx_level >= GFX12) { + si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_RSRC4_HS, + S_00B420_WAVE_LIMIT(0x3ff) | + S_00B420_GLG_FORCE_DISABLE(1) | + S_00B420_INST_PREF_SIZE(si_get_shader_prefetch_size(shader))); + + si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_LO_LS, va >> 8); + } else if (sscreen->info.gfx_level >= GFX11) { si_pm4_set_reg_idx3(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS, ac_apply_cu_en(S_00B404_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)) | S_00B404_CU_EN(0xffff), @@ -715,7 +731,7 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader) si_pm4_set_reg(pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS, S_00B428_VGPRS(si_shader_encode_vgprs(shader)) | S_00B428_SGPRS(si_shader_encode_sgprs(shader)) | - S_00B428_DX10_CLAMP(1) | + S_00B428_DX10_CLAMP(sscreen->info.gfx_level < GFX12) | S_00B428_MEM_ORDERED(si_shader_mem_ordered(shader)) | S_00B428_FLOAT_MODE(shader->config.float_mode) | S_00B428_LS_VGPR_COMP_CNT(sscreen->info.gfx_level >= GFX9 ? @@ -1287,6 +1303,46 @@ static void gfx11_dgpu_emit_shader_ngg(struct si_context *sctx, unsigned index) radeon_end(); } +template +static void gfx12_emit_shader_ngg(struct si_context *sctx, unsigned index) +{ + struct si_shader *shader = sctx->queued.named.gs; + + if (shader->selector->stage == MESA_SHADER_GEOMETRY) + gfx9_set_gs_sgpr_num_es_outputs(sctx, shader->ngg.esgs_vertex_stride); + + radeon_begin(&sctx->gfx_cs); + gfx12_begin_context_regs(); + if (HAS_TESS) { + gfx12_opt_set_context_reg(R_028AA4_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM, + shader->vgt_tf_param); + } + gfx12_opt_set_context_reg(R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP, + SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP, + shader->ngg.ge_max_output_per_subgroup); + gfx12_opt_set_context_reg(R_028B4C_GE_NGG_SUBGRP_CNTL, SI_TRACKED_GE_NGG_SUBGRP_CNTL, + shader->ngg.ge_ngg_subgrp_cntl); + gfx12_opt_set_context_reg(R_028B38_VGT_GS_MAX_VERT_OUT, SI_TRACKED_VGT_GS_MAX_VERT_OUT, + shader->ngg.vgt_gs_max_vert_out); + gfx12_opt_set_context_reg(R_028B3C_VGT_GS_INSTANCE_CNT, SI_TRACKED_VGT_GS_INSTANCE_CNT, + shader->ngg.vgt_gs_instance_cnt); + gfx12_opt_set_context_reg(R_02864C_SPI_SHADER_POS_FORMAT, SI_TRACKED_SPI_SHADER_POS_FORMAT, + shader->ngg.spi_shader_pos_format); + gfx12_opt_set_context_reg(R_028814_PA_CL_VTE_CNTL, SI_TRACKED_PA_CL_VTE_CNTL, + shader->ngg.pa_cl_vte_cntl); + gfx12_end_context_regs(); + + radeon_opt_set_uconfig_reg(sctx, R_030988_VGT_PRIMITIVEID_EN, + SI_TRACKED_VGT_PRIMITIVEID_EN_UCONFIG, + shader->ngg.vgt_primitiveid_en); + radeon_end(); /* don't track context rolls on GFX12 */ + + assert(!sctx->screen->info.uses_kernel_cu_mask); + gfx12_opt_push_gfx_sh_reg(R_00B220_SPI_SHADER_PGM_RSRC4_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, + shader->ngg.spi_shader_pgm_rsrc4_gs); +} + unsigned si_get_input_prim(const struct si_shader_selector *gs, const union si_shader_key *key) { if (gs->stage == MESA_SHADER_GEOMETRY) @@ -1359,7 +1415,12 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader if (!pm4) return; - if (sscreen->info.has_set_context_pairs_packed) { + if (sscreen->info.gfx_level >= GFX12) { + if (es_stage == MESA_SHADER_TESS_EVAL) + pm4->atom.emit = gfx12_emit_shader_ngg; + else + pm4->atom.emit = gfx12_emit_shader_ngg; + } else if (sscreen->info.has_set_context_pairs_packed) { if (es_stage == MESA_SHADER_TESS_EVAL) pm4->atom.emit = gfx11_dgpu_emit_shader_ngg; else @@ -1394,29 +1455,44 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader /* Primitives with adjancency can only occur without tessellation. */ assert(gs_info->gs_input_verts_per_prim <= 3 || es_stage == MESA_SHADER_VERTEX); - /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and - * VGPR[0:4] are always loaded. - * - * Vertex shaders always need to load VGPR3, because they need to - * pass edge flags for decomposed primitives (such as quads) to the PA - * for the GL_LINE polygon mode to skip rendering lines on inner edges. - */ - if (gs_info->uses_invocationid || - (gfx10_edgeflags_have_effect(shader) && !gfx10_is_ngg_passthrough(shader))) - gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID, edge flags. */ - else if ((gs_stage == MESA_SHADER_GEOMETRY && gs_info->uses_primid) || - (gs_stage == MESA_SHADER_VERTEX && shader->key.ge.mono.u.vs_export_prim_id)) - gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */ - else if (input_prim >= MESA_PRIM_TRIANGLES && !gfx10_is_ngg_passthrough(shader)) - gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */ - else - gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */ + if (sscreen->info.gfx_level >= GFX12) { + if (gs_info->gs_input_verts_per_prim >= 4) + gs_vgpr_comp_cnt = 2; /* VGPR2 contains offsets 3-5 */ + else if ((gs_stage == MESA_SHADER_GEOMETRY && gs_info->uses_primid) || + (gs_stage == MESA_SHADER_VERTEX && shader->key.ge.mono.u.vs_export_prim_id)) + gs_vgpr_comp_cnt = 1; /* VGPR1 contains PrimitiveID */ + else + gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0-2, edgeflags, GS invocation ID. */ + } else { + /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and + * VGPR[0:4] are always loaded. + * + * Vertex shaders always need to load VGPR3, because they need to + * pass edge flags for decomposed primitives (such as quads) to the PA + * for the GL_LINE polygon mode to skip rendering lines on inner edges. + */ + if (gs_info->uses_invocationid || + (gfx10_edgeflags_have_effect(shader) && !gfx10_is_ngg_passthrough(shader))) + gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID, edge flags. */ + else if ((gs_stage == MESA_SHADER_GEOMETRY && gs_info->uses_primid) || + (gs_stage == MESA_SHADER_VERTEX && shader->key.ge.mono.u.vs_export_prim_id)) + gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */ + else if (input_prim >= MESA_PRIM_TRIANGLES && !gfx10_is_ngg_passthrough(shader)) + gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */ + else + gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */ + } + + if (sscreen->info.gfx_level >= GFX12) { + si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_LO_ES, va >> 8); + } else { + si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8); + } - si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8); si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, S_00B228_VGPRS(si_shader_encode_vgprs(shader)) | S_00B228_FLOAT_MODE(shader->config.float_mode) | - S_00B228_DX10_CLAMP(1) | + S_00B228_DX10_CLAMP(sscreen->info.gfx_level < GFX12) | S_00B228_MEM_ORDERED(si_shader_mem_ordered(shader)) | S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt)); si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, @@ -1460,51 +1536,71 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader S_028A84_NGG_DISABLE_PROVOK_REUSE(shader->key.ge.mono.u.vs_export_prim_id || gs_sel->info.writes_primid); - unsigned late_alloc_wave64, cu_mask; + if (sscreen->info.gfx_level >= GFX12) { + unsigned num_params = shader->info.nr_param_exports; - ac_compute_late_alloc(&sscreen->info, true, shader->key.ge.opt.ngg_culling, - shader->config.scratch_bytes_per_wave > 0, - &late_alloc_wave64, &cu_mask); + /* Since there is no alloc/dealloc mechanism for the 12-bit ordered IDs, they can wrap + * around if there are more than 2^12 workgroups, causing 2 workgroups to get the same + * ordered ID, which would break the streamout algorithm. + * The recommended solution is to use the alloc/dealloc mechanism of the attribute ring, + * which is enough to limit the range of ordered IDs that can be in flight. + */ + if (si_shader_uses_streamout(shader)) + num_params = MAX2(num_params, 8); - /* Oversubscribe PC. This improves performance when there are too many varyings. */ - unsigned oversub_pc_lines, oversub_pc_factor = 1; - - if (shader->key.ge.opt.ngg_culling) { - /* Be more aggressive with NGG culling. */ - if (shader->info.nr_param_exports > 4) - oversub_pc_factor = 4; - else if (shader->info.nr_param_exports > 2) - oversub_pc_factor = 3; - else - oversub_pc_factor = 2; - } - oversub_pc_lines = late_alloc_wave64 ? (sscreen->info.pc_lines / 4) * oversub_pc_factor : 0; - shader->ngg.ge_pc_alloc = S_030980_OVERSUB_EN(oversub_pc_lines > 0) | - S_030980_NUM_PC_LINES(oversub_pc_lines - 1); - shader->ngg.vgt_primitiveid_en |= S_028A84_PRIMITIVEID_EN(es_enable_prim_id); - shader->ngg.spi_shader_pgm_rsrc3_gs = - ac_apply_cu_en(S_00B21C_CU_EN(cu_mask) | - S_00B21C_WAVE_LIMIT(0x3F), - C_00B21C_CU_EN, 0, &sscreen->info); - shader->ngg.spi_shader_pgm_rsrc4_gs = S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64); - shader->ngg.spi_vs_out_config = - S_0286C4_VS_EXPORT_COUNT(MAX2(shader->info.nr_param_exports, 1) - 1) | - S_0286C4_NO_PC_EXPORT(shader->info.nr_param_exports == 0); - - if (sscreen->info.gfx_level >= GFX11) { - shader->ngg.spi_shader_pgm_rsrc4_gs |= - ac_apply_cu_en(S_00B204_CU_EN_GFX11(0x1) | - S_00B204_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)), - C_00B204_CU_EN_GFX11, 16, &sscreen->info); + shader->ngg.spi_shader_pgm_rsrc4_gs = S_00B220_SPI_SHADER_LATE_ALLOC_GS(127) | + S_00B220_GLG_FORCE_DISABLE(1) | + S_00B220_WAVE_LIMIT(0x3ff) | + S_00B220_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)); + shader->ngg.spi_vs_out_config = S_00B0C4_VS_EXPORT_COUNT(MAX2(num_params, 1) - 1) | + S_00B0C4_NO_PC_EXPORT(num_params == 0); } else { - shader->ngg.spi_shader_pgm_rsrc4_gs |= - ac_apply_cu_en(S_00B204_CU_EN_GFX10(0xffff), - C_00B204_CU_EN_GFX10, 16, &sscreen->info); + unsigned late_alloc_wave64, cu_mask; + + ac_compute_late_alloc(&sscreen->info, true, shader->key.ge.opt.ngg_culling, + shader->config.scratch_bytes_per_wave > 0, + &late_alloc_wave64, &cu_mask); + + /* Oversubscribe PC. This improves performance when there are too many varyings. */ + unsigned oversub_pc_lines, oversub_pc_factor = 1; + + if (shader->key.ge.opt.ngg_culling) { + /* Be more aggressive with NGG culling. */ + if (shader->info.nr_param_exports > 4) + oversub_pc_factor = 4; + else if (shader->info.nr_param_exports > 2) + oversub_pc_factor = 3; + else + oversub_pc_factor = 2; + } + oversub_pc_lines = late_alloc_wave64 ? (sscreen->info.pc_lines / 4) * oversub_pc_factor : 0; + shader->ngg.ge_pc_alloc = S_030980_OVERSUB_EN(oversub_pc_lines > 0) | + S_030980_NUM_PC_LINES(oversub_pc_lines - 1); + shader->ngg.vgt_primitiveid_en |= S_028A84_PRIMITIVEID_EN(es_enable_prim_id); + shader->ngg.spi_shader_pgm_rsrc3_gs = + ac_apply_cu_en(S_00B21C_CU_EN(cu_mask) | + S_00B21C_WAVE_LIMIT(0x3F), + C_00B21C_CU_EN, 0, &sscreen->info); + shader->ngg.spi_shader_pgm_rsrc4_gs = S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64); + shader->ngg.spi_vs_out_config = + S_0286C4_VS_EXPORT_COUNT(MAX2(shader->info.nr_param_exports, 1) - 1) | + S_0286C4_NO_PC_EXPORT(shader->info.nr_param_exports == 0); + + if (sscreen->info.gfx_level >= GFX11) { + shader->ngg.spi_shader_pgm_rsrc4_gs |= + ac_apply_cu_en(S_00B204_CU_EN_GFX11(0x1) | + S_00B204_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)), + C_00B204_CU_EN_GFX11, 16, &sscreen->info); + } else { + shader->ngg.spi_shader_pgm_rsrc4_gs |= + ac_apply_cu_en(S_00B204_CU_EN_GFX10(0xffff), + C_00B204_CU_EN_GFX10, 16, &sscreen->info); + } } if (sscreen->info.gfx_level >= GFX11) { /* This should be <= 252 for performance on Gfx11. 256 works too but is slower. */ - unsigned max_prim_grp_size = 252; + unsigned max_prim_grp_size = sscreen->info.gfx_level >= GFX12 ? 256 : 252; unsigned prim_amp_factor = gs_stage == MESA_SHADER_GEOMETRY ? gs_sel->info.base.gs.vertices_out : 1; @@ -1552,17 +1648,25 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1); } - shader->ngg.vgt_shader_stages_en = - S_028B54_ES_EN(es_stage == MESA_SHADER_TESS_EVAL ? - V_028B54_ES_STAGE_DS : V_028B54_ES_STAGE_REAL) | - S_028B54_GS_EN(gs_stage == MESA_SHADER_GEOMETRY) | - S_028B54_PRIMGEN_EN(1) | - S_028B54_PRIMGEN_PASSTHRU_EN(gfx10_is_ngg_passthrough(shader)) | - S_028B54_PRIMGEN_PASSTHRU_NO_MSG(gfx10_is_ngg_passthrough(shader) && - sscreen->info.family >= CHIP_NAVI23) | - S_028B54_NGG_WAVE_ID_EN(si_shader_uses_streamout(shader)) | - S_028B54_GS_W32_EN(shader->wave_size == 32) | - S_028B54_MAX_PRIMGRP_IN_WAVE(2); + if (sscreen->info.gfx_level >= GFX12) { + shader->ngg.vgt_shader_stages_en = + S_028A98_GS_EN(gs_stage == MESA_SHADER_GEOMETRY) | + S_028A98_PRIMGEN_PASSTHRU_NO_MSG(gfx10_is_ngg_passthrough(shader)) | + S_028A98_GS_W32_EN(shader->wave_size == 32) | + S_028A98_NGG_WAVE_ID_EN(si_shader_uses_streamout(shader)); + } else { + shader->ngg.vgt_shader_stages_en = + S_028B54_ES_EN(es_stage == MESA_SHADER_TESS_EVAL ? + V_028B54_ES_STAGE_DS : V_028B54_ES_STAGE_REAL) | + S_028B54_GS_EN(gs_stage == MESA_SHADER_GEOMETRY) | + S_028B54_PRIMGEN_EN(1) | + S_028B54_PRIMGEN_PASSTHRU_EN(gfx10_is_ngg_passthrough(shader)) | + S_028B54_PRIMGEN_PASSTHRU_NO_MSG(gfx10_is_ngg_passthrough(shader) && + sscreen->info.family >= CHIP_NAVI23) | + S_028B54_NGG_WAVE_ID_EN(si_shader_uses_streamout(shader)) | + S_028B54_GS_W32_EN(shader->wave_size == 32) | + S_028B54_MAX_PRIMGRP_IN_WAVE(2); + } si_pm4_finalize(pm4); } @@ -1836,6 +1940,32 @@ static void gfx11_dgpu_emit_shader_ps(struct si_context *sctx, unsigned index) radeon_end(); /* don't track context rolls on GFX11 */ } +static void gfx12_emit_shader_ps(struct si_context *sctx, unsigned index) +{ + struct si_shader *shader = sctx->queued.named.ps; + + radeon_begin(&sctx->gfx_cs); + gfx12_begin_context_regs(); + gfx12_opt_set_context_reg(R_028640_SPI_PS_IN_CONTROL, SI_TRACKED_SPI_PS_IN_CONTROL, + shader->ps.spi_ps_in_control); + gfx12_opt_set_context_reg(R_028650_SPI_SHADER_Z_FORMAT, SI_TRACKED_SPI_SHADER_Z_FORMAT, + shader->ps.spi_shader_z_format); + gfx12_opt_set_context_reg(R_028654_SPI_SHADER_COL_FORMAT, SI_TRACKED_SPI_SHADER_COL_FORMAT, + shader->ps.spi_shader_col_format); + gfx12_opt_set_context_reg(R_028658_SPI_BARYC_CNTL, SI_TRACKED_SPI_BARYC_CNTL, + shader->ps.spi_baryc_cntl); + gfx12_opt_set_context_reg(R_02865C_SPI_PS_INPUT_ENA, SI_TRACKED_SPI_PS_INPUT_ENA, + shader->ps.spi_ps_input_ena); + gfx12_opt_set_context_reg(R_028660_SPI_PS_INPUT_ADDR, SI_TRACKED_SPI_PS_INPUT_ADDR, + shader->ps.spi_ps_input_addr); + gfx12_opt_set_context_reg(R_028854_CB_SHADER_MASK, SI_TRACKED_CB_SHADER_MASK, + shader->ps.cb_shader_mask); + gfx12_opt_set_context_reg(R_028BBC_PA_SC_HISZ_CONTROL, SI_TRACKED_PA_SC_HISZ_CONTROL, + shader->ps.pa_sc_hisz_control); + gfx12_end_context_regs(); + radeon_end(); /* don't track context rolls on GFX12 */ +} + static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader) { struct si_shader_info *info = &shader->selector->info; @@ -1884,13 +2014,19 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader) S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(info->writes_stencil) | S_02880C_MASK_EXPORT_ENABLE(shader->ps.writes_samplemask) | S_02880C_KILL_ENABLE(si_shader_uses_discard(shader)); + if (sscreen->info.gfx_level >= GFX12) + shader->ps.pa_sc_hisz_control = S_028BBC_ROUND(2); /* required minimum value */ switch (info->base.fs.depth_layout) { case FRAG_DEPTH_LAYOUT_GREATER: shader->ps.db_shader_control |= S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_GREATER_THAN_Z); + if (sscreen->info.gfx_level >= GFX12) + shader->ps.pa_sc_hisz_control |= S_028BBC_CONSERVATIVE_Z_EXPORT(V_028BBC_EXPORT_GREATER_THAN_Z); break; case FRAG_DEPTH_LAYOUT_LESS: shader->ps.db_shader_control |= S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_LESS_THAN_Z); + if (sscreen->info.gfx_level >= GFX12) + shader->ps.pa_sc_hisz_control |= S_028BBC_CONSERVATIVE_Z_EXPORT(V_028BBC_EXPORT_LESS_THAN_Z); break; default:; } @@ -2001,22 +2137,30 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader) } } - /* Enable PARAM_GEN for point smoothing. - * Gfx11 workaround when there are no PS inputs but LDS is used. - */ - bool param_gen = shader->key.ps.mono.point_smoothing || - (sscreen->info.gfx_level == GFX11 && !shader->ps.num_interp && - shader->config.lds_size); + if (sscreen->info.gfx_level >= GFX12) { + shader->ps.spi_ps_in_control = S_028640_PARAM_GEN(shader->key.ps.mono.point_smoothing) | + S_028640_PS_W32_EN(shader->wave_size == 32); + shader->ps.spi_gs_out_config_ps = S_00B0C4_NUM_INTERP(shader->ps.num_interp); + } else { + /* Enable PARAM_GEN for point smoothing. + * Gfx11 workaround when there are no PS inputs but LDS is used. + */ + bool param_gen = shader->key.ps.mono.point_smoothing || + (sscreen->info.gfx_level == GFX11 && !shader->ps.num_interp && + shader->config.lds_size); - shader->ps.spi_ps_in_control = S_0286D8_NUM_INTERP(shader->ps.num_interp) | - S_0286D8_PARAM_GEN(param_gen) | - S_0286D8_PS_W32_EN(shader->wave_size == 32); + shader->ps.spi_ps_in_control = S_0286D8_NUM_INTERP(shader->ps.num_interp) | + S_0286D8_PARAM_GEN(param_gen) | + S_0286D8_PS_W32_EN(shader->wave_size == 32); + } struct si_pm4_state *pm4 = si_get_shader_pm4_state(shader, NULL); if (!pm4) return; - if (sscreen->info.has_set_context_pairs_packed) + if (sscreen->info.gfx_level >= GFX12) + pm4->atom.emit = gfx12_emit_shader_ps; + else if (sscreen->info.has_set_context_pairs_packed) pm4->atom.emit = gfx11_dgpu_emit_shader_ps; else pm4->atom.emit = gfx6_emit_shader_ps; @@ -2029,7 +2173,12 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader) si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); } - if (sscreen->info.gfx_level >= GFX11) { + if (sscreen->info.gfx_level >= GFX12) { + si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC4_PS, + S_00B01C_WAVE_LIMIT_GFX12(0x3FF) | + S_00B01C_LDS_GROUP_SIZE_GFX12(1) | + S_00B01C_INST_PREF_SIZE(si_get_shader_prefetch_size(shader))); + } else if (sscreen->info.gfx_level >= GFX11) { unsigned cu_mask_ps = gfx103_get_cu_mask_ps(sscreen); si_pm4_set_reg_idx3(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS, @@ -2046,7 +2195,7 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader) si_pm4_set_reg(pm4, R_00B028_SPI_SHADER_PGM_RSRC1_PS, S_00B028_VGPRS(si_shader_encode_vgprs(shader)) | S_00B028_SGPRS(si_shader_encode_sgprs(shader)) | - S_00B028_DX10_CLAMP(1) | + S_00B028_DX10_CLAMP(sscreen->info.gfx_level < GFX12) | S_00B028_MEM_ORDERED(si_shader_mem_ordered(shader)) | S_00B028_FLOAT_MODE(shader->config.float_mode)); si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS, @@ -2306,6 +2455,8 @@ static void si_get_vs_key_outputs(struct si_context *sctx, struct si_shader_sele sctx->shader.ps.cso && sctx->shader.ps.cso->info.uses_primid; key->ge.opt.remove_streamout = vs->info.enabled_streamout_buffer_mask && !sctx->streamout.enabled_mask; + if (sctx->gfx_level >= GFX12) + key->ge.mono.remove_streamout = key->ge.opt.remove_streamout; } static void si_clear_vs_key_outputs(struct si_context *sctx, struct si_shader_selector *vs, @@ -2316,6 +2467,7 @@ static void si_clear_vs_key_outputs(struct si_context *sctx, struct si_shader_se key->ge.opt.remove_streamout = 0; key->ge.opt.ngg_culling = 0; key->ge.mono.u.vs_export_prim_id = 0; + key->ge.mono.remove_streamout = 0; } void si_ps_key_update_framebuffer(struct si_context *sctx) @@ -3434,8 +3586,8 @@ static void si_update_streamout_state(struct si_context *sctx) sctx->streamout.stride_in_dw = shader_with_so->info.base.xfb_stride; /* GDS must be allocated when any GDS instructions are used, otherwise it hangs. */ - if (sctx->gfx_level >= GFX11 && shader_with_so->info.enabled_streamout_buffer_mask && - !sctx->screen->gds_oa) { + if (sctx->gfx_level >= GFX11 && sctx->gfx_level < GFX12 && + shader_with_so->info.enabled_streamout_buffer_mask && !sctx->screen->gds_oa) { /* Gfx11 only uses GDS OA, not GDS memory. */ simple_mtx_lock(&sctx->screen->gds_mutex); if (!sctx->screen->gds_oa) { @@ -4288,8 +4440,10 @@ static void si_emit_vgt_pipeline_state(struct si_context *sctx, unsigned index) struct radeon_cmdbuf *cs = &sctx->gfx_cs; radeon_begin(cs); - radeon_opt_set_context_reg(sctx, R_028B54_VGT_SHADER_STAGES_EN, SI_TRACKED_VGT_SHADER_STAGES_EN, - sctx->vgt_shader_stages_en); + radeon_opt_set_context_reg(sctx, sctx->gfx_level >= GFX12 ? + R_028A98_VGT_SHADER_STAGES_EN : + R_028B54_VGT_SHADER_STAGES_EN, + SI_TRACKED_VGT_SHADER_STAGES_EN, sctx->vgt_shader_stages_en); if (sctx->gfx_level == GFX10_3) { /* Legacy Tess+GS should disable reuse to prevent hangs on GFX10.3. */ bool has_legacy_tess_gs = G_028B54_HS_EN(sctx->vgt_shader_stages_en) && @@ -4414,6 +4568,11 @@ static void si_set_patch_vertices(struct pipe_context *ctx, uint8_t patch_vertic sctx->do_update_shaders = true; } + /* Gfx12 programs patch_vertices in VGT_PRIMITIVE_TYPE.NUM_INPUT_CP. Make sure + * the register is updated. + */ + if (sctx->gfx_level >= GFX12 && sctx->last_prim == MESA_PRIM_PATCHES) + sctx->last_prim = -1; } } @@ -4556,13 +4715,15 @@ void si_update_tess_io_layout_state(struct si_context *sctx) sctx->ls_hs_rsrc2 = ls_hs_rsrc2; sctx->ls_hs_config = S_028B58_NUM_PATCHES(sctx->num_patches_per_workgroup) | - S_028B58_HS_NUM_INPUT_CP(num_tcs_input_cp) | S_028B58_HS_NUM_OUTPUT_CP(num_tcs_output_cp); + if (sctx->gfx_level < GFX12) + sctx->ls_hs_config |= S_028B58_HS_NUM_INPUT_CP(num_tcs_input_cp); + si_mark_atom_dirty(sctx, &sctx->atoms.s.tess_io_layout); } -static void si_emit_tess_io_layout_state(struct si_context *sctx, unsigned index) +static void gfx6_emit_tess_io_layout_state(struct si_context *sctx, unsigned index) { struct radeon_cmdbuf *cs = &sctx->gfx_cs; @@ -4570,7 +4731,20 @@ static void si_emit_tess_io_layout_state(struct si_context *sctx, unsigned index return; radeon_begin(cs); - if (sctx->screen->info.has_set_sh_pairs_packed) { + if (sctx->gfx_level >= GFX12) { + gfx12_opt_push_gfx_sh_reg(R_00B42C_SPI_SHADER_PGM_RSRC2_HS, + SI_TRACKED_SPI_SHADER_PGM_RSRC2_HS, sctx->ls_hs_rsrc2); + + /* Set userdata SGPRs for merged LS-HS. */ + gfx12_opt_push_gfx_sh_reg(R_00B430_SPI_SHADER_USER_DATA_HS_0 + + GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 4, + SI_TRACKED_SPI_SHADER_USER_DATA_HS__TCS_OFFCHIP_LAYOUT, + sctx->tcs_offchip_layout); + gfx12_opt_push_gfx_sh_reg(R_00B430_SPI_SHADER_USER_DATA_HS_0 + + GFX9_SGPR_TCS_OFFCHIP_ADDR * 4, + SI_TRACKED_SPI_SHADER_USER_DATA_HS__TCS_OFFCHIP_ADDR, + sctx->tes_offchip_ring_va_sgpr); + } else if (sctx->screen->info.has_set_sh_pairs_packed) { gfx11_opt_push_gfx_sh_reg(R_00B42C_SPI_SHADER_PGM_RSRC2_HS, SI_TRACKED_SPI_SHADER_PGM_RSRC2_HS, sctx->ls_hs_rsrc2); @@ -4647,6 +4821,46 @@ static void si_emit_tess_io_layout_state(struct si_context *sctx, unsigned index radeon_end_update_context_roll(sctx); } +static void gfx12_emit_tess_io_layout_state(struct si_context *sctx, unsigned index) +{ + struct radeon_cmdbuf *cs = &sctx->gfx_cs; + + if (!sctx->shader.tes.cso || !sctx->shader.tcs.current) + return; + + gfx12_opt_push_gfx_sh_reg(R_00B42C_SPI_SHADER_PGM_RSRC2_HS, + SI_TRACKED_SPI_SHADER_PGM_RSRC2_HS, sctx->ls_hs_rsrc2); + /* Set userdata SGPRs for merged LS-HS. */ + gfx12_opt_push_gfx_sh_reg(R_00B430_SPI_SHADER_USER_DATA_HS_0 + + GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 4, + SI_TRACKED_SPI_SHADER_USER_DATA_HS__TCS_OFFCHIP_LAYOUT, + sctx->tcs_offchip_layout); + gfx12_opt_push_gfx_sh_reg(R_00B430_SPI_SHADER_USER_DATA_HS_0 + + GFX9_SGPR_TCS_OFFCHIP_ADDR * 4, + SI_TRACKED_SPI_SHADER_USER_DATA_HS__TCS_OFFCHIP_ADDR, + sctx->tes_offchip_ring_va_sgpr); + + /* Set userdata SGPRs for TES. */ + unsigned tes_sh_base = sctx->shader_pointers.sh_base[PIPE_SHADER_TESS_EVAL]; + assert(tes_sh_base); + + /* TES (as ES or VS) reuses the BaseVertex and DrawID user SGPRs that are used when + * tessellation is disabled. We can do that because those user SGPRs are only set in LS + * for tessellation and are unused in TES. + */ + gfx12_opt_push_gfx_sh_reg(tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4, + SI_TRACKED_SPI_SHADER_USER_DATA_ES__BASE_VERTEX, + sctx->tcs_offchip_layout); + gfx12_opt_push_gfx_sh_reg(tes_sh_base + SI_SGPR_TES_OFFCHIP_ADDR * 4, + SI_TRACKED_SPI_SHADER_USER_DATA_ES__DRAWID, + sctx->tes_offchip_ring_va_sgpr); + + radeon_begin(cs); + radeon_opt_set_context_reg_idx(sctx, R_028B58_VGT_LS_HS_CONFIG, + SI_TRACKED_VGT_LS_HS_CONFIG, 2, sctx->ls_hs_config); + radeon_end(); /* don't track context rolls on GFX12 */ +} + void si_init_screen_live_shader_cache(struct si_screen *sscreen) { util_live_shader_cache_init(&sscreen->live_shader_cache, si_create_shader_selector, @@ -4657,14 +4871,20 @@ template static void si_emit_spi_map(struct si_context *sctx, unsigned index) { struct si_shader *ps = sctx->shader.ps.current; + struct si_shader *vs = si_get_vs(sctx)->current; unsigned spi_ps_input_cntl[NUM_INTERP]; STATIC_ASSERT(NUM_INTERP >= 0 && NUM_INTERP <= 32); + if (sctx->gfx_level >= GFX12) { + gfx12_opt_push_gfx_sh_reg(R_00B0C4_SPI_SHADER_GS_OUT_CONFIG_PS, + SI_TRACKED_SPI_SHADER_GS_OUT_CONFIG_PS, + vs->ngg.spi_vs_out_config | ps->ps.spi_gs_out_config_ps); + } + if (!NUM_INTERP) return; - struct si_shader *vs = si_get_vs(sctx)->current; struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; for (unsigned i = 0; i < NUM_INTERP; i++) { @@ -4703,10 +4923,17 @@ static void si_emit_spi_map(struct si_context *sctx, unsigned index) * Dota 2: Only ~16% of SPI map updates set different values. * Talos: Only ~9% of SPI map updates set different values. */ - radeon_begin(&sctx->gfx_cs); - radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0, spi_ps_input_cntl, - sctx->tracked_regs.spi_ps_input_cntl, NUM_INTERP); - radeon_end_update_context_roll(sctx); + if (sctx->gfx_level >= GFX12) { + radeon_begin(&sctx->gfx_cs); + radeon_opt_set_context_regn(sctx, R_028664_SPI_PS_INPUT_CNTL_0, spi_ps_input_cntl, + sctx->tracked_regs.spi_ps_input_cntl, NUM_INTERP); + radeon_end(); /* don't track context rolls on GFX12 */ + } else { + radeon_begin(&sctx->gfx_cs); + radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0, spi_ps_input_cntl, + sctx->tracked_regs.spi_ps_input_cntl, NUM_INTERP); + radeon_end_update_context_roll(sctx); + } } static void si_emit_spi_ge_ring_state(struct si_context *sctx, unsigned index) @@ -4740,9 +4967,11 @@ static void si_emit_spi_ge_ring_state(struct si_context *sctx, unsigned index) radeon_set_uconfig_reg_seq(R_030938_VGT_TF_RING_SIZE, 3); radeon_emit(S_030938_SIZE(tf_ring_size_field)); /* R_030938_VGT_TF_RING_SIZE */ radeon_emit(sscreen->hs.hs_offchip_param); /* R_03093C_VGT_HS_OFFCHIP_PARAM */ - radeon_emit(factor_va >> 8); /* R_030940_VGT_TF_MEMORY_BASE */ + radeon_emit(factor_va >> 8); /* R_030940_VGT_TF_MEMORY_BASE */ - if (sctx->gfx_level >= GFX10) + if (sctx->gfx_level >= GFX12) + radeon_set_uconfig_reg(R_03099C_VGT_TF_MEMORY_BASE_HI, S_03099C_BASE_HI(factor_va >> 40)); + else if (sctx->gfx_level >= GFX10) radeon_set_uconfig_reg(R_030984_VGT_TF_MEMORY_BASE_HI, S_030984_BASE_HI(factor_va >> 40)); else if (sctx->gfx_level == GFX9) radeon_set_uconfig_reg(R_030944_VGT_TF_MEMORY_BASE_HI, S_030944_BASE_HI(factor_va >> 40)); @@ -4783,15 +5012,34 @@ static void si_emit_spi_ge_ring_state(struct si_context *sctx, unsigned index) radeon_emit(S_585_PWS_ENA(1)); radeon_emit(0); /* GCR_CNTL */ - assert((sscreen->attribute_ring->gpu_address >> 32) == sscreen->info.address32_hi); + uint64_t attr_address = sscreen->attribute_pos_prim_ring->gpu_address; + assert((attr_address >> 32) == sscreen->info.address32_hi); radeon_set_uconfig_reg_seq(R_031110_SPI_GS_THROTTLE_CNTL1, 4); radeon_emit(0x12355123); /* SPI_GS_THROTTLE_CNTL1 */ radeon_emit(0x1544D); /* SPI_GS_THROTTLE_CNTL2 */ - radeon_emit(sscreen->attribute_ring->gpu_address >> 16); /* SPI_ATTRIBUTE_RING_BASE */ + radeon_emit(attr_address >> 16); /* SPI_ATTRIBUTE_RING_BASE */ radeon_emit(S_03111C_MEM_SIZE((sscreen->info.attribute_ring_size_per_se >> 16) - 1) | S_03111C_BIG_PAGE(sscreen->info.discardable_allows_big_page) | S_03111C_L1_POLICY(1)); /* SPI_ATTRIBUTE_RING_SIZE */ + + if (sctx->gfx_level >= GFX12) { + uint64_t pos_address = attr_address + sscreen->info.pos_ring_offset; + uint64_t prim_address = attr_address + sscreen->info.prim_ring_offset; + + /* When one of these 4 registers is updated, all 4 must be updated. */ + radeon_set_uconfig_reg_seq(R_0309A0_GE_POS_RING_BASE, 4); + radeon_emit(pos_address >> 16); /* R_0309A0_GE_POS_RING_BASE */ + radeon_emit(S_0309A4_MEM_SIZE(sscreen->info.pos_ring_size_per_se >> 5)); /* R_0309A4_GE_POS_RING_SIZE */ + radeon_emit(prim_address >> 16); /* R_0309A8_GE_PRIM_RING_BASE */ + radeon_emit(S_0309AC_MEM_SIZE(sscreen->info.prim_ring_size_per_se >> 5) | + S_0309AC_SCOPE(gfx12_scope_device) | + S_0309AC_PAF_TEMPORAL(gfx12_store_high_temporal_stay_dirty) | + S_0309AC_PAB_TEMPORAL(gfx12_load_last_use_discard) | + S_0309AC_SPEC_DATA_READ(gfx12_spec_read_auto) | + S_0309AC_FORCE_SE_SCOPE(1) | + S_0309AC_PAB_NOFILL(1)); /* R_0309AC_GE_PRIM_RING_SIZE */ + } radeon_end(); } } @@ -4800,9 +5048,13 @@ void si_init_shader_functions(struct si_context *sctx) { sctx->atoms.s.vgt_pipeline_state.emit = si_emit_vgt_pipeline_state; sctx->atoms.s.scratch_state.emit = si_emit_scratch_state; - sctx->atoms.s.tess_io_layout.emit = si_emit_tess_io_layout_state; sctx->atoms.s.spi_ge_ring_state.emit = si_emit_spi_ge_ring_state; + if (sctx->gfx_level >= GFX12) + sctx->atoms.s.tess_io_layout.emit = gfx12_emit_tess_io_layout_state; + else + sctx->atoms.s.tess_io_layout.emit = gfx6_emit_tess_io_layout_state; + sctx->b.create_vs_state = si_create_shader; sctx->b.create_tcs_state = si_create_shader; sctx->b.create_tes_state = si_create_shader; diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c b/src/gallium/drivers/radeonsi/si_state_streamout.c index 84ed0c4233f..d42d6318767 100644 --- a/src/gallium/drivers/radeonsi/si_state_streamout.c +++ b/src/gallium/drivers/radeonsi/si_state_streamout.c @@ -67,6 +67,9 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ if (!old_num_targets && !num_targets) return; + if (sctx->gfx_level >= GFX12) + si_set_internal_shader_buffer(sctx, SI_STREAMOUT_STATE_BUF, NULL); + /* We are going to unbind the buffers. Mark which caches need to be flushed. */ if (old_num_targets && sctx->streamout.begin_emitted) { /* Stop streamout. */ @@ -97,6 +100,11 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ */ sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME; + + /* Make the streamout state buffer available to the CP for resuming and DrawTF. */ + if (sctx->screen->info.cp_sdma_ge_use_system_memory_scope) + sctx->flags |= SI_CONTEXT_WB_L2; + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); } @@ -105,7 +113,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ * spec@ext_transform_feedback@immediate-reuse-index-buffer * spec@ext_transform_feedback@immediate-reuse-uniform-buffer */ - if (sctx->gfx_level >= GFX11 && old_num_targets) + if (sctx->gfx_level >= GFX11 && sctx->gfx_level < GFX12 && old_num_targets) si_flush_gfx_cs(sctx, 0, NULL); /* Streamout buffers must be bound in 2 places: @@ -129,11 +137,55 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ /* Allocate space for the filled buffer size. */ struct si_streamout_target *t = sctx->streamout.targets[i]; - if (!t->buf_filled_size) { - unsigned buf_filled_size_size = sctx->gfx_level >= GFX11 ? 8 : 4; - u_suballocator_alloc(&sctx->allocator_zeroed_memory, buf_filled_size_size, 4, - &t->buf_filled_size_offset, - (struct pipe_resource **)&t->buf_filled_size); + + if (sctx->gfx_level >= GFX12) { + bool first_target = util_bitcount(enabled_mask) == 1; + + /* The first enabled streamout target will contain the ordered ID/offset buffer for all + * targets. + */ + if (first_target && !append_bitmask) { + /* The layout is: + * struct { + * struct { + * uint32_t ordered_id; // equal for all buffers + * uint32_t dwords_written; + * } buffer[4]; + * }; + * + * The buffer must be initialized to 0 and the address must be aligned to 64 + * because it's faster when the atomic doesn't straddle a 64B block boundary. + */ + unsigned alloc_size = 32; + unsigned alignment = 64; + + si_resource_reference(&t->buf_filled_size, NULL); + u_suballocator_alloc(&sctx->allocator_zeroed_memory, alloc_size, alignment, + &t->buf_filled_size_offset, + (struct pipe_resource **)&t->buf_filled_size); + + /* Offset to dwords_written of the first enabled streamout buffer. */ + t->buf_filled_size_draw_count_offset = t->buf_filled_size_offset + i * 8 + 4; + } + + if (first_target) { + struct pipe_shader_buffer sbuf; + sbuf.buffer = &t->buf_filled_size->b.b; + sbuf.buffer_offset = t->buf_filled_size_offset; + sbuf.buffer_size = 32; /* unused, the shader only uses the low 32 bits of the address */ + + si_set_internal_shader_buffer(sctx, SI_STREAMOUT_STATE_BUF, &sbuf); + } + } else { + /* GFX6-11 */ + if (!t->buf_filled_size) { + unsigned alloc_size = sctx->gfx_level >= GFX11 ? 8 : 4; + + u_suballocator_alloc(&sctx->allocator_zeroed_memory, alloc_size, 4, + &t->buf_filled_size_offset, + (struct pipe_resource **)&t->buf_filled_size); + t->buf_filled_size_draw_count_offset = t->buf_filled_size_offset; + } } /* Bind it to the shader. */ @@ -156,6 +208,11 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ si_set_internal_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL); } + /* Either streamout is being resumed for all targets or none. Required by how we implement it + * for GFX12. + */ + assert(!append_bitmask || enabled_mask == append_bitmask); + if (!!sctx->streamout.enabled_mask != !!enabled_mask) sctx->do_update_shaders = true; /* to keep/remove streamout shader code as an optimization */ @@ -219,6 +276,7 @@ static void si_emit_streamout_begin(struct si_context *sctx, unsigned index) { struct radeon_cmdbuf *cs = &sctx->gfx_cs; struct si_streamout_target **t = sctx->streamout.targets; + bool first_target = true; if (sctx->gfx_level < GFX11) si_flush_vgt_streamout(sctx); @@ -229,7 +287,22 @@ static void si_emit_streamout_begin(struct si_context *sctx, unsigned index) t[i]->stride_in_dw = sctx->streamout.stride_in_dw[i]; - if (sctx->gfx_level >= GFX11) { + if (sctx->gfx_level >= GFX12) { + /* Only the first streamout target holds information. */ + if (first_target) { + if (sctx->streamout.append_bitmask & (1 << i)) { + si_cp_copy_data(sctx, cs, COPY_DATA_REG, NULL, + R_0309B0_GE_GS_ORDERED_ID_BASE >> 2, COPY_DATA_SRC_MEM, + t[i]->buf_filled_size, t[i]->buf_filled_size_offset); + } else { + radeon_begin(cs); + radeon_set_uconfig_reg(R_0309B0_GE_GS_ORDERED_ID_BASE, 0); + radeon_end(); + } + + first_target = false; + } + } else if (sctx->gfx_level >= GFX11) { if (sctx->streamout.append_bitmask & (1 << i)) { /* Restore the register value. */ si_cp_copy_data(sctx, cs, COPY_DATA_REG, NULL, @@ -289,6 +362,14 @@ void si_emit_streamout_end(struct si_context *sctx) struct radeon_cmdbuf *cs = &sctx->gfx_cs; struct si_streamout_target **t = sctx->streamout.targets; + if (sctx->gfx_level >= GFX12) { + /* Nothing to do. The streamout state buffer already contains the next ordered ID, which + * is the only thing we need to restore. + */ + sctx->streamout.begin_emitted = false; + return; + } + if (sctx->gfx_level >= GFX11) { /* Wait for streamout to finish before reading GDS_STRMOUT registers. */ sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH; diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c b/src/gallium/drivers/radeonsi/si_state_viewport.c index 92f0878cafb..18e9302e8ea 100644 --- a/src/gallium/drivers/radeonsi/si_state_viewport.c +++ b/src/gallium/drivers/radeonsi/si_state_viewport.c @@ -8,7 +8,8 @@ #include "util/u_upload_mgr.h" #include "util/u_viewport.h" -#define SI_MAX_SCISSOR 16384 +#define GFX6_MAX_VIEWPORT_SIZE 16384 +#define GFX12_MAX_VIEWPORT_SIZE 32768 /* TODO: this should be 64K, but maxx/maxy doesn't have enough bits */ static void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small_prim_cull_info *out) { @@ -97,7 +98,11 @@ static void si_emit_cull_state(struct si_context *sctx, unsigned index) radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, sctx->small_prim_cull_info_buf, RADEON_USAGE_READ | RADEON_PRIO_CONST_BUFFER); - if (sctx->screen->info.has_set_sh_pairs_packed) { + if (sctx->gfx_level >= GFX12) { + gfx12_push_gfx_sh_reg(R_00B230_SPI_SHADER_USER_DATA_GS_0 + + GFX9_SGPR_SMALL_PRIM_CULL_INFO * 4, + sctx->small_prim_cull_info_address); + } else if (sctx->screen->info.has_set_sh_pairs_packed) { gfx11_push_gfx_sh_reg(R_00B230_SPI_SHADER_USER_DATA_GS_0 + GFX9_SGPR_SMALL_PRIM_CULL_INFO * 4, sctx->small_prim_cull_info_address); @@ -195,10 +200,12 @@ static void si_get_scissor_from_viewport(struct si_context *ctx, static void si_clamp_scissor(struct si_context *ctx, struct pipe_scissor_state *out, struct si_signed_scissor *scissor) { - out->minx = CLAMP(scissor->minx, 0, SI_MAX_SCISSOR); - out->miny = CLAMP(scissor->miny, 0, SI_MAX_SCISSOR); - out->maxx = CLAMP(scissor->maxx, 0, SI_MAX_SCISSOR); - out->maxy = CLAMP(scissor->maxy, 0, SI_MAX_SCISSOR); + unsigned max_scissor = ctx->gfx_level >= GFX12 ? GFX12_MAX_VIEWPORT_SIZE : GFX6_MAX_VIEWPORT_SIZE; + + out->minx = CLAMP(scissor->minx, 0, max_scissor); + out->miny = CLAMP(scissor->miny, 0, max_scissor); + out->maxx = CLAMP(scissor->maxx, 0, max_scissor); + out->maxy = CLAMP(scissor->maxy, 0, max_scissor); } static void si_clip_scissor(struct pipe_scissor_state *out, struct pipe_scissor_state *clip) @@ -226,7 +233,7 @@ static void si_emit_one_scissor(struct si_context *ctx, struct radeon_cmdbuf *cs if (ctx->vs_disables_clipping_viewport) { final.minx = final.miny = 0; - final.maxx = final.maxy = SI_MAX_SCISSOR; + final.maxx = final.maxy = ctx->gfx_level >= GFX12 ? GFX12_MAX_VIEWPORT_SIZE : GFX6_MAX_VIEWPORT_SIZE; } else { si_clamp_scissor(ctx, &final, vp_scissor); } @@ -235,16 +242,27 @@ static void si_emit_one_scissor(struct si_context *ctx, struct radeon_cmdbuf *cs si_clip_scissor(&final, scissor); radeon_begin(cs); - /* Workaround for a hw bug on GFX6 that occurs when PA_SU_HARDWARE_SCREEN_OFFSET != 0 and - * any_scissor.BR_X/Y <= 0. - */ - if (ctx->gfx_level == GFX6 && (final.maxx == 0 || final.maxy == 0)) { - radeon_emit(S_028250_TL_X(1) | S_028250_TL_Y_GFX6(1) | S_028250_WINDOW_OFFSET_DISABLE(1)); - radeon_emit(S_028254_BR_X(1) | S_028254_BR_Y(1)); + if (ctx->gfx_level >= GFX12) { + if (final.maxx == 0 || final.maxy == 0) { + /* An empty scissor must be done like this because the bottom-right bounds are inclusive. */ + radeon_emit(S_028250_TL_X(1) | S_028250_TL_Y_GFX12(1)); + radeon_emit(S_028254_BR_X(0) | S_028254_BR_Y(0)); + } else { + radeon_emit(S_028250_TL_X(final.minx) | S_028250_TL_Y_GFX12(final.miny)); + radeon_emit(S_028254_BR_X(final.maxx - 1) | S_028254_BR_Y(final.maxy - 1)); + } } else { - radeon_emit(S_028250_TL_X(final.minx) | S_028250_TL_Y_GFX6(final.miny) | - S_028250_WINDOW_OFFSET_DISABLE(1)); - radeon_emit(S_028254_BR_X(final.maxx) | S_028254_BR_Y(final.maxy)); + /* Workaround for a hw bug on GFX6 that occurs when PA_SU_HARDWARE_SCREEN_OFFSET != 0 and + * any_scissor.BR_X/Y <= 0. + */ + if (ctx->gfx_level == GFX6 && (final.maxx == 0 || final.maxy == 0)) { + radeon_emit(S_028250_TL_X(1) | S_028250_TL_Y_GFX6(1) | S_028250_WINDOW_OFFSET_DISABLE(1)); + radeon_emit(S_028254_BR_X(1) | S_028254_BR_Y(1)); + } else { + radeon_emit(S_028250_TL_X(final.minx) | S_028250_TL_Y_GFX6(final.miny) | + S_028250_WINDOW_OFFSET_DISABLE(1)); + radeon_emit(S_028254_BR_X(final.maxx) | S_028254_BR_Y(final.maxy)); + } } radeon_end(); } @@ -284,7 +302,7 @@ static void si_emit_guardband(struct si_context *sctx, unsigned index) const unsigned hw_screen_offset_alignment = sctx->gfx_level >= GFX11 ? 32 : sctx->gfx_level >= GFX8 ? 16 : MAX2(sctx->screen->se_tile_repeat, 16); - const unsigned max_hw_screen_offset = 8176; + const unsigned max_hw_screen_offset = sctx->gfx_level >= GFX12 ? 32752 : 8176; /* Indexed by quantization modes */ static int max_viewport_size[] = {65536, 16384, 4096}; @@ -367,7 +385,21 @@ static void si_emit_guardband(struct si_context *sctx, unsigned index) * R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, R_028BEC_PA_CL_GB_VERT_DISC_ADJ * R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ, R_028BF4_PA_CL_GB_HORZ_DISC_ADJ */ - if (sctx->screen->info.has_set_context_pairs_packed) { + if (sctx->gfx_level >= GFX12) { + radeon_begin(&sctx->gfx_cs); + gfx12_begin_context_regs(); + gfx12_opt_set_context_reg(R_028BE4_PA_SU_VTX_CNTL, SI_TRACKED_PA_SU_VTX_CNTL, + pa_su_vtx_cntl); + gfx12_opt_set_context_reg4(R_02842C_PA_CL_GB_VERT_CLIP_ADJ, + SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ, + fui(guardband_y), fui(discard_y), + fui(guardband_x), fui(discard_x)); + gfx12_opt_set_context_reg(R_028234_PA_SU_HARDWARE_SCREEN_OFFSET, + SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET, + pa_su_hardware_screen_offset); + gfx12_end_context_regs(); + radeon_end(); /* don't track context rolls on GFX12 */ + } else if (sctx->screen->info.has_set_context_pairs_packed) { radeon_begin(&sctx->gfx_cs); gfx11_begin_packed_context_regs(); gfx11_opt_set_context_reg(R_028BE4_PA_SU_VTX_CNTL, SI_TRACKED_PA_SU_VTX_CNTL, @@ -489,7 +521,7 @@ static void si_set_viewport_states(struct pipe_context *pctx, unsigned start_slo si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors); } -static void si_emit_one_viewport(struct si_context *ctx, struct pipe_viewport_state *state) +static void gfx6_emit_one_viewport(struct si_context *ctx, struct pipe_viewport_state *state) { struct radeon_cmdbuf *cs = &ctx->gfx_cs; @@ -503,7 +535,7 @@ static void si_emit_one_viewport(struct si_context *ctx, struct pipe_viewport_st radeon_end(); } -static void si_emit_viewports(struct si_context *ctx) +static void gfx6_emit_viewports(struct si_context *ctx) { struct radeon_cmdbuf *cs = &ctx->gfx_cs; struct pipe_viewport_state *states = ctx->viewports.states; @@ -514,7 +546,7 @@ static void si_emit_viewports(struct si_context *ctx) radeon_set_context_reg_seq(R_02843C_PA_CL_VPORT_XSCALE, 6); radeon_end(); - si_emit_one_viewport(ctx, &states[0]); + gfx6_emit_one_viewport(ctx, &states[0]); return; } @@ -526,7 +558,7 @@ static void si_emit_viewports(struct si_context *ctx) radeon_end(); for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++) - si_emit_one_viewport(ctx, &states[i]); + gfx6_emit_one_viewport(ctx, &states[i]); } static inline void si_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz, @@ -540,7 +572,7 @@ static inline void si_viewport_zmin_zmax(const struct pipe_viewport_state *vp, b util_viewport_zmin_zmax(vp, halfz, zmin, zmax); } -static void si_emit_depth_ranges(struct si_context *ctx) +static void gfx6_emit_depth_ranges(struct si_context *ctx) { struct radeon_cmdbuf *cs = &ctx->gfx_cs; struct pipe_viewport_state *states = ctx->viewports.states; @@ -573,10 +605,57 @@ static void si_emit_depth_ranges(struct si_context *ctx) radeon_end(); } -static void si_emit_viewport_states(struct si_context *ctx, unsigned index) +static void gfx6_emit_viewport_states(struct si_context *ctx, unsigned index) { - si_emit_viewports(ctx); - si_emit_depth_ranges(ctx); + gfx6_emit_viewports(ctx); + gfx6_emit_depth_ranges(ctx); +} + +static void gfx12_emit_viewport_states(struct si_context *ctx, unsigned index) +{ + struct radeon_cmdbuf *cs = &ctx->gfx_cs; + struct pipe_viewport_state *states = ctx->viewports.states; + bool clip_halfz = ctx->queued.named.rasterizer->clip_halfz; + bool window_space = ctx->vs_disables_clipping_viewport; + float zmin, zmax; + + /* The simple case: Only 1 viewport is active. */ + if (!ctx->vs_writes_viewport_index) { + si_viewport_zmin_zmax(&states[0], clip_halfz, window_space, &zmin, &zmax); + + radeon_begin(cs); + radeon_set_context_reg_seq(R_02843C_PA_CL_VPORT_XSCALE, 8); + radeon_emit(fui(states[0].scale[0])); + radeon_emit(fui(states[0].translate[0])); + radeon_emit(fui(states[0].scale[1])); + radeon_emit(fui(states[0].translate[1])); + radeon_emit(fui(states[0].scale[2])); + radeon_emit(fui(states[0].translate[2])); + radeon_emit(fui(zmin)); + radeon_emit(fui(zmax)); + radeon_end(); + return; + } + + /* All registers in the array need to be updated if any of them is changed. + * This is (or was) a hardware requirement. + */ + radeon_begin(cs); + radeon_set_context_reg_seq(R_02843C_PA_CL_VPORT_XSCALE, SI_MAX_VIEWPORTS * 8); + + for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++) { + si_viewport_zmin_zmax(&states[i], clip_halfz, window_space, &zmin, &zmax); + + radeon_emit(fui(states[i].scale[0])); + radeon_emit(fui(states[i].translate[0])); + radeon_emit(fui(states[i].scale[1])); + radeon_emit(fui(states[i].translate[1])); + radeon_emit(fui(states[i].scale[2])); + radeon_emit(fui(states[i].translate[2])); + radeon_emit(fui(zmin)); + radeon_emit(fui(zmax)); + } + radeon_end(); } /** @@ -661,20 +740,43 @@ static void si_emit_window_rectangles(struct si_context *sctx, unsigned index) else rule = outside[num_rectangles - 1]; - radeon_begin(cs); - radeon_opt_set_context_reg(sctx, R_02820C_PA_SC_CLIPRECT_RULE, SI_TRACKED_PA_SC_CLIPRECT_RULE, - rule); - if (num_rectangles == 0) { - radeon_end(); - return; - } + if (sctx->gfx_level >= GFX12) { + radeon_begin(cs); + gfx12_begin_context_regs(); + gfx12_opt_set_context_reg(R_02820C_PA_SC_CLIPRECT_RULE, SI_TRACKED_PA_SC_CLIPRECT_RULE, rule); - radeon_set_context_reg_seq(R_028210_PA_SC_CLIPRECT_0_TL, num_rectangles * 2); - for (unsigned i = 0; i < num_rectangles; i++) { - radeon_emit(S_028210_TL_X(rects[i].minx) | S_028210_TL_Y(rects[i].miny)); - radeon_emit(S_028214_BR_X(rects[i].maxx) | S_028214_BR_Y(rects[i].maxy)); + if (num_rectangles) { + radeon_set_context_reg_seq(R_028210_PA_SC_CLIPRECT_0_TL, num_rectangles * 2); + for (unsigned i = 0; i < num_rectangles; i++) { + gfx12_set_context_reg(R_028210_PA_SC_CLIPRECT_0_TL + i * 8, + S_028210_TL_X(rects[i].minx) | S_028210_TL_Y(rects[i].miny)); + gfx12_set_context_reg(R_028214_PA_SC_CLIPRECT_0_BR + i * 8, + S_028214_BR_X(rects[i].maxx) | S_028214_BR_Y(rects[i].maxy)); + } + + for (unsigned i = 0; i < num_rectangles; i++) { + gfx12_set_context_reg(R_028374_PA_SC_CLIPRECT_0_EXT + i * 4, + S_028374_TL_X_EXT(rects[i].minx >> 15) | + S_028374_TL_Y_EXT(rects[i].miny >> 15) | + S_028374_BR_X_EXT(rects[i].maxx >> 15) | + S_028374_BR_Y_EXT(rects[i].maxy >> 15)); + } + } + gfx12_end_context_regs(); + radeon_end(); + } else { + radeon_begin(cs); + radeon_opt_set_context_reg(sctx, R_02820C_PA_SC_CLIPRECT_RULE, SI_TRACKED_PA_SC_CLIPRECT_RULE, + rule); + if (num_rectangles) { + radeon_set_context_reg_seq(R_028210_PA_SC_CLIPRECT_0_TL, num_rectangles * 2); + for (unsigned i = 0; i < num_rectangles; i++) { + radeon_emit(S_028210_TL_X(rects[i].minx) | S_028210_TL_Y(rects[i].miny)); + radeon_emit(S_028214_BR_X(rects[i].maxx) | S_028214_BR_Y(rects[i].maxy)); + } + } + radeon_end(); } - radeon_end(); } static void si_set_window_rectangles(struct pipe_context *ctx, bool include, @@ -696,7 +798,10 @@ void si_init_viewport_functions(struct si_context *ctx) { ctx->atoms.s.guardband.emit = si_emit_guardband; ctx->atoms.s.scissors.emit = si_emit_scissors; - ctx->atoms.s.viewports.emit = si_emit_viewport_states; + if (ctx->gfx_level >= GFX12) + ctx->atoms.s.viewports.emit = gfx12_emit_viewport_states; + else + ctx->atoms.s.viewports.emit = gfx6_emit_viewport_states; ctx->atoms.s.window_rectangles.emit = si_emit_window_rectangles; ctx->atoms.s.ngg_cull_state.emit = si_emit_cull_state; diff --git a/src/gallium/drivers/radeonsi/si_texture.c b/src/gallium/drivers/radeonsi/si_texture.c index 3abc5a06fbb..a749c06829c 100644 --- a/src/gallium/drivers/radeonsi/si_texture.c +++ b/src/gallium/drivers/radeonsi/si_texture.c @@ -214,115 +214,154 @@ static int si_init_surface(struct si_screen *sscreen, struct radeon_surf *surfac assert(util_is_power_of_two_or_zero(bpe)); } - if (!is_flushed_depth && is_depth) { - flags |= RADEON_SURF_ZBUFFER; + if (sscreen->info.gfx_level >= GFX12) { + if (!is_flushed_depth && is_depth) { + flags |= RADEON_SURF_ZBUFFER; + if (is_stencil) + flags |= RADEON_SURF_SBUFFER; - if ((sscreen->debug_flags & DBG(NO_HYPERZ)) || - (ptex->bind & PIPE_BIND_SHARED) || is_imported) { - flags |= RADEON_SURF_NO_HTILE; - } else if (tc_compatible_htile && - (sscreen->info.gfx_level >= GFX9 || array_mode == RADEON_SURF_MODE_2D)) { - /* TC-compatible HTILE only supports Z32_FLOAT. - * GFX9 also supports Z16_UNORM. - * On GFX8, promote Z16 to Z32. DB->CB copies will convert - * the format for transfers. - */ - if (sscreen->info.gfx_level == GFX8) - bpe = 4; + if (sscreen->debug_flags & DBG(NO_HYPERZ) || + ptex->flags & PIPE_RESOURCE_FLAG_SPARSE) + flags |= RADEON_SURF_NO_HTILE; + } + } else { + /* Gfx6-11 */ + if (!is_flushed_depth && is_depth) { + flags |= RADEON_SURF_ZBUFFER; - flags |= RADEON_SURF_TC_COMPATIBLE_HTILE; + if ((sscreen->debug_flags & DBG(NO_HYPERZ)) || + (ptex->bind & PIPE_BIND_SHARED) || is_imported) { + flags |= RADEON_SURF_NO_HTILE; + } else if (tc_compatible_htile && + (sscreen->info.gfx_level >= GFX9 || array_mode == RADEON_SURF_MODE_2D)) { + /* TC-compatible HTILE only supports Z32_FLOAT. + * GFX9 also supports Z16_UNORM. + * On GFX8, promote Z16 to Z32. DB->CB copies will convert + * the format for transfers. + */ + if (sscreen->info.gfx_level == GFX8) + bpe = 4; + + flags |= RADEON_SURF_TC_COMPATIBLE_HTILE; + } + + if (is_stencil) + flags |= RADEON_SURF_SBUFFER; } - if (is_stencil) - flags |= RADEON_SURF_SBUFFER; - } - - /* Disable DCC? (it can't be disabled if modifiers are used) */ - if (sscreen->info.gfx_level >= GFX8 && modifier == DRM_FORMAT_MOD_INVALID && !is_imported) { - /* Global options that disable DCC. */ - if (ptex->flags & SI_RESOURCE_FLAG_DISABLE_DCC) - flags |= RADEON_SURF_DISABLE_DCC; - - if (ptex->nr_samples >= 2 && sscreen->debug_flags & DBG(NO_DCC_MSAA)) - flags |= RADEON_SURF_DISABLE_DCC; - - /* Shared textures must always set up DCC. If it's not present, it will be disabled by - * si_get_opaque_metadata later. - */ - if (!is_imported && sscreen->debug_flags & DBG(NO_DCC)) - flags |= RADEON_SURF_DISABLE_DCC; - - /* R9G9B9E5 isn't supported for rendering by older generations. */ - if (sscreen->info.gfx_level < GFX10_3 && - ptex->format == PIPE_FORMAT_R9G9B9E5_FLOAT) - flags |= RADEON_SURF_DISABLE_DCC; - - /* If constant (non-data-dependent) format is requested, disable DCC: */ - if (ptex->bind & PIPE_BIND_CONST_BW) - flags |= RADEON_SURF_DISABLE_DCC; - - switch (sscreen->info.gfx_level) { - case GFX8: - /* Stoney: 128bpp MSAA textures randomly fail piglit tests with DCC. */ - if (sscreen->info.family == CHIP_STONEY && bpe == 16 && ptex->nr_samples >= 2) + /* Disable DCC? (it can't be disabled if modifiers are used) */ + if (sscreen->info.gfx_level >= GFX8 && modifier == DRM_FORMAT_MOD_INVALID && !is_imported) { + /* Global options that disable DCC. */ + if (ptex->flags & SI_RESOURCE_FLAG_DISABLE_DCC) flags |= RADEON_SURF_DISABLE_DCC; - /* DCC clear for 4x and 8x MSAA array textures unimplemented. */ - if (ptex->nr_storage_samples >= 4 && ptex->array_size > 1) + if (ptex->nr_samples >= 2 && sscreen->debug_flags & DBG(NO_DCC_MSAA)) flags |= RADEON_SURF_DISABLE_DCC; - break; - case GFX9: - /* DCC MSAA fails this on Raven: - * https://www.khronos.org/registry/webgl/sdk/tests/deqp/functional/gles3/fbomultisample.2_samples.html - * and this on Picasso: - * https://www.khronos.org/registry/webgl/sdk/tests/deqp/functional/gles3/fbomultisample.4_samples.html + /* Shared textures must always set up DCC. If it's not present, it will be disabled by + * si_get_opaque_metadata later. */ - if (sscreen->info.family == CHIP_RAVEN && ptex->nr_storage_samples >= 2 && bpe < 4) + if (!is_imported && sscreen->debug_flags & DBG(NO_DCC)) flags |= RADEON_SURF_DISABLE_DCC; - /* Vega10 fails these 2x and 4x MSAA tests with DCC: - * piglit/bin/ext_framebuffer_multisample-formats 2 GL_EXT_texture_snorm - * piglit/bin/ext_framebuffer_multisample-formats 4 GL_EXT_texture_snorm + /* R9G9B9E5 isn't supported for rendering by older generations. */ + if (sscreen->info.gfx_level < GFX10_3 && + ptex->format == PIPE_FORMAT_R9G9B9E5_FLOAT) + flags |= RADEON_SURF_DISABLE_DCC; + + /* If constant (non-data-dependent) format is requested, disable DCC: */ + if (ptex->bind & PIPE_BIND_CONST_BW) + flags |= RADEON_SURF_DISABLE_DCC; + + switch (sscreen->info.gfx_level) { + case GFX8: + /* Stoney: 128bpp MSAA textures randomly fail piglit tests with DCC. */ + if (sscreen->info.family == CHIP_STONEY && bpe == 16 && ptex->nr_samples >= 2) + flags |= RADEON_SURF_DISABLE_DCC; + + /* DCC clear for 4x and 8x MSAA array textures unimplemented. */ + if (ptex->nr_storage_samples >= 4 && ptex->array_size > 1) + flags |= RADEON_SURF_DISABLE_DCC; + break; + + case GFX9: + /* DCC MSAA fails this on Raven: + * https://www.khronos.org/registry/webgl/sdk/tests/deqp/functional/gles3/fbomultisample.2_samples.html + * and this on Picasso: + * https://www.khronos.org/registry/webgl/sdk/tests/deqp/functional/gles3/fbomultisample.4_samples.html + */ + if (sscreen->info.family == CHIP_RAVEN && ptex->nr_storage_samples >= 2 && bpe < 4) + flags |= RADEON_SURF_DISABLE_DCC; + + /* Vega10 fails these 2x and 4x MSAA tests with DCC: + * piglit/bin/ext_framebuffer_multisample-formats 2 GL_EXT_texture_snorm + * piglit/bin/ext_framebuffer_multisample-formats 4 GL_EXT_texture_snorm + */ + if ((ptex->nr_storage_samples == 2 || ptex->nr_storage_samples == 4) && bpe <= 2 && + util_format_is_snorm(ptex->format)) + flags |= RADEON_SURF_DISABLE_DCC; + + /* Vega10 fails these MSAA tests with DCC: + * piglit/bin/ext_framebuffer_multisample-formats 2 GL_ARB_texture_float + * piglit/bin/ext_framebuffer_multisample-formats 2 GL_ARB_texture_rg-float + */ + if (ptex->nr_storage_samples == 2 && bpe == 2 && util_format_is_float(ptex->format)) + flags |= RADEON_SURF_DISABLE_DCC; + + /* We allow S8_UINT as a color format, and piglit/draw-pixels fails if we enable DCC. */ + if (ptex->format == PIPE_FORMAT_S8_UINT) + flags |= RADEON_SURF_DISABLE_DCC; + break; + + case GFX10: + case GFX10_3: + if (ptex->nr_storage_samples >= 2 && !sscreen->options.dcc_msaa) + flags |= RADEON_SURF_DISABLE_DCC; + + /* Navi10 fails these MSAA tests with DCC: + * piglit/bin/arb_sample_shading-samplemask 2 all all + * piglit/bin/arb_sample_shading-samplemask 4 all all + * piglit/bin/ext_framebuffer_multisample-formats 2 GL_ARB_texture_float + * piglit/bin/ext_framebuffer_multisample-formats 2 GL_EXT_texture_integer + */ + if (sscreen->info.gfx_level == GFX10 && + (ptex->nr_storage_samples == 2 || ptex->nr_storage_samples == 4)) + flags |= RADEON_SURF_DISABLE_DCC; + break; + + case GFX11: + case GFX11_5: + break; + + default: + assert(0); + } + } + + if (sscreen->debug_flags & DBG(NO_FMASK)) + flags |= RADEON_SURF_NO_FMASK; + + if (sscreen->info.gfx_level == GFX9 && (ptex->flags & SI_RESOURCE_FLAG_FORCE_MICRO_TILE_MODE)) { + flags |= RADEON_SURF_FORCE_MICRO_TILE_MODE; + surface->micro_tile_mode = SI_RESOURCE_FLAG_MICRO_TILE_MODE_GET(ptex->flags); + } + + if (ptex->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING) { + /* GFX11 shouldn't get here because the flag is only used by the CB MSAA resolving + * that GFX11 doesn't have. */ - if ((ptex->nr_storage_samples == 2 || ptex->nr_storage_samples == 4) && bpe <= 2 && - util_format_is_snorm(ptex->format)) - flags |= RADEON_SURF_DISABLE_DCC; + assert(sscreen->info.gfx_level <= GFX10_3); - /* Vega10 fails these MSAA tests with DCC: - * piglit/bin/ext_framebuffer_multisample-formats 2 GL_ARB_texture_float - * piglit/bin/ext_framebuffer_multisample-formats 2 GL_ARB_texture_rg-float - */ - if (ptex->nr_storage_samples == 2 && bpe == 2 && util_format_is_float(ptex->format)) - flags |= RADEON_SURF_DISABLE_DCC; + flags |= RADEON_SURF_FORCE_SWIZZLE_MODE; - /* We allow S8_UINT as a color format, and piglit/draw-pixels fails if we enable DCC. */ - if (ptex->format == PIPE_FORMAT_S8_UINT) - flags |= RADEON_SURF_DISABLE_DCC; - break; + if (sscreen->info.gfx_level >= GFX10) + surface->u.gfx9.swizzle_mode = ADDR_SW_64KB_R_X; + } - case GFX10: - case GFX10_3: - if (ptex->nr_storage_samples >= 2 && !sscreen->options.dcc_msaa) - flags |= RADEON_SURF_DISABLE_DCC; - - /* Navi10 fails these MSAA tests with DCC: - * piglit/bin/arb_sample_shading-samplemask 2 all all - * piglit/bin/arb_sample_shading-samplemask 4 all all - * piglit/bin/ext_framebuffer_multisample-formats 2 GL_ARB_texture_float - * piglit/bin/ext_framebuffer_multisample-formats 2 GL_EXT_texture_integer - */ - if (sscreen->info.gfx_level == GFX10 && - (ptex->nr_storage_samples == 2 || ptex->nr_storage_samples == 4)) - flags |= RADEON_SURF_DISABLE_DCC; - break; - - case GFX11: - case GFX11_5: - break; - - default: - assert(0); + if (ptex->flags & PIPE_RESOURCE_FLAG_SPARSE) { + flags |= RADEON_SURF_NO_FMASK | + RADEON_SURF_NO_HTILE | + RADEON_SURF_DISABLE_DCC; } } @@ -336,35 +375,12 @@ static int si_init_surface(struct si_screen *sscreen, struct radeon_surf *surfac if (ptex->bind & PIPE_BIND_SHARED) flags |= RADEON_SURF_SHAREABLE; + if (is_imported) flags |= RADEON_SURF_IMPORTED | RADEON_SURF_SHAREABLE; - if (sscreen->debug_flags & DBG(NO_FMASK)) - flags |= RADEON_SURF_NO_FMASK; - if (sscreen->info.gfx_level == GFX9 && (ptex->flags & SI_RESOURCE_FLAG_FORCE_MICRO_TILE_MODE)) { - flags |= RADEON_SURF_FORCE_MICRO_TILE_MODE; - surface->micro_tile_mode = SI_RESOURCE_FLAG_MICRO_TILE_MODE_GET(ptex->flags); - } - - if (ptex->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING) { - /* GFX11 shouldn't get here because the flag is only used by the CB MSAA resolving - * that GFX11 doesn't have. - */ - assert(sscreen->info.gfx_level <= GFX10_3); - - flags |= RADEON_SURF_FORCE_SWIZZLE_MODE; - - if (sscreen->info.gfx_level >= GFX10) - surface->u.gfx9.swizzle_mode = ADDR_SW_64KB_R_X; - } - - if (ptex->flags & PIPE_RESOURCE_FLAG_SPARSE) { - flags |= - RADEON_SURF_PRT | - RADEON_SURF_NO_FMASK | - RADEON_SURF_NO_HTILE | - RADEON_SURF_DISABLE_DCC; - } + if (ptex->flags & PIPE_RESOURCE_FLAG_SPARSE) + flags |= RADEON_SURF_PRT; surface->modifier = modifier; @@ -1064,6 +1080,26 @@ static struct si_texture *si_texture_create_object(struct pipe_screen *screen, u_log_context_destroy(&log); } + if (sscreen->info.gfx_level >= GFX12) { + if (tex->is_depth) { + /* Z24 is no longer supported. We should use Z32_FLOAT instead. */ + if (base->format == PIPE_FORMAT_Z16_UNORM) { + tex->db_render_format = base->format; + } else { + tex->db_render_format = PIPE_FORMAT_Z32_FLOAT; + tex->upgraded_depth = base->format != PIPE_FORMAT_Z32_FLOAT && + base->format != PIPE_FORMAT_Z32_FLOAT_S8X24_UINT; + } + + tex->db_compatible = true; + tex->can_sample_z = true; + tex->can_sample_s = true; + } + return tex; + } + + /* Everything below is for GFX6-11. */ + /* Use 1.0 as the default clear value to get optimal ZRANGE_PRECISION if we don't * get a fast clear. */ @@ -1326,7 +1362,7 @@ si_texture_create_with_modifier(struct pipe_screen *screen, bool is_flushed_depth = templ->flags & SI_RESOURCE_FLAG_FLUSHED_DEPTH || templ->flags & SI_RESOURCE_FLAG_FORCE_LINEAR; bool tc_compatible_htile = - sscreen->info.gfx_level >= GFX8 && + sscreen->info.gfx_level >= GFX8 && sscreen->info.gfx_level < GFX12 && /* There are issues with TC-compatible HTILE on Tonga (and * Iceland is the same design), and documented bug workarounds * don't help. For example, this fails: