diff --git a/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c b/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c index eae44acf414..0319ae0bf1b 100644 --- a/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c +++ b/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c @@ -78,6 +78,7 @@ void si_init_cp_reg_shadowing(struct si_context *sctx) ac_emulate_clear_state(&sctx->screen->info, &sctx->gfx_cs, si_set_context_reg_array); /* TODO: Gfx11 fails GLCTS if we don't re-emit the preamble at the beginning of every IB. */ + /* TODO: Skipping this may have made register shadowing slower on Gfx11. */ if (sctx->gfx_level < GFX11) { si_pm4_emit_commands(sctx, sctx->cs_preamble_state); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index bd2ceceef98..87a576f6011 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -2134,7 +2134,7 @@ si_update_ngg_prim_state_sgpr(struct si_context *sctx, struct si_shader *hw_vs, /* Set the primitive type seen by the rasterizer. GS and tessellation affect this. * It's expected that hw_vs and ngg are inline constants in draw_vbo after optimizations. */ -static inline void +static ALWAYS_INLINE void si_set_rasterized_prim(struct si_context *sctx, enum mesa_prim rast_prim, struct si_shader *hw_vs, bool ngg) { diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 133bdce92df..b099aed6c66 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -5090,7 +5090,7 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count, if (sscreen->info.gfx_level >= GFX10) { const struct gfx10_format *fmt = &ac_get_gfx10_format_table(&sscreen->info)[elements[i].src_format]; - unsigned last_vertex_format = sscreen->info.gfx_level >= GFX11 ? 64 : 128; + ASSERTED unsigned last_vertex_format = sscreen->info.gfx_level >= GFX11 ? 64 : 128; assert(fmt->img_format != 0 && fmt->img_format < last_vertex_format); v->rsrc_word3[i] |= S_008F0C_FORMAT(fmt->img_format) | S_008F0C_RESOURCE_LEVEL(sscreen->info.gfx_level < GFX11); diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index 9b9858091a2..adf04519298 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -1074,7 +1074,7 @@ static void si_emit_draw_registers(struct si_context *sctx, radeon_begin(cs); if (prim != sctx->last_prim) { - unsigned vgt_prim = si_conv_pipe_prim(prim); + unsigned vgt_prim = HAS_TESS ? V_008958_DI_PT_PATCH : si_conv_pipe_prim(prim); if (GFX_VERSION >= GFX10) radeon_set_uconfig_reg(R_030908_VGT_PRIMITIVE_TYPE, vgt_prim); diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index eb2c136e2a8..7e023fa3524 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -1289,7 +1289,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader gs_info->base.vs.window_space_position : 0; bool es_enable_prim_id = shader->key.ge.mono.u.vs_export_prim_id || es_info->uses_primid; unsigned gs_num_invocations = gs_sel->stage == MESA_SHADER_GEOMETRY ? - MAX2(gs_info->base.gs.invocations, 1) : 0; + CLAMP(gs_info->base.gs.invocations, 1, 32) : 0; unsigned input_prim = si_get_input_prim(gs_sel, &shader->key); bool break_wave_at_eoi = false; @@ -1322,6 +1322,9 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader break_wave_at_eoi = true; } + /* Primitives with adjancency can only occur without tessellation. */ + assert(gs_info->gs_input_verts_per_prim <= 3 || es_stage == MESA_SHADER_VERTEX); + /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and * VGPR[0:4] are always loaded. * @@ -4650,9 +4653,10 @@ static void si_emit_spi_map(struct si_context *sctx, unsigned index) spi_ps_input_cntl[i] = ps_input_cntl; } - /* R_028644_SPI_PS_INPUT_CNTL_0 */ - /* Dota 2: Only ~16% of SPI map updates set different values. */ - /* Talos: Only ~9% of SPI map updates set different values. */ + /* Performance notes: + * Dota 2: Only ~16% of SPI map updates set different values. + * Talos: Only ~9% of SPI map updates set different values. + */ radeon_begin(&sctx->gfx_cs); radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0, spi_ps_input_cntl, sctx->tracked_regs.spi_ps_input_cntl, NUM_INTERP); diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c b/src/gallium/drivers/radeonsi/si_state_streamout.c index 4fa0119135f..9e34b202d98 100644 --- a/src/gallium/drivers/radeonsi/si_state_streamout.c +++ b/src/gallium/drivers/radeonsi/si_state_streamout.c @@ -321,8 +321,6 @@ void si_emit_streamout_end(struct si_context *sctx) if (!t[i]) continue; - uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset; - if (sctx->gfx_level >= GFX11) { si_cp_copy_data(sctx, &sctx->gfx_cs, COPY_DATA_DST_MEM, t[i]->buf_filled_size, t[i]->buf_filled_size_offset, @@ -331,6 +329,8 @@ void si_emit_streamout_end(struct si_context *sctx) sctx->flags |= SI_CONTEXT_PFP_SYNC_ME; si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); } else { + uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset; + radeon_begin(cs); radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); radeon_emit(STRMOUT_SELECT_BUFFER(i) | STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) | diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c b/src/gallium/drivers/radeonsi/si_state_viewport.c index 1210cf3995e..2c262aa3ed0 100644 --- a/src/gallium/drivers/radeonsi/si_state_viewport.c +++ b/src/gallium/drivers/radeonsi/si_state_viewport.c @@ -229,9 +229,8 @@ static void si_emit_one_scissor(struct si_context *ctx, struct radeon_cmdbuf *cs si_clip_scissor(&final, scissor); radeon_begin(cs); - - /* Workaround for a hw bug on GFX6 that occurs when PA_SU_HARDWARE_- - * SCREEN_OFFSET != 0 and any_scissor.BR_X/Y <= 0. + /* Workaround for a hw bug on GFX6 that occurs when PA_SU_HARDWARE_SCREEN_OFFSET != 0 and + * any_scissor.BR_X/Y <= 0. */ if (ctx->gfx_level == GFX6 && (final.maxx == 0 || final.maxy == 0)) { radeon_emit(S_028250_TL_X(1) | S_028250_TL_Y(1) | S_028250_WINDOW_OFFSET_DISABLE(1)); @@ -246,8 +245,6 @@ static void si_emit_one_scissor(struct si_context *ctx, struct radeon_cmdbuf *cs radeon_end(); } -#define MAX_PA_SU_HARDWARE_SCREEN_OFFSET 8176 - static void si_emit_guardband(struct si_context *sctx, unsigned index) { const struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; @@ -284,6 +281,7 @@ static void si_emit_guardband(struct si_context *sctx, unsigned index) const unsigned hw_screen_offset_alignment = sctx->gfx_level >= GFX11 ? 32 : sctx->gfx_level >= GFX8 ? 16 : MAX2(sctx->screen->se_tile_repeat, 16); + const unsigned max_hw_screen_offset = 8176; /* Indexed by quantization modes */ static int max_viewport_size[] = {65535, 16383, 4095}; @@ -295,8 +293,8 @@ static void si_emit_guardband(struct si_context *sctx, unsigned index) assert(vp_as_scissor.maxx <= max_viewport_size[vp_as_scissor.quant_mode] && vp_as_scissor.maxy <= max_viewport_size[vp_as_scissor.quant_mode]); - hw_screen_offset_x = CLAMP(hw_screen_offset_x, 0, MAX_PA_SU_HARDWARE_SCREEN_OFFSET); - hw_screen_offset_y = CLAMP(hw_screen_offset_y, 0, MAX_PA_SU_HARDWARE_SCREEN_OFFSET); + hw_screen_offset_x = CLAMP(hw_screen_offset_x, 0, max_hw_screen_offset); + hw_screen_offset_y = CLAMP(hw_screen_offset_y, 0, max_hw_screen_offset); /* Align the screen offset by dropping the low bits. */ hw_screen_offset_x &= ~(hw_screen_offset_alignment - 1); diff --git a/src/gallium/drivers/radeonsi/si_texture.c b/src/gallium/drivers/radeonsi/si_texture.c index 71fb2298088..b13b8a80e1d 100644 --- a/src/gallium/drivers/radeonsi/si_texture.c +++ b/src/gallium/drivers/radeonsi/si_texture.c @@ -201,14 +201,12 @@ static int si_init_surface(struct si_screen *sscreen, struct radeon_surf *surfac bool is_flushed_depth, bool tc_compatible_htile) { const struct util_format_description *desc = util_format_description(ptex->format); - bool is_depth, is_stencil; + bool is_depth = util_format_has_depth(desc); + bool is_stencil = util_format_has_stencil(desc); int r; unsigned bpe; uint64_t flags = 0; - is_depth = util_format_has_depth(desc); - is_stencil = util_format_has_stencil(desc); - if (!is_flushed_depth && ptex->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) { bpe = 4; /* stencil is allocated separately */ } else {