diff --git a/src/gallium/drivers/radeonsi/si_build_pm4.h b/src/gallium/drivers/radeonsi/si_build_pm4.h index 9d0506a5ccd..ab145e4943c 100644 --- a/src/gallium/drivers/radeonsi/si_build_pm4.h +++ b/src/gallium/drivers/radeonsi/si_build_pm4.h @@ -118,6 +118,32 @@ radeon_emit(value); \ } while (0) +#define radeon_push_gfx_sh_reg(reg, value) do { \ + unsigned __i = sctx->num_buffered_gfx_sh_regs++; \ + assert(__i / 2 < ARRAY_SIZE(sctx->buffered_gfx_sh_regs)); \ + sctx->buffered_gfx_sh_regs[__i / 2].reg_offset[__i % 2] = ((reg) - SI_SH_REG_OFFSET) >> 2; \ + sctx->buffered_gfx_sh_regs[__i / 2].reg_value[__i % 2] = value; \ +} while (0) + +#define radeon_set_or_push_gfx_sh_reg(reg, value) do { \ + if (GFX_VERSION >= GFX11) { \ + radeon_push_gfx_sh_reg(reg, value); \ + } else { \ + radeon_set_sh_reg_seq(reg, 1); \ + radeon_emit(value); \ + } \ +} while (0) + +#define radeon_opt_push_gfx_sh_reg(offset, reg, val) do { \ + unsigned __value = val; \ + if (((sctx->tracked_regs.other_reg_saved_mask >> (reg)) & 0x1) != 0x1 || \ + sctx->tracked_regs.other_reg_value[reg] != __value) { \ + radeon_push_gfx_sh_reg(offset, __value); \ + sctx->tracked_regs.other_reg_saved_mask |= BITFIELD64_BIT(reg); \ + sctx->tracked_regs.other_reg_value[reg] = __value; \ + } \ +} while (0) + #define radeon_set_uconfig_reg_seq(reg, num, perfctr) do { \ assert((reg) >= CIK_UCONFIG_REG_OFFSET && (reg) < CIK_UCONFIG_REG_END); \ radeon_emit(PKT3(PKT3_SET_UCONFIG_REG, num, perfctr)); \ diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index 75136c4e2b1..8397e67de12 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -2160,16 +2160,25 @@ void si_shader_change_notify(struct si_context *sctx) if (sh_reg_base) { \ unsigned mask = sctx->shader_pointers_dirty & (pointer_mask); \ \ - while (mask) { \ - int start, count; \ - u_bit_scan_consecutive_range(&mask, &start, &count); \ - \ - struct si_descriptors *descs = &sctx->descriptors[start]; \ - unsigned sh_offset = sh_reg_base + descs->shader_userdata_offset; \ - \ - radeon_set_sh_reg_seq(sh_offset, count); \ - for (int i = 0; i < count; i++) \ - radeon_emit_32bit_pointer(sctx->screen, descs[i].gpu_address); \ + if (sctx->gfx_level >= GFX11 && sh_reg_base != R_00B900_COMPUTE_USER_DATA_0) { \ + u_foreach_bit(i, mask) { \ + struct si_descriptors *descs = &sctx->descriptors[i]; \ + unsigned sh_reg = sh_reg_base + descs->shader_userdata_offset; \ + \ + radeon_push_gfx_sh_reg(sh_reg, descs->gpu_address); \ + } \ + } else { \ + while (mask) { \ + int start, count; \ + u_bit_scan_consecutive_range(&mask, &start, &count); \ + \ + struct si_descriptors *descs = &sctx->descriptors[start]; \ + unsigned sh_offset = sh_reg_base + descs->shader_userdata_offset; \ + \ + radeon_set_sh_reg_seq(sh_offset, count); \ + for (int i = 0; i < count; i++) \ + radeon_emit_32bit_pointer(sctx->screen, descs[i].gpu_address); \ + } \ } \ } \ } while (0) @@ -2179,9 +2188,12 @@ static void si_emit_global_shader_pointers(struct si_context *sctx, struct si_de radeon_begin(&sctx->gfx_cs); if (sctx->gfx_level >= GFX11) { - radeon_emit_one_32bit_pointer(sctx, descs, R_00B030_SPI_SHADER_USER_DATA_PS_0); - radeon_emit_one_32bit_pointer(sctx, descs, R_00B230_SPI_SHADER_USER_DATA_GS_0); - radeon_emit_one_32bit_pointer(sctx, descs, R_00B430_SPI_SHADER_USER_DATA_HS_0); + radeon_push_gfx_sh_reg(R_00B030_SPI_SHADER_USER_DATA_PS_0 + descs->shader_userdata_offset, + descs->gpu_address); + radeon_push_gfx_sh_reg(R_00B230_SPI_SHADER_USER_DATA_GS_0 + descs->shader_userdata_offset, + descs->gpu_address); + radeon_push_gfx_sh_reg(R_00B430_SPI_SHADER_USER_DATA_HS_0 + descs->shader_userdata_offset, + descs->gpu_address); } else if (sctx->gfx_level >= GFX10) { radeon_emit_one_32bit_pointer(sctx, descs, R_00B030_SPI_SHADER_USER_DATA_PS_0); /* HW VS stage only used in non-NGG mode. */ @@ -2231,8 +2243,9 @@ void si_emit_graphics_shader_pointers(struct si_context *sctx) if (sctx->gs_attribute_ring_pointer_dirty) { assert(sctx->gfx_level >= GFX11); - radeon_set_sh_reg(R_00B230_SPI_SHADER_USER_DATA_GS_0 + GFX9_SGPR_ATTRIBUTE_RING_ADDR * 4, - sctx->screen->attribute_ring->gpu_address); + radeon_push_gfx_sh_reg(R_00B230_SPI_SHADER_USER_DATA_GS_0 + + GFX9_SGPR_ATTRIBUTE_RING_ADDR * 4, + sctx->screen->attribute_ring->gpu_address); sctx->gs_attribute_ring_pointer_dirty = false; } radeon_end(); diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index 0f463af8063..4aca0e00e82 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -545,6 +545,9 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs) ctx->last_tes_sh_base = -1; ctx->last_num_tcs_input_cp = -1; + assert(ctx->num_buffered_gfx_sh_regs == 0); + ctx->num_buffered_gfx_sh_regs = 0; + if (ctx->scratch_buffer) { si_context_add_resource_size(ctx, &ctx->scratch_buffer->b.b); si_mark_atom_dirty(ctx, &ctx->atoms.s.scratch_state); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index b125b1a41e7..8805508b502 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -910,6 +910,20 @@ struct si_vertex_state { uint32_t descriptors[4 * SI_MAX_ATTRIBS]; }; +/* The structure layout is identical to a pair of registers in SET_*_REG_PAIRS_PACKED. */ +struct si_sh_reg_pair { + union { + /* A pair of register offsets. */ + struct { + uint16_t reg_offset[2]; + }; + /* The same pair of register offsets as a dword. */ + uint32_t reg_offsets; + }; + /* A pair of register values for the register offsets above. */ + uint32_t reg_value[2]; +}; + typedef void (*pipe_draw_vbo_func)(struct pipe_context *pipe, const struct pipe_draw_info *info, unsigned drawid_offset, @@ -1017,6 +1031,9 @@ struct si_context { unsigned dirty_states; union si_state queued; union si_state emitted; + /* Gfx11+: Buffered SH registers for SET_SH_REG_PAIRS_PACKED*. */ + unsigned num_buffered_gfx_sh_regs; + struct si_sh_reg_pair buffered_gfx_sh_regs[32]; /* Atom declarations. */ struct si_framebuffer framebuffer; diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index bdb75c1232d..c7dcfae516d 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -855,7 +855,20 @@ static void si_emit_tess_io_layout_state(struct si_context *sctx) if (!sctx->shader.tes.cso || !sctx->shader.tcs.current) return; - if (sctx->gfx_level >= GFX9) { + if (sctx->gfx_level >= GFX11) { + radeon_opt_push_gfx_sh_reg(R_00B42C_SPI_SHADER_PGM_RSRC2_HS, + SI_TRACKED_SPI_SHADER_PGM_RSRC2_HS, sctx->ls_hs_rsrc2); + + /* Set userdata SGPRs for merged LS-HS. */ + radeon_opt_push_gfx_sh_reg(R_00B430_SPI_SHADER_USER_DATA_HS_0 + + GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 4, + SI_TRACKED_SPI_SHADER_USER_DATA_HS__TCS_OFFCHIP_LAYOUT, + sctx->tcs_offchip_layout); + radeon_opt_push_gfx_sh_reg(R_00B430_SPI_SHADER_USER_DATA_HS_0 + + GFX9_SGPR_TCS_OFFCHIP_ADDR * 4, + SI_TRACKED_SPI_SHADER_USER_DATA_HS__TCS_OFFCHIP_ADDR, + sctx->tes_offchip_ring_va_sgpr); + } else if (sctx->gfx_level >= GFX9) { radeon_opt_set_sh_reg(sctx, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, SI_TRACKED_SPI_SHADER_PGM_RSRC2_HS, sctx->ls_hs_rsrc2); @@ -890,9 +903,16 @@ static void si_emit_tess_io_layout_state(struct si_context *sctx) /* These can't be optimized because the user data SGPRs may have different meaning * without tessellation. (they are VS and ES/GS user data SGPRs) */ - radeon_set_sh_reg_seq(tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4, 2); - radeon_emit(sctx->tcs_offchip_layout); - radeon_emit(sctx->tes_offchip_ring_va_sgpr); + if (sctx->gfx_level >= GFX11) { + radeon_push_gfx_sh_reg(tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4, + sctx->tcs_offchip_layout); + radeon_push_gfx_sh_reg(tes_sh_base + SI_SGPR_TES_OFFCHIP_ADDR * 4, + sctx->tes_offchip_ring_va_sgpr); + } else { + radeon_set_sh_reg_seq(tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4, 2); + radeon_emit(sctx->tcs_offchip_layout); + radeon_emit(sctx->tes_offchip_ring_va_sgpr); + } radeon_end(); radeon_begin_again(cs); @@ -1238,23 +1258,23 @@ static void si_emit_vs_state(struct si_context *sctx, unsigned index_size) radeon_begin(cs); if (HAS_GS) { - radeon_set_sh_reg(vs_base + SI_SGPR_VS_STATE_BITS * 4, vs_state); + radeon_set_or_push_gfx_sh_reg(vs_base + SI_SGPR_VS_STATE_BITS * 4, vs_state); /* GS always uses the state bits for emulating VGT_ESGS_RING_ITEMSIZE on Gfx9 * (via nir_load_esgs_vertex_stride_amd) and for emulating GS pipeline statistics * on gfx10.x. NGG GS also has lots of states in there. */ if (GFX_VERSION >= GFX9) - radeon_set_sh_reg(gs_base + SI_SGPR_VS_STATE_BITS * 4, gs_state); + radeon_set_or_push_gfx_sh_reg(gs_base + SI_SGPR_VS_STATE_BITS * 4, gs_state); /* The GS copy shader (for legacy GS) always uses the state bits. */ if (!NGG) - radeon_set_sh_reg(gs_copy_base + SI_SGPR_VS_STATE_BITS * 4, gs_state); + radeon_set_or_push_gfx_sh_reg(gs_copy_base + SI_SGPR_VS_STATE_BITS * 4, gs_state); } else if (HAS_TESS) { - radeon_set_sh_reg(vs_base + SI_SGPR_VS_STATE_BITS * 4, vs_state); - radeon_set_sh_reg(tes_base + SI_SGPR_VS_STATE_BITS * 4, NGG ? gs_state : vs_state); + radeon_set_or_push_gfx_sh_reg(vs_base + SI_SGPR_VS_STATE_BITS * 4, vs_state); + radeon_set_or_push_gfx_sh_reg(tes_base + SI_SGPR_VS_STATE_BITS * 4, NGG ? gs_state : vs_state); } else { - radeon_set_sh_reg(vs_base + SI_SGPR_VS_STATE_BITS * 4, NGG ? gs_state : vs_state); + radeon_set_or_push_gfx_sh_reg(vs_base + SI_SGPR_VS_STATE_BITS * 4, NGG ? gs_state : vs_state); } radeon_end(); @@ -1454,6 +1474,49 @@ static void si_emit_draw_registers(struct si_context *sctx, radeon_end(); } +static ALWAYS_INLINE void +gfx11_emit_buffered_sh_regs_inline(struct si_context *sctx, unsigned *num_regs, + struct si_sh_reg_pair *reg_pairs) +{ + unsigned reg_count = *num_regs; + + if (!reg_count) + return; + + *num_regs = 0; + + /* If there is only one register, we can't use the packed SET packet. */ + if (reg_count == 1) { + radeon_begin(&sctx->gfx_cs); + radeon_emit(PKT3(PKT3_SET_SH_REG, 1, 0)); + radeon_emit(reg_pairs[0].reg_offset[0]); + radeon_emit(reg_pairs[0].reg_value[0]); + radeon_end(); + return; + } + + unsigned packet = reg_count <= 14 ? PKT3_SET_SH_REG_PAIRS_PACKED_N : + PKT3_SET_SH_REG_PAIRS_PACKED; + unsigned padded_reg_count = align(reg_count, 2); + + radeon_begin(&sctx->gfx_cs); + radeon_emit(PKT3(packet, (padded_reg_count / 2) * 3, 0) | PKT3_RESET_FILTER_CAM_S(1)); + radeon_emit(padded_reg_count); + radeon_emit_array(reg_pairs, (reg_count / 2) * 3); + + if (reg_count % 2 == 1) { + unsigned i = reg_count / 2; + + /* Pad the packet by setting the first register again at the end because the register + * count must be even and 2 consecutive offsets must not be equal. + */ + radeon_emit(reg_pairs[i].reg_offset[0] | ((uint32_t)reg_pairs[0].reg_offset[0] << 16)); + radeon_emit(reg_pairs[i].reg_value[0]); + radeon_emit(reg_pairs[0].reg_value[0]); + } + radeon_end(); +} + #define EMIT_SQTT_END_DRAW \ do { \ if (GFX_VERSION >= GFX9 && unlikely(sctx->sqtt_enabled)) { \ @@ -1605,6 +1668,13 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw assert(indirect_va % 8 == 0); + if (GFX_VERSION >= GFX11) { + radeon_end(); + gfx11_emit_buffered_sh_regs_inline(sctx, &sctx->num_buffered_gfx_sh_regs, + sctx->buffered_gfx_sh_regs); + radeon_begin_again(cs); + } + si_invalidate_draw_constants(sctx); radeon_emit(PKT3(PKT3_SET_BASE, 2, 0)); @@ -1674,43 +1744,80 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw bool set_draw_id = !IS_DRAW_VERTEX_STATE && sctx->vs_uses_draw_id; bool set_base_instance = sctx->vs_uses_base_instance; + bool is_blit = !IS_DRAW_VERTEX_STATE && sctx->num_vs_blit_sgprs; - if (!IS_DRAW_VERTEX_STATE && sctx->num_vs_blit_sgprs) { + if (!is_blit) { + /* Prefer SET_SH_REG_PAIRS_PACKED* on Gfx11+. */ + if (GFX_VERSION >= GFX11) { + if (base_vertex != sctx->last_base_vertex || + sctx->last_base_vertex == SI_BASE_VERTEX_UNKNOWN) { + radeon_push_gfx_sh_reg(sh_base_reg + SI_SGPR_BASE_VERTEX * 4, base_vertex); + sctx->last_base_vertex = base_vertex; + } + + if (set_draw_id && + (drawid_base != sctx->last_drawid || + sctx->last_drawid == SI_DRAW_ID_UNKNOWN)) { + radeon_push_gfx_sh_reg(sh_base_reg + SI_SGPR_DRAWID * 4, drawid_base); + sctx->last_drawid = drawid_base; + } + + if (set_base_instance && + (info->start_instance != sctx->last_start_instance || + sctx->last_start_instance == SI_START_INSTANCE_UNKNOWN)) { + radeon_push_gfx_sh_reg(sh_base_reg + SI_SGPR_START_INSTANCE * 4, + info->start_instance); + sctx->last_start_instance = info->start_instance; + } + } else if (base_vertex != sctx->last_base_vertex || + sctx->last_base_vertex == SI_BASE_VERTEX_UNKNOWN || + (set_base_instance && + (info->start_instance != sctx->last_start_instance || + sctx->last_start_instance == SI_START_INSTANCE_UNKNOWN)) || + (set_draw_id && + (drawid_base != sctx->last_drawid || + sctx->last_drawid == SI_DRAW_ID_UNKNOWN)) || + sh_base_reg != sctx->last_sh_base_reg) { + if (set_base_instance) { + radeon_set_sh_reg_seq(sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 3); + radeon_emit(base_vertex); + radeon_emit(drawid_base); + radeon_emit(info->start_instance); + + sctx->last_start_instance = info->start_instance; + sctx->last_drawid = drawid_base; + } else if (set_draw_id) { + radeon_set_sh_reg_seq(sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 2); + radeon_emit(base_vertex); + radeon_emit(drawid_base); + + sctx->last_drawid = drawid_base; + } else { + radeon_set_sh_reg(sh_base_reg + SI_SGPR_BASE_VERTEX * 4, base_vertex); + } + + sctx->last_base_vertex = base_vertex; + sctx->last_sh_base_reg = sh_base_reg; + } + } + + if (GFX_VERSION >= GFX11) { + radeon_end(); + gfx11_emit_buffered_sh_regs_inline(sctx, &sctx->num_buffered_gfx_sh_regs, + sctx->buffered_gfx_sh_regs); + radeon_begin_again(cs); + } + + /* Blit SGPRs must be set after gfx11_emit_buffered_sh_regs_inline because they can + * overwrite them. + */ + if (is_blit) { /* Re-emit draw constants after we leave u_blitter. */ si_invalidate_draw_sh_constants(sctx); /* Blit VS doesn't use BASE_VERTEX, START_INSTANCE, and DRAWID. */ radeon_set_sh_reg_seq(sh_base_reg + SI_SGPR_VS_BLIT_DATA * 4, sctx->num_vs_blit_sgprs); radeon_emit_array(sctx->vs_blit_sh_data, sctx->num_vs_blit_sgprs); - } else if (base_vertex != sctx->last_base_vertex || - sctx->last_base_vertex == SI_BASE_VERTEX_UNKNOWN || - (set_base_instance && - (info->start_instance != sctx->last_start_instance || - sctx->last_start_instance == SI_START_INSTANCE_UNKNOWN)) || - (set_draw_id && - (drawid_base != sctx->last_drawid || - sctx->last_drawid == SI_DRAW_ID_UNKNOWN)) || - sh_base_reg != sctx->last_sh_base_reg) { - if (set_base_instance) { - radeon_set_sh_reg_seq(sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 3); - radeon_emit(base_vertex); - radeon_emit(drawid_base); - radeon_emit(info->start_instance); - - sctx->last_start_instance = info->start_instance; - sctx->last_drawid = drawid_base; - } else if (set_draw_id) { - radeon_set_sh_reg_seq(sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 2); - radeon_emit(base_vertex); - radeon_emit(drawid_base); - - sctx->last_drawid = drawid_base; - } else { - radeon_set_sh_reg(sh_base_reg + SI_SGPR_BASE_VERTEX * 4, base_vertex); - } - - sctx->last_base_vertex = base_vertex; - sctx->last_sh_base_reg = sh_base_reg; } /* Don't update draw_id in the following code if it doesn't increment. */ @@ -2018,7 +2125,7 @@ static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx, unsigned vb_desc_offset = sh_base + get_vb_descriptor_sgpr_ptr_offset(); - radeon_set_sh_reg(vb_desc_offset, vb_descriptors_address); + radeon_set_or_push_gfx_sh_reg(vb_desc_offset, vb_descriptors_address); /* the first iteration always executes */ do { @@ -2072,7 +2179,7 @@ static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx, unsigned vb_desc_ptr_offset = sh_base + get_vb_descriptor_sgpr_ptr_offset(); radeon_begin(&sctx->gfx_cs); - radeon_set_sh_reg(vb_desc_ptr_offset, vb_descriptors_address); + radeon_set_or_push_gfx_sh_reg(vb_desc_ptr_offset, vb_descriptors_address); radeon_end(); } diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 0b6e522b87c..b6ac8445c75 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -1186,12 +1186,22 @@ static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader radeon_begin_again(&sctx->gfx_cs); radeon_opt_set_uconfig_reg(sctx, R_030980_GE_PC_ALLOC, SI_TRACKED_GE_PC_ALLOC, shader->ngg.ge_pc_alloc); - radeon_opt_set_sh_reg_idx3(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, - SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, - shader->ngg.spi_shader_pgm_rsrc3_gs); - radeon_opt_set_sh_reg_idx3(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS, - SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, - shader->ngg.spi_shader_pgm_rsrc4_gs); + if (sctx->gfx_level >= GFX11) { + assert(!sctx->screen->info.uses_kernel_cu_mask); + radeon_opt_push_gfx_sh_reg(R_00B21C_SPI_SHADER_PGM_RSRC3_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, + shader->gs.spi_shader_pgm_rsrc3_gs); + radeon_opt_push_gfx_sh_reg(R_00B204_SPI_SHADER_PGM_RSRC4_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, + shader->gs.spi_shader_pgm_rsrc4_gs); + } else { + radeon_opt_set_sh_reg_idx3(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, + shader->ngg.spi_shader_pgm_rsrc3_gs); + radeon_opt_set_sh_reg_idx3(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, + shader->ngg.spi_shader_pgm_rsrc4_gs); + } radeon_end(); } diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c b/src/gallium/drivers/radeonsi/si_state_viewport.c index 88dd30e205a..d924c69f026 100644 --- a/src/gallium/drivers/radeonsi/si_state_viewport.c +++ b/src/gallium/drivers/radeonsi/si_state_viewport.c @@ -90,10 +90,17 @@ static void si_emit_cull_state(struct si_context *sctx) /* This will end up in SGPR6 as (value << 8), shifted by the hw. */ radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, sctx->small_prim_cull_info_buf, RADEON_USAGE_READ | RADEON_PRIO_CONST_BUFFER); - radeon_begin(&sctx->gfx_cs); - radeon_set_sh_reg(R_00B230_SPI_SHADER_USER_DATA_GS_0 + GFX9_SGPR_SMALL_PRIM_CULL_INFO * 4, - sctx->small_prim_cull_info_address); - radeon_end(); + + if (sctx->gfx_level >= GFX11) { + radeon_push_gfx_sh_reg(R_00B230_SPI_SHADER_USER_DATA_GS_0 + + GFX9_SGPR_SMALL_PRIM_CULL_INFO * 4, + sctx->small_prim_cull_info_address); + } else { + radeon_begin(&sctx->gfx_cs); + radeon_set_sh_reg(R_00B230_SPI_SHADER_USER_DATA_GS_0 + GFX9_SGPR_SMALL_PRIM_CULL_INFO * 4, + sctx->small_prim_cull_info_address); + radeon_end(); + } /* Better subpixel precision increases the efficiency of small * primitive culling. (more precision means a tighter bounding box