radv: remove precomputed registers from radv_shader_binary
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

It is enough to compute them after upload.
This saves some disk space and eliminates an unlikely
bug where the shader cache is shared between two GPUs
with the same chip but a different number of enabled CUs.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38970>
This commit is contained in:
Daniel Schürmann 2025-12-16 12:39:50 +01:00 committed by Marge Bot
parent 61287b00f3
commit 125ac1626d
5 changed files with 344 additions and 339 deletions

View file

@ -35,9 +35,9 @@ radv_sqtt_emit_relocated_shaders(struct radv_cmd_buffer *cmd_buffer, struct radv
/* Shaders are allocated in the 32-bit addr space and high bits are already configured. */
radeon_begin(cs);
if (pdev->info.gfx_level >= GFX12) {
gfx12_push_sh_reg(shader->info.regs.pgm_lo, reloc->va[s] >> 8);
gfx12_push_sh_reg(shader->regs.pgm_lo, reloc->va[s] >> 8);
} else {
radeon_set_sh_reg(shader->info.regs.pgm_lo, reloc->va[s] >> 8);
radeon_set_sh_reg(shader->regs.pgm_lo, reloc->va[s] >> 8);
}
radeon_end();
}
@ -49,9 +49,9 @@ radv_sqtt_emit_relocated_shaders(struct radv_cmd_buffer *cmd_buffer, struct radv
radeon_begin(ace_cs);
if (pdev->info.gfx_level >= GFX12) {
gfx12_push_sh_reg(task_shader->info.regs.pgm_lo, va >> 8);
gfx12_push_sh_reg(task_shader->regs.pgm_lo, va >> 8);
} else {
radeon_set_sh_reg(task_shader->info.regs.pgm_lo, va >> 8);
radeon_set_sh_reg(task_shader->regs.pgm_lo, va >> 8);
}
radeon_end();
}

View file

@ -2874,11 +2874,11 @@ radv_emit_ps_epilog_state(struct radv_cmd_buffer *cmd_buffer)
radeon_begin(cs);
if (pdev->info.gfx_level >= GFX12) {
if (pgm_rsrc1)
gfx12_push_sh_reg(ps_shader->info.regs.pgm_rsrc1, pgm_rsrc1);
gfx12_push_sh_reg(ps_shader->regs.pgm_rsrc1, pgm_rsrc1);
gfx12_push_32bit_pointer(epilog_pc_offset, ps_epilog->va, &pdev->info);
} else {
if (pgm_rsrc1)
radeon_set_sh_reg(ps_shader->info.regs.pgm_rsrc1, pgm_rsrc1);
radeon_set_sh_reg(ps_shader->regs.pgm_rsrc1, pgm_rsrc1);
radeon_emit_32bit_pointer(epilog_pc_offset, ps_epilog->va, &pdev->info);
}
radeon_end();
@ -2892,27 +2892,27 @@ radv_emit_compute_shader(const struct radv_physical_device *pdev, struct radv_cm
radeon_begin(cs);
if (pdev->info.gfx_level >= GFX12) {
gfx12_push_sh_reg(shader->info.regs.pgm_lo, va >> 8);
gfx12_push_sh_reg(shader->info.regs.pgm_rsrc1, shader->config.rsrc1);
gfx12_push_sh_reg(shader->info.regs.pgm_rsrc2, shader->config.rsrc2);
gfx12_push_sh_reg(shader->info.regs.pgm_rsrc3, shader->config.rsrc3);
gfx12_push_sh_reg(R_00B854_COMPUTE_RESOURCE_LIMITS, shader->info.regs.cs.compute_resource_limits);
gfx12_push_sh_reg(R_00B81C_COMPUTE_NUM_THREAD_X, shader->info.regs.cs.compute_num_thread_x);
gfx12_push_sh_reg(R_00B820_COMPUTE_NUM_THREAD_Y, shader->info.regs.cs.compute_num_thread_y);
gfx12_push_sh_reg(R_00B824_COMPUTE_NUM_THREAD_Z, shader->info.regs.cs.compute_num_thread_z);
gfx12_push_sh_reg(shader->regs.pgm_lo, va >> 8);
gfx12_push_sh_reg(shader->regs.pgm_rsrc1, shader->config.rsrc1);
gfx12_push_sh_reg(shader->regs.pgm_rsrc2, shader->config.rsrc2);
gfx12_push_sh_reg(shader->regs.pgm_rsrc3, shader->config.rsrc3);
gfx12_push_sh_reg(R_00B854_COMPUTE_RESOURCE_LIMITS, shader->regs.cs.compute_resource_limits);
gfx12_push_sh_reg(R_00B81C_COMPUTE_NUM_THREAD_X, shader->regs.cs.compute_num_thread_x);
gfx12_push_sh_reg(R_00B820_COMPUTE_NUM_THREAD_Y, shader->regs.cs.compute_num_thread_y);
gfx12_push_sh_reg(R_00B824_COMPUTE_NUM_THREAD_Z, shader->regs.cs.compute_num_thread_z);
} else {
radeon_set_sh_reg(shader->info.regs.pgm_lo, va >> 8);
radeon_set_sh_reg_seq(shader->info.regs.pgm_rsrc1, 2);
radeon_set_sh_reg(shader->regs.pgm_lo, va >> 8);
radeon_set_sh_reg_seq(shader->regs.pgm_rsrc1, 2);
radeon_emit(shader->config.rsrc1);
radeon_emit(shader->config.rsrc2);
if (pdev->info.gfx_level >= GFX10)
radeon_set_sh_reg(shader->info.regs.pgm_rsrc3, shader->config.rsrc3);
radeon_set_sh_reg(shader->regs.pgm_rsrc3, shader->config.rsrc3);
radeon_set_sh_reg(R_00B854_COMPUTE_RESOURCE_LIMITS, shader->info.regs.cs.compute_resource_limits);
radeon_set_sh_reg(R_00B854_COMPUTE_RESOURCE_LIMITS, shader->regs.cs.compute_resource_limits);
radeon_set_sh_reg_seq(R_00B81C_COMPUTE_NUM_THREAD_X, 3);
radeon_emit(shader->info.regs.cs.compute_num_thread_x);
radeon_emit(shader->info.regs.cs.compute_num_thread_y);
radeon_emit(shader->info.regs.cs.compute_num_thread_z);
radeon_emit(shader->regs.cs.compute_num_thread_x);
radeon_emit(shader->regs.cs.compute_num_thread_y);
radeon_emit(shader->regs.cs.compute_num_thread_z);
}
radeon_end();
}
@ -2952,34 +2952,32 @@ radv_emit_hw_vs(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *sh
const uint64_t va = radv_shader_get_va(shader);
radeon_begin(cs);
radeon_set_sh_reg_seq(shader->info.regs.pgm_lo, 4);
radeon_set_sh_reg_seq(shader->regs.pgm_lo, 4);
radeon_emit(va >> 8);
radeon_emit(S_00B124_MEM_BASE(va >> 40));
radeon_emit(shader->config.rsrc1);
radeon_emit(shader->config.rsrc2);
radeon_opt_set_context_reg(R_0286C4_SPI_VS_OUT_CONFIG, RADV_TRACKED_SPI_VS_OUT_CONFIG,
shader->info.regs.spi_vs_out_config);
shader->regs.spi_vs_out_config);
radeon_opt_set_context_reg(R_02870C_SPI_SHADER_POS_FORMAT, RADV_TRACKED_SPI_SHADER_POS_FORMAT,
shader->info.regs.spi_shader_pos_format);
shader->regs.spi_shader_pos_format);
radeon_opt_set_context_reg(R_02881C_PA_CL_VS_OUT_CNTL, RADV_TRACKED_PA_CL_VS_OUT_CNTL,
shader->info.regs.pa_cl_vs_out_cntl);
shader->regs.pa_cl_vs_out_cntl);
if (pdev->info.gfx_level <= GFX8)
radeon_opt_set_context_reg(R_028AB4_VGT_REUSE_OFF, RADV_TRACKED_VGT_REUSE_OFF,
shader->info.regs.vs.vgt_reuse_off);
radeon_opt_set_context_reg(R_028AB4_VGT_REUSE_OFF, RADV_TRACKED_VGT_REUSE_OFF, shader->regs.vs.vgt_reuse_off);
if (pdev->info.gfx_level >= GFX7) {
radeon_set_sh_reg_idx(&pdev->info, R_00B118_SPI_SHADER_PGM_RSRC3_VS, 3,
shader->info.regs.vs.spi_shader_pgm_rsrc3_vs);
radeon_set_sh_reg(R_00B11C_SPI_SHADER_LATE_ALLOC_VS, shader->info.regs.vs.spi_shader_late_alloc_vs);
radeon_set_sh_reg_idx(&pdev->info, R_00B118_SPI_SHADER_PGM_RSRC3_VS, 3, shader->regs.vs.spi_shader_pgm_rsrc3_vs);
radeon_set_sh_reg(R_00B11C_SPI_SHADER_LATE_ALLOC_VS, shader->regs.vs.spi_shader_late_alloc_vs);
if (pdev->info.gfx_level >= GFX10) {
radeon_set_uconfig_reg(R_030980_GE_PC_ALLOC, shader->info.regs.ge_pc_alloc);
radeon_set_uconfig_reg(R_030980_GE_PC_ALLOC, shader->regs.ge_pc_alloc);
if (shader->info.stage == MESA_SHADER_TESS_EVAL) {
radeon_opt_set_context_reg(R_028A44_VGT_GS_ONCHIP_CNTL, RADV_TRACKED_VGT_GS_ONCHIP_CNTL,
shader->info.regs.vgt_gs_onchip_cntl);
shader->regs.vgt_gs_onchip_cntl);
}
}
}
@ -2998,7 +2996,7 @@ radv_emit_hw_es(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *sh
assert(pdev->info.gfx_level < GFX11);
radeon_begin(cs);
radeon_set_sh_reg_seq(shader->info.regs.pgm_lo, 4);
radeon_set_sh_reg_seq(shader->regs.pgm_lo, 4);
radeon_emit(va >> 8);
radeon_emit(S_00B324_MEM_BASE(va >> 40));
radeon_emit(shader->config.rsrc1);
@ -3016,11 +3014,11 @@ radv_emit_hw_ls(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *sh
radeon_begin(cs);
if (pdev->info.gfx_level >= GFX12) {
gfx12_push_sh_reg(shader->info.regs.pgm_lo, va >> 8);
gfx12_push_sh_reg(shader->info.regs.pgm_rsrc1, shader->config.rsrc1);
gfx12_push_sh_reg(shader->regs.pgm_lo, va >> 8);
gfx12_push_sh_reg(shader->regs.pgm_rsrc1, shader->config.rsrc1);
} else {
radeon_set_sh_reg(shader->info.regs.pgm_lo, va >> 8);
radeon_set_sh_reg(shader->info.regs.pgm_rsrc1, shader->config.rsrc1);
radeon_set_sh_reg(shader->regs.pgm_lo, va >> 8);
radeon_set_sh_reg(shader->regs.pgm_rsrc1, shader->config.rsrc1);
}
radeon_end();
}
@ -3048,13 +3046,13 @@ radv_emit_hw_ngg(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *e
if (!shader->info.merged_shader_compiled_separately) {
radeon_begin(cs);
if (pdev->info.gfx_level >= GFX12) {
gfx12_push_sh_reg(shader->info.regs.pgm_lo, va >> 8);
gfx12_push_sh_reg(shader->info.regs.pgm_rsrc1, shader->config.rsrc1);
gfx12_push_sh_reg(shader->info.regs.pgm_rsrc2, shader->config.rsrc2);
gfx12_push_sh_reg(R_00B220_SPI_SHADER_PGM_RSRC4_GS, shader->info.regs.spi_shader_pgm_rsrc4_gs);
gfx12_push_sh_reg(shader->regs.pgm_lo, va >> 8);
gfx12_push_sh_reg(shader->regs.pgm_rsrc1, shader->config.rsrc1);
gfx12_push_sh_reg(shader->regs.pgm_rsrc2, shader->config.rsrc2);
gfx12_push_sh_reg(R_00B220_SPI_SHADER_PGM_RSRC4_GS, shader->regs.spi_shader_pgm_rsrc4_gs);
} else {
radeon_set_sh_reg(shader->info.regs.pgm_lo, va >> 8);
radeon_set_sh_reg_seq(shader->info.regs.pgm_rsrc1, 2);
radeon_set_sh_reg(shader->regs.pgm_lo, va >> 8);
radeon_set_sh_reg_seq(shader->regs.pgm_rsrc1, 2);
radeon_emit(shader->config.rsrc1);
radeon_emit(shader->config.rsrc2);
}
@ -3075,58 +3073,58 @@ radv_emit_hw_ngg(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *e
radeon_begin(cs);
gfx12_begin_context_regs();
gfx12_opt_set_context_reg(R_028818_PA_CL_VS_OUT_CNTL, RADV_TRACKED_PA_CL_VS_OUT_CNTL,
shader->info.regs.pa_cl_vs_out_cntl);
shader->regs.pa_cl_vs_out_cntl);
gfx12_opt_set_context_reg(R_028B3C_VGT_GS_INSTANCE_CNT, RADV_TRACKED_VGT_GS_INSTANCE_CNT,
shader->info.regs.vgt_gs_instance_cnt);
shader->regs.vgt_gs_instance_cnt);
gfx12_opt_set_context_reg2(R_028648_SPI_SHADER_IDX_FORMAT, RADV_TRACKED_SPI_SHADER_IDX_FORMAT,
shader->info.regs.ngg.spi_shader_idx_format, shader->info.regs.spi_shader_pos_format);
shader->regs.ngg.spi_shader_idx_format, shader->regs.spi_shader_pos_format);
gfx12_opt_set_context_reg(R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP, RADV_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP,
shader->info.regs.ngg.ge_max_output_per_subgroup);
shader->regs.ngg.ge_max_output_per_subgroup);
gfx12_opt_set_context_reg(R_028B4C_GE_NGG_SUBGRP_CNTL, RADV_TRACKED_GE_NGG_SUBGRP_CNTL,
shader->info.regs.ngg.ge_ngg_subgrp_cntl);
shader->regs.ngg.ge_ngg_subgrp_cntl);
gfx12_end_context_regs();
radeon_end();
} else if (pdev->info.has_set_context_pairs_packed) {
radeon_begin(cs);
gfx11_begin_packed_context_regs();
gfx11_opt_set_context_reg(R_02881C_PA_CL_VS_OUT_CNTL, RADV_TRACKED_PA_CL_VS_OUT_CNTL,
shader->info.regs.pa_cl_vs_out_cntl);
shader->regs.pa_cl_vs_out_cntl);
gfx11_opt_set_context_reg(R_028B90_VGT_GS_INSTANCE_CNT, RADV_TRACKED_VGT_GS_INSTANCE_CNT,
shader->info.regs.vgt_gs_instance_cnt);
shader->regs.vgt_gs_instance_cnt);
gfx11_opt_set_context_reg(R_028A84_VGT_PRIMITIVEID_EN, RADV_TRACKED_VGT_PRIMITIVEID_EN,
shader->info.regs.ngg.vgt_primitiveid_en | S_028A84_PRIMITIVEID_EN(es_enable_prim_id));
shader->regs.ngg.vgt_primitiveid_en | S_028A84_PRIMITIVEID_EN(es_enable_prim_id));
gfx11_opt_set_context_reg2(R_028708_SPI_SHADER_IDX_FORMAT, RADV_TRACKED_SPI_SHADER_IDX_FORMAT,
shader->info.regs.ngg.spi_shader_idx_format, shader->info.regs.spi_shader_pos_format);
shader->regs.ngg.spi_shader_idx_format, shader->regs.spi_shader_pos_format);
gfx11_opt_set_context_reg(R_0286C4_SPI_VS_OUT_CONFIG, RADV_TRACKED_SPI_VS_OUT_CONFIG,
shader->info.regs.spi_vs_out_config);
shader->regs.spi_vs_out_config);
gfx11_opt_set_context_reg(R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP, RADV_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP,
shader->info.regs.ngg.ge_max_output_per_subgroup);
shader->regs.ngg.ge_max_output_per_subgroup);
gfx11_opt_set_context_reg(R_028B4C_GE_NGG_SUBGRP_CNTL, RADV_TRACKED_GE_NGG_SUBGRP_CNTL,
shader->info.regs.ngg.ge_ngg_subgrp_cntl);
shader->regs.ngg.ge_ngg_subgrp_cntl);
gfx11_end_packed_context_regs();
radeon_end();
} else {
radeon_begin(cs);
radeon_opt_set_context_reg(R_02881C_PA_CL_VS_OUT_CNTL, RADV_TRACKED_PA_CL_VS_OUT_CNTL,
shader->info.regs.pa_cl_vs_out_cntl);
shader->regs.pa_cl_vs_out_cntl);
radeon_opt_set_context_reg(R_028B90_VGT_GS_INSTANCE_CNT, RADV_TRACKED_VGT_GS_INSTANCE_CNT,
shader->info.regs.vgt_gs_instance_cnt);
shader->regs.vgt_gs_instance_cnt);
radeon_opt_set_context_reg(R_028A84_VGT_PRIMITIVEID_EN, RADV_TRACKED_VGT_PRIMITIVEID_EN,
shader->info.regs.ngg.vgt_primitiveid_en | S_028A84_PRIMITIVEID_EN(es_enable_prim_id));
shader->regs.ngg.vgt_primitiveid_en | S_028A84_PRIMITIVEID_EN(es_enable_prim_id));
radeon_opt_set_context_reg2(R_028708_SPI_SHADER_IDX_FORMAT, RADV_TRACKED_SPI_SHADER_IDX_FORMAT,
shader->info.regs.ngg.spi_shader_idx_format, shader->info.regs.spi_shader_pos_format);
shader->regs.ngg.spi_shader_idx_format, shader->regs.spi_shader_pos_format);
radeon_opt_set_context_reg(R_0286C4_SPI_VS_OUT_CONFIG, RADV_TRACKED_SPI_VS_OUT_CONFIG,
shader->info.regs.spi_vs_out_config);
shader->regs.spi_vs_out_config);
radeon_opt_set_context_reg(R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP, RADV_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP,
shader->info.regs.ngg.ge_max_output_per_subgroup);
shader->regs.ngg.ge_max_output_per_subgroup);
radeon_opt_set_context_reg(R_028B4C_GE_NGG_SUBGRP_CNTL, RADV_TRACKED_GE_NGG_SUBGRP_CNTL,
shader->info.regs.ngg.ge_ngg_subgrp_cntl);
shader->regs.ngg.ge_ngg_subgrp_cntl);
radeon_end();
}
radeon_begin(cs);
uint32_t ge_cntl = shader->info.regs.ngg.ge_cntl;
uint32_t ge_cntl = shader->regs.ngg.ge_cntl;
if (pdev->info.gfx_level >= GFX11) {
ge_cntl |= S_03096C_BREAK_PRIMGRP_AT_EOI(break_wave_at_eoi);
} else {
@ -3146,7 +3144,7 @@ radv_emit_hw_ngg(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *e
}
radeon_opt_set_context_reg(R_028A44_VGT_GS_ONCHIP_CNTL, RADV_TRACKED_VGT_GS_ONCHIP_CNTL,
shader->info.regs.vgt_gs_onchip_cntl);
shader->regs.vgt_gs_onchip_cntl);
}
radeon_set_uconfig_reg(R_03096C_GE_CNTL, ge_cntl);
@ -3156,19 +3154,17 @@ radv_emit_hw_ngg(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *e
assert(!(shader->info.ngg_info.esgs_ring_size & 0xffff0000));
if (pdev->info.gfx_level >= GFX12) {
radeon_set_uconfig_reg(R_030988_VGT_PRIMITIVEID_EN, shader->info.regs.ngg.vgt_primitiveid_en);
radeon_set_uconfig_reg(R_030988_VGT_PRIMITIVEID_EN, shader->regs.ngg.vgt_primitiveid_en);
gfx12_push_sh_reg(ngg_lds_layout_offset,
SET_SGPR_FIELD(NGG_LDS_LAYOUT_GS_OUT_VERTEX_BASE, shader->info.ngg_info.esgs_ring_size));
} else {
if (pdev->info.gfx_level >= GFX7) {
radeon_set_sh_reg_idx(&pdev->info, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 3,
shader->info.regs.spi_shader_pgm_rsrc3_gs);
radeon_set_sh_reg_idx(&pdev->info, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 3, shader->regs.spi_shader_pgm_rsrc3_gs);
}
radeon_set_sh_reg_idx(&pdev->info, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3,
shader->info.regs.spi_shader_pgm_rsrc4_gs);
radeon_set_sh_reg_idx(&pdev->info, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3, shader->regs.spi_shader_pgm_rsrc4_gs);
radeon_set_uconfig_reg(R_030980_GE_PC_ALLOC, shader->info.regs.ge_pc_alloc);
radeon_set_uconfig_reg(R_030980_GE_PC_ALLOC, shader->regs.ge_pc_alloc);
radeon_set_sh_reg(ngg_lds_layout_offset,
SET_SGPR_FIELD(NGG_LDS_LAYOUT_GS_OUT_VERTEX_BASE, shader->info.ngg_info.esgs_ring_size));
@ -3187,14 +3183,14 @@ radv_emit_hw_hs(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *sh
radeon_begin(cs);
if (pdev->info.gfx_level >= GFX12) {
gfx12_push_sh_reg(shader->info.regs.pgm_lo, va >> 8);
gfx12_push_sh_reg(shader->info.regs.pgm_rsrc1, shader->config.rsrc1);
gfx12_push_sh_reg(shader->regs.pgm_lo, va >> 8);
gfx12_push_sh_reg(shader->regs.pgm_rsrc1, shader->config.rsrc1);
} else {
if (pdev->info.gfx_level >= GFX9) {
radeon_set_sh_reg(shader->info.regs.pgm_lo, va >> 8);
radeon_set_sh_reg(shader->info.regs.pgm_rsrc1, shader->config.rsrc1);
radeon_set_sh_reg(shader->regs.pgm_lo, va >> 8);
radeon_set_sh_reg(shader->regs.pgm_rsrc1, shader->config.rsrc1);
} else {
radeon_set_sh_reg_seq(shader->info.regs.pgm_lo, 4);
radeon_set_sh_reg_seq(shader->regs.pgm_lo, 4);
radeon_emit(va >> 8);
radeon_emit(S_00B424_MEM_BASE(va >> 40));
radeon_emit(shader->config.rsrc1);
@ -3233,23 +3229,23 @@ radv_emit_vertex_shader(struct radv_cmd_buffer *cmd_buffer)
gfx12_push_32bit_pointer(next_stage_pc_offset, next_stage->va, &pdev->info);
if (!vs->info.vs.has_prolog) {
gfx12_push_sh_reg(vs->info.regs.pgm_lo, vs->va >> 8);
gfx12_push_sh_reg(vs->regs.pgm_lo, vs->va >> 8);
if (vs->info.next_stage == MESA_SHADER_TESS_CTRL) {
gfx12_push_sh_reg(vs->info.regs.pgm_rsrc1, rsrc1);
gfx12_push_sh_reg(vs->regs.pgm_rsrc1, rsrc1);
} else {
gfx12_push_sh_reg(vs->info.regs.pgm_rsrc1, rsrc1);
gfx12_push_sh_reg(vs->info.regs.pgm_rsrc2, rsrc2);
gfx12_push_sh_reg(vs->regs.pgm_rsrc1, rsrc1);
gfx12_push_sh_reg(vs->regs.pgm_rsrc2, rsrc2);
}
}
} else {
radeon_emit_32bit_pointer(next_stage_pc_offset, next_stage->va, &pdev->info);
if (!vs->info.vs.has_prolog) {
radeon_set_sh_reg(vs->info.regs.pgm_lo, vs->va >> 8);
radeon_set_sh_reg(vs->regs.pgm_lo, vs->va >> 8);
if (vs->info.next_stage == MESA_SHADER_TESS_CTRL) {
radeon_set_sh_reg(vs->info.regs.pgm_rsrc1, rsrc1);
radeon_set_sh_reg(vs->regs.pgm_rsrc1, rsrc1);
} else {
radeon_set_sh_reg_seq(vs->info.regs.pgm_rsrc1, 2);
radeon_set_sh_reg_seq(vs->regs.pgm_rsrc1, 2);
radeon_emit(rsrc1);
radeon_emit(rsrc2);
}
@ -3304,13 +3300,13 @@ radv_emit_tess_eval_shader(struct radv_cmd_buffer *cmd_buffer)
radeon_begin(cs);
if (pdev->info.gfx_level >= GFX12) {
gfx12_push_sh_reg(tes->info.regs.pgm_lo, tes->va >> 8);
gfx12_push_sh_reg(tes->info.regs.pgm_rsrc1, rsrc1);
gfx12_push_sh_reg(tes->info.regs.pgm_rsrc2, rsrc2);
gfx12_push_sh_reg(tes->regs.pgm_lo, tes->va >> 8);
gfx12_push_sh_reg(tes->regs.pgm_rsrc1, rsrc1);
gfx12_push_sh_reg(tes->regs.pgm_rsrc2, rsrc2);
gfx12_push_32bit_pointer(next_stage_pc_offset, gs->va, &pdev->info);
} else {
radeon_set_sh_reg(tes->info.regs.pgm_lo, tes->va >> 8);
radeon_set_sh_reg_seq(tes->info.regs.pgm_rsrc1, 2);
radeon_set_sh_reg(tes->regs.pgm_lo, tes->va >> 8);
radeon_set_sh_reg_seq(tes->regs.pgm_rsrc1, 2);
radeon_emit(rsrc1);
radeon_emit(rsrc2);
radeon_emit_32bit_pointer(next_stage_pc_offset, gs->va, &pdev->info);
@ -3340,38 +3336,38 @@ radv_emit_hw_gs(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *gs
radeon_begin(cs);
radeon_opt_set_context_reg3(R_028A60_VGT_GSVS_RING_OFFSET_1, RADV_TRACKED_VGT_GSVS_RING_OFFSET_1,
gs->info.regs.gs.vgt_gsvs_ring_offset[0], gs->info.regs.gs.vgt_gsvs_ring_offset[1],
gs->info.regs.gs.vgt_gsvs_ring_offset[2]);
gs->regs.gs.vgt_gsvs_ring_offset[0], gs->regs.gs.vgt_gsvs_ring_offset[1],
gs->regs.gs.vgt_gsvs_ring_offset[2]);
radeon_opt_set_context_reg(R_028AB0_VGT_GSVS_RING_ITEMSIZE, RADV_TRACKED_VGT_GSVS_RING_ITEMSIZE,
gs->info.regs.gs.vgt_gsvs_ring_itemsize);
gs->regs.gs.vgt_gsvs_ring_itemsize);
radeon_opt_set_context_reg4(R_028B5C_VGT_GS_VERT_ITEMSIZE, RADV_TRACKED_VGT_GS_VERT_ITEMSIZE,
gs->info.regs.gs.vgt_gs_vert_itemsize[0], gs->info.regs.gs.vgt_gs_vert_itemsize[1],
gs->info.regs.gs.vgt_gs_vert_itemsize[2], gs->info.regs.gs.vgt_gs_vert_itemsize[3]);
gs->regs.gs.vgt_gs_vert_itemsize[0], gs->regs.gs.vgt_gs_vert_itemsize[1],
gs->regs.gs.vgt_gs_vert_itemsize[2], gs->regs.gs.vgt_gs_vert_itemsize[3]);
radeon_opt_set_context_reg(R_028B90_VGT_GS_INSTANCE_CNT, RADV_TRACKED_VGT_GS_INSTANCE_CNT,
gs->info.regs.gs.vgt_gs_instance_cnt);
gs->regs.gs.vgt_gs_instance_cnt);
if (pdev->info.gfx_level >= GFX9) {
if (!gs->info.merged_shader_compiled_separately) {
radeon_set_sh_reg(gs->info.regs.pgm_lo, va >> 8);
radeon_set_sh_reg(gs->regs.pgm_lo, va >> 8);
radeon_set_sh_reg_seq(gs->info.regs.pgm_rsrc1, 2);
radeon_set_sh_reg_seq(gs->regs.pgm_rsrc1, 2);
radeon_emit(gs->config.rsrc1);
radeon_emit(gs->config.rsrc2 | S_00B22C_LDS_SIZE(ac_shader_encode_lds_size(
gs_state->lds_size, pdev->info.gfx_level, MESA_SHADER_GEOMETRY)));
}
radeon_opt_set_context_reg(R_028A44_VGT_GS_ONCHIP_CNTL, RADV_TRACKED_VGT_GS_ONCHIP_CNTL,
gs->info.regs.vgt_gs_onchip_cntl);
gs->regs.vgt_gs_onchip_cntl);
if (pdev->info.gfx_level == GFX9) {
radeon_opt_set_context_reg(R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP, RADV_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
gs->info.regs.gs.vgt_gs_max_prims_per_subgroup);
gs->regs.gs.vgt_gs_max_prims_per_subgroup);
}
} else {
radeon_set_sh_reg_seq(gs->info.regs.pgm_lo, 4);
radeon_set_sh_reg_seq(gs->regs.pgm_lo, 4);
radeon_emit(va >> 8);
radeon_emit(S_00B224_MEM_BASE(va >> 40));
radeon_emit(gs->config.rsrc1);
@ -3381,15 +3377,15 @@ radv_emit_hw_gs(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *gs
* GFX9+: Only used to set the GS input VGPRs, emulated in shaders.
*/
radeon_opt_set_context_reg(R_028AAC_VGT_ESGS_RING_ITEMSIZE, RADV_TRACKED_VGT_ESGS_RING_ITEMSIZE,
gs->info.regs.gs.vgt_esgs_ring_itemsize);
gs->regs.gs.vgt_esgs_ring_itemsize);
}
if (pdev->info.gfx_level >= GFX7) {
radeon_set_sh_reg_idx(&pdev->info, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 3, gs->info.regs.spi_shader_pgm_rsrc3_gs);
radeon_set_sh_reg_idx(&pdev->info, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 3, gs->regs.spi_shader_pgm_rsrc3_gs);
}
if (pdev->info.gfx_level >= GFX10) {
radeon_set_sh_reg_idx(&pdev->info, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3, gs->info.regs.spi_shader_pgm_rsrc4_gs);
radeon_set_sh_reg_idx(&pdev->info, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3, gs->regs.spi_shader_pgm_rsrc4_gs);
}
radeon_end();
@ -3414,7 +3410,7 @@ radv_emit_geometry_shader(struct radv_cmd_buffer *cmd_buffer)
radeon_begin(cs);
radeon_opt_set_context_reg(R_028B38_VGT_GS_MAX_VERT_OUT, RADV_TRACKED_VGT_GS_MAX_VERT_OUT,
gs->info.regs.vgt_gs_max_vert_out);
gs->regs.vgt_gs_max_vert_out);
if (gs->info.merged_shader_compiled_separately) {
const uint32_t vgt_esgs_ring_itemsize_offset = radv_get_user_sgpr_loc(gs, AC_UD_VGT_ESGS_RING_ITEMSIZE);
@ -3454,13 +3450,13 @@ radv_gfx11_emit_meshlet(struct radv_cmd_buffer *cmd_buffer, const struct radv_sh
radeon_begin(cs);
if (pdev->info.gfx_level >= GFX12) {
gfx12_push_sh_reg(R_00B2B0_SPI_SHADER_GS_MESHLET_DIM, ms->info.regs.ngg.ms.spi_shader_gs_meshlet_dim);
gfx12_push_sh_reg(R_00B2B4_SPI_SHADER_GS_MESHLET_EXP_ALLOC, ms->info.regs.ngg.ms.spi_shader_gs_meshlet_exp_alloc);
gfx12_push_sh_reg(R_00B2B8_SPI_SHADER_GS_MESHLET_CTRL, ms->info.regs.ngg.ms.spi_shader_gs_meshlet_ctrl);
gfx12_push_sh_reg(R_00B2B0_SPI_SHADER_GS_MESHLET_DIM, ms->regs.ngg.ms.spi_shader_gs_meshlet_dim);
gfx12_push_sh_reg(R_00B2B4_SPI_SHADER_GS_MESHLET_EXP_ALLOC, ms->regs.ngg.ms.spi_shader_gs_meshlet_exp_alloc);
gfx12_push_sh_reg(R_00B2B8_SPI_SHADER_GS_MESHLET_CTRL, ms->regs.ngg.ms.spi_shader_gs_meshlet_ctrl);
} else {
radeon_set_sh_reg_seq(R_00B2B0_SPI_SHADER_GS_MESHLET_DIM, 2);
radeon_emit(ms->info.regs.ngg.ms.spi_shader_gs_meshlet_dim);
radeon_emit(ms->info.regs.ngg.ms.spi_shader_gs_meshlet_exp_alloc);
radeon_emit(ms->regs.ngg.ms.spi_shader_gs_meshlet_dim);
radeon_emit(ms->regs.ngg.ms.spi_shader_gs_meshlet_exp_alloc);
}
radeon_end();
}
@ -3479,7 +3475,7 @@ radv_emit_mesh_shader(struct radv_cmd_buffer *cmd_buffer)
radeon_begin(cs);
radeon_opt_set_context_reg(R_028B38_VGT_GS_MAX_VERT_OUT, RADV_TRACKED_VGT_GS_MAX_VERT_OUT,
ms->info.regs.vgt_gs_max_vert_out);
ms->regs.vgt_gs_max_vert_out);
radeon_set_uconfig_reg_idx(&pdev->info, R_030908_VGT_PRIMITIVE_TYPE, 1, V_008958_DI_PT_POINTLIST);
radeon_end();
@ -3628,7 +3624,7 @@ radv_emit_ps_inputs(struct radv_cmd_buffer *cmd_buffer)
*/
const unsigned num_per_vertex_params = ps->info.ps.num_inputs - num_per_primitive_params;
radeon_opt_set_context_reg(R_0286D8_SPI_PS_IN_CONTROL, RADV_TRACKED_SPI_PS_IN_CONTROL,
ps->info.regs.ps.spi_ps_in_control | S_0286D8_NUM_INTERP(num_per_vertex_params) |
ps->regs.ps.spi_ps_in_control | S_0286D8_NUM_INTERP(num_per_vertex_params) |
S_0286D8_NUM_PRIM_INTERP(num_per_primitive_params));
}
@ -3646,12 +3642,12 @@ radv_emit_fragment_shader_state(struct radv_cmd_buffer *cmd_buffer, const struct
const struct radv_physical_device *pdev = radv_device_physical(device);
const uint32_t spi_ps_input_ena = ps ? ps->config.spi_ps_input_ena : 0;
const uint32_t spi_ps_input_addr = ps ? ps->config.spi_ps_input_addr : 0;
const uint32_t spi_ps_in_control = ps ? ps->info.regs.ps.spi_ps_in_control : 0;
const uint32_t spi_ps_in_control = ps ? ps->regs.ps.spi_ps_in_control : 0;
struct radv_cmd_stream *cs = cmd_buffer->cs;
radeon_begin(cs);
if (pdev->info.gfx_level >= GFX12) {
const uint32_t pa_sc_hisz_control = ps ? ps->info.regs.ps.pa_sc_hisz_control : 0;
const uint32_t pa_sc_hisz_control = ps ? ps->regs.ps.pa_sc_hisz_control : 0;
gfx12_begin_context_regs();
gfx12_opt_set_context_reg2(R_02865C_SPI_PS_INPUT_ENA, RADV_TRACKED_SPI_PS_INPUT_ENA, spi_ps_input_ena,
@ -3668,7 +3664,7 @@ radv_emit_fragment_shader_state(struct radv_cmd_buffer *cmd_buffer, const struct
gfx11_opt_set_context_reg(R_0286D8_SPI_PS_IN_CONTROL, RADV_TRACKED_SPI_PS_IN_CONTROL, spi_ps_in_control);
gfx11_end_packed_context_regs();
} else {
const uint32_t pa_sc_shader_control = ps ? ps->info.regs.ps.pa_sc_shader_control : 0;
const uint32_t pa_sc_shader_control = ps ? ps->regs.ps.pa_sc_shader_control : 0;
radeon_opt_set_context_reg2(R_0286CC_SPI_PS_INPUT_ENA, RADV_TRACKED_SPI_PS_INPUT_ENA, spi_ps_input_ena,
spi_ps_input_addr);
@ -3709,11 +3705,11 @@ radv_emit_fragment_shader(struct radv_cmd_buffer *cmd_buffer)
radeon_begin(cs);
if (pdev->info.gfx_level >= GFX12) {
gfx12_push_sh_reg(ps->info.regs.pgm_lo, va >> 8);
gfx12_push_sh_reg(ps->info.regs.pgm_rsrc1, ps->config.rsrc1);
gfx12_push_sh_reg(ps->info.regs.pgm_rsrc2, ps->config.rsrc2);
gfx12_push_sh_reg(ps->regs.pgm_lo, va >> 8);
gfx12_push_sh_reg(ps->regs.pgm_rsrc1, ps->config.rsrc1);
gfx12_push_sh_reg(ps->regs.pgm_rsrc2, ps->config.rsrc2);
} else {
radeon_set_sh_reg_seq(ps->info.regs.pgm_lo, 4);
radeon_set_sh_reg_seq(ps->regs.pgm_lo, 4);
radeon_emit(va >> 8);
radeon_emit(S_00B024_MEM_BASE(va >> 40));
radeon_emit(ps->config.rsrc1);
@ -3956,10 +3952,10 @@ radv_emit_graphics_shaders(struct radv_cmd_buffer *cmd_buffer)
if (pdev->info.gfx_level >= GFX12) {
const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
uint32_t gs_out_config_ps = last_vgt_shader->info.regs.spi_vs_out_config;
uint32_t gs_out_config_ps = last_vgt_shader->regs.spi_vs_out_config;
if (ps) {
gs_out_config_ps |= ps->info.regs.ps.spi_gs_out_config_ps;
gs_out_config_ps |= ps->regs.ps.spi_gs_out_config_ps;
} else {
/* GFX12 seems to require a dummy FS state otherwise it might just hang. */
radv_emit_fragment_shader_state(cmd_buffer, NULL);
@ -6084,15 +6080,15 @@ emit_prolog_regs(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *v
radeon_begin(cs);
if (pdev->info.gfx_level >= GFX12) {
gfx12_push_sh_reg(vs_shader->info.regs.pgm_lo, prolog->va >> 8);
gfx12_push_sh_reg(vs_shader->info.regs.pgm_rsrc1, rsrc1);
gfx12_push_sh_reg(vs_shader->regs.pgm_lo, prolog->va >> 8);
gfx12_push_sh_reg(vs_shader->regs.pgm_rsrc1, rsrc1);
if (vs_shader->info.merged_shader_compiled_separately)
gfx12_push_sh_reg(vs_shader->info.regs.pgm_rsrc2, rsrc2);
gfx12_push_sh_reg(vs_shader->regs.pgm_rsrc2, rsrc2);
} else {
radeon_set_sh_reg(vs_shader->info.regs.pgm_lo, prolog->va >> 8);
radeon_set_sh_reg(vs_shader->info.regs.pgm_rsrc1, rsrc1);
radeon_set_sh_reg(vs_shader->regs.pgm_lo, prolog->va >> 8);
radeon_set_sh_reg(vs_shader->regs.pgm_rsrc1, rsrc1);
if (vs_shader->info.merged_shader_compiled_separately)
radeon_set_sh_reg(vs_shader->info.regs.pgm_rsrc2, rsrc2);
radeon_set_sh_reg(vs_shader->regs.pgm_rsrc2, rsrc2);
}
radeon_end();
@ -8378,7 +8374,7 @@ radv_bind_fragment_output_state(struct radv_cmd_buffer *cmd_buffer, const struct
if (ps) {
col_format = ps_epilog ? ps_epilog->spi_shader_col_format : ps->info.ps.spi_shader_col_format;
z_format = ps_epilog && ps->info.ps.exports_mrtz_via_epilog ? ps_epilog->spi_shader_z_format
: ps->info.regs.ps.spi_shader_z_format;
: ps->regs.ps.spi_shader_z_format;
cb_shader_mask = ps_epilog ? ps_epilog->cb_shader_mask : ps->info.ps.cb_shader_mask;
}
@ -8636,7 +8632,7 @@ radv_bind_fragment_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_BINNING_STATE;
}
if (!previous_ps || previous_ps->info.regs.ps.db_shader_control != ps->info.regs.ps.db_shader_control ||
if (!previous_ps || previous_ps->regs.ps.db_shader_control != ps->regs.ps.db_shader_control ||
previous_ps->info.ps.pops_is_per_sample != ps->info.ps.pops_is_per_sample)
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DB_SHADER_CONTROL;
@ -11582,18 +11578,18 @@ radv_emit_tcs_tes_state(struct radv_cmd_buffer *cmd_buffer)
radeon_begin(cs);
if (pdev->info.gfx_level >= GFX12) {
gfx12_push_sh_reg(tcs->info.regs.pgm_rsrc2, pgm_hs_rsrc2);
gfx12_push_sh_reg(tcs->regs.pgm_rsrc2, pgm_hs_rsrc2);
if (tcs_offchip_layout || tes_offchip_layout) {
gfx12_push_sh_reg(tcs_offchip_layout_offset, tcs_offchip_layout);
gfx12_push_sh_reg(tes_offchip_layout_offset, tes_offchip_layout);
}
} else {
if (pdev->info.gfx_level >= GFX9) {
radeon_set_sh_reg(tcs->info.regs.pgm_rsrc2, pgm_hs_rsrc2);
radeon_set_sh_reg(tcs->regs.pgm_rsrc2, pgm_hs_rsrc2);
} else {
const uint32_t ls_rsrc2 = vs->config.rsrc2 | S_00B52C_LDS_SIZE(lds_alloc);
radeon_set_sh_reg(vs->info.regs.pgm_rsrc2, ls_rsrc2);
radeon_set_sh_reg(vs->regs.pgm_rsrc2, ls_rsrc2);
}
if (tcs_offchip_layout || tes_offchip_layout) {
@ -11705,7 +11701,7 @@ radv_emit_db_shader_control(struct radv_cmd_buffer *cmd_buffer)
uint32_t db_shader_control;
if (ps) {
db_shader_control = ps->info.regs.ps.db_shader_control;
db_shader_control = ps->regs.ps.db_shader_control;
} else {
db_shader_control = S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_ANY_Z) |
S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z) |
@ -12787,10 +12783,10 @@ radv_bind_graphics_shaders(struct radv_cmd_buffer *cmd_buffer)
if (cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY]->info.is_ngg) {
gfx10_ngg_set_esgs_ring_itemsize(device, &es->info, &gs->info, &gs->info.ngg_info);
gfx10_get_ngg_info(device, &es->info, &gs->info, &gs->info.ngg_info);
radv_precompute_registers_hw_ngg(device, &gs->config, &gs->info);
radv_precompute_registers_hw_ngg(device, gs);
} else {
radv_get_legacy_gs_info(device, &es->info, &gs->info);
radv_precompute_registers_hw_gs(device, &es->info, &gs->info);
radv_precompute_registers_hw_gs(device, &es->info, gs);
cmd_buffer->esgs_ring_size_needed =
MAX2(cmd_buffer->esgs_ring_size_needed, gs->info.legacy_gs_info.esgs_ring_size);
@ -13824,9 +13820,9 @@ radv_emit_rt_stack_size(struct radv_cmd_buffer *cmd_buffer)
radeon_begin(cs);
if (pdev->info.gfx_level >= GFX12) {
gfx12_push_sh_reg(rt_prolog->info.regs.pgm_rsrc2, rsrc2);
gfx12_push_sh_reg(rt_prolog->regs.pgm_rsrc2, rsrc2);
} else {
radeon_set_sh_reg(rt_prolog->info.regs.pgm_rsrc2, rsrc2);
radeon_set_sh_reg(rt_prolog->regs.pgm_rsrc2, rsrc2);
}
radeon_end();
}

View file

@ -1567,7 +1567,7 @@ radv_open_rtld_binary(struct radv_device *device, const struct radv_shader_binar
#endif
static unsigned
radv_get_num_pos_exports(struct radv_shader_info *info, unsigned *clip_dist_mask, unsigned *cull_dist_mask)
radv_get_num_pos_exports(const struct radv_shader_info *info, unsigned *clip_dist_mask, unsigned *cull_dist_mask)
{
unsigned num = 1;
@ -1592,21 +1592,22 @@ radv_get_num_pos_exports(struct radv_shader_info *info, unsigned *clip_dist_mask
}
static void
radv_precompute_registers_hw_vs(struct radv_device *device, struct radv_shader_binary *binary)
radv_precompute_registers_hw_vs(struct radv_device *device, struct radv_shader *shader)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
struct radv_shader_info *info = &binary->info;
const struct radv_shader_info *info = &shader->info;
struct radv_shader_regs *regs = &shader->regs;
unsigned clip_dist_mask, cull_dist_mask;
unsigned num_pos_exports = radv_get_num_pos_exports(info, &clip_dist_mask, &cull_dist_mask);
/* VS is required to export at least one param. */
const uint32_t nparams = MAX2(info->outinfo.param_exports, 1);
info->regs.spi_vs_out_config = S_0286C4_VS_EXPORT_COUNT(nparams - 1);
regs->spi_vs_out_config = S_0286C4_VS_EXPORT_COUNT(nparams - 1);
if (pdev->info.gfx_level >= GFX10) {
info->regs.spi_vs_out_config |= S_0286C4_NO_PC_EXPORT(info->outinfo.param_exports == 0);
regs->spi_vs_out_config |= S_0286C4_NO_PC_EXPORT(info->outinfo.param_exports == 0);
}
info->regs.spi_shader_pos_format =
regs->spi_shader_pos_format =
S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
S_02870C_POS1_EXPORT_FORMAT(num_pos_exports > 1 ? V_02870C_SPI_SHADER_4COMP : V_02870C_SPI_SHADER_NONE) |
S_02870C_POS2_EXPORT_FORMAT(num_pos_exports > 2 ? V_02870C_SPI_SHADER_4COMP : V_02870C_SPI_SHADER_NONE) |
@ -1616,7 +1617,7 @@ radv_precompute_registers_hw_vs(struct radv_device *device, struct radv_shader_b
info->outinfo.writes_viewport_index || info->outinfo.writes_primitive_shading_rate;
const unsigned total_mask = clip_dist_mask | cull_dist_mask;
info->regs.pa_cl_vs_out_cntl =
regs->pa_cl_vs_out_cntl =
S_02881C_USE_VTX_POINT_SIZE(info->outinfo.writes_pointsize) |
S_02881C_USE_VTX_RENDER_TARGET_INDX(info->outinfo.writes_layer) |
S_02881C_USE_VTX_VIEWPORT_INDX(info->outinfo.writes_viewport_index) |
@ -1627,26 +1628,25 @@ radv_precompute_registers_hw_vs(struct radv_device *device, struct radv_shader_b
S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xf0) != 0) | total_mask << 8 | clip_dist_mask;
if (pdev->info.gfx_level <= GFX8)
info->regs.vs.vgt_reuse_off = info->outinfo.writes_viewport_index;
regs->vs.vgt_reuse_off = info->outinfo.writes_viewport_index;
unsigned late_alloc_wave64, cu_mask;
ac_compute_late_alloc(&pdev->info, false, false, binary->config.scratch_bytes_per_wave > 0, &late_alloc_wave64,
ac_compute_late_alloc(&pdev->info, false, false, shader->config.scratch_bytes_per_wave > 0, &late_alloc_wave64,
&cu_mask);
if (pdev->info.gfx_level >= GFX7) {
info->regs.vs.spi_shader_pgm_rsrc3_vs =
regs->vs.spi_shader_pgm_rsrc3_vs =
ac_apply_cu_en(S_00B118_CU_EN(cu_mask) | S_00B118_WAVE_LIMIT(0x3F), C_00B118_CU_EN, 0, &pdev->info);
info->regs.vs.spi_shader_late_alloc_vs = S_00B11C_LIMIT(late_alloc_wave64);
regs->vs.spi_shader_late_alloc_vs = S_00B11C_LIMIT(late_alloc_wave64);
if (pdev->info.gfx_level >= GFX10) {
const uint32_t oversub_pc_lines = late_alloc_wave64 ? pdev->info.pc_lines / 4 : 0;
info->regs.ge_pc_alloc =
S_030980_OVERSUB_EN(oversub_pc_lines > 0) | S_030980_NUM_PC_LINES(oversub_pc_lines - 1);
regs->ge_pc_alloc = S_030980_OVERSUB_EN(oversub_pc_lines > 0) | S_030980_NUM_PC_LINES(oversub_pc_lines - 1);
/* Required programming for tessellation (legacy pipeline only). */
if (binary->info.stage == MESA_SHADER_TESS_EVAL) {
info->regs.vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(250) | S_028A44_GS_PRIMS_PER_SUBGRP(126) |
if (shader->info.stage == MESA_SHADER_TESS_EVAL) {
regs->vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(250) | S_028A44_GS_PRIMS_PER_SUBGRP(126) |
S_028A44_GS_INST_PRIMS_IN_SUBGRP(126);
}
}
@ -1654,16 +1654,19 @@ radv_precompute_registers_hw_vs(struct radv_device *device, struct radv_shader_b
}
void
radv_precompute_registers_hw_gs(struct radv_device *device, struct radv_shader_info *es_info, struct radv_shader_info *gs_info)
radv_precompute_registers_hw_gs(struct radv_device *device, const struct radv_shader_info *es_info,
struct radv_shader *shader)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
const struct radv_shader_info *gs_info = &shader->info;
struct radv_shader_regs *regs = &shader->regs;
gs_info->regs.gs.vgt_esgs_ring_itemsize = es_info ? es_info->esgs_itemsize / 4 : gs_info->legacy_gs_info.esgs_itemsize /4;
regs->gs.vgt_esgs_ring_itemsize = es_info ? es_info->esgs_itemsize / 4 : gs_info->legacy_gs_info.esgs_itemsize / 4;
gs_info->regs.gs.vgt_gs_max_prims_per_subgroup =
regs->gs.vgt_gs_max_prims_per_subgroup =
S_028A94_MAX_PRIMS_PER_SUBGROUP(gs_info->legacy_gs_info.gs_inst_prims_in_subgroup);
gs_info->regs.vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(gs_info->legacy_gs_info.es_verts_per_subgroup) |
regs->vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(gs_info->legacy_gs_info.es_verts_per_subgroup) |
S_028A44_GS_PRIMS_PER_SUBGRP(gs_info->legacy_gs_info.gs_prims_per_subgroup) |
S_028A44_GS_INST_PRIMS_IN_SUBGRP(gs_info->legacy_gs_info.gs_inst_prims_in_subgroup);
@ -1675,44 +1678,45 @@ radv_precompute_registers_hw_gs(struct radv_device *device, struct radv_shader_i
const uint8_t *num_components = gs_info->gs.num_components_per_stream;
uint32_t offset = num_components[0] * gs_max_out_vertices;
gs_info->regs.gs.vgt_gsvs_ring_offset[0] = offset;
regs->gs.vgt_gsvs_ring_offset[0] = offset;
if (max_stream >= 1)
offset += num_components[1] * gs_max_out_vertices;
gs_info->regs.gs.vgt_gsvs_ring_offset[1] = offset;
regs->gs.vgt_gsvs_ring_offset[1] = offset;
if (max_stream >= 2)
offset += num_components[2] * gs_max_out_vertices;
gs_info->regs.gs.vgt_gsvs_ring_offset[2] = offset;
regs->gs.vgt_gsvs_ring_offset[2] = offset;
if (max_stream >= 3)
offset += num_components[3] * gs_max_out_vertices;
gs_info->regs.gs.vgt_gsvs_ring_itemsize = offset;
regs->gs.vgt_gsvs_ring_itemsize = offset;
for (uint32_t i = 0; i < 4; i++)
gs_info->regs.gs.vgt_gs_vert_itemsize[i] = (max_stream >= i) ? num_components[i] : 0;
regs->gs.vgt_gs_vert_itemsize[i] = (max_stream >= i) ? num_components[i] : 0;
const uint32_t gs_num_invocations = gs_info->gs.invocations;
gs_info->regs.gs.vgt_gs_instance_cnt =
S_028B90_CNT(MIN2(gs_num_invocations, 127)) | S_028B90_ENABLE(gs_num_invocations > 0);
regs->gs.vgt_gs_instance_cnt = S_028B90_CNT(MIN2(gs_num_invocations, 127)) | S_028B90_ENABLE(gs_num_invocations > 0);
gs_info->regs.spi_shader_pgm_rsrc3_gs =
regs->spi_shader_pgm_rsrc3_gs =
ac_apply_cu_en(S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F), C_00B21C_CU_EN, 0, &pdev->info);
if (pdev->info.gfx_level >= GFX10) {
gs_info->regs.spi_shader_pgm_rsrc4_gs =
regs->spi_shader_pgm_rsrc4_gs =
ac_apply_cu_en(S_00B204_CU_EN_GFX10(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0), C_00B204_CU_EN_GFX10,
16, &pdev->info);
}
gs_info->regs.vgt_gs_max_vert_out = gs_info->gs.vertices_out;
regs->vgt_gs_max_vert_out = gs_info->gs.vertices_out;
}
void
radv_precompute_registers_hw_ngg(struct radv_device *device, const struct ac_shader_config *config,
struct radv_shader_info *info)
radv_precompute_registers_hw_ngg(struct radv_device *device, struct radv_shader *shader)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
const struct radv_shader_info *info = &shader->info;
const struct ac_shader_config *config = &shader->config;
struct radv_shader_regs *regs = &shader->regs;
const bool no_pc_export = info->outinfo.param_exports == 0 && info->outinfo.prim_param_exports == 0;
const unsigned num_prim_params = info->outinfo.prim_param_exports;
@ -1720,30 +1724,30 @@ radv_precompute_registers_hw_ngg(struct radv_device *device, const struct ac_sha
if (pdev->info.gfx_level >= GFX12) {
const unsigned num_params = MAX2(info->outinfo.param_exports, 1);
info->regs.spi_vs_out_config = S_00B0C4_VS_EXPORT_COUNT(num_params - 1) |
S_00B0C4_PRIM_EXPORT_COUNT(num_prim_params) | S_00B0C4_NO_PC_EXPORT(no_pc_export);
regs->spi_vs_out_config = S_00B0C4_VS_EXPORT_COUNT(num_params - 1) | S_00B0C4_PRIM_EXPORT_COUNT(num_prim_params) |
S_00B0C4_NO_PC_EXPORT(no_pc_export);
info->regs.spi_shader_pgm_rsrc4_gs =
regs->spi_shader_pgm_rsrc4_gs =
S_00B220_SPI_SHADER_LATE_ALLOC_GS(127) | S_00B220_GLG_FORCE_DISABLE(1) | S_00B220_WAVE_LIMIT(0x3ff);
} else {
const unsigned num_params = MAX2(info->outinfo.param_exports, 1);
info->regs.spi_vs_out_config = S_0286C4_VS_EXPORT_COUNT(num_params - 1) |
S_0286C4_PRIM_EXPORT_COUNT(num_prim_params) | S_0286C4_NO_PC_EXPORT(no_pc_export);
regs->spi_vs_out_config = S_0286C4_VS_EXPORT_COUNT(num_params - 1) | S_0286C4_PRIM_EXPORT_COUNT(num_prim_params) |
S_0286C4_NO_PC_EXPORT(no_pc_export);
unsigned late_alloc_wave64, cu_mask;
ac_compute_late_alloc(&pdev->info, true, info->has_ngg_culling, config->scratch_bytes_per_wave > 0,
&late_alloc_wave64, &cu_mask);
info->regs.spi_shader_pgm_rsrc3_gs =
regs->spi_shader_pgm_rsrc3_gs =
ac_apply_cu_en(S_00B21C_CU_EN(cu_mask) | S_00B21C_WAVE_LIMIT(0x3F), C_00B21C_CU_EN, 0, &pdev->info);
if (pdev->info.gfx_level >= GFX11) {
info->regs.spi_shader_pgm_rsrc4_gs =
regs->spi_shader_pgm_rsrc4_gs =
ac_apply_cu_en(S_00B204_CU_EN_GFX11(0x1) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64),
C_00B204_CU_EN_GFX11, 16, &pdev->info);
} else {
info->regs.spi_shader_pgm_rsrc4_gs =
regs->spi_shader_pgm_rsrc4_gs =
ac_apply_cu_en(S_00B204_CU_EN_GFX10(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64),
C_00B204_CU_EN_GFX10, 16, &pdev->info);
}
@ -1760,7 +1764,7 @@ radv_precompute_registers_hw_ngg(struct radv_device *device, const struct ac_sha
oversub_pc_lines *= oversub_factor;
}
info->regs.ge_pc_alloc = S_030980_OVERSUB_EN(oversub_pc_lines > 0) | S_030980_NUM_PC_LINES(oversub_pc_lines - 1);
regs->ge_pc_alloc = S_030980_OVERSUB_EN(oversub_pc_lines > 0) | S_030980_NUM_PC_LINES(oversub_pc_lines - 1);
}
unsigned idx_format = V_028708_SPI_SHADER_1COMP;
@ -1770,9 +1774,9 @@ radv_precompute_registers_hw_ngg(struct radv_device *device, const struct ac_sha
unsigned clip_dist_mask, cull_dist_mask;
unsigned num_pos_exports = radv_get_num_pos_exports(info, &clip_dist_mask, &cull_dist_mask);
info->regs.ngg.spi_shader_idx_format = S_028708_IDX0_EXPORT_FORMAT(idx_format);
regs->ngg.spi_shader_idx_format = S_028708_IDX0_EXPORT_FORMAT(idx_format);
info->regs.spi_shader_pos_format =
regs->spi_shader_pos_format =
S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
S_02870C_POS1_EXPORT_FORMAT(num_pos_exports > 1 ? V_02870C_SPI_SHADER_4COMP : V_02870C_SPI_SHADER_NONE) |
S_02870C_POS2_EXPORT_FORMAT(num_pos_exports > 2 ? V_02870C_SPI_SHADER_4COMP : V_02870C_SPI_SHADER_NONE) |
@ -1782,7 +1786,7 @@ radv_precompute_registers_hw_ngg(struct radv_device *device, const struct ac_sha
info->outinfo.writes_viewport_index || info->outinfo.writes_primitive_shading_rate;
const unsigned total_mask = clip_dist_mask | cull_dist_mask;
info->regs.pa_cl_vs_out_cntl =
regs->pa_cl_vs_out_cntl =
S_02881C_USE_VTX_POINT_SIZE(info->outinfo.writes_pointsize) |
S_02881C_USE_VTX_RENDER_TARGET_INDX(info->outinfo.writes_layer) |
S_02881C_USE_VTX_VIEWPORT_INDX(info->outinfo.writes_viewport_index) |
@ -1792,71 +1796,72 @@ radv_precompute_registers_hw_ngg(struct radv_device *device, const struct ac_sha
S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0f) != 0) |
S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xf0) != 0) | total_mask << 8 | clip_dist_mask;
info->regs.ngg.vgt_primitiveid_en =
regs->ngg.vgt_primitiveid_en =
S_028A84_NGG_DISABLE_PROVOK_REUSE(info->stage == MESA_SHADER_VERTEX && info->outinfo.export_prim_id);
const uint32_t gs_num_invocations = info->stage == MESA_SHADER_GEOMETRY ? info->gs.invocations : 1;
info->regs.ngg.ge_max_output_per_subgroup = S_0287FC_MAX_VERTS_PER_SUBGROUP(info->ngg_info.max_out_verts);
regs->ngg.ge_max_output_per_subgroup = S_0287FC_MAX_VERTS_PER_SUBGROUP(info->ngg_info.max_out_verts);
info->regs.ngg.ge_ngg_subgrp_cntl =
regs->ngg.ge_ngg_subgrp_cntl =
S_028B4C_PRIM_AMP_FACTOR(info->ngg_info.prim_amp_factor) | S_028B4C_THDS_PER_SUBGRP(0); /* for fast launch */
info->regs.vgt_gs_instance_cnt =
S_028B90_CNT(gs_num_invocations) | S_028B90_ENABLE(gs_num_invocations > 1) |
regs->vgt_gs_instance_cnt = S_028B90_CNT(gs_num_invocations) | S_028B90_ENABLE(gs_num_invocations > 1) |
S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE(info->ngg_info.max_vert_out_per_gs_instance);
if (pdev->info.gfx_level >= GFX11) {
/* This should be <= 252 for performance on Gfx11. 256 works too but is slower. */
const uint32_t max_prim_grp_size = pdev->info.gfx_level >= GFX12 ? 256 : 252;
info->regs.ngg.ge_cntl = S_03096C_PRIMS_PER_SUBGRP(info->ngg_info.max_gsprims) |
regs->ngg.ge_cntl = S_03096C_PRIMS_PER_SUBGRP(info->ngg_info.max_gsprims) |
S_03096C_VERTS_PER_SUBGRP(info->ngg_info.hw_max_esverts) |
S_03096C_PRIM_GRP_SIZE_GFX11(max_prim_grp_size) |
S_03096C_DIS_PG_SIZE_ADJUST_FOR_STRIP(pdev->info.gfx_level >= GFX12);
} else {
info->regs.ngg.ge_cntl = S_03096C_PRIM_GRP_SIZE_GFX10(info->ngg_info.max_gsprims) |
regs->ngg.ge_cntl = S_03096C_PRIM_GRP_SIZE_GFX10(info->ngg_info.max_gsprims) |
S_03096C_VERT_GRP_SIZE(info->ngg_info.hw_max_esverts);
info->regs.vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(info->ngg_info.hw_max_esverts) |
regs->vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(info->ngg_info.hw_max_esverts) |
S_028A44_GS_PRIMS_PER_SUBGRP(info->ngg_info.max_gsprims) |
S_028A44_GS_INST_PRIMS_IN_SUBGRP(info->ngg_info.max_gsprims * gs_num_invocations);
}
info->regs.vgt_gs_max_vert_out = info->gs.vertices_out;
regs->vgt_gs_max_vert_out = info->gs.vertices_out;
}
static void
radv_precompute_registers_hw_ms(struct radv_device *device, struct radv_shader_binary *binary)
radv_precompute_registers_hw_ms(struct radv_device *device, struct radv_shader *shader)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
struct radv_shader_info *info = &binary->info;
const struct radv_shader_info *info = &shader->info;
struct radv_shader_regs *regs = &shader->regs;
radv_precompute_registers_hw_ngg(device, &binary->config, &binary->info);
radv_precompute_registers_hw_ngg(device, shader);
info->regs.vgt_gs_max_vert_out = pdev->info.mesh_fast_launch_2 ? info->ngg_info.max_out_verts : info->workgroup_size;
regs->vgt_gs_max_vert_out = pdev->info.mesh_fast_launch_2 ? info->ngg_info.max_out_verts : info->workgroup_size;
info->regs.ngg.ms.spi_shader_gs_meshlet_dim = S_00B2B0_MESHLET_NUM_THREAD_X(info->cs.block_size[0] - 1) |
regs->ngg.ms.spi_shader_gs_meshlet_dim = S_00B2B0_MESHLET_NUM_THREAD_X(info->cs.block_size[0] - 1) |
S_00B2B0_MESHLET_NUM_THREAD_Y(info->cs.block_size[1] - 1) |
S_00B2B0_MESHLET_NUM_THREAD_Z(info->cs.block_size[2] - 1) |
S_00B2B0_MESHLET_THREADGROUP_SIZE(info->workgroup_size - 1);
info->regs.ngg.ms.spi_shader_gs_meshlet_exp_alloc =
regs->ngg.ms.spi_shader_gs_meshlet_exp_alloc =
S_00B2B4_MAX_EXP_VERTS(info->ngg_info.max_out_verts) | S_00B2B4_MAX_EXP_PRIMS(info->ngg_info.prim_amp_factor);
if (pdev->info.gfx_level >= GFX12) {
const bool derivative_group_quads = info->cs.derivative_group == DERIVATIVE_GROUP_QUADS;
info->regs.ngg.ms.spi_shader_gs_meshlet_ctrl =
regs->ngg.ms.spi_shader_gs_meshlet_ctrl =
S_00B2B8_INTERLEAVE_BITS_X(derivative_group_quads) | S_00B2B8_INTERLEAVE_BITS_Y(derivative_group_quads);
}
}
static void
radv_precompute_registers_hw_fs(struct radv_device *device, struct radv_shader_binary *binary)
radv_precompute_registers_hw_fs(struct radv_device *device, struct radv_shader *shader)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
struct radv_shader_info *info = &binary->info;
const struct radv_shader_info *info = &shader->info;
struct radv_shader_regs *regs = &shader->regs;
unsigned conservative_z_export = V_02880C_EXPORT_ANY_Z;
if (info->ps.depth_layout == FRAG_DEPTH_LAYOUT_GREATER)
@ -1873,7 +1878,7 @@ radv_precompute_registers_hw_fs(struct radv_device *device, struct radv_shader_b
const bool mask_export_enable = info->ps.writes_sample_mask;
const bool disable_rbplus = pdev->info.has_rbplus && !pdev->info.rbplus_allowed;
info->regs.ps.db_shader_control =
regs->ps.db_shader_control =
S_02880C_Z_EXPORT_ENABLE(info->ps.writes_z) | S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(info->ps.writes_stencil) |
S_02880C_KILL_ENABLE(info->ps.can_discard) | S_02880C_MASK_EXPORT_ENABLE(mask_export_enable) |
S_02880C_CONSERVATIVE_Z_EXPORT(conservative_z_export) | S_02880C_Z_ORDER(z_order) |
@ -1883,62 +1888,64 @@ radv_precompute_registers_hw_fs(struct radv_device *device, struct radv_shader_b
S_02880C_DUAL_QUAD_DISABLE(disable_rbplus) | S_02880C_PRIMITIVE_ORDERED_PIXEL_SHADER(info->ps.pops);
if (pdev->info.gfx_level >= GFX12) {
info->regs.ps.spi_ps_in_control = S_028640_PS_W32_EN(info->wave_size == 32);
info->regs.ps.spi_gs_out_config_ps = S_00B0C4_NUM_INTERP(info->ps.num_inputs);
regs->ps.spi_ps_in_control = S_028640_PS_W32_EN(info->wave_size == 32);
regs->ps.spi_gs_out_config_ps = S_00B0C4_NUM_INTERP(info->ps.num_inputs);
info->regs.ps.pa_sc_hisz_control = S_028BBC_ROUND(2); /* required minimum value */
regs->ps.pa_sc_hisz_control = S_028BBC_ROUND(2); /* required minimum value */
if (info->ps.depth_layout == FRAG_DEPTH_LAYOUT_GREATER)
info->regs.ps.pa_sc_hisz_control |= S_028BBC_CONSERVATIVE_Z_EXPORT(V_028BBC_EXPORT_GREATER_THAN_Z);
regs->ps.pa_sc_hisz_control |= S_028BBC_CONSERVATIVE_Z_EXPORT(V_028BBC_EXPORT_GREATER_THAN_Z);
else if (info->ps.depth_layout == FRAG_DEPTH_LAYOUT_LESS)
info->regs.ps.pa_sc_hisz_control |= S_028BBC_CONSERVATIVE_Z_EXPORT(V_028BBC_EXPORT_LESS_THAN_Z);
regs->ps.pa_sc_hisz_control |= S_028BBC_CONSERVATIVE_Z_EXPORT(V_028BBC_EXPORT_LESS_THAN_Z);
} else {
/* GFX11 workaround when there are no PS inputs but LDS is used. */
const bool param_gen = pdev->info.gfx_level == GFX11 && !info->ps.num_inputs && binary->config.lds_size;
const bool param_gen = pdev->info.gfx_level == GFX11 && !info->ps.num_inputs && shader->config.lds_size;
info->regs.ps.spi_ps_in_control = S_0286D8_PS_W32_EN(info->wave_size == 32) | S_0286D8_PARAM_GEN(param_gen);
regs->ps.spi_ps_in_control = S_0286D8_PS_W32_EN(info->wave_size == 32) | S_0286D8_PARAM_GEN(param_gen);
/* Can't precompute NUM_INTERP on GFX10.3 because per-primititve attributes
* are tracked separately in NUM_PRIM_INTERP.
*/
if (pdev->info.gfx_level != GFX10_3) {
info->regs.ps.spi_ps_in_control |= S_0286D8_NUM_INTERP(info->ps.num_inputs);
regs->ps.spi_ps_in_control |= S_0286D8_NUM_INTERP(info->ps.num_inputs);
}
if (pdev->info.gfx_level >= GFX9 && pdev->info.gfx_level < GFX11)
info->regs.ps.pa_sc_shader_control = S_028C40_LOAD_COLLISION_WAVEID(info->ps.pops);
regs->ps.pa_sc_shader_control = S_028C40_LOAD_COLLISION_WAVEID(info->ps.pops);
}
info->regs.ps.spi_shader_z_format = ac_get_spi_shader_z_format(
info->ps.writes_z, info->ps.writes_stencil, info->ps.writes_sample_mask, info->ps.writes_mrt0_alpha);
regs->ps.spi_shader_z_format = ac_get_spi_shader_z_format(info->ps.writes_z, info->ps.writes_stencil,
info->ps.writes_sample_mask, info->ps.writes_mrt0_alpha);
}
static void
radv_precompute_registers_hw_cs(struct radv_device *device, struct radv_shader_binary *binary)
radv_precompute_registers_hw_cs(struct radv_device *device, struct radv_shader *shader)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
struct radv_shader_info *info = &binary->info;
const struct radv_shader_info *info = &shader->info;
struct radv_shader_regs *regs = &shader->regs;
info->regs.cs.compute_resource_limits = radv_get_compute_resource_limits(pdev, info);
regs->cs.compute_resource_limits = radv_get_compute_resource_limits(pdev, info);
if (pdev->info.gfx_level >= GFX12) {
info->regs.cs.compute_num_thread_x = S_00B81C_NUM_THREAD_FULL_GFX12(info->cs.block_size[0]);
info->regs.cs.compute_num_thread_y = S_00B820_NUM_THREAD_FULL_GFX12(info->cs.block_size[1]);
regs->cs.compute_num_thread_x = S_00B81C_NUM_THREAD_FULL_GFX12(info->cs.block_size[0]);
regs->cs.compute_num_thread_y = S_00B820_NUM_THREAD_FULL_GFX12(info->cs.block_size[1]);
if (info->cs.derivative_group == DERIVATIVE_GROUP_QUADS) {
info->regs.cs.compute_num_thread_x |= S_00B81C_INTERLEAVE_BITS_X(1);
info->regs.cs.compute_num_thread_y |= S_00B820_INTERLEAVE_BITS_Y(1);
regs->cs.compute_num_thread_x |= S_00B81C_INTERLEAVE_BITS_X(1);
regs->cs.compute_num_thread_y |= S_00B820_INTERLEAVE_BITS_Y(1);
}
} else {
info->regs.cs.compute_num_thread_x = S_00B81C_NUM_THREAD_FULL_GFX6(info->cs.block_size[0]);
info->regs.cs.compute_num_thread_y = S_00B820_NUM_THREAD_FULL_GFX6(info->cs.block_size[1]);
regs->cs.compute_num_thread_x = S_00B81C_NUM_THREAD_FULL_GFX6(info->cs.block_size[0]);
regs->cs.compute_num_thread_y = S_00B820_NUM_THREAD_FULL_GFX6(info->cs.block_size[1]);
}
info->regs.cs.compute_num_thread_z = S_00B824_NUM_THREAD_FULL(info->cs.block_size[2]);
regs->cs.compute_num_thread_z = S_00B824_NUM_THREAD_FULL(info->cs.block_size[2]);
}
static void
radv_precompute_registers_pgm(const struct radv_device *device, struct radv_shader_info *info)
radv_precompute_registers_pgm(const struct radv_device *device, struct radv_shader *shader)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
const enum amd_gfx_level gfx_level = pdev->info.gfx_level;
const struct radv_shader_info *info = &shader->info;
enum ac_hw_stage hw_stage = radv_select_hw_stage(info, gfx_level);
/* Special case for merged shaders compiled separately with ESO on GFX9+. */
@ -1951,73 +1958,74 @@ radv_precompute_registers_pgm(const struct radv_device *device, struct radv_shad
}
}
struct radv_shader_regs *regs = &shader->regs;
switch (hw_stage) {
case AC_HW_NEXT_GEN_GEOMETRY_SHADER:
assert(gfx_level >= GFX10);
if (gfx_level >= GFX12) {
info->regs.pgm_lo = R_00B224_SPI_SHADER_PGM_LO_ES;
regs->pgm_lo = R_00B224_SPI_SHADER_PGM_LO_ES;
} else {
info->regs.pgm_lo = R_00B320_SPI_SHADER_PGM_LO_ES;
regs->pgm_lo = R_00B320_SPI_SHADER_PGM_LO_ES;
}
info->regs.pgm_rsrc1 = R_00B228_SPI_SHADER_PGM_RSRC1_GS;
info->regs.pgm_rsrc2 = R_00B22C_SPI_SHADER_PGM_RSRC2_GS;
regs->pgm_rsrc1 = R_00B228_SPI_SHADER_PGM_RSRC1_GS;
regs->pgm_rsrc2 = R_00B22C_SPI_SHADER_PGM_RSRC2_GS;
break;
case AC_HW_LEGACY_GEOMETRY_SHADER:
assert(gfx_level < GFX11);
if (gfx_level >= GFX10) {
info->regs.pgm_lo = R_00B320_SPI_SHADER_PGM_LO_ES;
regs->pgm_lo = R_00B320_SPI_SHADER_PGM_LO_ES;
} else if (gfx_level >= GFX9) {
info->regs.pgm_lo = R_00B210_SPI_SHADER_PGM_LO_ES;
regs->pgm_lo = R_00B210_SPI_SHADER_PGM_LO_ES;
} else {
info->regs.pgm_lo = R_00B220_SPI_SHADER_PGM_LO_GS;
regs->pgm_lo = R_00B220_SPI_SHADER_PGM_LO_GS;
}
info->regs.pgm_rsrc1 = R_00B228_SPI_SHADER_PGM_RSRC1_GS;
info->regs.pgm_rsrc2 = R_00B22C_SPI_SHADER_PGM_RSRC2_GS;
regs->pgm_rsrc1 = R_00B228_SPI_SHADER_PGM_RSRC1_GS;
regs->pgm_rsrc2 = R_00B22C_SPI_SHADER_PGM_RSRC2_GS;
break;
case AC_HW_EXPORT_SHADER:
assert(gfx_level < GFX9);
info->regs.pgm_lo = R_00B320_SPI_SHADER_PGM_LO_ES;
info->regs.pgm_rsrc1 = R_00B328_SPI_SHADER_PGM_RSRC1_ES;
info->regs.pgm_rsrc2 = R_00B32C_SPI_SHADER_PGM_RSRC2_ES;
regs->pgm_lo = R_00B320_SPI_SHADER_PGM_LO_ES;
regs->pgm_rsrc1 = R_00B328_SPI_SHADER_PGM_RSRC1_ES;
regs->pgm_rsrc2 = R_00B32C_SPI_SHADER_PGM_RSRC2_ES;
break;
case AC_HW_LOCAL_SHADER:
assert(gfx_level < GFX9);
info->regs.pgm_lo = R_00B520_SPI_SHADER_PGM_LO_LS;
info->regs.pgm_rsrc1 = R_00B528_SPI_SHADER_PGM_RSRC1_LS;
info->regs.pgm_rsrc2 = R_00B52C_SPI_SHADER_PGM_RSRC2_LS;
regs->pgm_lo = R_00B520_SPI_SHADER_PGM_LO_LS;
regs->pgm_rsrc1 = R_00B528_SPI_SHADER_PGM_RSRC1_LS;
regs->pgm_rsrc2 = R_00B52C_SPI_SHADER_PGM_RSRC2_LS;
break;
case AC_HW_HULL_SHADER:
if (gfx_level >= GFX12) {
info->regs.pgm_lo = R_00B424_SPI_SHADER_PGM_LO_LS;
regs->pgm_lo = R_00B424_SPI_SHADER_PGM_LO_LS;
} else if (gfx_level >= GFX10) {
info->regs.pgm_lo = R_00B520_SPI_SHADER_PGM_LO_LS;
regs->pgm_lo = R_00B520_SPI_SHADER_PGM_LO_LS;
} else if (gfx_level >= GFX9) {
info->regs.pgm_lo = R_00B410_SPI_SHADER_PGM_LO_LS;
regs->pgm_lo = R_00B410_SPI_SHADER_PGM_LO_LS;
} else {
info->regs.pgm_lo = R_00B420_SPI_SHADER_PGM_LO_HS;
regs->pgm_lo = R_00B420_SPI_SHADER_PGM_LO_HS;
}
info->regs.pgm_rsrc1 = R_00B428_SPI_SHADER_PGM_RSRC1_HS;
info->regs.pgm_rsrc2 = R_00B42C_SPI_SHADER_PGM_RSRC2_HS;
regs->pgm_rsrc1 = R_00B428_SPI_SHADER_PGM_RSRC1_HS;
regs->pgm_rsrc2 = R_00B42C_SPI_SHADER_PGM_RSRC2_HS;
break;
case AC_HW_VERTEX_SHADER:
assert(gfx_level < GFX11);
info->regs.pgm_lo = R_00B120_SPI_SHADER_PGM_LO_VS;
info->regs.pgm_rsrc1 = R_00B128_SPI_SHADER_PGM_RSRC1_VS;
info->regs.pgm_rsrc2 = R_00B12C_SPI_SHADER_PGM_RSRC2_VS;
regs->pgm_lo = R_00B120_SPI_SHADER_PGM_LO_VS;
regs->pgm_rsrc1 = R_00B128_SPI_SHADER_PGM_RSRC1_VS;
regs->pgm_rsrc2 = R_00B12C_SPI_SHADER_PGM_RSRC2_VS;
break;
case AC_HW_PIXEL_SHADER:
info->regs.pgm_lo = R_00B020_SPI_SHADER_PGM_LO_PS;
info->regs.pgm_rsrc1 = R_00B028_SPI_SHADER_PGM_RSRC1_PS;
info->regs.pgm_rsrc2 = R_00B02C_SPI_SHADER_PGM_RSRC2_PS;
regs->pgm_lo = R_00B020_SPI_SHADER_PGM_LO_PS;
regs->pgm_rsrc1 = R_00B028_SPI_SHADER_PGM_RSRC1_PS;
regs->pgm_rsrc2 = R_00B02C_SPI_SHADER_PGM_RSRC2_PS;
break;
case AC_HW_COMPUTE_SHADER:
info->regs.pgm_lo = R_00B830_COMPUTE_PGM_LO;
info->regs.pgm_rsrc1 = R_00B848_COMPUTE_PGM_RSRC1;
info->regs.pgm_rsrc2 = R_00B84C_COMPUTE_PGM_RSRC2;
info->regs.pgm_rsrc3 = R_00B8A0_COMPUTE_PGM_RSRC3;
regs->pgm_lo = R_00B830_COMPUTE_PGM_LO;
regs->pgm_rsrc1 = R_00B848_COMPUTE_PGM_RSRC1;
regs->pgm_rsrc2 = R_00B84C_COMPUTE_PGM_RSRC2;
regs->pgm_rsrc3 = R_00B8A0_COMPUTE_PGM_RSRC3;
break;
default:
UNREACHABLE("invalid hw stage");
@ -2026,47 +2034,47 @@ radv_precompute_registers_pgm(const struct radv_device *device, struct radv_shad
}
static void
radv_precompute_registers(struct radv_device *device, struct radv_shader_binary *binary)
radv_precompute_registers(struct radv_device *device, struct radv_shader *shader)
{
struct radv_shader_info *info = &binary->info;
const struct radv_shader_info *info = &shader->info;
radv_precompute_registers_pgm(device, info);
radv_precompute_registers_pgm(device, shader);
switch (info->stage) {
case MESA_SHADER_VERTEX:
if (!info->vs.as_ls && !info->vs.as_es) {
if (info->is_ngg) {
radv_precompute_registers_hw_ngg(device, &binary->config, &binary->info);
radv_precompute_registers_hw_ngg(device, shader);
} else {
radv_precompute_registers_hw_vs(device, binary);
radv_precompute_registers_hw_vs(device, shader);
}
}
break;
case MESA_SHADER_TESS_EVAL:
if (!info->tes.as_es) {
if (info->is_ngg) {
radv_precompute_registers_hw_ngg(device, &binary->config, &binary->info);
radv_precompute_registers_hw_ngg(device, shader);
} else {
radv_precompute_registers_hw_vs(device, binary);
radv_precompute_registers_hw_vs(device, shader);
}
}
break;
case MESA_SHADER_GEOMETRY:
if (info->is_ngg) {
radv_precompute_registers_hw_ngg(device, &binary->config, &binary->info);
radv_precompute_registers_hw_ngg(device, shader);
} else {
radv_precompute_registers_hw_gs(device, NULL, &binary->info);
radv_precompute_registers_hw_gs(device, NULL, shader);
}
break;
case MESA_SHADER_MESH:
radv_precompute_registers_hw_ms(device, binary);
radv_precompute_registers_hw_ms(device, shader);
break;
case MESA_SHADER_FRAGMENT:
radv_precompute_registers_hw_fs(device, binary);
radv_precompute_registers_hw_fs(device, shader);
break;
case MESA_SHADER_COMPUTE:
case MESA_SHADER_TASK:
radv_precompute_registers_hw_cs(device, binary);
radv_precompute_registers_hw_cs(device, shader);
break;
default:
break;
@ -2420,9 +2428,6 @@ radv_postprocess_binary_config(struct radv_device *device, struct radv_shader_bi
config->rsrc1 |= S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt);
}
/* Precompute register values for faster emission. */
radv_precompute_registers(device, binary);
return true;
}
@ -2916,6 +2921,9 @@ radv_shader_create_uncached(struct radv_device *device, const struct radv_shader
goto out;
}
/* Precompute register values for faster emission. */
radv_precompute_registers(device, shader);
*out_shader = shader;
out:

View file

@ -422,6 +422,7 @@ struct radv_shader {
uint32_t code_size;
uint32_t exec_size;
struct radv_shader_info info;
struct radv_shader_regs regs;
uint32_t max_waves;
blake3_hash hash;
@ -709,10 +710,10 @@ radv_shader_need_push_constants_upload(const struct radv_shader *shader)
return loc->sgpr_idx != -1;
}
void radv_precompute_registers_hw_gs(struct radv_device *device, struct radv_shader_info *es_info, struct radv_shader_info *gs_info);
void radv_precompute_registers_hw_gs(struct radv_device *device, const struct radv_shader_info *es_info,
struct radv_shader *shader);
void radv_precompute_registers_hw_ngg(struct radv_device *device, const struct ac_shader_config *config,
struct radv_shader_info *info);
void radv_precompute_registers_hw_ngg(struct radv_device *device, struct radv_shader *shader);
void radv_set_stage_key_robustness(const struct vk_pipeline_robustness_state *rs, mesa_shader_stage stage,
struct radv_shader_stage_key *key);

View file

@ -259,9 +259,10 @@ struct radv_shader_info {
struct radv_legacy_gs_info legacy_gs_info;
struct gfx10_ngg_info ngg_info;
};
};
/* Precomputed register values. */
struct {
/* Precomputed register values. */
struct radv_shader_regs {
uint32_t pgm_lo;
uint32_t pgm_rsrc1;
uint32_t pgm_rsrc2;
@ -323,7 +324,6 @@ struct radv_shader_info {
uint32_t spi_vs_out_config;
uint32_t spi_shader_pos_format;
uint32_t vgt_gs_instance_cnt;
} regs;
};
void radv_nir_shader_info_init(mesa_shader_stage stage, mesa_shader_stage next_stage, struct radv_shader_info *info);