diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index 0911407d3b6..5e93c6ebfac 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -631,309 +631,6 @@ static void si_prefetch_shaders(struct si_context *sctx) sctx->prefetch_L2_mask = 0; } -#if GFX_VER == 6 /* declare these functions only once because they support all chips. */ - -/** - * This calculates the LDS size for tessellation shaders (VS, TCS, TES). - * LS.LDS_SIZE is shared by all 3 shader stages. - * - * The information about LDS and other non-compile-time parameters is then - * written to userdata SGPRs. - * - * This depends on: - * - patch_vertices - * - VS and the currently selected shader variant (called by si_update_shaders) - * - TCS and the currently selected shader variant (called by si_update_shaders) - * - tess_uses_prim_id (called by si_update_shaders) - * - sh_base[TESS_EVAL] depending on GS on/off (called by si_update_shaders) - */ -void si_update_tess_io_layout_state(struct si_context *sctx) -{ - struct si_shader *ls_current; - struct si_shader_selector *ls; - struct si_shader_selector *tcs = sctx->shader.tcs.cso; - unsigned tess_uses_primid = sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id; - bool has_primid_instancing_bug = sctx->gfx_level == GFX6 && sctx->screen->info.max_se == 1; - unsigned tes_sh_base = sctx->shader_pointers.sh_base[PIPE_SHADER_TESS_EVAL]; - uint8_t num_tcs_input_cp = sctx->patch_vertices; - - assert(sctx->shader.tcs.current); - - /* Since GFX9 has merged LS-HS in the TCS state, set LS = TCS. */ - if (sctx->gfx_level >= GFX9) { - ls_current = sctx->shader.tcs.current; - ls = ls_current->key.ge.part.tcs.ls; - } else { - ls_current = sctx->shader.vs.current; - ls = sctx->shader.vs.cso; - } - - if (sctx->last_ls == ls_current && sctx->last_tcs == tcs && - sctx->last_tes_sh_base == tes_sh_base && sctx->last_num_tcs_input_cp == num_tcs_input_cp && - (!has_primid_instancing_bug || (sctx->last_tess_uses_primid == tess_uses_primid))) - return; - - sctx->last_ls = ls_current; - sctx->last_tcs = tcs; - sctx->last_tes_sh_base = tes_sh_base; - sctx->last_num_tcs_input_cp = num_tcs_input_cp; - sctx->last_tess_uses_primid = tess_uses_primid; - - /* This calculates how shader inputs and outputs among VS, TCS, and TES - * are laid out in LDS. */ - unsigned num_tcs_outputs = util_last_bit64(tcs->info.outputs_written); - unsigned num_tcs_output_cp = tcs->info.base.tess.tcs_vertices_out; - unsigned num_tcs_patch_outputs = util_last_bit64(tcs->info.patch_outputs_written); - - unsigned input_vertex_size = ls->info.lshs_vertex_stride; - unsigned output_vertex_size = num_tcs_outputs * 16; - unsigned input_patch_size; - - /* Allocate LDS for TCS inputs only if it's used. */ - if (!ls_current->key.ge.opt.same_patch_vertices || - tcs->info.base.inputs_read & ~tcs->info.tcs_vgpr_only_inputs) - input_patch_size = num_tcs_input_cp * input_vertex_size; - else - input_patch_size = 0; - - unsigned pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size; - unsigned output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16; - unsigned lds_per_patch; - - /* Compute the LDS size per patch. - * - * LDS is used to store TCS outputs if they are read, and to store tess - * factors if they are not defined in all invocations. - */ - if (tcs->info.base.outputs_read || - tcs->info.base.patch_outputs_read || - !tcs->info.tessfactors_are_def_in_all_invocs) { - lds_per_patch = input_patch_size + output_patch_size; - } else { - /* LDS will only store TCS inputs. The offchip buffer will only store TCS outputs. */ - lds_per_patch = MAX2(input_patch_size, output_patch_size); - } - - /* Ensure that we only need 4 waves per CU, so that we don't need to check - * resource usage (such as whether we have enough VGPRs to fit the whole - * threadgroup into the CU). It also ensures that the number of tcs in and out - * vertices per threadgroup are at most 256, which is the hw limit. - */ - unsigned max_verts_per_patch = MAX2(num_tcs_input_cp, num_tcs_output_cp); - unsigned num_patches = 256 / max_verts_per_patch; - - /* Not necessary for correctness, but higher numbers are slower. - * The hardware can do more, but the radeonsi shader constant is - * limited to 6 bits. - */ - num_patches = MIN2(num_patches, 64); /* e.g. 64 triangles in exactly 3 waves */ - - /* When distributed tessellation is unsupported, switch between SEs - * at a higher frequency to manually balance the workload between SEs. - */ - if (!sctx->screen->info.has_distributed_tess && sctx->screen->info.max_se > 1) - num_patches = MIN2(num_patches, 16); /* recommended */ - - /* Make sure the output data fits in the offchip buffer */ - num_patches = - MIN2(num_patches, (sctx->screen->hs.tess_offchip_block_dw_size * 4) / output_patch_size); - - /* Make sure that the data fits in LDS. This assumes the shaders only - * use LDS for the inputs and outputs. - * - * The maximum allowed LDS size is 32K. Higher numbers can hang. - * Use 16K as the maximum, so that we can fit 2 workgroups on the same CU. - */ - ASSERTED unsigned max_lds_size = 32 * 1024; /* hw limit */ - unsigned target_lds_size = 16 * 1024; /* target at least 2 workgroups per CU, 16K each */ - num_patches = MIN2(num_patches, target_lds_size / lds_per_patch); - num_patches = MAX2(num_patches, 1); - assert(num_patches * lds_per_patch <= max_lds_size); - - /* Make sure that vector lanes are fully occupied by cutting off the last wave - * if it's only partially filled. - */ - unsigned temp_verts_per_tg = num_patches * max_verts_per_patch; - unsigned wave_size = ls_current->wave_size; - - if (temp_verts_per_tg > wave_size && - (wave_size - temp_verts_per_tg % wave_size >= MAX2(max_verts_per_patch, 8))) - num_patches = (temp_verts_per_tg & ~(wave_size - 1)) / max_verts_per_patch; - - if (sctx->gfx_level == GFX6) { - /* GFX6 bug workaround, related to power management. Limit LS-HS - * threadgroups to only one wave. - */ - unsigned one_wave = wave_size / max_verts_per_patch; - num_patches = MIN2(num_patches, one_wave); - } - - /* The VGT HS block increments the patch ID unconditionally - * within a single threadgroup. This results in incorrect - * patch IDs when instanced draws are used. - * - * The intended solution is to restrict threadgroups to - * a single instance by setting SWITCH_ON_EOI, which - * should cause IA to split instances up. However, this - * doesn't work correctly on GFX6 when there is no other - * SE to switch to. - */ - if (has_primid_instancing_bug && tess_uses_primid) - num_patches = 1; - - sctx->num_patches_per_workgroup = num_patches; - - unsigned output_patch0_offset = input_patch_size * num_patches; - unsigned perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size; - - /* Compute userdata SGPRs. */ - assert(((input_vertex_size / 4) & ~0xff) == 0); - assert(((perpatch_output_offset / 4) & ~0xffff) == 0); - assert(num_tcs_input_cp <= 32); - assert(num_tcs_output_cp <= 32); - assert(num_patches <= 64); - assert(((pervertex_output_patch_size * num_patches) & ~0xffff) == 0); - - uint64_t ring_va = (unlikely(sctx->ws->cs_is_secure(&sctx->gfx_cs)) ? - si_resource(sctx->tess_rings_tmz) : si_resource(sctx->tess_rings))->gpu_address; - assert((ring_va & u_bit_consecutive(0, 19)) == 0); - - sctx->tes_offchip_ring_va_sgpr = ring_va; - sctx->tcs_offchip_layout = - (num_patches - 1) | ((num_tcs_output_cp - 1) << 6) | ((num_tcs_input_cp - 1) << 11) | - ((pervertex_output_patch_size * num_patches) << 16); - - /* Compute the LDS size. */ - unsigned lds_size = lds_per_patch * num_patches; - - if (sctx->gfx_level >= GFX7) { - assert(lds_size <= 65536); - lds_size = align(lds_size, 512) / 512; - } else { - assert(lds_size <= 32768); - lds_size = align(lds_size, 256) / 256; - } - - /* Set SI_SGPR_VS_STATE_BITS. */ - SET_FIELD(sctx->current_vs_state, VS_STATE_LS_OUT_VERTEX_SIZE, input_vertex_size / 4); - SET_FIELD(sctx->current_vs_state, VS_STATE_TCS_OUT_PATCH0_OFFSET, perpatch_output_offset / 4); - - /* We should be able to support in-shader LDS use with LLVM >= 9 - * by just adding the lds_sizes together, but it has never - * been tested. */ - assert(ls_current->config.lds_size == 0); - - unsigned ls_hs_rsrc2; - - if (sctx->gfx_level >= GFX9) { - ls_hs_rsrc2 = sctx->shader.tcs.current->config.rsrc2; - - if (sctx->gfx_level >= GFX10) - ls_hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX10(lds_size); - else - ls_hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX9(lds_size); - } else { - ls_hs_rsrc2 = sctx->shader.vs.current->config.rsrc2; - - si_multiwave_lds_size_workaround(sctx->screen, &lds_size); - ls_hs_rsrc2 |= S_00B52C_LDS_SIZE(lds_size); - } - - sctx->ls_hs_rsrc2 = ls_hs_rsrc2; - sctx->ls_hs_config = - S_028B58_NUM_PATCHES(sctx->num_patches_per_workgroup) | - S_028B58_HS_NUM_INPUT_CP(num_tcs_input_cp) | - S_028B58_HS_NUM_OUTPUT_CP(num_tcs_output_cp); - - si_mark_atom_dirty(sctx, &sctx->atoms.s.tess_io_layout); -} - -static void si_emit_tess_io_layout_state(struct si_context *sctx) -{ - struct radeon_cmdbuf *cs = &sctx->gfx_cs; - radeon_begin(cs); - - if (!sctx->shader.tes.cso || !sctx->shader.tcs.current) - return; - - if (sctx->screen->info.has_set_pairs_packets) { - radeon_opt_push_gfx_sh_reg(R_00B42C_SPI_SHADER_PGM_RSRC2_HS, - SI_TRACKED_SPI_SHADER_PGM_RSRC2_HS, sctx->ls_hs_rsrc2); - - /* Set userdata SGPRs for merged LS-HS. */ - radeon_opt_push_gfx_sh_reg(R_00B430_SPI_SHADER_USER_DATA_HS_0 + - GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 4, - SI_TRACKED_SPI_SHADER_USER_DATA_HS__TCS_OFFCHIP_LAYOUT, - sctx->tcs_offchip_layout); - radeon_opt_push_gfx_sh_reg(R_00B430_SPI_SHADER_USER_DATA_HS_0 + - GFX9_SGPR_TCS_OFFCHIP_ADDR * 4, - SI_TRACKED_SPI_SHADER_USER_DATA_HS__TCS_OFFCHIP_ADDR, - sctx->tes_offchip_ring_va_sgpr); - } else if (sctx->gfx_level >= GFX9) { - radeon_opt_set_sh_reg(sctx, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, - SI_TRACKED_SPI_SHADER_PGM_RSRC2_HS, sctx->ls_hs_rsrc2); - - /* Set userdata SGPRs for merged LS-HS. */ - radeon_opt_set_sh_reg2(sctx, - R_00B430_SPI_SHADER_USER_DATA_HS_0 + - GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 4, - SI_TRACKED_SPI_SHADER_USER_DATA_HS__TCS_OFFCHIP_LAYOUT, - sctx->tcs_offchip_layout, sctx->tes_offchip_ring_va_sgpr); - } else { - /* Due to a hw bug, RSRC2_LS must be written twice with another - * LS register written in between. */ - if (sctx->gfx_level == GFX7 && sctx->family != CHIP_HAWAII) - radeon_set_sh_reg(R_00B52C_SPI_SHADER_PGM_RSRC2_LS, sctx->ls_hs_rsrc2); - radeon_set_sh_reg_seq(R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2); - radeon_emit(sctx->shader.vs.current->config.rsrc1); - radeon_emit(sctx->ls_hs_rsrc2); - - /* Set userdata SGPRs for TCS. */ - radeon_opt_set_sh_reg3(sctx, - R_00B430_SPI_SHADER_USER_DATA_HS_0 + - GFX6_SGPR_TCS_OFFCHIP_LAYOUT * 4, - SI_TRACKED_SPI_SHADER_USER_DATA_HS__TCS_OFFCHIP_LAYOUT, - sctx->tcs_offchip_layout, sctx->tes_offchip_ring_va_sgpr, - sctx->current_vs_state); - } - - /* Set userdata SGPRs for TES. */ - unsigned tes_sh_base = sctx->shader_pointers.sh_base[PIPE_SHADER_TESS_EVAL]; - assert(tes_sh_base); - - /* TES (as ES or VS) reuses the BaseVertex and DrawID user SGPRs that are used when - * tessellation is disabled. That's because those user SGPRs are only set in LS - * for tessellation. - */ - if (sctx->screen->info.has_set_pairs_packets) { - radeon_opt_push_gfx_sh_reg(tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4, - SI_TRACKED_SPI_SHADER_USER_DATA_ES__BASE_VERTEX, - sctx->tcs_offchip_layout); - radeon_opt_push_gfx_sh_reg(tes_sh_base + SI_SGPR_TES_OFFCHIP_ADDR * 4, - SI_TRACKED_SPI_SHADER_USER_DATA_ES__DRAWID, - sctx->tes_offchip_ring_va_sgpr); - } else { - bool has_gs = sctx->ngg || sctx->shader.gs.cso; - - radeon_opt_set_sh_reg2(sctx, tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4, - has_gs ? SI_TRACKED_SPI_SHADER_USER_DATA_ES__BASE_VERTEX - : SI_TRACKED_SPI_SHADER_USER_DATA_VS__BASE_VERTEX, - sctx->tcs_offchip_layout, sctx->tes_offchip_ring_va_sgpr); - } - radeon_end(); - - radeon_begin_again(cs); - if (sctx->gfx_level >= GFX7) { - radeon_opt_set_context_reg_idx(sctx, R_028B58_VGT_LS_HS_CONFIG, - SI_TRACKED_VGT_LS_HS_CONFIG, 2, sctx->ls_hs_config); - } else { - radeon_opt_set_context_reg(sctx, R_028B58_VGT_LS_HS_CONFIG, - SI_TRACKED_VGT_LS_HS_CONFIG, sctx->ls_hs_config); - } - radeon_end_update_context_roll(sctx); -} -#endif - static unsigned si_num_prims_for_vertices(enum mesa_prim prim, unsigned count, unsigned vertices_per_patch) { @@ -2906,8 +2603,6 @@ void si_init_spi_map_functions(struct si_context *sctx) sctx->emit_spi_map[30] = si_emit_spi_map<30>; sctx->emit_spi_map[31] = si_emit_spi_map<31>; sctx->emit_spi_map[32] = si_emit_spi_map<32>; - - sctx->atoms.s.tess_io_layout.emit = si_emit_tess_io_layout_state; } #endif diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index ca49937abd8..b2d47499ed9 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -4298,6 +4298,306 @@ static void si_set_patch_vertices(struct pipe_context *ctx, uint8_t patch_vertic } } +/** + * This calculates the LDS size for tessellation shaders (VS, TCS, TES). + * LS.LDS_SIZE is shared by all 3 shader stages. + * + * The information about LDS and other non-compile-time parameters is then + * written to userdata SGPRs. + * + * This depends on: + * - patch_vertices + * - VS and the currently selected shader variant (called by si_update_shaders) + * - TCS and the currently selected shader variant (called by si_update_shaders) + * - tess_uses_prim_id (called by si_update_shaders) + * - sh_base[TESS_EVAL] depending on GS on/off (called by si_update_shaders) + */ +void si_update_tess_io_layout_state(struct si_context *sctx) +{ + struct si_shader *ls_current; + struct si_shader_selector *ls; + struct si_shader_selector *tcs = sctx->shader.tcs.cso; + unsigned tess_uses_primid = sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id; + bool has_primid_instancing_bug = sctx->gfx_level == GFX6 && sctx->screen->info.max_se == 1; + unsigned tes_sh_base = sctx->shader_pointers.sh_base[PIPE_SHADER_TESS_EVAL]; + uint8_t num_tcs_input_cp = sctx->patch_vertices; + + assert(sctx->shader.tcs.current); + + /* Since GFX9 has merged LS-HS in the TCS state, set LS = TCS. */ + if (sctx->gfx_level >= GFX9) { + ls_current = sctx->shader.tcs.current; + ls = ls_current->key.ge.part.tcs.ls; + } else { + ls_current = sctx->shader.vs.current; + ls = sctx->shader.vs.cso; + } + + if (sctx->last_ls == ls_current && sctx->last_tcs == tcs && + sctx->last_tes_sh_base == tes_sh_base && sctx->last_num_tcs_input_cp == num_tcs_input_cp && + (!has_primid_instancing_bug || (sctx->last_tess_uses_primid == tess_uses_primid))) + return; + + sctx->last_ls = ls_current; + sctx->last_tcs = tcs; + sctx->last_tes_sh_base = tes_sh_base; + sctx->last_num_tcs_input_cp = num_tcs_input_cp; + sctx->last_tess_uses_primid = tess_uses_primid; + + /* This calculates how shader inputs and outputs among VS, TCS, and TES + * are laid out in LDS. */ + unsigned num_tcs_outputs = util_last_bit64(tcs->info.outputs_written); + unsigned num_tcs_output_cp = tcs->info.base.tess.tcs_vertices_out; + unsigned num_tcs_patch_outputs = util_last_bit64(tcs->info.patch_outputs_written); + + unsigned input_vertex_size = ls->info.lshs_vertex_stride; + unsigned output_vertex_size = num_tcs_outputs * 16; + unsigned input_patch_size; + + /* Allocate LDS for TCS inputs only if it's used. */ + if (!ls_current->key.ge.opt.same_patch_vertices || + tcs->info.base.inputs_read & ~tcs->info.tcs_vgpr_only_inputs) + input_patch_size = num_tcs_input_cp * input_vertex_size; + else + input_patch_size = 0; + + unsigned pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size; + unsigned output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16; + unsigned lds_per_patch; + + /* Compute the LDS size per patch. + * + * LDS is used to store TCS outputs if they are read, and to store tess + * factors if they are not defined in all invocations. + */ + if (tcs->info.base.outputs_read || + tcs->info.base.patch_outputs_read || + !tcs->info.tessfactors_are_def_in_all_invocs) { + lds_per_patch = input_patch_size + output_patch_size; + } else { + /* LDS will only store TCS inputs. The offchip buffer will only store TCS outputs. */ + lds_per_patch = MAX2(input_patch_size, output_patch_size); + } + + /* Ensure that we only need 4 waves per CU, so that we don't need to check + * resource usage (such as whether we have enough VGPRs to fit the whole + * threadgroup into the CU). It also ensures that the number of tcs in and out + * vertices per threadgroup are at most 256, which is the hw limit. + */ + unsigned max_verts_per_patch = MAX2(num_tcs_input_cp, num_tcs_output_cp); + unsigned num_patches = 256 / max_verts_per_patch; + + /* Not necessary for correctness, but higher numbers are slower. + * The hardware can do more, but the radeonsi shader constant is + * limited to 6 bits. + */ + num_patches = MIN2(num_patches, 64); /* e.g. 64 triangles in exactly 3 waves */ + + /* When distributed tessellation is unsupported, switch between SEs + * at a higher frequency to manually balance the workload between SEs. + */ + if (!sctx->screen->info.has_distributed_tess && sctx->screen->info.max_se > 1) + num_patches = MIN2(num_patches, 16); /* recommended */ + + /* Make sure the output data fits in the offchip buffer */ + num_patches = + MIN2(num_patches, (sctx->screen->hs.tess_offchip_block_dw_size * 4) / output_patch_size); + + /* Make sure that the data fits in LDS. This assumes the shaders only + * use LDS for the inputs and outputs. + * + * The maximum allowed LDS size is 32K. Higher numbers can hang. + * Use 16K as the maximum, so that we can fit 2 workgroups on the same CU. + */ + ASSERTED unsigned max_lds_size = 32 * 1024; /* hw limit */ + unsigned target_lds_size = 16 * 1024; /* target at least 2 workgroups per CU, 16K each */ + num_patches = MIN2(num_patches, target_lds_size / lds_per_patch); + num_patches = MAX2(num_patches, 1); + assert(num_patches * lds_per_patch <= max_lds_size); + + /* Make sure that vector lanes are fully occupied by cutting off the last wave + * if it's only partially filled. + */ + unsigned temp_verts_per_tg = num_patches * max_verts_per_patch; + unsigned wave_size = ls_current->wave_size; + + if (temp_verts_per_tg > wave_size && + (wave_size - temp_verts_per_tg % wave_size >= MAX2(max_verts_per_patch, 8))) + num_patches = (temp_verts_per_tg & ~(wave_size - 1)) / max_verts_per_patch; + + if (sctx->gfx_level == GFX6) { + /* GFX6 bug workaround, related to power management. Limit LS-HS + * threadgroups to only one wave. + */ + unsigned one_wave = wave_size / max_verts_per_patch; + num_patches = MIN2(num_patches, one_wave); + } + + /* The VGT HS block increments the patch ID unconditionally + * within a single threadgroup. This results in incorrect + * patch IDs when instanced draws are used. + * + * The intended solution is to restrict threadgroups to + * a single instance by setting SWITCH_ON_EOI, which + * should cause IA to split instances up. However, this + * doesn't work correctly on GFX6 when there is no other + * SE to switch to. + */ + if (has_primid_instancing_bug && tess_uses_primid) + num_patches = 1; + + sctx->num_patches_per_workgroup = num_patches; + + unsigned output_patch0_offset = input_patch_size * num_patches; + unsigned perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size; + + /* Compute userdata SGPRs. */ + assert(((input_vertex_size / 4) & ~0xff) == 0); + assert(((perpatch_output_offset / 4) & ~0xffff) == 0); + assert(num_tcs_input_cp <= 32); + assert(num_tcs_output_cp <= 32); + assert(num_patches <= 64); + assert(((pervertex_output_patch_size * num_patches) & ~0xffff) == 0); + + uint64_t ring_va = (unlikely(sctx->ws->cs_is_secure(&sctx->gfx_cs)) ? + si_resource(sctx->tess_rings_tmz) : si_resource(sctx->tess_rings))->gpu_address; + assert((ring_va & u_bit_consecutive(0, 19)) == 0); + + sctx->tes_offchip_ring_va_sgpr = ring_va; + sctx->tcs_offchip_layout = + (num_patches - 1) | ((num_tcs_output_cp - 1) << 6) | ((num_tcs_input_cp - 1) << 11) | + ((pervertex_output_patch_size * num_patches) << 16); + + /* Compute the LDS size. */ + unsigned lds_size = lds_per_patch * num_patches; + + if (sctx->gfx_level >= GFX7) { + assert(lds_size <= 65536); + lds_size = align(lds_size, 512) / 512; + } else { + assert(lds_size <= 32768); + lds_size = align(lds_size, 256) / 256; + } + + /* Set SI_SGPR_VS_STATE_BITS. */ + SET_FIELD(sctx->current_vs_state, VS_STATE_LS_OUT_VERTEX_SIZE, input_vertex_size / 4); + SET_FIELD(sctx->current_vs_state, VS_STATE_TCS_OUT_PATCH0_OFFSET, perpatch_output_offset / 4); + + /* We should be able to support in-shader LDS use with LLVM >= 9 + * by just adding the lds_sizes together, but it has never + * been tested. */ + assert(ls_current->config.lds_size == 0); + + unsigned ls_hs_rsrc2; + + if (sctx->gfx_level >= GFX9) { + ls_hs_rsrc2 = sctx->shader.tcs.current->config.rsrc2; + + if (sctx->gfx_level >= GFX10) + ls_hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX10(lds_size); + else + ls_hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX9(lds_size); + } else { + ls_hs_rsrc2 = sctx->shader.vs.current->config.rsrc2; + + si_multiwave_lds_size_workaround(sctx->screen, &lds_size); + ls_hs_rsrc2 |= S_00B52C_LDS_SIZE(lds_size); + } + + sctx->ls_hs_rsrc2 = ls_hs_rsrc2; + sctx->ls_hs_config = + S_028B58_NUM_PATCHES(sctx->num_patches_per_workgroup) | + S_028B58_HS_NUM_INPUT_CP(num_tcs_input_cp) | + S_028B58_HS_NUM_OUTPUT_CP(num_tcs_output_cp); + + si_mark_atom_dirty(sctx, &sctx->atoms.s.tess_io_layout); +} + +static void si_emit_tess_io_layout_state(struct si_context *sctx) +{ + struct radeon_cmdbuf *cs = &sctx->gfx_cs; + radeon_begin(cs); + + if (!sctx->shader.tes.cso || !sctx->shader.tcs.current) + return; + + if (sctx->screen->info.has_set_pairs_packets) { + radeon_opt_push_gfx_sh_reg(R_00B42C_SPI_SHADER_PGM_RSRC2_HS, + SI_TRACKED_SPI_SHADER_PGM_RSRC2_HS, sctx->ls_hs_rsrc2); + + /* Set userdata SGPRs for merged LS-HS. */ + radeon_opt_push_gfx_sh_reg(R_00B430_SPI_SHADER_USER_DATA_HS_0 + + GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 4, + SI_TRACKED_SPI_SHADER_USER_DATA_HS__TCS_OFFCHIP_LAYOUT, + sctx->tcs_offchip_layout); + radeon_opt_push_gfx_sh_reg(R_00B430_SPI_SHADER_USER_DATA_HS_0 + + GFX9_SGPR_TCS_OFFCHIP_ADDR * 4, + SI_TRACKED_SPI_SHADER_USER_DATA_HS__TCS_OFFCHIP_ADDR, + sctx->tes_offchip_ring_va_sgpr); + } else if (sctx->gfx_level >= GFX9) { + radeon_opt_set_sh_reg(sctx, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, + SI_TRACKED_SPI_SHADER_PGM_RSRC2_HS, sctx->ls_hs_rsrc2); + + /* Set userdata SGPRs for merged LS-HS. */ + radeon_opt_set_sh_reg2(sctx, + R_00B430_SPI_SHADER_USER_DATA_HS_0 + + GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 4, + SI_TRACKED_SPI_SHADER_USER_DATA_HS__TCS_OFFCHIP_LAYOUT, + sctx->tcs_offchip_layout, sctx->tes_offchip_ring_va_sgpr); + } else { + /* Due to a hw bug, RSRC2_LS must be written twice with another + * LS register written in between. */ + if (sctx->gfx_level == GFX7 && sctx->family != CHIP_HAWAII) + radeon_set_sh_reg(R_00B52C_SPI_SHADER_PGM_RSRC2_LS, sctx->ls_hs_rsrc2); + radeon_set_sh_reg_seq(R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2); + radeon_emit(sctx->shader.vs.current->config.rsrc1); + radeon_emit(sctx->ls_hs_rsrc2); + + /* Set userdata SGPRs for TCS. */ + radeon_opt_set_sh_reg3(sctx, + R_00B430_SPI_SHADER_USER_DATA_HS_0 + + GFX6_SGPR_TCS_OFFCHIP_LAYOUT * 4, + SI_TRACKED_SPI_SHADER_USER_DATA_HS__TCS_OFFCHIP_LAYOUT, + sctx->tcs_offchip_layout, sctx->tes_offchip_ring_va_sgpr, + sctx->current_vs_state); + } + + /* Set userdata SGPRs for TES. */ + unsigned tes_sh_base = sctx->shader_pointers.sh_base[PIPE_SHADER_TESS_EVAL]; + assert(tes_sh_base); + + /* TES (as ES or VS) reuses the BaseVertex and DrawID user SGPRs that are used when + * tessellation is disabled. That's because those user SGPRs are only set in LS + * for tessellation. + */ + if (sctx->screen->info.has_set_pairs_packets) { + radeon_opt_push_gfx_sh_reg(tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4, + SI_TRACKED_SPI_SHADER_USER_DATA_ES__BASE_VERTEX, + sctx->tcs_offchip_layout); + radeon_opt_push_gfx_sh_reg(tes_sh_base + SI_SGPR_TES_OFFCHIP_ADDR * 4, + SI_TRACKED_SPI_SHADER_USER_DATA_ES__DRAWID, + sctx->tes_offchip_ring_va_sgpr); + } else { + bool has_gs = sctx->ngg || sctx->shader.gs.cso; + + radeon_opt_set_sh_reg2(sctx, tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4, + has_gs ? SI_TRACKED_SPI_SHADER_USER_DATA_ES__BASE_VERTEX + : SI_TRACKED_SPI_SHADER_USER_DATA_VS__BASE_VERTEX, + sctx->tcs_offchip_layout, sctx->tes_offchip_ring_va_sgpr); + } + radeon_end(); + + radeon_begin_again(cs); + if (sctx->gfx_level >= GFX7) { + radeon_opt_set_context_reg_idx(sctx, R_028B58_VGT_LS_HS_CONFIG, + SI_TRACKED_VGT_LS_HS_CONFIG, 2, sctx->ls_hs_config); + } else { + radeon_opt_set_context_reg(sctx, R_028B58_VGT_LS_HS_CONFIG, + SI_TRACKED_VGT_LS_HS_CONFIG, sctx->ls_hs_config); + } + radeon_end_update_context_roll(sctx); +} + void si_init_screen_live_shader_cache(struct si_screen *sscreen) { util_live_shader_cache_init(&sscreen->live_shader_cache, si_create_shader_selector, @@ -4308,6 +4608,7 @@ void si_init_shader_functions(struct si_context *sctx) { sctx->atoms.s.vgt_pipeline_state.emit = si_emit_vgt_pipeline_state; sctx->atoms.s.scratch_state.emit = si_emit_scratch_state; + sctx->atoms.s.tess_io_layout.emit = si_emit_tess_io_layout_state; sctx->b.create_vs_state = si_create_shader; sctx->b.create_tcs_state = si_create_shader;