radv: Use new NIR lowering of NGG GS when ACO is used.

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10740>
This commit is contained in:
Timur Kristóf 2021-04-22 14:44:28 +02:00 committed by Marge Bot
parent 60ac5dda82
commit bb127c2130
4 changed files with 16 additions and 705 deletions

View file

@ -4290,44 +4290,6 @@ Temp thread_id_in_threadgroup(isel_context *ctx)
return bld.vadd32(bld.def(v1), Operand(num_pre_threads), Operand(tid_in_wave));
}
Temp wave_count_in_threadgroup(isel_context *ctx)
{
Builder bld(ctx->program, ctx->block);
return bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
get_arg(ctx, ctx->args->ac.merged_wave_info), Operand(28u | (4u << 16)));
}
Temp ngg_gs_vertex_lds_addr(isel_context *ctx, Temp vertex_idx)
{
Builder bld(ctx->program, ctx->block);
unsigned write_stride_2exp = ffs(MAX2(ctx->shader->info.gs.vertices_out, 1)) - 1;
/* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */
if (write_stride_2exp) {
Temp row = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(5u), vertex_idx);
Temp swizzle = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand((1u << write_stride_2exp) - 1), row);
vertex_idx = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), vertex_idx, swizzle);
}
Temp vertex_idx_bytes = bld.v_mul24_imm(bld.def(v1), vertex_idx, ctx->ngg_gs_emit_vtx_bytes);
return bld.vadd32(bld.def(v1), vertex_idx_bytes, Operand(ctx->ngg_gs_emit_addr));
}
Temp ngg_gs_emit_vertex_lds_addr(isel_context *ctx, Temp emit_vertex_idx)
{
/* Should be used by GS threads only (not by the NGG GS epilogue).
* Returns the LDS address of the given vertex index as emitted by the current GS thread.
*/
Builder bld(ctx->program, ctx->block);
Temp thread_id_in_tg = thread_id_in_threadgroup(ctx);
Temp thread_vertices_addr = bld.v_mul24_imm(bld.def(v1), thread_id_in_tg, ctx->shader->info.gs.vertices_out);
Temp vertex_idx = bld.vadd32(bld.def(v1), thread_vertices_addr, emit_vertex_idx);
return ngg_gs_vertex_lds_addr(ctx, vertex_idx);
}
Temp get_tess_rel_patch_id(isel_context *ctx)
{
Builder bld(ctx->program, ctx->block);
@ -6988,115 +6950,6 @@ void visit_load_sample_mask_in(isel_context *ctx, nir_intrinsic_instr *instr) {
}
}
unsigned gs_outprim_vertices(unsigned outprim)
{
switch (outprim) {
case 0: /* GL_POINTS */
return 1;
case 3: /* GL_LINE_STRIP */
return 2;
case 5: /* GL_TRIANGLE_STRIP */
return 3;
default:
unreachable("Unsupported GS output primitive type.");
}
}
void ngg_visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *instr)
{
Builder bld(ctx->program, ctx->block);
Temp emit_vertex_idx = get_ssa_temp(ctx, instr->src[0].ssa);
Temp emit_vertex_addr = ngg_gs_emit_vertex_lds_addr(ctx, emit_vertex_idx);
unsigned stream = nir_intrinsic_stream_id(instr);
unsigned out_idx = 0;
for (unsigned i = 0; i <= VARYING_SLOT_VAR31; i++) {
if (ctx->program->info->gs.output_streams[i] != stream) {
continue;
} else if (!ctx->outputs.mask[i] && ctx->program->info->gs.output_usage_mask[i]) {
/* The GS can write this output, but it's empty for the current vertex. */
out_idx++;
continue;
}
uint32_t wrmask = ctx->program->info->gs.output_usage_mask[i] &
ctx->outputs.mask[i];
/* Clear output for the next vertex. */
ctx->outputs.mask[i] = 0;
if (!wrmask)
continue;
for (unsigned j = 0; j < 4; j++) {
if (wrmask & (1 << j)) {
Temp elem = ctx->outputs.temps[i * 4u + j];
store_lds(ctx, elem.bytes(), elem, 0x1u, emit_vertex_addr, out_idx * 4u, 4u);
}
out_idx++;
}
}
/* Calculate per-vertex primitive flags based on current and total vertex count per primitive:
* bit 0: whether this vertex finishes a primitive
* bit 1: whether the primitive is odd (if we are emitting triangle strips, otherwise always 0)
* bit 2: always 1 (so that we can use it for determining vertex liveness)
*/
unsigned total_vtx_per_prim = gs_outprim_vertices(ctx->shader->info.gs.output_primitive);
bool calc_odd = stream == 0 && total_vtx_per_prim == 3;
Temp prim_flag;
if (nir_src_is_const(instr->src[1])) {
uint8_t current_vtx_per_prim = nir_src_as_uint(instr->src[1]);
uint8_t completes_prim = (current_vtx_per_prim >= (total_vtx_per_prim - 1)) ? 1 : 0;
uint8_t odd = (uint8_t)calc_odd & current_vtx_per_prim;
uint8_t flag = completes_prim | (odd << 1) | (1 << 2);
prim_flag = bld.copy(bld.def(v1b), Operand(flag));
} else if (!instr->src[1].ssa->divergent) {
Temp current_vtx_per_prim = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
Temp completes_prim = bld.sopc(aco_opcode::s_cmp_le_u32, bld.def(s1, scc), Operand(total_vtx_per_prim - 1), current_vtx_per_prim);
prim_flag = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(0b101u), Operand(0b100u), bld.scc(completes_prim));
if (calc_odd) {
Temp odd = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), current_vtx_per_prim, Operand(0u));
prim_flag = bld.sop2(aco_opcode::s_lshl1_add_u32, bld.def(s1), bld.def(s1, scc), odd, prim_flag);
}
} else {
Temp current_vtx_per_prim = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
Temp completes_prim = bld.vopc(aco_opcode::v_cmp_le_u32, bld.hint_vcc(bld.def(bld.lm)), Operand(total_vtx_per_prim - 1), current_vtx_per_prim);
prim_flag = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0b100u), Operand(0b101u), Operand(completes_prim));
if (calc_odd) {
Temp odd = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), current_vtx_per_prim);
prim_flag = bld.vop3(aco_opcode::v_lshl_or_b32, bld.def(v1), odd, Operand(1u), prim_flag);
}
}
/* Store the per-vertex primitive flags at the end of the vertex data */
prim_flag = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), as_vgpr(ctx, prim_flag), Operand(0u));
unsigned primflag_offset = ctx->ngg_gs_primflags_offset + stream;
store_lds(ctx, 1, prim_flag, 1u, emit_vertex_addr, primflag_offset, 1);
}
void ngg_gs_clear_primflags(isel_context *ctx, Temp vtx_cnt, unsigned stream);
void ngg_gs_write_shader_query(isel_context *ctx, nir_intrinsic_instr *instr);
void ngg_visit_set_vertex_and_primitive_count(isel_context *ctx, nir_intrinsic_instr *instr)
{
unsigned stream = nir_intrinsic_stream_id(instr);
if (stream > 0 && !ctx->args->shader_info->gs.num_stream_output_components[stream])
return;
ctx->ngg_gs_known_vtxcnt[stream] = true;
/* Clear the primitive flags of non-emitted GS vertices. */
if (!nir_src_is_const(instr->src[0]) || nir_src_as_uint(instr->src[0]) < ctx->shader->info.gs.vertices_out) {
Temp vtx_cnt = get_ssa_temp(ctx, instr->src[0].ssa);
ngg_gs_clear_primflags(ctx, vtx_cnt, stream);
}
ngg_gs_write_shader_query(ctx, instr);
}
void visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *instr)
{
Builder bld(ctx->program, ctx->block);
@ -8521,10 +8374,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
break;
}
case nir_intrinsic_emit_vertex_with_counter: {
if (ctx->stage.hw == HWStage::NGG)
ngg_visit_emit_vertex_with_counter(ctx, instr);
else
visit_emit_vertex_with_counter(ctx, instr);
assert(ctx->stage.hw == HWStage::GS);
visit_emit_vertex_with_counter(ctx, instr);
break;
}
case nir_intrinsic_end_primitive_with_counter: {
@ -8535,8 +8386,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
break;
}
case nir_intrinsic_set_vertex_and_primitive_count: {
if (ctx->stage.hw == HWStage::NGG)
ngg_visit_set_vertex_and_primitive_count(ctx, instr);
assert(ctx->stage.hw == HWStage::GS);
/* unused in the legacy pipeline, the HW keeps track of this for us */
break;
}
@ -11193,20 +11043,6 @@ Temp merged_wave_info_to_mask(isel_context *ctx, unsigned i)
return lanecount_to_mask(ctx, count);
}
Temp ngg_max_vertex_count(isel_context *ctx)
{
Builder bld(ctx->program, ctx->block);
return bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
get_arg(ctx, ctx->args->ac.gs_tg_info), Operand(12u | (9u << 16u)));
}
Temp ngg_max_primitive_count(isel_context *ctx)
{
Builder bld(ctx->program, ctx->block);
return bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
get_arg(ctx, ctx->args->ac.gs_tg_info), Operand(22u | (9u << 16u)));
}
void ngg_emit_sendmsg_gs_alloc_req(isel_context *ctx, Temp vtx_cnt, Temp prm_cnt)
{
assert(vtx_cnt.id() && prm_cnt.id());
@ -11214,7 +11050,7 @@ void ngg_emit_sendmsg_gs_alloc_req(isel_context *ctx, Temp vtx_cnt, Temp prm_cnt
Builder bld(ctx->program, ctx->block);
Temp prm_cnt_0;
if (ctx->program->chip_class == GFX10 && ctx->stage.has(SWStage::GS) && ctx->ngg_gs_const_prmcnt[0] <= 0) {
if (ctx->program->chip_class == GFX10 && ctx->stage.has(SWStage::GS)) {
/* Navi 1x workaround: make sure to always export at least 1 vertex and triangle */
prm_cnt_0 = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), prm_cnt, Operand(0u));
prm_cnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(1u), prm_cnt, bld.scc(prm_cnt_0));
@ -11254,495 +11090,6 @@ void ngg_emit_sendmsg_gs_alloc_req(isel_context *ctx, Temp vtx_cnt, Temp prm_cnt
}
}
void ngg_emit_wave0_sendmsg_gs_alloc_req(isel_context *ctx, Temp vtx_cnt = Temp(), Temp prm_cnt = Temp())
{
Builder bld(ctx->program, ctx->block);
/* Get the id of the current wave within the threadgroup (workgroup) */
Builder::Result wave_id_in_tg = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
get_arg(ctx, ctx->args->ac.merged_wave_info), Operand(24u | (4u << 16)));
/* Execute the following code only on the first wave (wave id 0),
* use the SCC def to tell if the wave id is zero or not.
*/
Temp waveid_as_cond = wave_id_in_tg.def(1).getTemp();
if_context ic;
begin_uniform_if_then(ctx, &ic, waveid_as_cond);
begin_uniform_if_else(ctx, &ic);
bld.reset(ctx->block);
/* VS/TES: we infer the vertex and primitive count from arguments
* GS: the caller needs to supply them
*/
assert(ctx->stage.has(SWStage::GS)
? (vtx_cnt.id() && prm_cnt.id())
: (!vtx_cnt.id() && !prm_cnt.id()));
/* Number of vertices output by VS/TES */
if (vtx_cnt.id() == 0)
vtx_cnt = ngg_max_vertex_count(ctx);
/* Number of primitives output by VS/TES */
if (prm_cnt.id() == 0)
prm_cnt = ngg_max_primitive_count(ctx);
ngg_emit_sendmsg_gs_alloc_req(ctx, vtx_cnt, prm_cnt);
end_uniform_if(ctx, &ic);
}
Temp ngg_pack_prim_exp_arg(isel_context *ctx, unsigned num_vertices, const Temp vtxindex[], const Temp is_null)
{
Builder bld(ctx->program, ctx->block);
Temp tmp;
Temp gs_invocation_id;
if (ctx->stage == vertex_ngg)
gs_invocation_id = get_arg(ctx, ctx->args->ac.gs_invocation_id);
for (unsigned i = 0; i < num_vertices; ++i) {
assert(vtxindex[i].id());
if (i)
tmp = bld.vop3(aco_opcode::v_lshl_or_b32, bld.def(v1), vtxindex[i], Operand(10u * i), tmp);
else
tmp = vtxindex[i];
/* The initial edge flag is always false in tess eval shaders. */
if (ctx->stage == vertex_ngg) {
Temp edgeflag = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), gs_invocation_id, Operand(8u + i), Operand(1u));
tmp = bld.vop3(aco_opcode::v_lshl_or_b32, bld.def(v1), edgeflag, Operand(10u * i + 9u), tmp);
}
}
if (is_null.id())
tmp = bld.vop3(aco_opcode::v_lshl_or_b32, bld.def(v1), is_null, Operand(31u), tmp);
return tmp;
}
void ngg_emit_prim_export(isel_context *ctx, unsigned num_vertices_per_primitive, const Temp vtxindex[], const Temp is_null = Temp())
{
Builder bld(ctx->program, ctx->block);
Temp prim_exp_arg;
if (!ctx->stage.has(SWStage::GS) && ctx->args->options->key.vs_common_out.as_ngg_passthrough)
prim_exp_arg = get_arg(ctx, ctx->args->ac.gs_vtx_offset[0]);
else
prim_exp_arg = ngg_pack_prim_exp_arg(ctx, num_vertices_per_primitive, vtxindex, is_null);
bld.exp(aco_opcode::exp, prim_exp_arg, Operand(v1), Operand(v1), Operand(v1),
1 /* enabled mask */, V_008DFC_SQ_EXP_PRIM /* dest */,
false /* compressed */, true/* done */, false /* valid mask */);
}
std::pair<Temp, Temp> ngg_gs_workgroup_reduce_and_scan(isel_context *ctx, Temp src_mask)
{
/* Workgroup scan for NGG GS.
* This performs a reduction along with an exclusive scan addition accross the workgroup.
* Assumes that all lanes are enabled (exec = -1) where this is emitted.
*
* Input: (1) per-lane bool
* -- 1 if the lane has a live/valid vertex, 0 otherwise
* Output: (1) result of a reduction over the entire workgroup,
* -- the total number of vertices emitted by the workgroup
* (2) result of an exclusive scan over the entire workgroup
* -- used for vertex compaction, in order to determine
* which lane should export the current lane's vertex
*/
Builder bld(ctx->program, ctx->block);
assert(src_mask.regClass() == bld.lm);
/* Subgroup reduction and exclusive scan on the per-lane boolean. */
Temp sg_reduction = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), src_mask);
Temp sg_excl = emit_mbcnt(ctx, bld.tmp(v1), Operand(src_mask));
if (ctx->program->workgroup_size <= ctx->program->wave_size)
return std::make_pair(sg_reduction, sg_excl);
if_context ic;
/* Determine if the current lane is the first. */
Temp is_first_lane = bld.copy(bld.def(bld.lm), Operand(1u, ctx->program->wave_size == 64));
Temp wave_id_in_tg = wave_id_in_threadgroup(ctx);
begin_divergent_if_then(ctx, &ic, is_first_lane);
bld.reset(ctx->block);
/* The first lane of each wave stores the result of its subgroup reduction to LDS (NGG scratch). */
Temp wave_id_in_tg_lds_addr = bld.vop2_e64(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), wave_id_in_tg);
store_lds(ctx, 4u, as_vgpr(ctx, sg_reduction), 0x1u, wave_id_in_tg_lds_addr, ctx->ngg_gs_scratch_addr, 4u);
/* Wait for all waves to write to LDS. */
create_workgroup_barrier(bld);
/* Number of LDS dwords written by all waves (if there is only 1, that is already handled above) */
unsigned num_lds_dwords = DIV_ROUND_UP(MIN2(ctx->program->workgroup_size, 256), ctx->program->wave_size);
assert(num_lds_dwords >= 2 && num_lds_dwords <= 8);
/* The first lane of each wave loads every wave's results from LDS, to avoid bank conflicts */
Temp reduction_per_wave_vector = load_lds(ctx, 4u * num_lds_dwords, bld.tmp(RegClass(RegType::vgpr, num_lds_dwords)),
bld.copy(bld.def(v1), Operand(0u)), ctx->ngg_gs_scratch_addr, 16u);
begin_divergent_if_else(ctx, &ic);
end_divergent_if(ctx, &ic);
bld.reset(ctx->block);
/* Create phis which get us the above reduction results, or undef. */
bld.reset(&ctx->block->instructions, ctx->block->instructions.begin());
reduction_per_wave_vector = bld.pseudo(aco_opcode::p_phi, bld.def(reduction_per_wave_vector.regClass()), reduction_per_wave_vector, Operand(reduction_per_wave_vector.regClass()));
bld.reset(ctx->block);
emit_split_vector(ctx, reduction_per_wave_vector, num_lds_dwords);
Temp reduction_per_wave[8];
for (unsigned i = 0; i < num_lds_dwords; ++i) {
Temp reduction_current_wave = emit_extract_vector(ctx, reduction_per_wave_vector, i, v1);
reduction_per_wave[i] = bld.readlane(bld.def(s1), reduction_current_wave, Operand(0u));
}
Temp wave_count = wave_count_in_threadgroup(ctx);
Temp reduction_result = reduction_per_wave[0];
Temp excl_base;
for (unsigned i = 0; i < num_lds_dwords; ++i) {
/* Workgroup reduction:
* Add the reduction results from all waves (up to and including wave_count).
*/
if (i != 0) {
Temp should_add = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), wave_count, Operand(i + 1u));
Temp addition = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), reduction_per_wave[i], Operand(0u), bld.scc(should_add));
reduction_result = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), reduction_result, addition);
}
/* Base of workgroup exclusive scan:
* Add the reduction results from waves up to and excluding wave_id_in_tg.
*/
if (i != (num_lds_dwords - 1)) {
Temp should_add = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), wave_id_in_tg, Operand(i + 1u));
Temp addition = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), reduction_per_wave[i], Operand(0u), bld.scc(should_add));
excl_base = !excl_base.id() ? addition : bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), excl_base, addition);
}
}
assert(excl_base.id());
/* WG exclusive scan result: base + subgroup exclusive result. */
Temp wg_excl = bld.vadd32(bld.def(v1), Operand(excl_base), Operand(sg_excl));
return std::make_pair(reduction_result, wg_excl);
}
void ngg_gs_clear_primflags(isel_context *ctx, Temp vtx_cnt, unsigned stream)
{
loop_context lc;
if_context ic;
Builder bld(ctx->program, ctx->block);
Temp zero = bld.copy(bld.def(v1b), Operand(uint8_t(0)));
Temp counter_init = bld.copy(bld.def(v1), as_vgpr(ctx, vtx_cnt));
begin_loop(ctx, &lc);
Temp incremented_counter = bld.tmp(counter_init.regClass());
bld.reset(&ctx->block->instructions, ctx->block->instructions.begin());
Temp counter = bld.pseudo(aco_opcode::p_phi, bld.def(counter_init.regClass()), Operand(counter_init), incremented_counter);
bld.reset(ctx->block);
Temp break_cond = bld.vopc(aco_opcode::v_cmp_le_u32, bld.def(bld.lm), Operand(ctx->shader->info.gs.vertices_out), counter);
/* Break when vertices_out <= counter */
begin_divergent_if_then(ctx, &ic, break_cond);
emit_loop_break(ctx);
begin_divergent_if_else(ctx, &ic);
end_divergent_if(ctx, &ic);
bld.reset(ctx->block);
/* Store zero to the primitive flag of the current vertex for the current stream */
Temp emit_vertex_addr = ngg_gs_emit_vertex_lds_addr(ctx, counter);
unsigned primflag_offset = ctx->ngg_gs_primflags_offset + stream;
store_lds(ctx, 1, zero, 0xf, emit_vertex_addr, primflag_offset, 1);
/* Increment counter */
bld.vadd32(Definition(incremented_counter), counter, Operand(1u));
end_loop(ctx, &lc);
}
void ngg_gs_write_shader_query(isel_context *ctx, nir_intrinsic_instr *instr)
{
/* Each subgroup uses a single GDS atomic to collect the total number of primitives.
* TODO: Consider using primitive compaction at the end instead.
*/
unsigned total_vtx_per_prim = gs_outprim_vertices(ctx->shader->info.gs.output_primitive);
if_context ic_shader_query;
Builder bld(ctx->program, ctx->block);
Temp shader_query = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), get_arg(ctx, ctx->args->ngg_gs_state), Operand(0u));
begin_uniform_if_then(ctx, &ic_shader_query, shader_query);
bld.reset(ctx->block);
Temp sg_prm_cnt;
/* Calculate the "real" number of emitted primitives from the emitted GS vertices and primitives.
* GS emits points, line strips or triangle strips.
* Real primitives are points, lines or triangles.
*/
if (nir_src_is_const(instr->src[0]) && nir_src_is_const(instr->src[1])) {
unsigned gs_vtx_cnt = nir_src_as_uint(instr->src[0]);
unsigned gs_prm_cnt = nir_src_as_uint(instr->src[1]);
Temp prm_cnt = bld.copy(bld.def(s1), Operand(gs_vtx_cnt - gs_prm_cnt * (total_vtx_per_prim - 1u)));
Temp thread_cnt = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), Operand(exec, bld.lm));
sg_prm_cnt = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), prm_cnt, thread_cnt);
} else {
Temp gs_vtx_cnt = get_ssa_temp(ctx, instr->src[0].ssa);
Temp prm_cnt = get_ssa_temp(ctx, instr->src[1].ssa);
if (total_vtx_per_prim > 1)
prm_cnt = bld.vop3(aco_opcode::v_mad_i32_i24, bld.def(v1), prm_cnt, Operand(-1u * (total_vtx_per_prim - 1)), gs_vtx_cnt);
else
prm_cnt = as_vgpr(ctx, prm_cnt);
/* Reduction calculates the primitive count for the entire subgroup. */
sg_prm_cnt = emit_reduction_instr(ctx, aco_opcode::p_reduce, ReduceOp::iadd32,
ctx->program->wave_size, bld.def(s1), prm_cnt);
}
Temp first_lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
Temp is_first_lane = bld.sop2(Builder::s_lshl, bld.def(bld.lm), bld.def(s1, scc),
Operand(1u, ctx->program->wave_size == 64), first_lane);
if_context ic_last_lane;
begin_divergent_if_then(ctx, &ic_last_lane, is_first_lane);
bld.reset(ctx->block);
Temp gds_addr = bld.copy(bld.def(v1), Operand(0u));
Operand m = bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand(0x100u)));
bld.ds(aco_opcode::ds_add_u32, gds_addr, as_vgpr(ctx, sg_prm_cnt), m, 0u, 0u, true);
begin_divergent_if_else(ctx, &ic_last_lane);
end_divergent_if(ctx, &ic_last_lane);
begin_uniform_if_else(ctx, &ic_shader_query);
end_uniform_if(ctx, &ic_shader_query);
}
Temp ngg_gs_load_prim_flag_0(isel_context *ctx, Temp tid_in_tg, Temp max_vtxcnt, Temp vertex_lds_addr)
{
if_context ic;
Builder bld(ctx->program, ctx->block);
Temp is_vertex_emit_thread = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.def(bld.lm), max_vtxcnt, tid_in_tg);
begin_divergent_if_then(ctx, &ic, is_vertex_emit_thread);
bld.reset(ctx->block);
Operand m = load_lds_size_m0(bld);
Temp prim_flag_0 = bld.ds(aco_opcode::ds_read_u8, bld.def(v1), vertex_lds_addr, m, ctx->ngg_gs_primflags_offset);
begin_divergent_if_else(ctx, &ic);
end_divergent_if(ctx, &ic);
bld.reset(&ctx->block->instructions, ctx->block->instructions.begin());
prim_flag_0 = bld.pseudo(aco_opcode::p_phi, bld.def(prim_flag_0.regClass()), Operand(prim_flag_0), Operand(0u));
return prim_flag_0;
}
void ngg_gs_setup_vertex_compaction(isel_context *ctx, Temp vertex_live, Temp tid_in_tg, Temp exporter_tid_in_tg)
{
if_context ic;
Builder bld(ctx->program, ctx->block);
assert(vertex_live.regClass() == bld.lm);
begin_divergent_if_then(ctx, &ic, vertex_live);
bld.reset(ctx->block);
/* Setup the vertex compaction.
* Save the current thread's id for the thread which will export the current vertex.
* We reuse stream 1 of the primitive flag of the other thread's vertex for storing this.
*/
Temp export_thread_lds_addr = ngg_gs_vertex_lds_addr(ctx, exporter_tid_in_tg);
tid_in_tg = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tid_in_tg, Operand(0u));
store_lds(ctx, 1u, tid_in_tg, 1u, export_thread_lds_addr, ctx->ngg_gs_primflags_offset + 1u, 1u);
begin_divergent_if_else(ctx, &ic);
end_divergent_if(ctx, &ic);
bld.reset(ctx->block);
/* Wait for all waves to setup the vertex compaction. */
create_workgroup_barrier(bld);
}
void ngg_gs_export_primitives(isel_context *ctx, Temp max_prmcnt, Temp tid_in_tg, Temp exporter_tid_in_tg,
Temp prim_flag_0)
{
if_context ic;
Builder bld(ctx->program, ctx->block);
unsigned total_vtx_per_prim = gs_outprim_vertices(ctx->shader->info.gs.output_primitive);
assert(total_vtx_per_prim <= 3);
Temp is_prim_export_thread = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.def(bld.lm), max_prmcnt, tid_in_tg);
begin_divergent_if_then(ctx, &ic, is_prim_export_thread);
bld.reset(ctx->block);
Temp is_null_prim = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(-1u), prim_flag_0);
Temp indices[3];
indices[total_vtx_per_prim - 1] = exporter_tid_in_tg;
if (total_vtx_per_prim >= 2)
indices[total_vtx_per_prim - 2] = bld.vsub32(bld.def(v1), exporter_tid_in_tg, Operand(1u));
if (total_vtx_per_prim == 3)
indices[total_vtx_per_prim - 3] = bld.vsub32(bld.def(v1), exporter_tid_in_tg, Operand(2u));
if (total_vtx_per_prim == 3) {
/* API GS outputs triangle strips, but NGG HW needs triangles.
* We already have triangles due to how we set the primitive flags, but we need to
* make sure the vertex order is so that the front/back is correct, and the provoking vertex is kept.
*/
bool flatshade_first = !ctx->args->options->key.vs.provoking_vtx_last;
/* If the triangle is odd, this will swap its two non-provoking vertices. */
Temp is_odd = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), Operand(prim_flag_0), Operand(1u), Operand(1u));
if (flatshade_first) {
indices[1] = bld.vadd32(bld.def(v1), indices[1], Operand(is_odd));
indices[2] = bld.vsub32(bld.def(v1), indices[2], Operand(is_odd));
} else {
indices[0] = bld.vadd32(bld.def(v1), indices[0], Operand(is_odd));
indices[1] = bld.vsub32(bld.def(v1), indices[1], Operand(is_odd));
}
}
ngg_emit_prim_export(ctx, total_vtx_per_prim, indices, is_null_prim);
begin_divergent_if_else(ctx, &ic);
end_divergent_if(ctx, &ic);
}
void ngg_gs_export_vertices(isel_context *ctx, Temp wg_vtx_cnt, Temp tid_in_tg, Temp vertex_lds_addr)
{
if_context ic;
Builder bld(ctx->program, ctx->block);
/* See if the current thread has to export a vertex. */
Temp is_vtx_export_thread = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.def(bld.lm), wg_vtx_cnt, tid_in_tg);
begin_divergent_if_then(ctx, &ic, is_vtx_export_thread);
bld.reset(ctx->block);
/* The index of the vertex that the current thread will export. */
Temp exported_vtx_idx;
if (ctx->ngg_gs_early_alloc) {
/* No vertex compaction necessary, the thread can export its own vertex. */
exported_vtx_idx = tid_in_tg;
} else {
/* Vertex compaction: read stream 1 of the primitive flags to see which vertex the current thread needs to export */
Operand m = load_lds_size_m0(bld);
exported_vtx_idx = bld.ds(aco_opcode::ds_read_u8, bld.def(v1), vertex_lds_addr, m, ctx->ngg_gs_primflags_offset + 1);
}
/* Get the LDS address of the vertex that the current thread must export. */
Temp exported_vtx_addr = ngg_gs_vertex_lds_addr(ctx, exported_vtx_idx);
/* Read the vertex attributes from LDS. */
unsigned out_idx = 0;
for (unsigned i = 0; i <= VARYING_SLOT_VAR31; i++) {
if (ctx->program->info->gs.output_streams[i] != 0)
continue;
/* Set the output mask to the GS output usage mask. */
unsigned rdmask =
ctx->outputs.mask[i] =
ctx->program->info->gs.output_usage_mask[i];
if (!rdmask)
continue;
for (unsigned j = 0; j < 4; j++) {
if (rdmask & (1 << j))
ctx->outputs.temps[i * 4u + j] =
load_lds(ctx, 4u, bld.tmp(v1), exported_vtx_addr, out_idx * 4u, 4u);
out_idx++;
}
}
/* Export the vertex parameters. */
create_vs_exports(ctx);
begin_divergent_if_else(ctx, &ic);
end_divergent_if(ctx, &ic);
}
void ngg_gs_prelude(isel_context *ctx)
{
if (!ctx->ngg_gs_early_alloc)
return;
/* We know the GS writes the maximum possible number of vertices, so
* it's likely that most threads need to export a primitive, too.
* Thus, we won't have to worry about primitive compaction here.
*/
Temp num_max_vertices = ngg_max_vertex_count(ctx);
ngg_emit_wave0_sendmsg_gs_alloc_req(ctx, num_max_vertices, num_max_vertices);
}
void ngg_gs_finale(isel_context *ctx)
{
/* Sanity check. Make sure the vertex/primitive counts are set and the LDS is correctly initialized. */
assert(ctx->ngg_gs_known_vtxcnt[0]);
if_context ic;
Builder bld(ctx->program, ctx->block);
/* Wait for all waves to reach the epilogue. */
create_workgroup_barrier(bld);
/* Thread ID in the entire threadgroup */
Temp tid_in_tg = thread_id_in_threadgroup(ctx);
/* Number of threads that may need to export a vertex or primitive. */
Temp max_vtxcnt = ngg_max_vertex_count(ctx);
/* LDS address of the vertex corresponding to the current thread. */
Temp vertex_lds_addr = ngg_gs_vertex_lds_addr(ctx, tid_in_tg);
/* Primitive flag from stream 0 of the vertex corresponding to the current thread. */
Temp prim_flag_0 = ngg_gs_load_prim_flag_0(ctx, tid_in_tg, max_vtxcnt, vertex_lds_addr);
bld.reset(ctx->block);
/* NIR already filters out incomplete primitives and vertices,
* so any vertex whose primitive flag is non-zero is considered live/valid.
*/
Temp vertex_live = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), Operand(prim_flag_0));
/* Total number of vertices emitted by the workgroup. */
Temp wg_vtx_cnt;
/* ID of the thread which will export the current thread's vertex. */
Temp exporter_tid_in_tg;
if (ctx->ngg_gs_early_alloc) {
/* There is no need for a scan or vertex compaction, we know that
* the GS writes all possible vertices so each thread can export its own vertex.
*/
wg_vtx_cnt = max_vtxcnt;
exporter_tid_in_tg = tid_in_tg;
} else {
/* Perform a workgroup reduction and exclusive scan. */
std::pair<Temp, Temp> wg_scan = ngg_gs_workgroup_reduce_and_scan(ctx, vertex_live);
bld.reset(ctx->block);
/* Total number of vertices emitted by the workgroup. */
wg_vtx_cnt = wg_scan.first;
/* ID of the thread which will export the current thread's vertex. */
exporter_tid_in_tg = wg_scan.second;
/* Skip all exports when possible. */
Temp have_exports = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), wg_vtx_cnt, Operand(0u));
max_vtxcnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), max_vtxcnt, Operand(0u), bld.scc(have_exports));
ngg_emit_wave0_sendmsg_gs_alloc_req(ctx, wg_vtx_cnt, max_vtxcnt);
ngg_gs_setup_vertex_compaction(ctx, vertex_live, tid_in_tg, exporter_tid_in_tg);
}
ngg_gs_export_primitives(ctx, max_vtxcnt, tid_in_tg, exporter_tid_in_tg, prim_flag_0);
ngg_gs_export_vertices(ctx, wg_vtx_cnt, tid_in_tg, vertex_lds_addr);
}
} /* end namespace */
void select_program(Program *program,
@ -11753,7 +11100,7 @@ void select_program(Program *program,
{
isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args, false);
if_context ic_merged_wave_info;
bool ngg_gs = ctx.stage.hw == HWStage::NGG && ctx.stage.has(SWStage::GS);
bool ngg_gs = ctx.stage.hw == HWStage::NGG && ctx.stage.has(SWStage::GS);
for (unsigned i = 0; i < shader_count; i++) {
nir_shader *nir = shaders[i];
@ -11776,9 +11123,6 @@ void select_program(Program *program,
}
}
if (!i && ngg_gs)
ngg_gs_prelude(&ctx);
/* In a merged VS+TCS HS, the VS implementation can be completely empty. */
nir_function_impl *func = nir_shader_get_entrypoint(nir);
bool empty_shader = nir_cf_list_is_empty_block(&func->body) &&
@ -11787,14 +11131,8 @@ void select_program(Program *program,
(nir->info.stage == MESA_SHADER_TESS_EVAL &&
ctx.stage == tess_eval_geometry_gs));
bool check_merged_wave_info = ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader);
bool endif_merged_wave_info = ctx.tcs_in_out_eq ? i == 1 : check_merged_wave_info;
if (i && ngg_gs) {
/* NGG GS waves need to wait for each other after the GS half is done. */
Builder bld(ctx.program, ctx.block);
create_workgroup_barrier(bld);
}
bool check_merged_wave_info = ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader && !(ngg_gs && i == 1));
bool endif_merged_wave_info = ctx.tcs_in_out_eq ? i == 1 : (check_merged_wave_info && !(ngg_gs && i == 1));
if (check_merged_wave_info) {
Temp cond = merged_wave_info_to_mask(&ctx, i);
@ -11843,9 +11181,6 @@ void select_program(Program *program,
end_divergent_if(&ctx, &ic_merged_wave_info);
}
if (ngg_gs && nir->info.stage == MESA_SHADER_GEOMETRY)
ngg_gs_finale(&ctx);
if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {
/* Outputs of the previous stage are inputs to the next stage */
ctx.inputs = ctx.outputs;

View file

@ -93,15 +93,7 @@ struct isel_context {
Temp persp_centroid, linear_centroid;
/* GS inputs */
bool ngg_gs_early_alloc = false;
bool ngg_gs_known_vtxcnt[4] = {false, false, false, false};
Temp gs_wave_id;
unsigned ngg_gs_emit_addr = 0;
unsigned ngg_gs_emit_vtx_bytes = 0;
unsigned ngg_gs_scratch_addr = 0;
unsigned ngg_gs_primflags_offset = 0;
int ngg_gs_const_vtxcnt[4];
int ngg_gs_const_prmcnt[4];
/* VS output information */
bool export_clip_dists;

View file

@ -407,28 +407,7 @@ void setup_gs_variables(isel_context *ctx, nir_shader *nir)
setup_vs_output_info(ctx, nir, false,
ctx->options->key.vs_common_out.export_clip_dists, outinfo);
unsigned ngg_gs_scratch_bytes = ctx->args->shader_info->so.num_outputs ? (44u * 4u) : (8u * 4u);
unsigned ngg_emit_bytes = ctx->args->shader_info->ngg_info.ngg_emit_size * 4u;
unsigned esgs_ring_bytes = ctx->args->shader_info->ngg_info.esgs_ring_size;
ctx->ngg_gs_primflags_offset = ctx->args->shader_info->gs.gsvs_vertex_size;
ctx->ngg_gs_emit_vtx_bytes = ctx->ngg_gs_primflags_offset + 4u;
ctx->ngg_gs_emit_addr = esgs_ring_bytes;
ctx->ngg_gs_scratch_addr = ctx->ngg_gs_emit_addr + ngg_emit_bytes;
ctx->ngg_gs_scratch_addr = ALIGN(ctx->ngg_gs_scratch_addr, 16u);
unsigned total_lds_bytes = ctx->ngg_gs_scratch_addr + ngg_gs_scratch_bytes;
assert(total_lds_bytes >= ctx->ngg_gs_emit_addr);
assert(total_lds_bytes >= ctx->ngg_gs_scratch_addr);
ctx->program->config->lds_size = DIV_ROUND_UP(total_lds_bytes, ctx->program->dev.lds_encoding_granule);
/* Make sure we have enough room for emitted GS vertices */
if (nir->info.gs.vertices_out)
assert((ngg_emit_bytes % (ctx->ngg_gs_emit_vtx_bytes * nir->info.gs.vertices_out)) == 0);
/* See if the number of vertices and primitives are compile-time known */
nir_gs_count_vertices_and_primitives(nir, ctx->ngg_gs_const_vtxcnt, ctx->ngg_gs_const_prmcnt, 4u);
ctx->ngg_gs_early_alloc = ctx->ngg_gs_const_vtxcnt[0] == nir->info.gs.vertices_out && ctx->ngg_gs_const_prmcnt[0] != -1;
ctx->program->config->lds_size = DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule);
}
if (ctx->stage.has(SWStage::VS))

View file

@ -862,11 +862,16 @@ bool radv_lower_ngg(struct radv_device *device, struct nir_shader *nir, bool has
info->is_ngg_passthrough = out_conf.passthrough;
key->vs_common_out.as_ngg_passthrough = out_conf.passthrough;
} else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
if (!key->vs_common_out.as_ngg)
if (!info->is_ngg)
return false;
/* TODO: lower NGG GS in NIR */
return false;
ac_nir_lower_ngg_gs(
nir, info->wave_size, max_workgroup_size,
info->ngg_info.esgs_ring_size,
info->gs.gsvs_vertex_size,
info->ngg_info.ngg_emit_size * 4u,
key->vs.provoking_vtx_last);
return true;
} else {
return false;
}