amd: add offset to is_subgroup_invocation_lt_amd

Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31184>
This commit is contained in:
Georg Lehmann 2024-09-15 10:19:05 +02:00 committed by Marge Bot
parent 29207d83a7
commit bcfc5c09fa
3 changed files with 22 additions and 14 deletions

View file

@ -8178,7 +8178,7 @@ emit_interp_center(isel_context* ctx, Temp dst, Temp bary, Temp pos1, Temp pos2)
}
Temp merged_wave_info_to_mask(isel_context* ctx, unsigned i);
Temp lanecount_to_mask(isel_context* ctx, Temp count);
Temp lanecount_to_mask(isel_context* ctx, Temp count, unsigned bit_offset);
void pops_await_overlapped_waves(isel_context* ctx);
Temp
@ -9316,7 +9316,8 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
}
case nir_intrinsic_is_subgroup_invocation_lt_amd: {
Temp src = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
bld.copy(Definition(get_ssa_temp(ctx, &instr->def)), lanecount_to_mask(ctx, src));
unsigned offset = nir_intrinsic_base(instr);
bld.copy(Definition(get_ssa_temp(ctx, &instr->def)), lanecount_to_mask(ctx, src, offset));
break;
}
case nir_intrinsic_gds_atomic_add_amd: {
@ -11633,11 +11634,17 @@ finish_program(isel_context* ctx)
}
Temp
lanecount_to_mask(isel_context* ctx, Temp count)
lanecount_to_mask(isel_context* ctx, Temp count, unsigned bit_offset)
{
assert(count.regClass() == s1);
Builder bld(ctx->program, ctx->block);
if (bit_offset) {
count = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), count,
Operand::c32(bit_offset));
}
Temp mask = bld.sop2(aco_opcode::s_bfm_b64, bld.def(s2), count, Operand::zero());
Temp cond;
@ -11659,14 +11666,12 @@ lanecount_to_mask(isel_context* ctx, Temp count)
Temp
merged_wave_info_to_mask(isel_context* ctx, unsigned i)
{
Builder bld(ctx->program, ctx->block);
/* lanecount_to_mask() only cares about s0.byte[i].[6:0]
* so we don't need either s_bfe nor s_and here.
*/
Temp count = get_arg(ctx, ctx->args->merged_wave_info);
/* lanecount_to_mask() only cares about s0.u[6:0] so we don't need either s_bfe nor s_and here */
Temp count = i == 0 ? get_arg(ctx, ctx->args->merged_wave_info)
: bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc),
get_arg(ctx, ctx->args->merged_wave_info), Operand::c32(i * 8u));
return lanecount_to_mask(ctx, count);
return lanecount_to_mask(ctx, count, i * 8u);
}
static void

View file

@ -3466,8 +3466,11 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
break;
}
case nir_intrinsic_is_subgroup_invocation_lt_amd: {
LLVMValueRef count = LLVMBuildAnd(ctx->ac.builder, get_src(ctx, instr->src[0]),
LLVMConstInt(ctx->ac.i32, 0xff, 0), "");
unsigned offset = nir_intrinsic_base(instr);
LLVMValueRef count = get_src(ctx, instr->src[0]);
if (offset)
count = LLVMBuildLShr(ctx->ac.builder, count, LLVMConstInt(ctx->ac.i32, offset, 0), "");
count = LLVMBuildAnd(ctx->ac.builder, count, LLVMConstInt(ctx->ac.i32, 0xff, 0), "");
result = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), count, "");
break;
}

View file

@ -1603,8 +1603,8 @@ system_value("streamout_offset_amd", 1, indices=[BASE])
# AMD merged shader intrinsics
# Whether the current invocation index in the subgroup is less than the source. The source must be
# subgroup uniform and bits 0-7 must be less than or equal to the wave size.
intrinsic("is_subgroup_invocation_lt_amd", src_comp=[1], dest_comp=1, bit_sizes=[1], flags=[CAN_ELIMINATE])
# subgroup uniform and the 8 bits starting at the base bit must be less than or equal to the wave size.
intrinsic("is_subgroup_invocation_lt_amd", src_comp=[1], dest_comp=1, bit_sizes=[1], indices=[BASE], flags=[CAN_ELIMINATE])
# AMD NGG intrinsics