aco,ac/llvm,radeonsi: lower f2f16 to f2f16_rtz in nir

No need to handle f2f16 specially for OpenGL, and we can vectorize
f2f16 when using ACO.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Signed-off-by: Qiang Yu <yuq825@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25990>
This commit is contained in:
Qiang Yu 2023-11-08 10:54:55 +08:00 committed by Marge Bot
parent 7e4aac46ad
commit dbbf566588
3 changed files with 31 additions and 35 deletions

View file

@ -444,7 +444,8 @@ aco_nir_op_supports_packed_math_16bit(const nir_alu_instr* alu)
case nir_op_f2f16: { case nir_op_f2f16: {
nir_shader* shader = nir_cf_node_get_function(&alu->instr.block->cf_node)->function->shader; nir_shader* shader = nir_cf_node_get_function(&alu->instr.block->cf_node)->function->shader;
unsigned execution_mode = shader->info.float_controls_execution_mode; unsigned execution_mode = shader->info.float_controls_execution_mode;
return nir_is_rounding_mode_rtz(execution_mode, 16); return (shader->options->force_f2f16_rtz && !nir_is_rounding_mode_rtne(execution_mode, 16)) ||
nir_is_rounding_mode_rtz(execution_mode, 16);
} }
case nir_op_fadd: case nir_op_fadd:
case nir_op_fsub: case nir_op_fsub:

View file

@ -928,45 +928,31 @@ static bool visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
case nir_op_u2f64: case nir_op_u2f64:
result = LLVMBuildUIToFP(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), ""); result = LLVMBuildUIToFP(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
break; break;
case nir_op_f2f16_rtz: case nir_op_f2f16_rtz: {
case nir_op_f2f16:
src[0] = ac_to_float(&ctx->ac, src[0]); src[0] = ac_to_float(&ctx->ac, src[0]);
/* For OpenGL, we want fast packing with v_cvt_pkrtz_f16, but if we use it, if (LLVMTypeOf(src[0]) == ctx->ac.f64)
* all f32->f16 conversions have to round towards zero, because both scalar src[0] = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ctx->ac.f32, "");
* and vec2 down-conversions have to round equally.
/* Fast path conversion. This only works if NIR is vectorized
* to vec2 16.
*/ */
if (ctx->ac.float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL || instr->op == nir_op_f2f16_rtz) { if (LLVMTypeOf(src[0]) == ctx->ac.v2f32) {
src[0] = ac_to_float(&ctx->ac, src[0]); LLVMValueRef args[] = {
ac_llvm_extract_elem(&ctx->ac, src[0], 0),
if (LLVMTypeOf(src[0]) == ctx->ac.f64) ac_llvm_extract_elem(&ctx->ac, src[0], 1),
src[0] = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ctx->ac.f32, ""); };
result = ac_build_cvt_pkrtz_f16(&ctx->ac, args);
/* Fast path conversion. This only works if NIR is vectorized break;
* to vec2 16.
*/
if (LLVMTypeOf(src[0]) == ctx->ac.v2f32) {
LLVMValueRef args[] = {
ac_llvm_extract_elem(&ctx->ac, src[0], 0),
ac_llvm_extract_elem(&ctx->ac, src[0], 1),
};
result = ac_build_cvt_pkrtz_f16(&ctx->ac, args);
break;
}
assert(ac_get_llvm_num_components(src[0]) == 1);
LLVMValueRef param[2] = {src[0], LLVMGetUndef(ctx->ac.f32)};
result = ac_build_cvt_pkrtz_f16(&ctx->ac, param);
result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, "");
} else {
if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type))
result =
LLVMBuildFPExt(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
else
result =
LLVMBuildFPTrunc(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
} }
assert(ac_get_llvm_num_components(src[0]) == 1);
LLVMValueRef param[2] = {src[0], LLVMGetUndef(ctx->ac.f32)};
result = ac_build_cvt_pkrtz_f16(&ctx->ac, param);
result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, "");
break; break;
}
case nir_op_f2f16:
case nir_op_f2f16_rtne: case nir_op_f2f16_rtne:
case nir_op_f2f32: case nir_op_f2f32:
case nir_op_f2f64: case nir_op_f2f64:

View file

@ -1381,6 +1381,15 @@ void si_init_screen_get_functions(struct si_screen *sscreen)
nir_lower_imul64 | nir_lower_imul_high64 | nir_lower_imul_2x32_64 | nir_lower_imul64 | nir_lower_imul_high64 | nir_lower_imul_2x32_64 |
nir_lower_divmod64 | nir_lower_minmax64 | nir_lower_iabs64 | nir_lower_divmod64 | nir_lower_minmax64 | nir_lower_iabs64 |
nir_lower_iadd_sat64 | nir_lower_conv64, nir_lower_iadd_sat64 | nir_lower_conv64,
/* For OpenGL, rounding mode is undefined. We want fast packing with v_cvt_pkrtz_f16,
* but if we use it, all f32->f16 conversions have to round towards zero,
* because both scalar and vec2 down-conversions have to round equally.
*
* For OpenCL, rounding mode is explicit. This will only lower f2f16 to f2f16_rtz
* when execution mode is rtz instead of rtne.
*/
.force_f2f16_rtz = true,
}; };
*sscreen->nir_options = nir_options; *sscreen->nir_options = nir_options;
} }