mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-24 21:50:12 +01:00
gallivm: use f16c hw support for float->half and half->float conversion
Should be way faster of course on cpus supporting this (includes AMD Bulldozer and Jaguar cores, Intel Ivy Bridge and up (except budget models)). Passes piglit fbo-blending-formats GL_ARB_texture_float -auto on Ivy Bridge. Reviewed-by: Brian Paul <brianp@vmware.com>
This commit is contained in:
parent
302df7cc85
commit
067a0ae420
4 changed files with 53 additions and 4 deletions
|
|
@ -175,9 +175,24 @@ lp_build_half_to_float(struct gallivm_state *gallivm,
|
|||
struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length);
|
||||
struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
|
||||
LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);
|
||||
LLVMValueRef h;
|
||||
|
||||
if (util_cpu_caps.has_f16c && HAVE_LLVM >= 0x0301 &&
|
||||
(src_length == 4 || src_length == 8)) {
|
||||
const char *intrinsic = NULL;
|
||||
if (src_length == 4) {
|
||||
src = lp_build_pad_vector(gallivm, src, 8);
|
||||
intrinsic = "llvm.x86.vcvtph2ps.128";
|
||||
}
|
||||
else {
|
||||
intrinsic = "llvm.x86.vcvtph2ps.256";
|
||||
}
|
||||
return lp_build_intrinsic_unary(builder, intrinsic,
|
||||
lp_build_vec_type(gallivm, f32_type), src);
|
||||
}
|
||||
|
||||
/* Convert int16 vector to int32 vector by zero ext (might generate bad code) */
|
||||
LLVMValueRef h = LLVMBuildZExt(builder, src, int_vec_type, "");
|
||||
h = LLVMBuildZExt(builder, src, int_vec_type, "");
|
||||
return lp_build_smallfloat_to_float(gallivm, f32_type, h, 10, 5, 0, true);
|
||||
}
|
||||
|
||||
|
|
@ -204,9 +219,31 @@ lp_build_float_to_half(struct gallivm_state *gallivm,
|
|||
struct lp_type i16_type = lp_type_int_vec(16, 16 * length);
|
||||
LLVMValueRef result;
|
||||
|
||||
result = lp_build_float_to_smallfloat(gallivm, i32_type, src, 10, 5, 0, true);
|
||||
/* Convert int32 vector to int16 vector by trunc (might generate bad code) */
|
||||
result = LLVMBuildTrunc(builder, result, lp_build_vec_type(gallivm, i16_type), "");
|
||||
if (util_cpu_caps.has_f16c && HAVE_LLVM >= 0x0301 &&
|
||||
(length == 4 || length == 8)) {
|
||||
struct lp_type i168_type = lp_type_int_vec(16, 16 * 8);
|
||||
unsigned mode = 3; /* same as LP_BUILD_ROUND_TRUNCATE */
|
||||
LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
|
||||
const char *intrinsic = NULL;
|
||||
if (length == 4) {
|
||||
intrinsic = "llvm.x86.vcvtps2ph.128";
|
||||
}
|
||||
else {
|
||||
intrinsic = "llvm.x86.vcvtps2ph.256";
|
||||
}
|
||||
result = lp_build_intrinsic_binary(builder, intrinsic,
|
||||
lp_build_vec_type(gallivm, i168_type),
|
||||
src, LLVMConstInt(i32t, mode, 0));
|
||||
if (length == 4) {
|
||||
result = lp_build_extract_range(gallivm, result, 0, 4);
|
||||
}
|
||||
}
|
||||
|
||||
else {
|
||||
result = lp_build_float_to_smallfloat(gallivm, i32_type, src, 10, 5, 0, true);
|
||||
/* Convert int32 vector to int16 vector by trunc (might generate bad code) */
|
||||
result = LLVMBuildTrunc(builder, result, lp_build_vec_type(gallivm, i16_type), "");
|
||||
}
|
||||
|
||||
/*
|
||||
* Debugging code.
|
||||
|
|
|
|||
|
|
@ -468,6 +468,15 @@ lp_build_init(void)
|
|||
util_cpu_caps.has_avx = 0;
|
||||
}
|
||||
|
||||
if (!HAVE_AVX) {
|
||||
/*
|
||||
* note these instructions are VEX-only, so can only emit if we use
|
||||
* avx (don't want to base it on has_avx & has_f16c later as that would
|
||||
* omit it unnecessarily on amd cpus, see above).
|
||||
*/
|
||||
util_cpu_caps.has_f16c = 0;
|
||||
}
|
||||
|
||||
#ifdef PIPE_ARCH_PPC_64
|
||||
/* Set the NJ bit in VSCR to 0 so denormalized values are handled as
|
||||
* specified by IEEE standard (PowerISA 2.06 - Section 6.3). This garantees
|
||||
|
|
@ -495,6 +504,7 @@ lp_build_init(void)
|
|||
util_cpu_caps.has_ssse3 = 0;
|
||||
util_cpu_caps.has_sse4_1 = 0;
|
||||
util_cpu_caps.has_avx = 0;
|
||||
util_cpu_caps.has_f16c = 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -279,6 +279,7 @@ util_cpu_detect(void)
|
|||
util_cpu_caps.has_sse4_1 = (regs2[2] >> 19) & 1;
|
||||
util_cpu_caps.has_sse4_2 = (regs2[2] >> 20) & 1;
|
||||
util_cpu_caps.has_avx = (regs2[2] >> 28) & 1;
|
||||
util_cpu_caps.has_f16c = (regs2[2] >> 29) & 1;
|
||||
util_cpu_caps.has_mmx2 = util_cpu_caps.has_sse; /* SSE cpus supports mmxext too */
|
||||
|
||||
cacheline = ((regs2[1] >> 8) & 0xFF) * 8;
|
||||
|
|
|
|||
|
|
@ -63,6 +63,7 @@ struct util_cpu_caps {
|
|||
unsigned has_sse4_1:1;
|
||||
unsigned has_sse4_2:1;
|
||||
unsigned has_avx:1;
|
||||
unsigned has_f16c:1;
|
||||
unsigned has_3dnow:1;
|
||||
unsigned has_3dnow_ext:1;
|
||||
unsigned has_altivec:1;
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue