2012-11-19 15:15:32 -08:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2012 Intel Corporation
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
* Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
|
|
|
* DEALINGS IN THE SOFTWARE.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include "ir.h"
|
|
|
|
|
#include "ir_builder.h"
|
|
|
|
|
#include "ir_optimization.h"
|
|
|
|
|
#include "ir_rvalue_visitor.h"
|
|
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
|
|
using namespace ir_builder;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* A visitor that lowers built-in floating-point pack/unpack expressions
|
|
|
|
|
* such packSnorm2x16.
|
|
|
|
|
*/
|
|
|
|
|
class lower_packing_builtins_visitor : public ir_rvalue_visitor {
|
|
|
|
|
public:
|
|
|
|
|
/**
|
|
|
|
|
* \param op_mask is a bitmask of `enum lower_packing_builtins_op`
|
|
|
|
|
*/
|
|
|
|
|
explicit lower_packing_builtins_visitor(int op_mask)
|
|
|
|
|
: op_mask(op_mask),
|
|
|
|
|
progress(false)
|
|
|
|
|
{
|
|
|
|
|
/* Mutually exclusive options. */
|
|
|
|
|
assert(!((op_mask & LOWER_PACK_HALF_2x16) &&
|
|
|
|
|
(op_mask & LOWER_PACK_HALF_2x16_TO_SPLIT)));
|
|
|
|
|
|
|
|
|
|
assert(!((op_mask & LOWER_UNPACK_HALF_2x16) &&
|
|
|
|
|
(op_mask & LOWER_UNPACK_HALF_2x16_TO_SPLIT)));
|
|
|
|
|
|
|
|
|
|
factory.instructions = &factory_instructions;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
virtual ~lower_packing_builtins_visitor()
|
|
|
|
|
{
|
|
|
|
|
assert(factory_instructions.is_empty());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool get_progress() { return progress; }
|
|
|
|
|
|
|
|
|
|
void handle_rvalue(ir_rvalue **rvalue)
|
|
|
|
|
{
|
|
|
|
|
if (!*rvalue)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
ir_expression *expr = (*rvalue)->as_expression();
|
|
|
|
|
if (!expr)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
enum lower_packing_builtins_op lowering_op =
|
|
|
|
|
choose_lowering_op(expr->operation);
|
|
|
|
|
|
|
|
|
|
if (lowering_op == LOWER_PACK_UNPACK_NONE)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
setup_factory(ralloc_parent(expr));
|
|
|
|
|
|
|
|
|
|
ir_rvalue *op0 = expr->operands[0];
|
|
|
|
|
ralloc_steal(factory.mem_ctx, op0);
|
|
|
|
|
|
|
|
|
|
switch (lowering_op) {
|
|
|
|
|
case LOWER_PACK_SNORM_2x16:
|
|
|
|
|
*rvalue = lower_pack_snorm_2x16(op0);
|
|
|
|
|
break;
|
2013-01-21 15:31:00 -08:00
|
|
|
case LOWER_PACK_SNORM_4x8:
|
|
|
|
|
*rvalue = lower_pack_snorm_4x8(op0);
|
|
|
|
|
break;
|
2012-11-19 15:15:32 -08:00
|
|
|
case LOWER_PACK_UNORM_2x16:
|
|
|
|
|
*rvalue = lower_pack_unorm_2x16(op0);
|
|
|
|
|
break;
|
2013-01-21 15:31:00 -08:00
|
|
|
case LOWER_PACK_UNORM_4x8:
|
|
|
|
|
*rvalue = lower_pack_unorm_4x8(op0);
|
|
|
|
|
break;
|
2012-11-19 15:15:32 -08:00
|
|
|
case LOWER_PACK_HALF_2x16:
|
|
|
|
|
*rvalue = lower_pack_half_2x16(op0);
|
|
|
|
|
break;
|
|
|
|
|
case LOWER_PACK_HALF_2x16_TO_SPLIT:
|
|
|
|
|
*rvalue = split_pack_half_2x16(op0);
|
|
|
|
|
break;
|
|
|
|
|
case LOWER_UNPACK_SNORM_2x16:
|
|
|
|
|
*rvalue = lower_unpack_snorm_2x16(op0);
|
|
|
|
|
break;
|
2013-01-21 15:31:00 -08:00
|
|
|
case LOWER_UNPACK_SNORM_4x8:
|
|
|
|
|
*rvalue = lower_unpack_snorm_4x8(op0);
|
|
|
|
|
break;
|
2012-11-19 15:15:32 -08:00
|
|
|
case LOWER_UNPACK_UNORM_2x16:
|
|
|
|
|
*rvalue = lower_unpack_unorm_2x16(op0);
|
|
|
|
|
break;
|
2013-01-21 15:31:00 -08:00
|
|
|
case LOWER_UNPACK_UNORM_4x8:
|
|
|
|
|
*rvalue = lower_unpack_unorm_4x8(op0);
|
|
|
|
|
break;
|
2012-11-19 15:15:32 -08:00
|
|
|
case LOWER_UNPACK_HALF_2x16:
|
|
|
|
|
*rvalue = lower_unpack_half_2x16(op0);
|
|
|
|
|
break;
|
|
|
|
|
case LOWER_UNPACK_HALF_2x16_TO_SPLIT:
|
|
|
|
|
*rvalue = split_unpack_half_2x16(op0);
|
|
|
|
|
break;
|
|
|
|
|
case LOWER_PACK_UNPACK_NONE:
|
2015-08-20 20:52:32 -04:00
|
|
|
case LOWER_PACK_USE_BFI:
|
2015-08-20 21:55:52 -04:00
|
|
|
case LOWER_PACK_USE_BFE:
|
2012-11-19 15:15:32 -08:00
|
|
|
assert(!"not reached");
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
teardown_factory();
|
|
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
const int op_mask;
|
|
|
|
|
bool progress;
|
|
|
|
|
ir_factory factory;
|
|
|
|
|
exec_list factory_instructions;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Determine the needed lowering operation by filtering \a expr_op
|
|
|
|
|
* through \ref op_mask.
|
|
|
|
|
*/
|
|
|
|
|
enum lower_packing_builtins_op
|
|
|
|
|
choose_lowering_op(ir_expression_operation expr_op)
|
|
|
|
|
{
|
|
|
|
|
/* C++ regards int and enum as fundamentally different types.
|
|
|
|
|
* So, we can't simply return from each case; we must cast the return
|
|
|
|
|
* value.
|
|
|
|
|
*/
|
|
|
|
|
int result;
|
|
|
|
|
|
|
|
|
|
switch (expr_op) {
|
|
|
|
|
case ir_unop_pack_snorm_2x16:
|
|
|
|
|
result = op_mask & LOWER_PACK_SNORM_2x16;
|
|
|
|
|
break;
|
2013-01-21 15:31:00 -08:00
|
|
|
case ir_unop_pack_snorm_4x8:
|
|
|
|
|
result = op_mask & LOWER_PACK_SNORM_4x8;
|
|
|
|
|
break;
|
2012-11-19 15:15:32 -08:00
|
|
|
case ir_unop_pack_unorm_2x16:
|
|
|
|
|
result = op_mask & LOWER_PACK_UNORM_2x16;
|
|
|
|
|
break;
|
2013-01-21 15:31:00 -08:00
|
|
|
case ir_unop_pack_unorm_4x8:
|
|
|
|
|
result = op_mask & LOWER_PACK_UNORM_4x8;
|
|
|
|
|
break;
|
2012-11-19 15:15:32 -08:00
|
|
|
case ir_unop_pack_half_2x16:
|
|
|
|
|
result = op_mask & (LOWER_PACK_HALF_2x16 | LOWER_PACK_HALF_2x16_TO_SPLIT);
|
|
|
|
|
break;
|
|
|
|
|
case ir_unop_unpack_snorm_2x16:
|
|
|
|
|
result = op_mask & LOWER_UNPACK_SNORM_2x16;
|
|
|
|
|
break;
|
2013-01-21 15:31:00 -08:00
|
|
|
case ir_unop_unpack_snorm_4x8:
|
|
|
|
|
result = op_mask & LOWER_UNPACK_SNORM_4x8;
|
|
|
|
|
break;
|
2012-11-19 15:15:32 -08:00
|
|
|
case ir_unop_unpack_unorm_2x16:
|
|
|
|
|
result = op_mask & LOWER_UNPACK_UNORM_2x16;
|
|
|
|
|
break;
|
2013-01-21 15:31:00 -08:00
|
|
|
case ir_unop_unpack_unorm_4x8:
|
|
|
|
|
result = op_mask & LOWER_UNPACK_UNORM_4x8;
|
|
|
|
|
break;
|
2012-11-19 15:15:32 -08:00
|
|
|
case ir_unop_unpack_half_2x16:
|
|
|
|
|
result = op_mask & (LOWER_UNPACK_HALF_2x16 | LOWER_UNPACK_HALF_2x16_TO_SPLIT);
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
result = LOWER_PACK_UNPACK_NONE;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return static_cast<enum lower_packing_builtins_op>(result);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
setup_factory(void *mem_ctx)
|
|
|
|
|
{
|
|
|
|
|
assert(factory.mem_ctx == NULL);
|
|
|
|
|
assert(factory.instructions->is_empty());
|
|
|
|
|
|
|
|
|
|
factory.mem_ctx = mem_ctx;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
teardown_factory()
|
|
|
|
|
{
|
|
|
|
|
base_ir->insert_before(factory.instructions);
|
|
|
|
|
assert(factory.instructions->is_empty());
|
|
|
|
|
factory.mem_ctx = NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
|
ir_constant*
|
|
|
|
|
constant(T x)
|
|
|
|
|
{
|
|
|
|
|
return factory.constant(x);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* \brief Pack two uint16's into a single uint32.
|
|
|
|
|
*
|
|
|
|
|
* Interpret the given uvec2 as a uint16 pair. Pack the pair into a uint32
|
|
|
|
|
* where the least significant bits specify the first element of the pair.
|
|
|
|
|
* Return the uint32.
|
|
|
|
|
*/
|
|
|
|
|
ir_rvalue*
|
|
|
|
|
pack_uvec2_to_uint(ir_rvalue *uvec2_rval)
|
|
|
|
|
{
|
|
|
|
|
assert(uvec2_rval->type == glsl_type::uvec2_type);
|
|
|
|
|
|
|
|
|
|
/* uvec2 u = UVEC2_RVAL; */
|
|
|
|
|
ir_variable *u = factory.make_temp(glsl_type::uvec2_type,
|
2015-08-20 20:52:32 -04:00
|
|
|
"tmp_pack_uvec2_to_uint");
|
2012-11-19 15:15:32 -08:00
|
|
|
factory.emit(assign(u, uvec2_rval));
|
|
|
|
|
|
2015-08-20 20:52:32 -04:00
|
|
|
if (op_mask & LOWER_PACK_USE_BFI) {
|
|
|
|
|
return bitfield_insert(bit_and(swizzle_x(u), constant(0xffffu)),
|
|
|
|
|
swizzle_y(u),
|
glsl, nir: Make ir_quadop_bitfield_insert a vectorized operation.
We would like to be able to combine
result.x = bitfieldInsert(src0.x, src1.x, src2.x, src3.x);
result.y = bitfieldInsert(src0.y, src1.y, src2.y, src3.y);
result.z = bitfieldInsert(src0.z, src1.z, src2.z, src3.z);
result.w = bitfieldInsert(src0.w, src1.w, src2.w, src3.w);
into a single ivec4 bitfieldInsert operation. This should be possible
with most drivers.
This patch changes the offset and bits parameters from scalar ints
to ivecN or uvecN. The type of all four operands will be the same,
for simplicity.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
2016-01-05 04:01:11 -08:00
|
|
|
constant(16u),
|
|
|
|
|
constant(16u));
|
2015-08-20 20:52:32 -04:00
|
|
|
}
|
|
|
|
|
|
2012-11-19 15:15:32 -08:00
|
|
|
/* return (u.y << 16) | (u.x & 0xffff); */
|
|
|
|
|
return bit_or(lshift(swizzle_y(u), constant(16u)),
|
|
|
|
|
bit_and(swizzle_x(u), constant(0xffffu)));
|
|
|
|
|
}
|
|
|
|
|
|
2013-01-21 15:31:00 -08:00
|
|
|
/**
|
|
|
|
|
* \brief Pack four uint8's into a single uint32.
|
|
|
|
|
*
|
|
|
|
|
* Interpret the given uvec4 as a uint32 4-typle. Pack the 4-tuple into a
|
|
|
|
|
* uint32 where the least significant bits specify the first element of the
|
|
|
|
|
* 4-tuple. Return the uint32.
|
|
|
|
|
*/
|
|
|
|
|
ir_rvalue*
|
|
|
|
|
pack_uvec4_to_uint(ir_rvalue *uvec4_rval)
|
|
|
|
|
{
|
|
|
|
|
assert(uvec4_rval->type == glsl_type::uvec4_type);
|
|
|
|
|
|
|
|
|
|
ir_variable *u = factory.make_temp(glsl_type::uvec4_type,
|
2015-08-20 20:52:32 -04:00
|
|
|
"tmp_pack_uvec4_to_uint");
|
|
|
|
|
|
|
|
|
|
if (op_mask & LOWER_PACK_USE_BFI) {
|
|
|
|
|
/* uvec4 u = UVEC4_RVAL; */
|
|
|
|
|
factory.emit(assign(u, uvec4_rval));
|
|
|
|
|
|
|
|
|
|
return bitfield_insert(bitfield_insert(
|
|
|
|
|
bitfield_insert(
|
|
|
|
|
bit_and(swizzle_x(u), constant(0xffu)),
|
glsl, nir: Make ir_quadop_bitfield_insert a vectorized operation.
We would like to be able to combine
result.x = bitfieldInsert(src0.x, src1.x, src2.x, src3.x);
result.y = bitfieldInsert(src0.y, src1.y, src2.y, src3.y);
result.z = bitfieldInsert(src0.z, src1.z, src2.z, src3.z);
result.w = bitfieldInsert(src0.w, src1.w, src2.w, src3.w);
into a single ivec4 bitfieldInsert operation. This should be possible
with most drivers.
This patch changes the offset and bits parameters from scalar ints
to ivecN or uvecN. The type of all four operands will be the same,
for simplicity.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
2016-01-05 04:01:11 -08:00
|
|
|
swizzle_y(u), constant(8u), constant(8u)),
|
|
|
|
|
swizzle_z(u), constant(16u), constant(8u)),
|
|
|
|
|
swizzle_w(u), constant(24u), constant(8u));
|
2015-08-20 20:52:32 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* uvec4 u = UVEC4_RVAL & 0xff */
|
2013-01-21 15:31:00 -08:00
|
|
|
factory.emit(assign(u, bit_and(uvec4_rval, constant(0xffu))));
|
|
|
|
|
|
|
|
|
|
/* return (u.w << 24) | (u.z << 16) | (u.y << 8) | u.x; */
|
|
|
|
|
return bit_or(bit_or(lshift(swizzle_w(u), constant(24u)),
|
|
|
|
|
lshift(swizzle_z(u), constant(16u))),
|
|
|
|
|
bit_or(lshift(swizzle_y(u), constant(8u)),
|
|
|
|
|
swizzle_x(u)));
|
|
|
|
|
}
|
|
|
|
|
|
2012-11-19 15:15:32 -08:00
|
|
|
/**
|
|
|
|
|
* \brief Unpack a uint32 into two uint16's.
|
|
|
|
|
*
|
|
|
|
|
* Interpret the given uint32 as a uint16 pair where the uint32's least
|
|
|
|
|
* significant bits specify the pair's first element. Return the uint16
|
|
|
|
|
* pair as a uvec2.
|
|
|
|
|
*/
|
|
|
|
|
ir_rvalue*
|
|
|
|
|
unpack_uint_to_uvec2(ir_rvalue *uint_rval)
|
|
|
|
|
{
|
|
|
|
|
assert(uint_rval->type == glsl_type::uint_type);
|
|
|
|
|
|
|
|
|
|
/* uint u = UINT_RVAL; */
|
|
|
|
|
ir_variable *u = factory.make_temp(glsl_type::uint_type,
|
|
|
|
|
"tmp_unpack_uint_to_uvec2_u");
|
|
|
|
|
factory.emit(assign(u, uint_rval));
|
|
|
|
|
|
|
|
|
|
/* uvec2 u2; */
|
|
|
|
|
ir_variable *u2 = factory.make_temp(glsl_type::uvec2_type,
|
|
|
|
|
"tmp_unpack_uint_to_uvec2_u2");
|
|
|
|
|
|
|
|
|
|
/* u2.x = u & 0xffffu; */
|
|
|
|
|
factory.emit(assign(u2, bit_and(u, constant(0xffffu)), WRITEMASK_X));
|
|
|
|
|
|
|
|
|
|
/* u2.y = u >> 16u; */
|
|
|
|
|
factory.emit(assign(u2, rshift(u, constant(16u)), WRITEMASK_Y));
|
|
|
|
|
|
|
|
|
|
return deref(u2).val;
|
|
|
|
|
}
|
|
|
|
|
|
2015-08-20 21:55:52 -04:00
|
|
|
/**
|
|
|
|
|
* \brief Unpack a uint32 into two int16's.
|
|
|
|
|
*
|
|
|
|
|
* Specifically each 16-bit value is sign-extended to the full width of an
|
|
|
|
|
* int32 on return.
|
|
|
|
|
*/
|
|
|
|
|
ir_rvalue *
|
|
|
|
|
unpack_uint_to_ivec2(ir_rvalue *uint_rval)
|
|
|
|
|
{
|
|
|
|
|
assert(uint_rval->type == glsl_type::uint_type);
|
|
|
|
|
|
|
|
|
|
if (!(op_mask & LOWER_PACK_USE_BFE)) {
|
|
|
|
|
return rshift(lshift(u2i(unpack_uint_to_uvec2(uint_rval)),
|
|
|
|
|
constant(16u)),
|
|
|
|
|
constant(16u));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ir_variable *i = factory.make_temp(glsl_type::int_type,
|
|
|
|
|
"tmp_unpack_uint_to_ivec2_i");
|
|
|
|
|
factory.emit(assign(i, u2i(uint_rval)));
|
|
|
|
|
|
|
|
|
|
/* ivec2 i2; */
|
|
|
|
|
ir_variable *i2 = factory.make_temp(glsl_type::ivec2_type,
|
|
|
|
|
"tmp_unpack_uint_to_ivec2_i2");
|
|
|
|
|
|
|
|
|
|
factory.emit(assign(i2, bitfield_extract(i, constant(0), constant(16)),
|
|
|
|
|
WRITEMASK_X));
|
|
|
|
|
factory.emit(assign(i2, bitfield_extract(i, constant(16), constant(16)),
|
|
|
|
|
WRITEMASK_Y));
|
|
|
|
|
|
|
|
|
|
return deref(i2).val;
|
|
|
|
|
}
|
|
|
|
|
|
2013-01-21 15:31:00 -08:00
|
|
|
/**
|
|
|
|
|
* \brief Unpack a uint32 into four uint8's.
|
|
|
|
|
*
|
|
|
|
|
* Interpret the given uint32 as a uint8 4-tuple where the uint32's least
|
|
|
|
|
* significant bits specify the 4-tuple's first element. Return the uint8
|
|
|
|
|
* 4-tuple as a uvec4.
|
|
|
|
|
*/
|
|
|
|
|
ir_rvalue*
|
|
|
|
|
unpack_uint_to_uvec4(ir_rvalue *uint_rval)
|
|
|
|
|
{
|
|
|
|
|
assert(uint_rval->type == glsl_type::uint_type);
|
|
|
|
|
|
|
|
|
|
/* uint u = UINT_RVAL; */
|
|
|
|
|
ir_variable *u = factory.make_temp(glsl_type::uint_type,
|
|
|
|
|
"tmp_unpack_uint_to_uvec4_u");
|
|
|
|
|
factory.emit(assign(u, uint_rval));
|
|
|
|
|
|
|
|
|
|
/* uvec4 u4; */
|
|
|
|
|
ir_variable *u4 = factory.make_temp(glsl_type::uvec4_type,
|
|
|
|
|
"tmp_unpack_uint_to_uvec4_u4");
|
|
|
|
|
|
|
|
|
|
/* u4.x = u & 0xffu; */
|
|
|
|
|
factory.emit(assign(u4, bit_and(u, constant(0xffu)), WRITEMASK_X));
|
|
|
|
|
|
2015-08-20 21:55:52 -04:00
|
|
|
if (op_mask & LOWER_PACK_USE_BFE) {
|
|
|
|
|
/* u4.y = bitfield_extract(u, 8, 8); */
|
glsl, nir: Make ir_triop_bitfield_extract a vectorized operation.
We would like to be able to combine
result.x = bitfieldExtract(src0.x, src1.x, src2.x);
result.y = bitfieldExtract(src0.y, src1.y, src2.y);
result.z = bitfieldExtract(src0.z, src1.z, src2.z);
result.w = bitfieldExtract(src0.w, src1.w, src2.w);
into a single ivec4 bitfieldInsert operation. This should be possible
with most drivers.
This patch changes the offset and bits parameters from scalar ints
to ivecN or uvecN. The type of all three operands will be the same,
for simplicity.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
2016-01-07 16:01:51 -08:00
|
|
|
factory.emit(assign(u4, bitfield_extract(u, constant(8u), constant(8u)),
|
2015-08-20 21:55:52 -04:00
|
|
|
WRITEMASK_Y));
|
|
|
|
|
|
|
|
|
|
/* u4.z = bitfield_extract(u, 16, 8); */
|
glsl, nir: Make ir_triop_bitfield_extract a vectorized operation.
We would like to be able to combine
result.x = bitfieldExtract(src0.x, src1.x, src2.x);
result.y = bitfieldExtract(src0.y, src1.y, src2.y);
result.z = bitfieldExtract(src0.z, src1.z, src2.z);
result.w = bitfieldExtract(src0.w, src1.w, src2.w);
into a single ivec4 bitfieldInsert operation. This should be possible
with most drivers.
This patch changes the offset and bits parameters from scalar ints
to ivecN or uvecN. The type of all three operands will be the same,
for simplicity.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
2016-01-07 16:01:51 -08:00
|
|
|
factory.emit(assign(u4, bitfield_extract(u, constant(16u), constant(8u)),
|
2015-08-20 21:55:52 -04:00
|
|
|
WRITEMASK_Z));
|
|
|
|
|
} else {
|
|
|
|
|
/* u4.y = (u >> 8u) & 0xffu; */
|
|
|
|
|
factory.emit(assign(u4, bit_and(rshift(u, constant(8u)),
|
|
|
|
|
constant(0xffu)), WRITEMASK_Y));
|
|
|
|
|
|
|
|
|
|
/* u4.z = (u >> 16u) & 0xffu; */
|
|
|
|
|
factory.emit(assign(u4, bit_and(rshift(u, constant(16u)),
|
|
|
|
|
constant(0xffu)), WRITEMASK_Z));
|
|
|
|
|
}
|
2013-01-21 15:31:00 -08:00
|
|
|
|
|
|
|
|
/* u4.w = (u >> 24u) */
|
|
|
|
|
factory.emit(assign(u4, rshift(u, constant(24u)), WRITEMASK_W));
|
|
|
|
|
|
|
|
|
|
return deref(u4).val;
|
|
|
|
|
}
|
|
|
|
|
|
2015-08-20 21:55:52 -04:00
|
|
|
/**
|
|
|
|
|
* \brief Unpack a uint32 into four int8's.
|
|
|
|
|
*
|
|
|
|
|
* Specifically each 8-bit value is sign-extended to the full width of an
|
|
|
|
|
* int32 on return.
|
|
|
|
|
*/
|
|
|
|
|
ir_rvalue *
|
|
|
|
|
unpack_uint_to_ivec4(ir_rvalue *uint_rval)
|
|
|
|
|
{
|
|
|
|
|
assert(uint_rval->type == glsl_type::uint_type);
|
|
|
|
|
|
|
|
|
|
if (!(op_mask & LOWER_PACK_USE_BFE)) {
|
|
|
|
|
return rshift(lshift(u2i(unpack_uint_to_uvec4(uint_rval)),
|
|
|
|
|
constant(24u)),
|
|
|
|
|
constant(24u));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ir_variable *i = factory.make_temp(glsl_type::int_type,
|
|
|
|
|
"tmp_unpack_uint_to_ivec4_i");
|
|
|
|
|
factory.emit(assign(i, u2i(uint_rval)));
|
|
|
|
|
|
|
|
|
|
/* ivec4 i4; */
|
|
|
|
|
ir_variable *i4 = factory.make_temp(glsl_type::ivec4_type,
|
|
|
|
|
"tmp_unpack_uint_to_ivec4_i4");
|
|
|
|
|
|
|
|
|
|
factory.emit(assign(i4, bitfield_extract(i, constant(0), constant(8)),
|
|
|
|
|
WRITEMASK_X));
|
|
|
|
|
factory.emit(assign(i4, bitfield_extract(i, constant(8), constant(8)),
|
|
|
|
|
WRITEMASK_Y));
|
|
|
|
|
factory.emit(assign(i4, bitfield_extract(i, constant(16), constant(8)),
|
|
|
|
|
WRITEMASK_Z));
|
|
|
|
|
factory.emit(assign(i4, bitfield_extract(i, constant(24), constant(8)),
|
|
|
|
|
WRITEMASK_W));
|
|
|
|
|
|
|
|
|
|
return deref(i4).val;
|
|
|
|
|
}
|
|
|
|
|
|
2012-11-19 15:15:32 -08:00
|
|
|
/**
|
|
|
|
|
* \brief Lower a packSnorm2x16 expression.
|
|
|
|
|
*
|
|
|
|
|
* \param vec2_rval is packSnorm2x16's input
|
|
|
|
|
* \return packSnorm2x16's output as a uint rvalue
|
|
|
|
|
*/
|
|
|
|
|
ir_rvalue*
|
|
|
|
|
lower_pack_snorm_2x16(ir_rvalue *vec2_rval)
|
|
|
|
|
{
|
|
|
|
|
/* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
|
|
|
|
|
*
|
|
|
|
|
* highp uint packSnorm2x16(vec2 v)
|
|
|
|
|
* --------------------------------
|
|
|
|
|
* First, converts each component of the normalized floating-point value
|
|
|
|
|
* v into 16-bit integer values. Then, the results are packed into the
|
|
|
|
|
* returned 32-bit unsigned integer.
|
|
|
|
|
*
|
|
|
|
|
* The conversion for component c of v to fixed point is done as
|
|
|
|
|
* follows:
|
|
|
|
|
*
|
|
|
|
|
* packSnorm2x16: round(clamp(c, -1, +1) * 32767.0)
|
|
|
|
|
*
|
|
|
|
|
* The first component of the vector will be written to the least
|
|
|
|
|
* significant bits of the output; the last component will be written to
|
|
|
|
|
* the most significant bits.
|
|
|
|
|
*
|
|
|
|
|
* This function generates IR that approximates the following pseudo-GLSL:
|
|
|
|
|
*
|
|
|
|
|
* return pack_uvec2_to_uint(
|
|
|
|
|
* uvec2(ivec2(
|
|
|
|
|
* round(clamp(VEC2_RVALUE, -1.0f, 1.0f) * 32767.0f))));
|
|
|
|
|
*
|
|
|
|
|
* It is necessary to first convert the vec2 to ivec2 rather than directly
|
|
|
|
|
* converting vec2 to uvec2 because the latter conversion is undefined.
|
|
|
|
|
* From page 56 (62 of pdf) of the GLSL ES 3.00 spec: "It is undefined to
|
|
|
|
|
* convert a negative floating point value to an uint".
|
|
|
|
|
*/
|
|
|
|
|
assert(vec2_rval->type == glsl_type::vec2_type);
|
|
|
|
|
|
|
|
|
|
ir_rvalue *result = pack_uvec2_to_uint(
|
|
|
|
|
i2u(f2i(round_even(mul(clamp(vec2_rval,
|
|
|
|
|
constant(-1.0f),
|
|
|
|
|
constant(1.0f)),
|
|
|
|
|
constant(32767.0f))))));
|
|
|
|
|
|
|
|
|
|
assert(result->type == glsl_type::uint_type);
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
2013-01-21 15:31:00 -08:00
|
|
|
/**
|
|
|
|
|
* \brief Lower a packSnorm4x8 expression.
|
|
|
|
|
*
|
|
|
|
|
* \param vec4_rval is packSnorm4x8's input
|
|
|
|
|
* \return packSnorm4x8's output as a uint rvalue
|
|
|
|
|
*/
|
|
|
|
|
ir_rvalue*
|
|
|
|
|
lower_pack_snorm_4x8(ir_rvalue *vec4_rval)
|
|
|
|
|
{
|
|
|
|
|
/* From page 137 (143 of pdf) of the GLSL 4.30 spec:
|
|
|
|
|
*
|
|
|
|
|
* highp uint packSnorm4x8(vec4 v)
|
|
|
|
|
* -------------------------------
|
|
|
|
|
* First, converts each component of the normalized floating-point value
|
|
|
|
|
* v into 8-bit integer values. Then, the results are packed into the
|
|
|
|
|
* returned 32-bit unsigned integer.
|
|
|
|
|
*
|
|
|
|
|
* The conversion for component c of v to fixed point is done as
|
|
|
|
|
* follows:
|
|
|
|
|
*
|
|
|
|
|
* packSnorm4x8: round(clamp(c, -1, +1) * 127.0)
|
|
|
|
|
*
|
|
|
|
|
* The first component of the vector will be written to the least
|
|
|
|
|
* significant bits of the output; the last component will be written to
|
|
|
|
|
* the most significant bits.
|
|
|
|
|
*
|
|
|
|
|
* This function generates IR that approximates the following pseudo-GLSL:
|
|
|
|
|
*
|
|
|
|
|
* return pack_uvec4_to_uint(
|
|
|
|
|
* uvec4(ivec4(
|
|
|
|
|
* round(clamp(VEC4_RVALUE, -1.0f, 1.0f) * 127.0f))));
|
|
|
|
|
*
|
|
|
|
|
* It is necessary to first convert the vec4 to ivec4 rather than directly
|
|
|
|
|
* converting vec4 to uvec4 because the latter conversion is undefined.
|
|
|
|
|
* From page 87 (93 of pdf) of the GLSL 4.30 spec: "It is undefined to
|
|
|
|
|
* convert a negative floating point value to an uint".
|
|
|
|
|
*/
|
|
|
|
|
assert(vec4_rval->type == glsl_type::vec4_type);
|
|
|
|
|
|
|
|
|
|
ir_rvalue *result = pack_uvec4_to_uint(
|
|
|
|
|
i2u(f2i(round_even(mul(clamp(vec4_rval,
|
|
|
|
|
constant(-1.0f),
|
|
|
|
|
constant(1.0f)),
|
|
|
|
|
constant(127.0f))))));
|
|
|
|
|
|
|
|
|
|
assert(result->type == glsl_type::uint_type);
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
2012-11-19 15:15:32 -08:00
|
|
|
/**
|
|
|
|
|
* \brief Lower an unpackSnorm2x16 expression.
|
|
|
|
|
*
|
|
|
|
|
* \param uint_rval is unpackSnorm2x16's input
|
|
|
|
|
* \return unpackSnorm2x16's output as a vec2 rvalue
|
|
|
|
|
*/
|
|
|
|
|
ir_rvalue*
|
|
|
|
|
lower_unpack_snorm_2x16(ir_rvalue *uint_rval)
|
|
|
|
|
{
|
|
|
|
|
/* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
|
|
|
|
|
*
|
|
|
|
|
* highp vec2 unpackSnorm2x16 (highp uint p)
|
|
|
|
|
* -----------------------------------------
|
|
|
|
|
* First, unpacks a single 32-bit unsigned integer p into a pair of
|
|
|
|
|
* 16-bit unsigned integers. Then, each component is converted to
|
|
|
|
|
* a normalized floating-point value to generate the returned
|
|
|
|
|
* two-component vector.
|
|
|
|
|
*
|
|
|
|
|
* The conversion for unpacked fixed-point value f to floating point is
|
|
|
|
|
* done as follows:
|
|
|
|
|
*
|
|
|
|
|
* unpackSnorm2x16: clamp(f / 32767.0, -1,+1)
|
|
|
|
|
*
|
|
|
|
|
* The first component of the returned vector will be extracted from the
|
|
|
|
|
* least significant bits of the input; the last component will be
|
|
|
|
|
* extracted from the most significant bits.
|
|
|
|
|
*
|
|
|
|
|
* This function generates IR that approximates the following pseudo-GLSL:
|
|
|
|
|
*
|
|
|
|
|
* return clamp(
|
|
|
|
|
* ((ivec2(unpack_uint_to_uvec2(UINT_RVALUE)) << 16) >> 16) / 32767.0f,
|
|
|
|
|
* -1.0f, 1.0f);
|
|
|
|
|
*
|
|
|
|
|
* The above IR may appear unnecessarily complex, but the intermediate
|
|
|
|
|
* conversion to ivec2 and the bit shifts are necessary to correctly unpack
|
|
|
|
|
* negative floats.
|
|
|
|
|
*
|
|
|
|
|
* To see why, consider packing and then unpacking vec2(-1.0, 0.0).
|
|
|
|
|
* packSnorm2x16 encodes -1.0 as the int16 0xffff. During unpacking, we
|
|
|
|
|
* place that int16 into an int32, which results in the *positive* integer
|
|
|
|
|
* 0x0000ffff. The int16's sign bit becomes, in the int32, the rather
|
|
|
|
|
* unimportant bit 16. We must now extend the int16's sign bit into bits
|
|
|
|
|
* 17-32, which is accomplished by left-shifting then right-shifting.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
assert(uint_rval->type == glsl_type::uint_type);
|
|
|
|
|
|
|
|
|
|
ir_rvalue *result =
|
2015-08-20 21:55:52 -04:00
|
|
|
clamp(div(i2f(unpack_uint_to_ivec2(uint_rval)),
|
2012-11-19 15:15:32 -08:00
|
|
|
constant(32767.0f)),
|
|
|
|
|
constant(-1.0f),
|
|
|
|
|
constant(1.0f));
|
|
|
|
|
|
|
|
|
|
assert(result->type == glsl_type::vec2_type);
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
2013-01-21 15:31:00 -08:00
|
|
|
/**
|
|
|
|
|
* \brief Lower an unpackSnorm4x8 expression.
|
|
|
|
|
*
|
|
|
|
|
* \param uint_rval is unpackSnorm4x8's input
|
|
|
|
|
* \return unpackSnorm4x8's output as a vec4 rvalue
|
|
|
|
|
*/
|
|
|
|
|
ir_rvalue*
|
|
|
|
|
lower_unpack_snorm_4x8(ir_rvalue *uint_rval)
|
|
|
|
|
{
|
|
|
|
|
/* From page 137 (143 of pdf) of the GLSL 4.30 spec:
|
|
|
|
|
*
|
|
|
|
|
* highp vec4 unpackSnorm4x8 (highp uint p)
|
|
|
|
|
* ----------------------------------------
|
|
|
|
|
* First, unpacks a single 32-bit unsigned integer p into four
|
|
|
|
|
* 8-bit unsigned integers. Then, each component is converted to
|
|
|
|
|
* a normalized floating-point value to generate the returned
|
|
|
|
|
* four-component vector.
|
|
|
|
|
*
|
|
|
|
|
* The conversion for unpacked fixed-point value f to floating point is
|
|
|
|
|
* done as follows:
|
|
|
|
|
*
|
|
|
|
|
* unpackSnorm4x8: clamp(f / 127.0, -1, +1)
|
|
|
|
|
*
|
|
|
|
|
* The first component of the returned vector will be extracted from the
|
|
|
|
|
* least significant bits of the input; the last component will be
|
|
|
|
|
* extracted from the most significant bits.
|
|
|
|
|
*
|
|
|
|
|
* This function generates IR that approximates the following pseudo-GLSL:
|
|
|
|
|
*
|
|
|
|
|
* return clamp(
|
|
|
|
|
* ((ivec4(unpack_uint_to_uvec4(UINT_RVALUE)) << 24) >> 24) / 127.0f,
|
|
|
|
|
* -1.0f, 1.0f);
|
|
|
|
|
*
|
|
|
|
|
* The above IR may appear unnecessarily complex, but the intermediate
|
|
|
|
|
* conversion to ivec4 and the bit shifts are necessary to correctly unpack
|
|
|
|
|
* negative floats.
|
|
|
|
|
*
|
|
|
|
|
* To see why, consider packing and then unpacking vec4(-1.0, 0.0, 0.0,
|
|
|
|
|
* 0.0). packSnorm4x8 encodes -1.0 as the int8 0xff. During unpacking, we
|
|
|
|
|
* place that int8 into an int32, which results in the *positive* integer
|
|
|
|
|
* 0x000000ff. The int8's sign bit becomes, in the int32, the rather
|
|
|
|
|
* unimportant bit 8. We must now extend the int8's sign bit into bits
|
|
|
|
|
* 9-32, which is accomplished by left-shifting then right-shifting.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
assert(uint_rval->type == glsl_type::uint_type);
|
|
|
|
|
|
|
|
|
|
ir_rvalue *result =
|
2015-08-20 21:55:52 -04:00
|
|
|
clamp(div(i2f(unpack_uint_to_ivec4(uint_rval)),
|
2013-01-21 15:31:00 -08:00
|
|
|
constant(127.0f)),
|
|
|
|
|
constant(-1.0f),
|
|
|
|
|
constant(1.0f));
|
|
|
|
|
|
|
|
|
|
assert(result->type == glsl_type::vec4_type);
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
2012-11-19 15:15:32 -08:00
|
|
|
/**
|
|
|
|
|
* \brief Lower a packUnorm2x16 expression.
|
|
|
|
|
*
|
|
|
|
|
* \param vec2_rval is packUnorm2x16's input
|
|
|
|
|
* \return packUnorm2x16's output as a uint rvalue
|
|
|
|
|
*/
|
|
|
|
|
ir_rvalue*
|
|
|
|
|
lower_pack_unorm_2x16(ir_rvalue *vec2_rval)
|
|
|
|
|
{
|
|
|
|
|
/* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
|
|
|
|
|
*
|
|
|
|
|
* highp uint packUnorm2x16 (vec2 v)
|
|
|
|
|
* ---------------------------------
|
|
|
|
|
* First, converts each component of the normalized floating-point value
|
|
|
|
|
* v into 16-bit integer values. Then, the results are packed into the
|
|
|
|
|
* returned 32-bit unsigned integer.
|
|
|
|
|
*
|
|
|
|
|
* The conversion for component c of v to fixed point is done as
|
|
|
|
|
* follows:
|
|
|
|
|
*
|
|
|
|
|
* packUnorm2x16: round(clamp(c, 0, +1) * 65535.0)
|
|
|
|
|
*
|
|
|
|
|
* The first component of the vector will be written to the least
|
|
|
|
|
* significant bits of the output; the last component will be written to
|
|
|
|
|
* the most significant bits.
|
|
|
|
|
*
|
|
|
|
|
* This function generates IR that approximates the following pseudo-GLSL:
|
|
|
|
|
*
|
|
|
|
|
* return pack_uvec2_to_uint(uvec2(
|
|
|
|
|
* round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 65535.0f)));
|
|
|
|
|
*
|
2015-04-22 11:33:17 +01:00
|
|
|
* Here it is safe to directly convert the vec2 to uvec2 because the vec2
|
|
|
|
|
* has been clamped to a non-negative range.
|
2012-11-19 15:15:32 -08:00
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
assert(vec2_rval->type == glsl_type::vec2_type);
|
|
|
|
|
|
|
|
|
|
ir_rvalue *result = pack_uvec2_to_uint(
|
|
|
|
|
f2u(round_even(mul(saturate(vec2_rval), constant(65535.0f)))));
|
|
|
|
|
|
|
|
|
|
assert(result->type == glsl_type::uint_type);
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
2013-01-21 15:31:00 -08:00
|
|
|
/**
|
|
|
|
|
* \brief Lower a packUnorm4x8 expression.
|
|
|
|
|
*
|
|
|
|
|
* \param vec4_rval is packUnorm4x8's input
|
|
|
|
|
* \return packUnorm4x8's output as a uint rvalue
|
|
|
|
|
*/
|
|
|
|
|
ir_rvalue*
|
|
|
|
|
lower_pack_unorm_4x8(ir_rvalue *vec4_rval)
|
|
|
|
|
{
|
|
|
|
|
/* From page 137 (143 of pdf) of the GLSL 4.30 spec:
|
|
|
|
|
*
|
|
|
|
|
* highp uint packUnorm4x8 (vec4 v)
|
|
|
|
|
* --------------------------------
|
|
|
|
|
* First, converts each component of the normalized floating-point value
|
|
|
|
|
* v into 8-bit integer values. Then, the results are packed into the
|
|
|
|
|
* returned 32-bit unsigned integer.
|
|
|
|
|
*
|
|
|
|
|
* The conversion for component c of v to fixed point is done as
|
|
|
|
|
* follows:
|
|
|
|
|
*
|
|
|
|
|
* packUnorm4x8: round(clamp(c, 0, +1) * 255.0)
|
|
|
|
|
*
|
|
|
|
|
* The first component of the vector will be written to the least
|
|
|
|
|
* significant bits of the output; the last component will be written to
|
|
|
|
|
* the most significant bits.
|
|
|
|
|
*
|
|
|
|
|
* This function generates IR that approximates the following pseudo-GLSL:
|
|
|
|
|
*
|
|
|
|
|
* return pack_uvec4_to_uint(uvec4(
|
|
|
|
|
* round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 255.0f)));
|
|
|
|
|
*
|
2015-04-22 11:33:17 +01:00
|
|
|
* Here it is safe to directly convert the vec4 to uvec4 because the vec4
|
|
|
|
|
* has been clamped to a non-negative range.
|
2013-01-21 15:31:00 -08:00
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
assert(vec4_rval->type == glsl_type::vec4_type);
|
|
|
|
|
|
|
|
|
|
ir_rvalue *result = pack_uvec4_to_uint(
|
|
|
|
|
f2u(round_even(mul(saturate(vec4_rval), constant(255.0f)))));
|
|
|
|
|
|
|
|
|
|
assert(result->type == glsl_type::uint_type);
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
2012-11-19 15:15:32 -08:00
|
|
|
/**
|
|
|
|
|
* \brief Lower an unpackUnorm2x16 expression.
|
|
|
|
|
*
|
|
|
|
|
* \param uint_rval is unpackUnorm2x16's input
|
|
|
|
|
* \return unpackUnorm2x16's output as a vec2 rvalue
|
|
|
|
|
*/
|
|
|
|
|
ir_rvalue*
|
|
|
|
|
lower_unpack_unorm_2x16(ir_rvalue *uint_rval)
|
|
|
|
|
{
|
|
|
|
|
/* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
|
|
|
|
|
*
|
|
|
|
|
* highp vec2 unpackUnorm2x16 (highp uint p)
|
|
|
|
|
* -----------------------------------------
|
|
|
|
|
* First, unpacks a single 32-bit unsigned integer p into a pair of
|
|
|
|
|
* 16-bit unsigned integers. Then, each component is converted to
|
|
|
|
|
* a normalized floating-point value to generate the returned
|
|
|
|
|
* two-component vector.
|
|
|
|
|
*
|
|
|
|
|
* The conversion for unpacked fixed-point value f to floating point is
|
|
|
|
|
* done as follows:
|
|
|
|
|
*
|
|
|
|
|
* unpackUnorm2x16: f / 65535.0
|
|
|
|
|
*
|
|
|
|
|
* The first component of the returned vector will be extracted from the
|
|
|
|
|
* least significant bits of the input; the last component will be
|
|
|
|
|
* extracted from the most significant bits.
|
|
|
|
|
*
|
|
|
|
|
* This function generates IR that approximates the following pseudo-GLSL:
|
|
|
|
|
*
|
|
|
|
|
* return vec2(unpack_uint_to_uvec2(UINT_RVALUE)) / 65535.0;
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
assert(uint_rval->type == glsl_type::uint_type);
|
|
|
|
|
|
|
|
|
|
ir_rvalue *result = div(u2f(unpack_uint_to_uvec2(uint_rval)),
|
|
|
|
|
constant(65535.0f));
|
|
|
|
|
|
|
|
|
|
assert(result->type == glsl_type::vec2_type);
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
2013-01-21 15:31:00 -08:00
|
|
|
/**
|
|
|
|
|
* \brief Lower an unpackUnorm4x8 expression.
|
|
|
|
|
*
|
|
|
|
|
* \param uint_rval is unpackUnorm4x8's input
|
|
|
|
|
* \return unpackUnorm4x8's output as a vec4 rvalue
|
|
|
|
|
*/
|
|
|
|
|
ir_rvalue*
|
|
|
|
|
lower_unpack_unorm_4x8(ir_rvalue *uint_rval)
|
|
|
|
|
{
|
|
|
|
|
/* From page 137 (143 of pdf) of the GLSL 4.30 spec:
|
|
|
|
|
*
|
|
|
|
|
* highp vec4 unpackUnorm4x8 (highp uint p)
|
|
|
|
|
* ----------------------------------------
|
|
|
|
|
* First, unpacks a single 32-bit unsigned integer p into four
|
|
|
|
|
* 8-bit unsigned integers. Then, each component is converted to
|
|
|
|
|
* a normalized floating-point value to generate the returned
|
|
|
|
|
* two-component vector.
|
|
|
|
|
*
|
|
|
|
|
* The conversion for unpacked fixed-point value f to floating point is
|
|
|
|
|
* done as follows:
|
|
|
|
|
*
|
|
|
|
|
* unpackUnorm4x8: f / 255.0
|
|
|
|
|
*
|
|
|
|
|
* The first component of the returned vector will be extracted from the
|
|
|
|
|
* least significant bits of the input; the last component will be
|
|
|
|
|
* extracted from the most significant bits.
|
|
|
|
|
*
|
|
|
|
|
* This function generates IR that approximates the following pseudo-GLSL:
|
|
|
|
|
*
|
|
|
|
|
* return vec4(unpack_uint_to_uvec4(UINT_RVALUE)) / 255.0;
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
assert(uint_rval->type == glsl_type::uint_type);
|
|
|
|
|
|
|
|
|
|
ir_rvalue *result = div(u2f(unpack_uint_to_uvec4(uint_rval)),
|
|
|
|
|
constant(255.0f));
|
|
|
|
|
|
|
|
|
|
assert(result->type == glsl_type::vec4_type);
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
2012-11-19 15:15:32 -08:00
|
|
|
/**
|
|
|
|
|
* \brief Lower the component-wise calculation of packHalf2x16.
|
|
|
|
|
*
|
|
|
|
|
* \param f_rval is one component of packHafl2x16's input
|
|
|
|
|
* \param e_rval is the unshifted exponent bits of f_rval
|
|
|
|
|
* \param m_rval is the unshifted mantissa bits of f_rval
|
|
|
|
|
*
|
|
|
|
|
* \return a uint rvalue that encodes a float16 in its lower 16 bits
|
|
|
|
|
*/
|
|
|
|
|
ir_rvalue*
|
|
|
|
|
pack_half_1x16_nosign(ir_rvalue *f_rval,
|
|
|
|
|
ir_rvalue *e_rval,
|
|
|
|
|
ir_rvalue *m_rval)
|
|
|
|
|
{
|
|
|
|
|
assert(e_rval->type == glsl_type::uint_type);
|
|
|
|
|
assert(m_rval->type == glsl_type::uint_type);
|
|
|
|
|
|
|
|
|
|
/* uint u16; */
|
|
|
|
|
ir_variable *u16 = factory.make_temp(glsl_type::uint_type,
|
|
|
|
|
"tmp_pack_half_1x16_u16");
|
|
|
|
|
|
|
|
|
|
/* float f = FLOAT_RVAL; */
|
|
|
|
|
ir_variable *f = factory.make_temp(glsl_type::float_type,
|
|
|
|
|
"tmp_pack_half_1x16_f");
|
|
|
|
|
factory.emit(assign(f, f_rval));
|
|
|
|
|
|
|
|
|
|
/* uint e = E_RVAL; */
|
|
|
|
|
ir_variable *e = factory.make_temp(glsl_type::uint_type,
|
|
|
|
|
"tmp_pack_half_1x16_e");
|
|
|
|
|
factory.emit(assign(e, e_rval));
|
|
|
|
|
|
|
|
|
|
/* uint m = M_RVAL; */
|
|
|
|
|
ir_variable *m = factory.make_temp(glsl_type::uint_type,
|
|
|
|
|
"tmp_pack_half_1x16_m");
|
|
|
|
|
factory.emit(assign(m, m_rval));
|
|
|
|
|
|
|
|
|
|
/* Preliminaries
|
|
|
|
|
* -------------
|
|
|
|
|
*
|
|
|
|
|
* For a float16, the bit layout is:
|
|
|
|
|
*
|
|
|
|
|
* sign: 15
|
|
|
|
|
* exponent: 10:14
|
|
|
|
|
* mantissa: 0:9
|
|
|
|
|
*
|
|
|
|
|
* Let f16 be a float16 value. The sign, exponent, and mantissa
|
|
|
|
|
* determine its value thus:
|
|
|
|
|
*
|
|
|
|
|
* if e16 = 0 and m16 = 0, then zero: (-1)^s16 * 0 (1)
|
|
|
|
|
* if e16 = 0 and m16!= 0, then subnormal: (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10) (2)
|
|
|
|
|
* if 0 < e16 < 31, then normal: (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
|
|
|
|
|
* if e16 = 31 and m16 = 0, then infinite: (-1)^s16 * inf (4)
|
|
|
|
|
* if e16 = 31 and m16 != 0, then NaN (5)
|
|
|
|
|
*
|
|
|
|
|
* where 0 <= m16 < 2^10.
|
|
|
|
|
*
|
|
|
|
|
* For a float32, the bit layout is:
|
|
|
|
|
*
|
|
|
|
|
* sign: 31
|
|
|
|
|
* exponent: 23:30
|
|
|
|
|
* mantissa: 0:22
|
|
|
|
|
*
|
|
|
|
|
* Let f32 be a float32 value. The sign, exponent, and mantissa
|
|
|
|
|
* determine its value thus:
|
|
|
|
|
*
|
|
|
|
|
* if e32 = 0 and m32 = 0, then zero: (-1)^s * 0 (10)
|
|
|
|
|
* if e32 = 0 and m32 != 0, then subnormal: (-1)^s * 2^(e32 - 126) * (m32 / 2^23) (11)
|
|
|
|
|
* if 0 < e32 < 255, then normal: (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
|
|
|
|
|
* if e32 = 255 and m32 = 0, then infinite: (-1)^s * inf (13)
|
|
|
|
|
* if e32 = 255 and m32 != 0, then NaN (14)
|
|
|
|
|
*
|
|
|
|
|
* where 0 <= m32 < 2^23.
|
|
|
|
|
*
|
|
|
|
|
* The minimum and maximum normal float16 values are
|
|
|
|
|
*
|
|
|
|
|
* min_norm16 = 2^(1 - 15) * (1 + 0 / 2^10) = 2^(-14) (20)
|
|
|
|
|
* max_norm16 = 2^(30 - 15) * (1 + 1023 / 2^10) (21)
|
|
|
|
|
*
|
|
|
|
|
* The step at max_norm16 is
|
|
|
|
|
*
|
|
|
|
|
* max_step16 = 2^5 (22)
|
|
|
|
|
*
|
|
|
|
|
* Observe that the float16 boundary values in equations 20-21 lie in the
|
|
|
|
|
* range of normal float32 values.
|
|
|
|
|
*
|
|
|
|
|
*
|
|
|
|
|
* Rounding Behavior
|
|
|
|
|
* -----------------
|
|
|
|
|
* Not all float32 values can be exactly represented as a float16. We
|
|
|
|
|
* round all such intermediate float32 values to the nearest float16; if
|
|
|
|
|
* the float32 is exactly between to float16 values, we round to the one
|
|
|
|
|
* with an even mantissa. This rounding behavior has several benefits:
|
|
|
|
|
*
|
|
|
|
|
* - It has no sign bias.
|
|
|
|
|
*
|
|
|
|
|
* - It reproduces the behavior of real hardware: opcode F32TO16 in Intel's
|
|
|
|
|
* GPU ISA.
|
|
|
|
|
*
|
|
|
|
|
* - By reproducing the behavior of the GPU (at least on Intel hardware),
|
|
|
|
|
* compile-time evaluation of constant packHalf2x16 GLSL expressions will
|
|
|
|
|
* result in the same value as if the expression were executed on the
|
|
|
|
|
* GPU.
|
|
|
|
|
*
|
|
|
|
|
* Calculation
|
|
|
|
|
* -----------
|
|
|
|
|
* Our task is to compute s16, e16, m16 given f32. Since this function
|
|
|
|
|
* ignores the sign bit, assume that s32 = s16 = 0. There are several
|
|
|
|
|
* cases consider.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
factory.emit(
|
|
|
|
|
|
|
|
|
|
/* Case 1) f32 is NaN
|
|
|
|
|
*
|
|
|
|
|
* The resultant f16 will also be NaN.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
/* if (e32 == 255 && m32 != 0) { */
|
|
|
|
|
if_tree(logic_and(equal(e, constant(0xffu << 23u)),
|
|
|
|
|
logic_not(equal(m, constant(0u)))),
|
|
|
|
|
|
|
|
|
|
assign(u16, constant(0x7fffu)),
|
|
|
|
|
|
|
|
|
|
/* Case 2) f32 lies in the range [0, min_norm16).
|
|
|
|
|
*
|
|
|
|
|
* The resultant float16 will be either zero, subnormal, or normal.
|
|
|
|
|
*
|
|
|
|
|
* Solving
|
|
|
|
|
*
|
|
|
|
|
* f32 = min_norm16 (30)
|
|
|
|
|
*
|
|
|
|
|
* gives
|
|
|
|
|
*
|
|
|
|
|
* e32 = 113 and m32 = 0 (31)
|
|
|
|
|
*
|
|
|
|
|
* Therefore this case occurs if and only if
|
|
|
|
|
*
|
|
|
|
|
* e32 < 113 (32)
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
/* } else if (e32 < 113) { */
|
|
|
|
|
if_tree(less(e, constant(113u << 23u)),
|
|
|
|
|
|
|
|
|
|
/* u16 = uint(round_to_even(abs(f32) * float(1u << 24u))); */
|
|
|
|
|
assign(u16, f2u(round_even(mul(expr(ir_unop_abs, f),
|
|
|
|
|
constant((float) (1 << 24)))))),
|
|
|
|
|
|
|
|
|
|
/* Case 3) f32 lies in the range
|
|
|
|
|
* [min_norm16, max_norm16 + max_step16).
|
|
|
|
|
*
|
|
|
|
|
* The resultant float16 will be either normal or infinite.
|
|
|
|
|
*
|
|
|
|
|
* Solving
|
|
|
|
|
*
|
|
|
|
|
* f32 = max_norm16 + max_step16 (40)
|
|
|
|
|
* = 2^15 * (1 + 1023 / 2^10) + 2^5 (41)
|
|
|
|
|
* = 2^16 (42)
|
|
|
|
|
* gives
|
|
|
|
|
*
|
|
|
|
|
* e32 = 143 and m32 = 0 (43)
|
|
|
|
|
*
|
|
|
|
|
* We already solved the boundary condition f32 = min_norm16 above
|
|
|
|
|
* in equation 31. Therefore this case occurs if and only if
|
|
|
|
|
*
|
|
|
|
|
* 113 <= e32 and e32 < 143
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
/* } else if (e32 < 143) { */
|
|
|
|
|
if_tree(less(e, constant(143u << 23u)),
|
|
|
|
|
|
|
|
|
|
/* The addition below handles the case where the mantissa rounds
|
|
|
|
|
* up to 1024 and bumps the exponent.
|
|
|
|
|
*
|
|
|
|
|
* u16 = ((e - (112u << 23u)) >> 13u)
|
|
|
|
|
* + round_to_even((float(m) / (1u << 13u));
|
|
|
|
|
*/
|
|
|
|
|
assign(u16, add(rshift(sub(e, constant(112u << 23u)),
|
|
|
|
|
constant(13u)),
|
|
|
|
|
f2u(round_even(
|
|
|
|
|
div(u2f(m), constant((float) (1 << 13))))))),
|
|
|
|
|
|
|
|
|
|
/* Case 4) f32 lies in the range [max_norm16 + max_step16, inf].
|
|
|
|
|
*
|
|
|
|
|
* The resultant float16 will be infinite.
|
|
|
|
|
*
|
|
|
|
|
* The cases above caught all float32 values in the range
|
|
|
|
|
* [0, max_norm16 + max_step16), so this is the fall-through case.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
/* } else { */
|
|
|
|
|
|
|
|
|
|
assign(u16, constant(31u << 10u))))));
|
|
|
|
|
|
|
|
|
|
/* } */
|
|
|
|
|
|
|
|
|
|
return deref(u16).val;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* \brief Lower a packHalf2x16 expression.
|
|
|
|
|
*
|
|
|
|
|
* \param vec2_rval is packHalf2x16's input
|
|
|
|
|
* \return packHalf2x16's output as a uint rvalue
|
|
|
|
|
*/
|
|
|
|
|
ir_rvalue*
|
|
|
|
|
lower_pack_half_2x16(ir_rvalue *vec2_rval)
|
|
|
|
|
{
|
|
|
|
|
/* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
|
|
|
|
|
*
|
|
|
|
|
* highp uint packHalf2x16 (mediump vec2 v)
|
|
|
|
|
* ----------------------------------------
|
|
|
|
|
* Returns an unsigned integer obtained by converting the components of
|
|
|
|
|
* a two-component floating-point vector to the 16-bit floating-point
|
|
|
|
|
* representation found in the OpenGL ES Specification, and then packing
|
|
|
|
|
* these two 16-bit integers into a 32-bit unsigned integer.
|
|
|
|
|
*
|
|
|
|
|
* The first vector component specifies the 16 least- significant bits
|
|
|
|
|
* of the result; the second component specifies the 16 most-significant
|
|
|
|
|
* bits.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
assert(vec2_rval->type == glsl_type::vec2_type);
|
|
|
|
|
|
|
|
|
|
/* vec2 f = VEC2_RVAL; */
|
|
|
|
|
ir_variable *f = factory.make_temp(glsl_type::vec2_type,
|
|
|
|
|
"tmp_pack_half_2x16_f");
|
|
|
|
|
factory.emit(assign(f, vec2_rval));
|
|
|
|
|
|
|
|
|
|
/* uvec2 f32 = bitcast_f2u(f); */
|
|
|
|
|
ir_variable *f32 = factory.make_temp(glsl_type::uvec2_type,
|
|
|
|
|
"tmp_pack_half_2x16_f32");
|
|
|
|
|
factory.emit(assign(f32, expr(ir_unop_bitcast_f2u, f)));
|
|
|
|
|
|
|
|
|
|
/* uvec2 f16; */
|
|
|
|
|
ir_variable *f16 = factory.make_temp(glsl_type::uvec2_type,
|
|
|
|
|
"tmp_pack_half_2x16_f16");
|
|
|
|
|
|
|
|
|
|
/* Get f32's unshifted exponent bits.
|
|
|
|
|
*
|
|
|
|
|
* uvec2 e = f32 & 0x7f800000u;
|
|
|
|
|
*/
|
|
|
|
|
ir_variable *e = factory.make_temp(glsl_type::uvec2_type,
|
|
|
|
|
"tmp_pack_half_2x16_e");
|
|
|
|
|
factory.emit(assign(e, bit_and(f32, constant(0x7f800000u))));
|
|
|
|
|
|
|
|
|
|
/* Get f32's unshifted mantissa bits.
|
|
|
|
|
*
|
|
|
|
|
* uvec2 m = f32 & 0x007fffffu;
|
|
|
|
|
*/
|
|
|
|
|
ir_variable *m = factory.make_temp(glsl_type::uvec2_type,
|
|
|
|
|
"tmp_pack_half_2x16_m");
|
|
|
|
|
factory.emit(assign(m, bit_and(f32, constant(0x007fffffu))));
|
|
|
|
|
|
|
|
|
|
/* Set f16's exponent and mantissa bits.
|
|
|
|
|
*
|
|
|
|
|
* f16.x = pack_half_1x16_nosign(e.x, m.x);
|
|
|
|
|
* f16.y = pack_half_1y16_nosign(e.y, m.y);
|
|
|
|
|
*/
|
|
|
|
|
factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_x(f),
|
|
|
|
|
swizzle_x(e),
|
|
|
|
|
swizzle_x(m)),
|
|
|
|
|
WRITEMASK_X));
|
|
|
|
|
factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_y(f),
|
|
|
|
|
swizzle_y(e),
|
|
|
|
|
swizzle_y(m)),
|
|
|
|
|
WRITEMASK_Y));
|
|
|
|
|
|
|
|
|
|
/* Set f16's sign bits.
|
|
|
|
|
*
|
|
|
|
|
* f16 |= (f32 & (1u << 31u) >> 16u;
|
|
|
|
|
*/
|
|
|
|
|
factory.emit(
|
|
|
|
|
assign(f16, bit_or(f16,
|
|
|
|
|
rshift(bit_and(f32, constant(1u << 31u)),
|
|
|
|
|
constant(16u)))));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* return (f16.y << 16u) | f16.x; */
|
|
|
|
|
ir_rvalue *result = bit_or(lshift(swizzle_y(f16),
|
|
|
|
|
constant(16u)),
|
|
|
|
|
swizzle_x(f16));
|
|
|
|
|
|
|
|
|
|
assert(result->type == glsl_type::uint_type);
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* \brief Split packHalf2x16's vec2 operand into two floats.
|
|
|
|
|
*
|
|
|
|
|
* \param vec2_rval is packHalf2x16's input
|
|
|
|
|
* \return a uint rvalue
|
|
|
|
|
*
|
|
|
|
|
* Some code generators, such as the i965 fragment shader, require that all
|
|
|
|
|
* vector expressions be lowered to a sequence of scalar expressions.
|
|
|
|
|
* However, packHalf2x16 cannot be scalarized by the same mechanism as
|
|
|
|
|
* a true vector operation because its input and output have a differing
|
|
|
|
|
* number of vector components.
|
|
|
|
|
*
|
|
|
|
|
* This method scalarizes packHalf2x16 by transforming it from an unary
|
|
|
|
|
* operation having vector input to a binary operation having scalar input.
|
|
|
|
|
* That is, it transforms
|
|
|
|
|
*
|
|
|
|
|
* packHalf2x16(VEC2_RVAL);
|
|
|
|
|
*
|
|
|
|
|
* into
|
|
|
|
|
*
|
|
|
|
|
* vec2 v = VEC2_RVAL;
|
|
|
|
|
* return packHalf2x16_split(v.x, v.y);
|
|
|
|
|
*/
|
|
|
|
|
ir_rvalue*
|
|
|
|
|
split_pack_half_2x16(ir_rvalue *vec2_rval)
|
|
|
|
|
{
|
|
|
|
|
assert(vec2_rval->type == glsl_type::vec2_type);
|
|
|
|
|
|
|
|
|
|
ir_variable *v = factory.make_temp(glsl_type::vec2_type,
|
|
|
|
|
"tmp_split_pack_half_2x16_v");
|
|
|
|
|
factory.emit(assign(v, vec2_rval));
|
|
|
|
|
|
|
|
|
|
return expr(ir_binop_pack_half_2x16_split, swizzle_x(v), swizzle_y(v));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* \brief Lower the component-wise calculation of unpackHalf2x16.
|
|
|
|
|
*
|
|
|
|
|
* Given a uint that encodes a float16 in its lower 16 bits, this function
|
|
|
|
|
* returns a uint that encodes a float32 with the same value. The sign bit
|
|
|
|
|
* of the float16 is ignored.
|
|
|
|
|
*
|
|
|
|
|
* \param e_rval is the unshifted exponent bits of a float16
|
|
|
|
|
* \param m_rval is the unshifted mantissa bits of a float16
|
|
|
|
|
* \param a uint rvalue that encodes a float32
|
|
|
|
|
*/
|
|
|
|
|
ir_rvalue*
|
|
|
|
|
unpack_half_1x16_nosign(ir_rvalue *e_rval, ir_rvalue *m_rval)
|
|
|
|
|
{
|
|
|
|
|
assert(e_rval->type == glsl_type::uint_type);
|
|
|
|
|
assert(m_rval->type == glsl_type::uint_type);
|
|
|
|
|
|
|
|
|
|
/* uint u32; */
|
|
|
|
|
ir_variable *u32 = factory.make_temp(glsl_type::uint_type,
|
|
|
|
|
"tmp_unpack_half_1x16_u32");
|
|
|
|
|
|
|
|
|
|
/* uint e = E_RVAL; */
|
|
|
|
|
ir_variable *e = factory.make_temp(glsl_type::uint_type,
|
|
|
|
|
"tmp_unpack_half_1x16_e");
|
|
|
|
|
factory.emit(assign(e, e_rval));
|
|
|
|
|
|
|
|
|
|
/* uint m = M_RVAL; */
|
|
|
|
|
ir_variable *m = factory.make_temp(glsl_type::uint_type,
|
|
|
|
|
"tmp_unpack_half_1x16_m");
|
|
|
|
|
factory.emit(assign(m, m_rval));
|
|
|
|
|
|
|
|
|
|
/* Preliminaries
|
|
|
|
|
* -------------
|
|
|
|
|
*
|
|
|
|
|
* For a float16, the bit layout is:
|
|
|
|
|
*
|
|
|
|
|
* sign: 15
|
|
|
|
|
* exponent: 10:14
|
|
|
|
|
* mantissa: 0:9
|
|
|
|
|
*
|
|
|
|
|
* Let f16 be a float16 value. The sign, exponent, and mantissa
|
|
|
|
|
* determine its value thus:
|
|
|
|
|
*
|
|
|
|
|
* if e16 = 0 and m16 = 0, then zero: (-1)^s16 * 0 (1)
|
|
|
|
|
* if e16 = 0 and m16!= 0, then subnormal: (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10) (2)
|
|
|
|
|
* if 0 < e16 < 31, then normal: (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
|
|
|
|
|
* if e16 = 31 and m16 = 0, then infinite: (-1)^s16 * inf (4)
|
|
|
|
|
* if e16 = 31 and m16 != 0, then NaN (5)
|
|
|
|
|
*
|
|
|
|
|
* where 0 <= m16 < 2^10.
|
|
|
|
|
*
|
|
|
|
|
* For a float32, the bit layout is:
|
|
|
|
|
*
|
|
|
|
|
* sign: 31
|
|
|
|
|
* exponent: 23:30
|
|
|
|
|
* mantissa: 0:22
|
|
|
|
|
*
|
|
|
|
|
* Let f32 be a float32 value. The sign, exponent, and mantissa
|
|
|
|
|
* determine its value thus:
|
|
|
|
|
*
|
|
|
|
|
* if e32 = 0 and m32 = 0, then zero: (-1)^s * 0 (10)
|
|
|
|
|
* if e32 = 0 and m32 != 0, then subnormal: (-1)^s * 2^(e32 - 126) * (m32 / 2^23) (11)
|
|
|
|
|
* if 0 < e32 < 255, then normal: (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
|
|
|
|
|
* if e32 = 255 and m32 = 0, then infinite: (-1)^s * inf (13)
|
|
|
|
|
* if e32 = 255 and m32 != 0, then NaN (14)
|
|
|
|
|
*
|
|
|
|
|
* where 0 <= m32 < 2^23.
|
|
|
|
|
*
|
|
|
|
|
* Calculation
|
|
|
|
|
* -----------
|
|
|
|
|
* Our task is to compute s32, e32, m32 given f16. Since this function
|
|
|
|
|
* ignores the sign bit, assume that s32 = s16 = 0. There are several
|
|
|
|
|
* cases consider.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
factory.emit(
|
|
|
|
|
|
|
|
|
|
/* Case 1) f16 is zero or subnormal.
|
|
|
|
|
*
|
|
|
|
|
* The simplest method of calcuating f32 in this case is
|
|
|
|
|
*
|
|
|
|
|
* f32 = f16 (20)
|
|
|
|
|
* = 2^(-14) * (m16 / 2^10) (21)
|
|
|
|
|
* = m16 / 2^(-24) (22)
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
/* if (e16 == 0) { */
|
|
|
|
|
if_tree(equal(e, constant(0u)),
|
|
|
|
|
|
|
|
|
|
/* u32 = bitcast_f2u(float(m) / float(1 << 24)); */
|
|
|
|
|
assign(u32, expr(ir_unop_bitcast_f2u,
|
|
|
|
|
div(u2f(m), constant((float)(1 << 24))))),
|
|
|
|
|
|
|
|
|
|
/* Case 2) f16 is normal.
|
|
|
|
|
*
|
|
|
|
|
* The equation
|
|
|
|
|
*
|
|
|
|
|
* f32 = f16 (30)
|
|
|
|
|
* 2^(e32 - 127) * (1 + m32 / 2^23) = (31)
|
|
|
|
|
* 2^(e16 - 15) * (1 + m16 / 2^10)
|
|
|
|
|
*
|
|
|
|
|
* can be decomposed into two
|
|
|
|
|
*
|
|
|
|
|
* 2^(e32 - 127) = 2^(e16 - 15) (32)
|
|
|
|
|
* 1 + m32 / 2^23 = 1 + m16 / 2^10 (33)
|
|
|
|
|
*
|
|
|
|
|
* which solve to
|
|
|
|
|
*
|
|
|
|
|
* e32 = e16 + 112 (34)
|
|
|
|
|
* m32 = m16 * 2^13 (35)
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
/* } else if (e16 < 31)) { */
|
|
|
|
|
if_tree(less(e, constant(31u << 10u)),
|
|
|
|
|
|
|
|
|
|
/* u32 = ((e + (112 << 10)) | m) << 13;
|
|
|
|
|
*/
|
|
|
|
|
assign(u32, lshift(bit_or(add(e, constant(112u << 10u)), m),
|
|
|
|
|
constant(13u))),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* Case 3) f16 is infinite. */
|
|
|
|
|
if_tree(equal(m, constant(0u)),
|
|
|
|
|
|
|
|
|
|
assign(u32, constant(255u << 23u)),
|
|
|
|
|
|
|
|
|
|
/* Case 4) f16 is NaN. */
|
|
|
|
|
/* } else { */
|
|
|
|
|
|
|
|
|
|
assign(u32, constant(0x7fffffffu))))));
|
|
|
|
|
|
|
|
|
|
/* } */
|
|
|
|
|
|
|
|
|
|
return deref(u32).val;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* \brief Lower an unpackHalf2x16 expression.
|
|
|
|
|
*
|
|
|
|
|
* \param uint_rval is unpackHalf2x16's input
|
|
|
|
|
* \return unpackHalf2x16's output as a vec2 rvalue
|
|
|
|
|
*/
|
|
|
|
|
ir_rvalue*
|
|
|
|
|
lower_unpack_half_2x16(ir_rvalue *uint_rval)
|
|
|
|
|
{
|
|
|
|
|
/* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
|
|
|
|
|
*
|
|
|
|
|
* mediump vec2 unpackHalf2x16 (highp uint v)
|
|
|
|
|
* ------------------------------------------
|
|
|
|
|
* Returns a two-component floating-point vector with components
|
|
|
|
|
* obtained by unpacking a 32-bit unsigned integer into a pair of 16-bit
|
|
|
|
|
* values, interpreting those values as 16-bit floating-point numbers
|
|
|
|
|
* according to the OpenGL ES Specification, and converting them to
|
|
|
|
|
* 32-bit floating-point values.
|
|
|
|
|
*
|
|
|
|
|
* The first component of the vector is obtained from the
|
|
|
|
|
* 16 least-significant bits of v; the second component is obtained
|
|
|
|
|
* from the 16 most-significant bits of v.
|
|
|
|
|
*/
|
|
|
|
|
assert(uint_rval->type == glsl_type::uint_type);
|
|
|
|
|
|
|
|
|
|
/* uint u = RVALUE;
|
|
|
|
|
* uvec2 f16 = uvec2(u.x & 0xffff, u.y >> 16);
|
|
|
|
|
*/
|
|
|
|
|
ir_variable *f16 = factory.make_temp(glsl_type::uvec2_type,
|
|
|
|
|
"tmp_unpack_half_2x16_f16");
|
|
|
|
|
factory.emit(assign(f16, unpack_uint_to_uvec2(uint_rval)));
|
|
|
|
|
|
|
|
|
|
/* uvec2 f32; */
|
|
|
|
|
ir_variable *f32 = factory.make_temp(glsl_type::uvec2_type,
|
|
|
|
|
"tmp_unpack_half_2x16_f32");
|
|
|
|
|
|
|
|
|
|
/* Get f16's unshifted exponent bits.
|
|
|
|
|
*
|
|
|
|
|
* uvec2 e = f16 & 0x7c00u;
|
|
|
|
|
*/
|
|
|
|
|
ir_variable *e = factory.make_temp(glsl_type::uvec2_type,
|
|
|
|
|
"tmp_unpack_half_2x16_e");
|
|
|
|
|
factory.emit(assign(e, bit_and(f16, constant(0x7c00u))));
|
|
|
|
|
|
|
|
|
|
/* Get f16's unshifted mantissa bits.
|
|
|
|
|
*
|
|
|
|
|
* uvec2 m = f16 & 0x03ffu;
|
|
|
|
|
*/
|
|
|
|
|
ir_variable *m = factory.make_temp(glsl_type::uvec2_type,
|
|
|
|
|
"tmp_unpack_half_2x16_m");
|
|
|
|
|
factory.emit(assign(m, bit_and(f16, constant(0x03ffu))));
|
|
|
|
|
|
|
|
|
|
/* Set f32's exponent and mantissa bits.
|
|
|
|
|
*
|
|
|
|
|
* f32.x = unpack_half_1x16_nosign(e.x, m.x);
|
|
|
|
|
* f32.y = unpack_half_1x16_nosign(e.y, m.y);
|
|
|
|
|
*/
|
|
|
|
|
factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_x(e),
|
|
|
|
|
swizzle_x(m)),
|
|
|
|
|
WRITEMASK_X));
|
|
|
|
|
factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_y(e),
|
|
|
|
|
swizzle_y(m)),
|
|
|
|
|
WRITEMASK_Y));
|
|
|
|
|
|
|
|
|
|
/* Set f32's sign bit.
|
|
|
|
|
*
|
|
|
|
|
* f32 |= (f16 & 0x8000u) << 16u;
|
|
|
|
|
*/
|
|
|
|
|
factory.emit(assign(f32, bit_or(f32,
|
|
|
|
|
lshift(bit_and(f16,
|
|
|
|
|
constant(0x8000u)),
|
|
|
|
|
constant(16u)))));
|
|
|
|
|
|
|
|
|
|
/* return bitcast_u2f(f32); */
|
|
|
|
|
ir_rvalue *result = expr(ir_unop_bitcast_u2f, f32);
|
|
|
|
|
assert(result->type == glsl_type::vec2_type);
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* \brief Split unpackHalf2x16 into two operations.
|
|
|
|
|
*
|
|
|
|
|
* \param uint_rval is unpackHalf2x16's input
|
|
|
|
|
* \return a vec2 rvalue
|
|
|
|
|
*
|
|
|
|
|
* Some code generators, such as the i965 fragment shader, require that all
|
|
|
|
|
* vector expressions be lowered to a sequence of scalar expressions.
|
|
|
|
|
* However, unpackHalf2x16 cannot be scalarized by the same method as
|
|
|
|
|
* a true vector operation because the number of components of its input
|
|
|
|
|
* and output differ.
|
|
|
|
|
*
|
|
|
|
|
* This method scalarizes unpackHalf2x16 by transforming it from a single
|
|
|
|
|
* operation having vec2 output to a pair of operations each having float
|
|
|
|
|
* output. That is, it transforms
|
|
|
|
|
*
|
|
|
|
|
* unpackHalf2x16(UINT_RVAL)
|
|
|
|
|
*
|
|
|
|
|
* into
|
|
|
|
|
*
|
|
|
|
|
* uint u = UINT_RVAL;
|
|
|
|
|
* vec2 v;
|
|
|
|
|
*
|
|
|
|
|
* v.x = unpackHalf2x16_split_x(u);
|
|
|
|
|
* v.y = unpackHalf2x16_split_y(u);
|
|
|
|
|
*
|
|
|
|
|
* return v;
|
|
|
|
|
*/
|
|
|
|
|
ir_rvalue*
|
|
|
|
|
split_unpack_half_2x16(ir_rvalue *uint_rval)
|
|
|
|
|
{
|
|
|
|
|
assert(uint_rval->type == glsl_type::uint_type);
|
|
|
|
|
|
|
|
|
|
/* uint u = uint_rval; */
|
|
|
|
|
ir_variable *u = factory.make_temp(glsl_type::uint_type,
|
|
|
|
|
"tmp_split_unpack_half_2x16_u");
|
|
|
|
|
factory.emit(assign(u, uint_rval));
|
|
|
|
|
|
|
|
|
|
/* vec2 v; */
|
|
|
|
|
ir_variable *v = factory.make_temp(glsl_type::vec2_type,
|
|
|
|
|
"tmp_split_unpack_half_2x16_v");
|
|
|
|
|
|
|
|
|
|
/* v.x = unpack_half_2x16_split_x(u); */
|
|
|
|
|
factory.emit(assign(v, expr(ir_unop_unpack_half_2x16_split_x, u),
|
|
|
|
|
WRITEMASK_X));
|
|
|
|
|
|
|
|
|
|
/* v.y = unpack_half_2x16_split_y(u); */
|
|
|
|
|
factory.emit(assign(v, expr(ir_unop_unpack_half_2x16_split_y, u),
|
|
|
|
|
WRITEMASK_Y));
|
|
|
|
|
|
|
|
|
|
return deref(v).val;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
} // namespace anonymous
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* \brief Lower the builtin packing functions.
|
|
|
|
|
*
|
|
|
|
|
* \param op_mask is a bitmask of `enum lower_packing_builtins_op`.
|
|
|
|
|
*/
|
|
|
|
|
bool
|
|
|
|
|
lower_packing_builtins(exec_list *instructions, int op_mask)
|
|
|
|
|
{
|
|
|
|
|
lower_packing_builtins_visitor v(op_mask);
|
|
|
|
|
visit_list_elements(&v, instructions, true);
|
|
|
|
|
return v.get_progress();
|
|
|
|
|
}
|