2016-05-07 13:01:24 -04:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2016 Red Hat
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
* Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
* IN THE SOFTWARE.
|
|
|
|
|
*
|
|
|
|
|
* Authors:
|
|
|
|
|
* Rob Clark <robclark@freedesktop.org>
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#ifndef _NIR_SEARCH_HELPERS_
|
|
|
|
|
#define _NIR_SEARCH_HELPERS_
|
|
|
|
|
|
2023-08-08 12:00:35 -05:00
|
|
|
#include <math.h>
|
2017-11-13 13:00:53 -08:00
|
|
|
#include "util/bitscan.h"
|
2023-08-08 12:00:35 -05:00
|
|
|
#include "nir.h"
|
2018-01-23 09:48:43 +08:00
|
|
|
#include "nir_range_analysis.h"
|
2016-05-07 13:01:24 -04:00
|
|
|
|
|
|
|
|
static inline bool
|
2020-05-12 12:48:17 -07:00
|
|
|
is_pos_power_of_two(UNUSED struct hash_table *ht, const nir_alu_instr *instr,
|
2019-09-23 15:40:46 -07:00
|
|
|
unsigned src, unsigned num_components,
|
2016-05-07 13:01:24 -04:00
|
|
|
const uint8_t *swizzle)
|
|
|
|
|
{
|
2017-02-27 17:21:42 -08:00
|
|
|
/* only constant srcs: */
|
2018-10-20 12:07:41 -05:00
|
|
|
if (!nir_src_is_const(instr->src[src].src))
|
2016-05-07 13:01:24 -04:00
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < num_components; i++) {
|
2019-09-25 11:59:49 -07:00
|
|
|
nir_alu_type type = nir_op_infos[instr->op].input_types[src];
|
|
|
|
|
switch (nir_alu_type_get_base_type(type)) {
|
2018-10-20 12:07:41 -05:00
|
|
|
case nir_type_int: {
|
|
|
|
|
int64_t val = nir_src_comp_as_int(instr->src[src].src, swizzle[i]);
|
|
|
|
|
if (val <= 0 || !util_is_power_of_two_or_zero64(val))
|
2016-05-07 13:01:24 -04:00
|
|
|
return false;
|
|
|
|
|
break;
|
2018-10-20 12:07:41 -05:00
|
|
|
}
|
|
|
|
|
case nir_type_uint: {
|
|
|
|
|
uint64_t val = nir_src_comp_as_uint(instr->src[src].src, swizzle[i]);
|
|
|
|
|
if (val == 0 || !util_is_power_of_two_or_zero64(val))
|
2016-05-07 13:01:24 -04:00
|
|
|
return false;
|
|
|
|
|
break;
|
2018-10-20 12:07:41 -05:00
|
|
|
}
|
2016-05-07 13:01:24 -04:00
|
|
|
default:
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline bool
|
2020-05-12 12:48:17 -07:00
|
|
|
is_neg_power_of_two(UNUSED struct hash_table *ht, const nir_alu_instr *instr,
|
2019-09-23 15:40:46 -07:00
|
|
|
unsigned src, unsigned num_components,
|
2016-05-07 13:01:24 -04:00
|
|
|
const uint8_t *swizzle)
|
|
|
|
|
{
|
2017-02-27 17:21:42 -08:00
|
|
|
/* only constant srcs: */
|
2018-10-20 12:07:41 -05:00
|
|
|
if (!nir_src_is_const(instr->src[src].src))
|
2016-05-07 13:01:24 -04:00
|
|
|
return false;
|
|
|
|
|
|
2021-07-21 17:13:40 +01:00
|
|
|
int64_t int_min = u_intN_min(instr->src[src].src.ssa->bit_size);
|
|
|
|
|
|
2016-05-07 13:01:24 -04:00
|
|
|
for (unsigned i = 0; i < num_components; i++) {
|
2019-09-25 11:59:49 -07:00
|
|
|
nir_alu_type type = nir_op_infos[instr->op].input_types[src];
|
|
|
|
|
switch (nir_alu_type_get_base_type(type)) {
|
2018-10-20 12:07:41 -05:00
|
|
|
case nir_type_int: {
|
|
|
|
|
int64_t val = nir_src_comp_as_int(instr->src[src].src, swizzle[i]);
|
2021-07-21 17:13:40 +01:00
|
|
|
/* "int_min" is a power-of-two, but negation can cause overflow. */
|
|
|
|
|
if (val == int_min || val >= 0 || !util_is_power_of_two_or_zero64(-val))
|
2016-05-07 13:01:24 -04:00
|
|
|
return false;
|
|
|
|
|
break;
|
2018-10-20 12:07:41 -05:00
|
|
|
}
|
2016-05-07 13:01:24 -04:00
|
|
|
default:
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2021-03-09 16:51:25 +00:00
|
|
|
static inline bool
|
|
|
|
|
is_bitcount2(UNUSED struct hash_table *ht, const nir_alu_instr *instr,
|
|
|
|
|
unsigned src, unsigned num_components,
|
|
|
|
|
const uint8_t *swizzle)
|
|
|
|
|
{
|
|
|
|
|
/* only constant srcs: */
|
|
|
|
|
if (!nir_src_is_const(instr->src[src].src))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < num_components; i++) {
|
|
|
|
|
uint64_t val = nir_src_comp_as_uint(instr->src[src].src, swizzle[i]);
|
|
|
|
|
if (util_bitcount64(val) != 2)
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2023-08-08 12:00:35 -05:00
|
|
|
#define MULTIPLE(test) \
|
|
|
|
|
static inline bool \
|
|
|
|
|
is_unsigned_multiple_of_##test(UNUSED struct hash_table *ht, \
|
|
|
|
|
const nir_alu_instr *instr, \
|
|
|
|
|
unsigned src, unsigned num_components, \
|
|
|
|
|
const uint8_t *swizzle) \
|
|
|
|
|
{ \
|
|
|
|
|
/* only constant srcs: */ \
|
|
|
|
|
if (!nir_src_is_const(instr->src[src].src)) \
|
|
|
|
|
return false; \
|
|
|
|
|
\
|
|
|
|
|
for (unsigned i = 0; i < num_components; i++) { \
|
|
|
|
|
uint64_t val = nir_src_comp_as_uint(instr->src[src].src, swizzle[i]); \
|
|
|
|
|
if (val % test != 0) \
|
|
|
|
|
return false; \
|
|
|
|
|
} \
|
|
|
|
|
\
|
|
|
|
|
return true; \
|
|
|
|
|
}
|
2019-10-09 15:03:45 +01:00
|
|
|
|
|
|
|
|
MULTIPLE(2)
|
|
|
|
|
MULTIPLE(4)
|
|
|
|
|
MULTIPLE(8)
|
|
|
|
|
MULTIPLE(16)
|
|
|
|
|
MULTIPLE(32)
|
|
|
|
|
MULTIPLE(64)
|
|
|
|
|
|
2016-11-29 17:33:30 -08:00
|
|
|
static inline bool
|
2020-05-12 12:48:17 -07:00
|
|
|
is_zero_to_one(UNUSED struct hash_table *ht, const nir_alu_instr *instr,
|
|
|
|
|
unsigned src, unsigned num_components,
|
2016-11-29 17:33:30 -08:00
|
|
|
const uint8_t *swizzle)
|
|
|
|
|
{
|
2018-10-20 12:07:41 -05:00
|
|
|
/* only constant srcs: */
|
|
|
|
|
if (!nir_src_is_const(instr->src[src].src))
|
2016-11-29 17:33:30 -08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < num_components; i++) {
|
2021-10-15 17:23:51 +01:00
|
|
|
nir_alu_type type = nir_op_infos[instr->op].input_types[src];
|
|
|
|
|
switch (nir_alu_type_get_base_type(type)) {
|
2018-10-20 12:07:41 -05:00
|
|
|
case nir_type_float: {
|
|
|
|
|
double val = nir_src_comp_as_float(instr->src[src].src, swizzle[i]);
|
|
|
|
|
if (isnan(val) || val < 0.0f || val > 1.0f)
|
2016-11-29 17:33:30 -08:00
|
|
|
return false;
|
|
|
|
|
break;
|
2018-10-20 12:07:41 -05:00
|
|
|
}
|
2016-11-29 17:33:30 -08:00
|
|
|
default:
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2019-03-05 14:54:35 -08:00
|
|
|
/**
|
|
|
|
|
* Exclusive compare with (0, 1).
|
|
|
|
|
*
|
|
|
|
|
* This differs from \c is_zero_to_one because that function tests 0 <= src <=
|
|
|
|
|
* 1 while this function tests 0 < src < 1.
|
|
|
|
|
*/
|
|
|
|
|
static inline bool
|
2020-05-12 12:48:17 -07:00
|
|
|
is_gt_0_and_lt_1(UNUSED struct hash_table *ht, const nir_alu_instr *instr,
|
2019-09-23 15:40:46 -07:00
|
|
|
unsigned src, unsigned num_components,
|
2019-03-05 14:54:35 -08:00
|
|
|
const uint8_t *swizzle)
|
|
|
|
|
{
|
|
|
|
|
/* only constant srcs: */
|
|
|
|
|
if (!nir_src_is_const(instr->src[src].src))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < num_components; i++) {
|
2021-10-15 17:23:51 +01:00
|
|
|
nir_alu_type type = nir_op_infos[instr->op].input_types[src];
|
|
|
|
|
switch (nir_alu_type_get_base_type(type)) {
|
2019-03-05 14:54:35 -08:00
|
|
|
case nir_type_float: {
|
|
|
|
|
double val = nir_src_comp_as_float(instr->src[src].src, swizzle[i]);
|
|
|
|
|
if (isnan(val) || val <= 0.0f || val >= 1.0f)
|
|
|
|
|
return false;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
default:
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2020-03-31 15:37:00 -07:00
|
|
|
/**
|
|
|
|
|
* x & 1 != 0
|
|
|
|
|
*/
|
|
|
|
|
static inline bool
|
|
|
|
|
is_odd(UNUSED struct hash_table *ht, const nir_alu_instr *instr,
|
|
|
|
|
unsigned src, unsigned num_components,
|
|
|
|
|
const uint8_t *swizzle)
|
|
|
|
|
{
|
|
|
|
|
/* only constant srcs: */
|
|
|
|
|
if (!nir_src_is_const(instr->src[src].src))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < num_components; i++) {
|
|
|
|
|
nir_alu_type type = nir_op_infos[instr->op].input_types[src];
|
|
|
|
|
switch (nir_alu_type_get_base_type(type)) {
|
|
|
|
|
case nir_type_int:
|
|
|
|
|
case nir_type_uint: {
|
|
|
|
|
if ((nir_src_comp_as_uint(instr->src[src].src, swizzle[i]) & 1) == 0)
|
|
|
|
|
return false;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
default:
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2018-05-22 18:19:16 -07:00
|
|
|
static inline bool
|
2020-05-12 12:48:17 -07:00
|
|
|
is_not_const_zero(UNUSED struct hash_table *ht, const nir_alu_instr *instr,
|
2019-09-23 15:40:46 -07:00
|
|
|
unsigned src, unsigned num_components,
|
2018-05-22 18:19:16 -07:00
|
|
|
const uint8_t *swizzle)
|
|
|
|
|
{
|
|
|
|
|
if (nir_src_as_const_value(instr->src[src].src) == NULL)
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < num_components; i++) {
|
2019-09-25 11:59:49 -07:00
|
|
|
nir_alu_type type = nir_op_infos[instr->op].input_types[src];
|
|
|
|
|
switch (nir_alu_type_get_base_type(type)) {
|
2018-05-22 18:19:16 -07:00
|
|
|
case nir_type_float:
|
|
|
|
|
if (nir_src_comp_as_float(instr->src[src].src, swizzle[i]) == 0.0)
|
|
|
|
|
return false;
|
|
|
|
|
break;
|
|
|
|
|
case nir_type_bool:
|
|
|
|
|
case nir_type_int:
|
|
|
|
|
case nir_type_uint:
|
|
|
|
|
if (nir_src_comp_as_uint(instr->src[src].src, swizzle[i]) == 0)
|
|
|
|
|
return false;
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
nir/algebraic: optimize bits=umin(bits, 32-(offset&0x1f))
Optimizes patterns which are created by recent versions of vkd3d-proton,
when constant folding doesn't eliminate it entirely:
- ubitfield_extract(value, offset, umin(bits, 32-(offset&0x1f)))
- ibitfield_extract(value, offset, umin(bits, 32-(offset&0x1f)))
- bitfield_insert(base, insert, offset, umin(bits, 32-(offset&0x1f)))
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13225>
2021-10-06 15:06:51 +01:00
|
|
|
/** Is value unsigned less than the limit? */
|
2021-06-09 14:53:49 -07:00
|
|
|
static inline bool
|
nir/algebraic: optimize bits=umin(bits, 32-(offset&0x1f))
Optimizes patterns which are created by recent versions of vkd3d-proton,
when constant folding doesn't eliminate it entirely:
- ubitfield_extract(value, offset, umin(bits, 32-(offset&0x1f)))
- ibitfield_extract(value, offset, umin(bits, 32-(offset&0x1f)))
- bitfield_insert(base, insert, offset, umin(bits, 32-(offset&0x1f)))
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13225>
2021-10-06 15:06:51 +01:00
|
|
|
is_ult(const nir_alu_instr *instr, unsigned src, unsigned num_components, const uint8_t *swizzle,
|
|
|
|
|
uint64_t limit)
|
2021-06-09 14:53:49 -07:00
|
|
|
{
|
|
|
|
|
/* only constant srcs: */
|
|
|
|
|
if (!nir_src_is_const(instr->src[src].src))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < num_components; i++) {
|
nir/algebraic: optimize bits=umin(bits, 32-(offset&0x1f))
Optimizes patterns which are created by recent versions of vkd3d-proton,
when constant folding doesn't eliminate it entirely:
- ubitfield_extract(value, offset, umin(bits, 32-(offset&0x1f)))
- ibitfield_extract(value, offset, umin(bits, 32-(offset&0x1f)))
- bitfield_insert(base, insert, offset, umin(bits, 32-(offset&0x1f)))
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13225>
2021-10-06 15:06:51 +01:00
|
|
|
const uint64_t val =
|
2021-06-09 14:53:49 -07:00
|
|
|
nir_src_comp_as_uint(instr->src[src].src, swizzle[i]);
|
|
|
|
|
|
nir/algebraic: optimize bits=umin(bits, 32-(offset&0x1f))
Optimizes patterns which are created by recent versions of vkd3d-proton,
when constant folding doesn't eliminate it entirely:
- ubitfield_extract(value, offset, umin(bits, 32-(offset&0x1f)))
- ibitfield_extract(value, offset, umin(bits, 32-(offset&0x1f)))
- bitfield_insert(base, insert, offset, umin(bits, 32-(offset&0x1f)))
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13225>
2021-10-06 15:06:51 +01:00
|
|
|
if (val >= limit)
|
2021-06-09 14:53:49 -07:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
nir/algebraic: optimize bits=umin(bits, 32-(offset&0x1f))
Optimizes patterns which are created by recent versions of vkd3d-proton,
when constant folding doesn't eliminate it entirely:
- ubitfield_extract(value, offset, umin(bits, 32-(offset&0x1f)))
- ibitfield_extract(value, offset, umin(bits, 32-(offset&0x1f)))
- bitfield_insert(base, insert, offset, umin(bits, 32-(offset&0x1f)))
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13225>
2021-10-06 15:06:51 +01:00
|
|
|
/** Is value unsigned less than 32? */
|
|
|
|
|
static inline bool
|
|
|
|
|
is_ult_32(UNUSED struct hash_table *ht, const nir_alu_instr *instr,
|
|
|
|
|
unsigned src, unsigned num_components,
|
|
|
|
|
const uint8_t *swizzle)
|
|
|
|
|
{
|
|
|
|
|
return is_ult(instr, src, num_components, swizzle, 32);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/** Is value unsigned less than 0xfffc07fc? */
|
|
|
|
|
static inline bool
|
|
|
|
|
is_ult_0xfffc07fc(UNUSED struct hash_table *ht, const nir_alu_instr *instr,
|
|
|
|
|
unsigned src, unsigned num_components,
|
|
|
|
|
const uint8_t *swizzle)
|
|
|
|
|
{
|
|
|
|
|
return is_ult(instr, src, num_components, swizzle, 0xfffc07fcU);
|
|
|
|
|
}
|
|
|
|
|
|
2021-11-11 11:16:06 +00:00
|
|
|
/** Is the first 5 bits of value unsigned greater than or equal 2? */
|
|
|
|
|
static inline bool
|
|
|
|
|
is_first_5_bits_uge_2(UNUSED struct hash_table *ht, const nir_alu_instr *instr,
|
|
|
|
|
unsigned src, unsigned num_components,
|
|
|
|
|
const uint8_t *swizzle)
|
|
|
|
|
{
|
|
|
|
|
/* only constant srcs: */
|
|
|
|
|
if (!nir_src_is_const(instr->src[src].src))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < num_components; i++) {
|
|
|
|
|
const unsigned val =
|
|
|
|
|
nir_src_comp_as_uint(instr->src[src].src, swizzle[i]);
|
|
|
|
|
|
|
|
|
|
if ((val & 0x1f) < 2)
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2023-05-19 09:56:42 -07:00
|
|
|
/** Is this a constant that could be either int16_t or uint16_t? */
|
|
|
|
|
static inline bool
|
|
|
|
|
is_16_bits(UNUSED struct hash_table *ht, const nir_alu_instr *instr,
|
|
|
|
|
unsigned src, unsigned num_components,
|
|
|
|
|
const uint8_t *swizzle)
|
|
|
|
|
{
|
|
|
|
|
/* only constant srcs: */
|
|
|
|
|
if (!nir_src_is_const(instr->src[src].src))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < num_components; i++) {
|
|
|
|
|
const int64_t val =
|
|
|
|
|
nir_src_comp_as_int(instr->src[src].src, swizzle[i]);
|
|
|
|
|
|
|
|
|
|
if (val > 0xffff || val < -0x8000)
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2017-01-12 13:10:55 +11:00
|
|
|
static inline bool
|
2020-05-12 12:48:17 -07:00
|
|
|
is_not_const(UNUSED struct hash_table *ht, const nir_alu_instr *instr,
|
|
|
|
|
unsigned src, UNUSED unsigned num_components,
|
2018-01-04 15:28:30 -08:00
|
|
|
UNUSED const uint8_t *swizzle)
|
2017-01-12 13:10:55 +11:00
|
|
|
{
|
2018-10-20 12:07:41 -05:00
|
|
|
return !nir_src_is_const(instr->src[src].src);
|
2017-01-12 13:10:55 +11:00
|
|
|
}
|
|
|
|
|
|
2018-03-27 22:57:07 -07:00
|
|
|
static inline bool
|
2020-05-12 12:48:17 -07:00
|
|
|
is_not_fmul(struct hash_table *ht, const nir_alu_instr *instr, unsigned src,
|
2018-03-27 22:57:07 -07:00
|
|
|
UNUSED unsigned num_components, UNUSED const uint8_t *swizzle)
|
|
|
|
|
{
|
|
|
|
|
nir_alu_instr *src_alu =
|
|
|
|
|
nir_src_as_alu_instr(instr->src[src].src);
|
|
|
|
|
|
|
|
|
|
if (src_alu == NULL)
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
if (src_alu->op == nir_op_fneg)
|
2019-09-23 15:40:46 -07:00
|
|
|
return is_not_fmul(ht, src_alu, 0, 0, NULL);
|
2018-03-27 22:57:07 -07:00
|
|
|
|
2021-09-14 18:02:01 +01:00
|
|
|
return src_alu->op != nir_op_fmul && src_alu->op != nir_op_fmulz;
|
2018-03-27 22:57:07 -07:00
|
|
|
}
|
|
|
|
|
|
2020-06-19 11:30:27 +01:00
|
|
|
static inline bool
|
2020-05-12 12:48:17 -07:00
|
|
|
is_fmul(struct hash_table *ht, const nir_alu_instr *instr, unsigned src,
|
2020-06-19 11:30:27 +01:00
|
|
|
UNUSED unsigned num_components, UNUSED const uint8_t *swizzle)
|
|
|
|
|
{
|
|
|
|
|
nir_alu_instr *src_alu =
|
|
|
|
|
nir_src_as_alu_instr(instr->src[src].src);
|
|
|
|
|
|
|
|
|
|
if (src_alu == NULL)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (src_alu->op == nir_op_fneg)
|
|
|
|
|
return is_fmul(ht, src_alu, 0, 0, NULL);
|
|
|
|
|
|
2021-09-14 18:02:01 +01:00
|
|
|
return src_alu->op == nir_op_fmul || src_alu->op == nir_op_fmulz;
|
2020-06-19 11:30:27 +01:00
|
|
|
}
|
|
|
|
|
|
2019-06-05 17:23:11 -07:00
|
|
|
static inline bool
|
2020-05-12 12:48:17 -07:00
|
|
|
is_fsign(const nir_alu_instr *instr, unsigned src,
|
2019-06-05 17:23:11 -07:00
|
|
|
UNUSED unsigned num_components, UNUSED const uint8_t *swizzle)
|
|
|
|
|
{
|
|
|
|
|
nir_alu_instr *src_alu =
|
|
|
|
|
nir_src_as_alu_instr(instr->src[src].src);
|
|
|
|
|
|
|
|
|
|
if (src_alu == NULL)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (src_alu->op == nir_op_fneg)
|
|
|
|
|
src_alu = nir_src_as_alu_instr(src_alu->src[0].src);
|
|
|
|
|
|
2019-07-15 15:18:47 -07:00
|
|
|
return src_alu != NULL && src_alu->op == nir_op_fsign;
|
2019-06-05 17:23:11 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline bool
|
2020-05-12 12:48:17 -07:00
|
|
|
is_not_const_and_not_fsign(struct hash_table *ht, const nir_alu_instr *instr,
|
|
|
|
|
unsigned src, unsigned num_components,
|
|
|
|
|
const uint8_t *swizzle)
|
2019-06-05 17:23:11 -07:00
|
|
|
{
|
2019-09-23 15:40:46 -07:00
|
|
|
return is_not_const(ht, instr, src, num_components, swizzle) &&
|
2019-06-05 17:23:11 -07:00
|
|
|
!is_fsign(instr, src, num_components, swizzle);
|
|
|
|
|
}
|
|
|
|
|
|
2017-01-12 21:51:56 +11:00
|
|
|
static inline bool
|
2021-11-08 12:40:16 -08:00
|
|
|
is_used_once(const nir_alu_instr *instr)
|
2017-01-12 21:51:56 +11:00
|
|
|
{
|
2023-08-14 11:43:35 -05:00
|
|
|
return list_is_singular(&instr->def.uses);
|
2017-01-12 21:51:56 +11:00
|
|
|
}
|
|
|
|
|
|
2018-12-03 16:30:44 -08:00
|
|
|
static inline bool
|
2021-11-08 12:40:16 -08:00
|
|
|
is_used_by_if(const nir_alu_instr *instr)
|
2018-12-03 16:30:44 -08:00
|
|
|
{
|
2023-08-14 11:43:35 -05:00
|
|
|
return nir_def_used_by_if(&instr->def);
|
2018-12-03 16:30:44 -08:00
|
|
|
}
|
|
|
|
|
|
2017-01-08 23:52:59 +11:00
|
|
|
static inline bool
|
2021-11-08 12:40:16 -08:00
|
|
|
is_not_used_by_if(const nir_alu_instr *instr)
|
2017-01-08 23:52:59 +11:00
|
|
|
{
|
2023-04-06 13:19:31 -04:00
|
|
|
return !is_used_by_if(instr);
|
2017-01-08 23:52:59 +11:00
|
|
|
}
|
|
|
|
|
|
2018-10-23 14:30:41 -07:00
|
|
|
static inline bool
|
2021-11-08 12:40:16 -08:00
|
|
|
is_used_by_non_fsat(const nir_alu_instr *instr)
|
2018-10-23 14:30:41 -07:00
|
|
|
{
|
2023-08-14 11:43:35 -05:00
|
|
|
nir_foreach_use(src, &instr->def) {
|
2018-10-23 14:30:41 -07:00
|
|
|
const nir_instr *const user_instr = src->parent_instr;
|
|
|
|
|
|
|
|
|
|
if (user_instr->type != nir_instr_type_alu)
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
const nir_alu_instr *const user_alu = nir_instr_as_alu(user_instr);
|
|
|
|
|
|
|
|
|
|
assert(instr != user_alu);
|
|
|
|
|
if (user_alu->op != nir_op_fsat)
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2020-06-17 14:46:55 +01:00
|
|
|
static inline bool
|
2021-11-08 12:40:16 -08:00
|
|
|
is_only_used_as_float(const nir_alu_instr *instr)
|
2020-06-17 14:46:55 +01:00
|
|
|
{
|
2023-08-14 11:43:35 -05:00
|
|
|
nir_foreach_use(src, &instr->def) {
|
2020-06-17 14:46:55 +01:00
|
|
|
const nir_instr *const user_instr = src->parent_instr;
|
|
|
|
|
if (user_instr->type != nir_instr_type_alu)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
const nir_alu_instr *const user_alu = nir_instr_as_alu(user_instr);
|
|
|
|
|
assert(instr != user_alu);
|
|
|
|
|
|
2023-08-08 12:00:35 -05:00
|
|
|
unsigned index = (nir_alu_src *)container_of(src, nir_alu_src, src) - user_alu->src;
|
2021-10-15 17:23:51 +01:00
|
|
|
nir_alu_type type = nir_op_infos[user_alu->op].input_types[index];
|
|
|
|
|
if (nir_alu_type_get_base_type(type) != nir_type_float)
|
2020-06-17 14:46:55 +01:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
nir: Be smarter fusing ffma
If there is a single use of fmul, and that single use is fadd, it makes
sense to fuse ffma, as we already do. However, if there are multiple
uses, fusing may impede code gen. Consider the source fragment:
a = fmul(x, y)
b = fadd(a, z)
c = fmin(a, t)
d = fmax(b, c)
The fmul has two uses. The current ffma fusing is greedy and will
produce the following "optimized" code.
a = fmul(x, y)
b = ffma(x, y, z)
c = fmin(a, t)
d = fmax(b, c)
Actually, this code is worse! Instead of 1 fmul + 1 fadd, we now have 1
fmul + 1 ffma. In effect, two multiplies (and a fused add) instead of
one multiply and an add. Depending on the ISA, that could impede
scheduling or increase code size. It can also increase register
pressure, extending the live range.
It's tempting to gate on is_used_once, but that would hurt in cases
where we really do fuse everything, e.g.:
a = fmul(x, y)
b = fadd(a, z)
c = fadd(a, t)
For ISAs that fuse ffma, we expect that 2 ffma is faster than 1 fmul + 2
fadd. So what we really want is to fuse ffma iff the fmul will get
deleted. That occurs iff all uses of the fmul are fadd and will
themselves get fused to ffma, leaving fmul to get dead code eliminated.
That's easy to implement with a new NIR search helper, checking that all
uses are fadd.
shader-db results on Mali-G57 [open shader-db + subset of closed]:
total instructions in shared programs: 179491 -> 178991 (-0.28%)
instructions in affected programs: 36862 -> 36362 (-1.36%)
helped: 190
HURT: 27
total cycles in shared programs: 10573.20 -> 10571.75 (-0.01%)
cycles in affected programs: 72.02 -> 70.56 (-2.02%)
helped: 28
HURT: 1
total fma in shared programs: 1590.47 -> 1582.61 (-0.49%)
fma in affected programs: 319.95 -> 312.09 (-2.46%)
helped: 194
HURT: 1
total cvt in shared programs: 812.98 -> 813.03 (<.01%)
cvt in affected programs: 118.53 -> 118.58 (0.04%)
helped: 65
HURT: 81
total quadwords in shared programs: 98968 -> 98840 (-0.13%)
quadwords in affected programs: 2960 -> 2832 (-4.32%)
helped: 20
HURT: 4
total threads in shared programs: 4693 -> 4697 (0.09%)
threads in affected programs: 4 -> 8 (100.00%)
helped: 4
HURT: 0
v2: Update trace checksums for virgl due to numerical differences.
Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18814>
2022-10-15 13:39:26 -04:00
|
|
|
static inline bool
|
|
|
|
|
is_only_used_by_fadd(const nir_alu_instr *instr)
|
|
|
|
|
{
|
2023-08-14 11:43:35 -05:00
|
|
|
nir_foreach_use(src, &instr->def) {
|
nir: Be smarter fusing ffma
If there is a single use of fmul, and that single use is fadd, it makes
sense to fuse ffma, as we already do. However, if there are multiple
uses, fusing may impede code gen. Consider the source fragment:
a = fmul(x, y)
b = fadd(a, z)
c = fmin(a, t)
d = fmax(b, c)
The fmul has two uses. The current ffma fusing is greedy and will
produce the following "optimized" code.
a = fmul(x, y)
b = ffma(x, y, z)
c = fmin(a, t)
d = fmax(b, c)
Actually, this code is worse! Instead of 1 fmul + 1 fadd, we now have 1
fmul + 1 ffma. In effect, two multiplies (and a fused add) instead of
one multiply and an add. Depending on the ISA, that could impede
scheduling or increase code size. It can also increase register
pressure, extending the live range.
It's tempting to gate on is_used_once, but that would hurt in cases
where we really do fuse everything, e.g.:
a = fmul(x, y)
b = fadd(a, z)
c = fadd(a, t)
For ISAs that fuse ffma, we expect that 2 ffma is faster than 1 fmul + 2
fadd. So what we really want is to fuse ffma iff the fmul will get
deleted. That occurs iff all uses of the fmul are fadd and will
themselves get fused to ffma, leaving fmul to get dead code eliminated.
That's easy to implement with a new NIR search helper, checking that all
uses are fadd.
shader-db results on Mali-G57 [open shader-db + subset of closed]:
total instructions in shared programs: 179491 -> 178991 (-0.28%)
instructions in affected programs: 36862 -> 36362 (-1.36%)
helped: 190
HURT: 27
total cycles in shared programs: 10573.20 -> 10571.75 (-0.01%)
cycles in affected programs: 72.02 -> 70.56 (-2.02%)
helped: 28
HURT: 1
total fma in shared programs: 1590.47 -> 1582.61 (-0.49%)
fma in affected programs: 319.95 -> 312.09 (-2.46%)
helped: 194
HURT: 1
total cvt in shared programs: 812.98 -> 813.03 (<.01%)
cvt in affected programs: 118.53 -> 118.58 (0.04%)
helped: 65
HURT: 81
total quadwords in shared programs: 98968 -> 98840 (-0.13%)
quadwords in affected programs: 2960 -> 2832 (-4.32%)
helped: 20
HURT: 4
total threads in shared programs: 4693 -> 4697 (0.09%)
threads in affected programs: 4 -> 8 (100.00%)
helped: 4
HURT: 0
v2: Update trace checksums for virgl due to numerical differences.
Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18814>
2022-10-15 13:39:26 -04:00
|
|
|
const nir_instr *const user_instr = src->parent_instr;
|
|
|
|
|
if (user_instr->type != nir_instr_type_alu)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
const nir_alu_instr *const user_alu = nir_instr_as_alu(user_instr);
|
|
|
|
|
assert(instr != user_alu);
|
|
|
|
|
|
nir/opt_algebraic: Fuse c - a * b to FMA
Algebraically it is clear that
-(a * b) + c = (-a) * b + c = fma(-a, b, c)
But this is not clear from the NIR
('fadd', ('fneg', ('fmul', a, b)), c)
Add rules to handle this case specially. Note we don't necessarily want
to solve this by pushing fneg into fmul, because the rule opt_algebraic
(not the late part where FMA fusing happens) specifically pulls fneg out
of fmul to push fneg up multiplication chains.
Noticed in the big glmark2 "terrain" shader, which has a cycle count
reduced by 22% on Mali-G57 thanks to having this pattern a ton and being
FMA bound.
BEFORE: 1249 inst, 16.015625 cycles, 16.015625 fma, ... 632 quadwords
AFTER: 997 inst, 12.437500 cycles, .... 504 quadwords
Results on the same shader on AGX are also quite dramatic:
BEFORE: 1294 inst, 8600 bytes, 50 halfregs, ...
AFTER: 1154 inst, 8040 bytes, 50 halfregs, ...
Similar rules apply for fabs.
v2: Use a loop over the bit sizes (suggested by Emma).
shader-db on Valhall (open + small subset of closed), results on Bifrost
are similar:
total instructions in shared programs: 167975 -> 164970 (-1.79%)
instructions in affected programs: 92642 -> 89637 (-3.24%)
helped: 492
HURT: 25
helped stats (abs) min: 1.0 max: 252.0 x̄: 6.25 x̃: 3
helped stats (rel) min: 0.30% max: 20.18% x̄: 3.21% x̃: 2.91%
HURT stats (abs) min: 1.0 max: 5.0 x̄: 2.80 x̃: 3
HURT stats (rel) min: 0.46% max: 9.09% x̄: 3.89% x̃: 3.37%
95% mean confidence interval for instructions value: -6.95 -4.68
95% mean confidence interval for instructions %-change: -3.08% -2.65%
Instructions are helped.
total cycles in shared programs: 10556.89 -> 10538.98 (-0.17%)
cycles in affected programs: 265.56 -> 247.66 (-6.74%)
helped: 88
HURT: 2
helped stats (abs) min: 0.015625 max: 3.578125 x̄: 0.20 x̃: 0
helped stats (rel) min: 0.65% max: 22.34% x̄: 5.65% x̃: 4.25%
HURT stats (abs) min: 0.0625 max: 0.0625 x̄: 0.06 x̃: 0
HURT stats (rel) min: 8.33% max: 12.50% x̄: 10.42% x̃: 10.42%
95% mean confidence interval for cycles value: -0.28 -0.12
95% mean confidence interval for cycles %-change: -6.30% -4.30%
Cycles are helped.
total fma in shared programs: 1582.42 -> 1535.06 (-2.99%)
fma in affected programs: 871.58 -> 824.22 (-5.43%)
helped: 502
HURT: 9
helped stats (abs) min: 0.015625 max: 3.578125 x̄: 0.09 x̃: 0
helped stats (rel) min: 0.60% max: 25.00% x̄: 5.46% x̃: 4.82%
HURT stats (abs) min: 0.015625 max: 0.0625 x̄: 0.03 x̃: 0
HURT stats (rel) min: 4.35% max: 12.50% x̄: 6.22% x̃: 4.35%
95% mean confidence interval for fma value: -0.11 -0.08
95% mean confidence interval for fma %-change: -5.58% -4.93%
Fma are helped.
total cvt in shared programs: 665.55 -> 665.95 (0.06%)
cvt in affected programs: 61.72 -> 62.12 (0.66%)
helped: 33
HURT: 43
helped stats (abs) min: 0.015625 max: 0.359375 x̄: 0.04 x̃: 0
helped stats (rel) min: 1.01% max: 25.00% x̄: 6.68% x̃: 4.35%
HURT stats (abs) min: 0.015625 max: 0.109375 x̄: 0.04 x̃: 0
HURT stats (rel) min: 0.78% max: 38.46% x̄: 10.85% x̃: 6.90%
95% mean confidence interval for cvt value: -0.01 0.02
95% mean confidence interval for cvt %-change: 0.23% 6.24%
Inconclusive result (value mean confidence interval includes 0).
total quadwords in shared programs: 93376 -> 91736 (-1.76%)
quadwords in affected programs: 25376 -> 23736 (-6.46%)
helped: 169
HURT: 1
helped stats (abs) min: 8.0 max: 128.0 x̄: 9.75 x̃: 8
helped stats (rel) min: 1.52% max: 33.33% x̄: 8.35% x̃: 8.00%
HURT stats (abs) min: 8.0 max: 8.0 x̄: 8.00 x̃: 8
HURT stats (rel) min: 25.00% max: 25.00% x̄: 25.00% x̃: 25.00%
95% mean confidence interval for quadwords value: -11.18 -8.11
95% mean confidence interval for quadwords %-change: -8.95% -7.36%
Quadwords are helped.
total threads in shared programs: 4697 -> 4701 (0.09%)
threads in affected programs: 4 -> 8 (100.00%)
helped: 4
HURT: 0
helped stats (abs) min: 1.0 max: 1.0 x̄: 1.00 x̃: 1
helped stats (rel) min: 100.00% max: 100.00% x̄: 100.00% x̃: 100.00%
95% mean confidence interval for threads value: 1.00 1.00
95% mean confidence interval for threads %-change: 100.00% 100.00%
Threads are helped.
Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com>
Reviewed-by: Marek Ol<C5><A1><C3><A1>k <marek.olsak@amd.com>
Reviewed-by: Karol Herbst <kherbst@redhat.com> [v1]
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19312>
2022-10-25 22:29:31 -04:00
|
|
|
if (user_alu->op == nir_op_fneg || user_alu->op == nir_op_fabs) {
|
|
|
|
|
if (!is_only_used_by_fadd(user_alu))
|
|
|
|
|
return false;
|
|
|
|
|
} else if (user_alu->op != nir_op_fadd) {
|
nir: Be smarter fusing ffma
If there is a single use of fmul, and that single use is fadd, it makes
sense to fuse ffma, as we already do. However, if there are multiple
uses, fusing may impede code gen. Consider the source fragment:
a = fmul(x, y)
b = fadd(a, z)
c = fmin(a, t)
d = fmax(b, c)
The fmul has two uses. The current ffma fusing is greedy and will
produce the following "optimized" code.
a = fmul(x, y)
b = ffma(x, y, z)
c = fmin(a, t)
d = fmax(b, c)
Actually, this code is worse! Instead of 1 fmul + 1 fadd, we now have 1
fmul + 1 ffma. In effect, two multiplies (and a fused add) instead of
one multiply and an add. Depending on the ISA, that could impede
scheduling or increase code size. It can also increase register
pressure, extending the live range.
It's tempting to gate on is_used_once, but that would hurt in cases
where we really do fuse everything, e.g.:
a = fmul(x, y)
b = fadd(a, z)
c = fadd(a, t)
For ISAs that fuse ffma, we expect that 2 ffma is faster than 1 fmul + 2
fadd. So what we really want is to fuse ffma iff the fmul will get
deleted. That occurs iff all uses of the fmul are fadd and will
themselves get fused to ffma, leaving fmul to get dead code eliminated.
That's easy to implement with a new NIR search helper, checking that all
uses are fadd.
shader-db results on Mali-G57 [open shader-db + subset of closed]:
total instructions in shared programs: 179491 -> 178991 (-0.28%)
instructions in affected programs: 36862 -> 36362 (-1.36%)
helped: 190
HURT: 27
total cycles in shared programs: 10573.20 -> 10571.75 (-0.01%)
cycles in affected programs: 72.02 -> 70.56 (-2.02%)
helped: 28
HURT: 1
total fma in shared programs: 1590.47 -> 1582.61 (-0.49%)
fma in affected programs: 319.95 -> 312.09 (-2.46%)
helped: 194
HURT: 1
total cvt in shared programs: 812.98 -> 813.03 (<.01%)
cvt in affected programs: 118.53 -> 118.58 (0.04%)
helped: 65
HURT: 81
total quadwords in shared programs: 98968 -> 98840 (-0.13%)
quadwords in affected programs: 2960 -> 2832 (-4.32%)
helped: 20
HURT: 4
total threads in shared programs: 4693 -> 4697 (0.09%)
threads in affected programs: 4 -> 8 (100.00%)
helped: 4
HURT: 0
v2: Update trace checksums for virgl due to numerical differences.
Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18814>
2022-10-15 13:39:26 -04:00
|
|
|
return false;
|
nir/opt_algebraic: Fuse c - a * b to FMA
Algebraically it is clear that
-(a * b) + c = (-a) * b + c = fma(-a, b, c)
But this is not clear from the NIR
('fadd', ('fneg', ('fmul', a, b)), c)
Add rules to handle this case specially. Note we don't necessarily want
to solve this by pushing fneg into fmul, because the rule opt_algebraic
(not the late part where FMA fusing happens) specifically pulls fneg out
of fmul to push fneg up multiplication chains.
Noticed in the big glmark2 "terrain" shader, which has a cycle count
reduced by 22% on Mali-G57 thanks to having this pattern a ton and being
FMA bound.
BEFORE: 1249 inst, 16.015625 cycles, 16.015625 fma, ... 632 quadwords
AFTER: 997 inst, 12.437500 cycles, .... 504 quadwords
Results on the same shader on AGX are also quite dramatic:
BEFORE: 1294 inst, 8600 bytes, 50 halfregs, ...
AFTER: 1154 inst, 8040 bytes, 50 halfregs, ...
Similar rules apply for fabs.
v2: Use a loop over the bit sizes (suggested by Emma).
shader-db on Valhall (open + small subset of closed), results on Bifrost
are similar:
total instructions in shared programs: 167975 -> 164970 (-1.79%)
instructions in affected programs: 92642 -> 89637 (-3.24%)
helped: 492
HURT: 25
helped stats (abs) min: 1.0 max: 252.0 x̄: 6.25 x̃: 3
helped stats (rel) min: 0.30% max: 20.18% x̄: 3.21% x̃: 2.91%
HURT stats (abs) min: 1.0 max: 5.0 x̄: 2.80 x̃: 3
HURT stats (rel) min: 0.46% max: 9.09% x̄: 3.89% x̃: 3.37%
95% mean confidence interval for instructions value: -6.95 -4.68
95% mean confidence interval for instructions %-change: -3.08% -2.65%
Instructions are helped.
total cycles in shared programs: 10556.89 -> 10538.98 (-0.17%)
cycles in affected programs: 265.56 -> 247.66 (-6.74%)
helped: 88
HURT: 2
helped stats (abs) min: 0.015625 max: 3.578125 x̄: 0.20 x̃: 0
helped stats (rel) min: 0.65% max: 22.34% x̄: 5.65% x̃: 4.25%
HURT stats (abs) min: 0.0625 max: 0.0625 x̄: 0.06 x̃: 0
HURT stats (rel) min: 8.33% max: 12.50% x̄: 10.42% x̃: 10.42%
95% mean confidence interval for cycles value: -0.28 -0.12
95% mean confidence interval for cycles %-change: -6.30% -4.30%
Cycles are helped.
total fma in shared programs: 1582.42 -> 1535.06 (-2.99%)
fma in affected programs: 871.58 -> 824.22 (-5.43%)
helped: 502
HURT: 9
helped stats (abs) min: 0.015625 max: 3.578125 x̄: 0.09 x̃: 0
helped stats (rel) min: 0.60% max: 25.00% x̄: 5.46% x̃: 4.82%
HURT stats (abs) min: 0.015625 max: 0.0625 x̄: 0.03 x̃: 0
HURT stats (rel) min: 4.35% max: 12.50% x̄: 6.22% x̃: 4.35%
95% mean confidence interval for fma value: -0.11 -0.08
95% mean confidence interval for fma %-change: -5.58% -4.93%
Fma are helped.
total cvt in shared programs: 665.55 -> 665.95 (0.06%)
cvt in affected programs: 61.72 -> 62.12 (0.66%)
helped: 33
HURT: 43
helped stats (abs) min: 0.015625 max: 0.359375 x̄: 0.04 x̃: 0
helped stats (rel) min: 1.01% max: 25.00% x̄: 6.68% x̃: 4.35%
HURT stats (abs) min: 0.015625 max: 0.109375 x̄: 0.04 x̃: 0
HURT stats (rel) min: 0.78% max: 38.46% x̄: 10.85% x̃: 6.90%
95% mean confidence interval for cvt value: -0.01 0.02
95% mean confidence interval for cvt %-change: 0.23% 6.24%
Inconclusive result (value mean confidence interval includes 0).
total quadwords in shared programs: 93376 -> 91736 (-1.76%)
quadwords in affected programs: 25376 -> 23736 (-6.46%)
helped: 169
HURT: 1
helped stats (abs) min: 8.0 max: 128.0 x̄: 9.75 x̃: 8
helped stats (rel) min: 1.52% max: 33.33% x̄: 8.35% x̃: 8.00%
HURT stats (abs) min: 8.0 max: 8.0 x̄: 8.00 x̃: 8
HURT stats (rel) min: 25.00% max: 25.00% x̄: 25.00% x̃: 25.00%
95% mean confidence interval for quadwords value: -11.18 -8.11
95% mean confidence interval for quadwords %-change: -8.95% -7.36%
Quadwords are helped.
total threads in shared programs: 4697 -> 4701 (0.09%)
threads in affected programs: 4 -> 8 (100.00%)
helped: 4
HURT: 0
helped stats (abs) min: 1.0 max: 1.0 x̄: 1.00 x̃: 1
helped stats (rel) min: 100.00% max: 100.00% x̄: 100.00% x̃: 100.00%
95% mean confidence interval for threads value: 1.00 1.00
95% mean confidence interval for threads %-change: 100.00% 100.00%
Threads are helped.
Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com>
Reviewed-by: Marek Ol<C5><A1><C3><A1>k <marek.olsak@amd.com>
Reviewed-by: Karol Herbst <kherbst@redhat.com> [v1]
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19312>
2022-10-25 22:29:31 -04:00
|
|
|
}
|
nir: Be smarter fusing ffma
If there is a single use of fmul, and that single use is fadd, it makes
sense to fuse ffma, as we already do. However, if there are multiple
uses, fusing may impede code gen. Consider the source fragment:
a = fmul(x, y)
b = fadd(a, z)
c = fmin(a, t)
d = fmax(b, c)
The fmul has two uses. The current ffma fusing is greedy and will
produce the following "optimized" code.
a = fmul(x, y)
b = ffma(x, y, z)
c = fmin(a, t)
d = fmax(b, c)
Actually, this code is worse! Instead of 1 fmul + 1 fadd, we now have 1
fmul + 1 ffma. In effect, two multiplies (and a fused add) instead of
one multiply and an add. Depending on the ISA, that could impede
scheduling or increase code size. It can also increase register
pressure, extending the live range.
It's tempting to gate on is_used_once, but that would hurt in cases
where we really do fuse everything, e.g.:
a = fmul(x, y)
b = fadd(a, z)
c = fadd(a, t)
For ISAs that fuse ffma, we expect that 2 ffma is faster than 1 fmul + 2
fadd. So what we really want is to fuse ffma iff the fmul will get
deleted. That occurs iff all uses of the fmul are fadd and will
themselves get fused to ffma, leaving fmul to get dead code eliminated.
That's easy to implement with a new NIR search helper, checking that all
uses are fadd.
shader-db results on Mali-G57 [open shader-db + subset of closed]:
total instructions in shared programs: 179491 -> 178991 (-0.28%)
instructions in affected programs: 36862 -> 36362 (-1.36%)
helped: 190
HURT: 27
total cycles in shared programs: 10573.20 -> 10571.75 (-0.01%)
cycles in affected programs: 72.02 -> 70.56 (-2.02%)
helped: 28
HURT: 1
total fma in shared programs: 1590.47 -> 1582.61 (-0.49%)
fma in affected programs: 319.95 -> 312.09 (-2.46%)
helped: 194
HURT: 1
total cvt in shared programs: 812.98 -> 813.03 (<.01%)
cvt in affected programs: 118.53 -> 118.58 (0.04%)
helped: 65
HURT: 81
total quadwords in shared programs: 98968 -> 98840 (-0.13%)
quadwords in affected programs: 2960 -> 2832 (-4.32%)
helped: 20
HURT: 4
total threads in shared programs: 4693 -> 4697 (0.09%)
threads in affected programs: 4 -> 8 (100.00%)
helped: 4
HURT: 0
v2: Update trace checksums for virgl due to numerical differences.
Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18814>
2022-10-15 13:39:26 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2021-02-04 11:55:43 -06:00
|
|
|
static inline bool
|
2021-11-08 12:40:16 -08:00
|
|
|
only_lower_8_bits_used(const nir_alu_instr *instr)
|
2021-02-04 11:55:43 -06:00
|
|
|
{
|
2023-08-14 11:43:35 -05:00
|
|
|
return (nir_def_bits_used(&instr->def) & ~0xffull) == 0;
|
2021-02-04 11:55:43 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline bool
|
2021-11-08 12:40:16 -08:00
|
|
|
only_lower_16_bits_used(const nir_alu_instr *instr)
|
2021-02-04 11:55:43 -06:00
|
|
|
{
|
2023-08-14 11:43:35 -05:00
|
|
|
return (nir_def_bits_used(&instr->def) & ~0xffffull) == 0;
|
2021-02-04 11:55:43 -06:00
|
|
|
}
|
|
|
|
|
|
2019-05-13 00:09:38 +02:00
|
|
|
/**
|
|
|
|
|
* Returns true if a NIR ALU src represents a constant integer
|
|
|
|
|
* of either 32 or 64 bits, and the higher word (bit-size / 2)
|
|
|
|
|
* of all its components is zero.
|
|
|
|
|
*/
|
|
|
|
|
static inline bool
|
2020-05-12 12:48:17 -07:00
|
|
|
is_upper_half_zero(UNUSED struct hash_table *ht, const nir_alu_instr *instr,
|
|
|
|
|
unsigned src, unsigned num_components,
|
|
|
|
|
const uint8_t *swizzle)
|
2019-05-13 00:09:38 +02:00
|
|
|
{
|
|
|
|
|
if (nir_src_as_const_value(instr->src[src].src) == NULL)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < num_components; i++) {
|
|
|
|
|
unsigned half_bit_size = nir_src_bit_size(instr->src[src].src) / 2;
|
2022-11-15 12:10:40 +00:00
|
|
|
uint64_t high_bits = u_bit_consecutive64(half_bit_size, half_bit_size);
|
2019-05-13 00:09:38 +02:00
|
|
|
if ((nir_src_comp_as_uint(instr->src[src].src,
|
2023-08-08 12:00:35 -05:00
|
|
|
swizzle[i]) &
|
|
|
|
|
high_bits) != 0) {
|
2019-05-13 00:09:38 +02:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Returns true if a NIR ALU src represents a constant integer
|
|
|
|
|
* of either 32 or 64 bits, and the lower word (bit-size / 2)
|
|
|
|
|
* of all its components is zero.
|
|
|
|
|
*/
|
|
|
|
|
static inline bool
|
2020-05-12 12:48:17 -07:00
|
|
|
is_lower_half_zero(UNUSED struct hash_table *ht, const nir_alu_instr *instr,
|
|
|
|
|
unsigned src, unsigned num_components,
|
|
|
|
|
const uint8_t *swizzle)
|
2019-05-13 00:09:38 +02:00
|
|
|
{
|
|
|
|
|
if (nir_src_as_const_value(instr->src[src].src) == NULL)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < num_components; i++) {
|
2022-11-15 12:10:40 +00:00
|
|
|
uint64_t low_bits = u_bit_consecutive64(0, nir_src_bit_size(instr->src[src].src) / 2);
|
|
|
|
|
if ((nir_src_comp_as_uint(instr->src[src].src, swizzle[i]) & low_bits) != 0)
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline bool
|
|
|
|
|
is_upper_half_negative_one(UNUSED struct hash_table *ht, const nir_alu_instr *instr,
|
|
|
|
|
unsigned src, unsigned num_components,
|
|
|
|
|
const uint8_t *swizzle)
|
|
|
|
|
{
|
|
|
|
|
if (nir_src_as_const_value(instr->src[src].src) == NULL)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < num_components; i++) {
|
|
|
|
|
unsigned half_bit_size = nir_src_bit_size(instr->src[src].src) / 2;
|
|
|
|
|
uint64_t high_bits = u_bit_consecutive64(half_bit_size, half_bit_size);
|
|
|
|
|
if ((nir_src_comp_as_uint(instr->src[src].src,
|
2023-08-08 12:00:35 -05:00
|
|
|
swizzle[i]) &
|
|
|
|
|
high_bits) != high_bits) {
|
2022-11-15 12:10:40 +00:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline bool
|
|
|
|
|
is_lower_half_negative_one(UNUSED struct hash_table *ht, const nir_alu_instr *instr,
|
|
|
|
|
unsigned src, unsigned num_components,
|
|
|
|
|
const uint8_t *swizzle)
|
|
|
|
|
{
|
|
|
|
|
if (nir_src_as_const_value(instr->src[src].src) == NULL)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < num_components; i++) {
|
|
|
|
|
uint64_t low_bits = u_bit_consecutive64(0, nir_src_bit_size(instr->src[src].src) / 2);
|
|
|
|
|
if ((nir_src_comp_as_uint(instr->src[src].src, swizzle[i]) & low_bits) != low_bits)
|
2019-05-13 00:09:38 +02:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2019-06-12 16:48:21 -07:00
|
|
|
static inline bool
|
2021-11-08 12:40:16 -08:00
|
|
|
no_signed_wrap(const nir_alu_instr *instr)
|
2019-06-12 16:48:21 -07:00
|
|
|
{
|
|
|
|
|
return instr->no_signed_wrap;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline bool
|
2021-11-08 12:40:16 -08:00
|
|
|
no_unsigned_wrap(const nir_alu_instr *instr)
|
2019-06-12 16:48:21 -07:00
|
|
|
{
|
|
|
|
|
return instr->no_unsigned_wrap;
|
|
|
|
|
}
|
|
|
|
|
|
2018-01-23 09:48:43 +08:00
|
|
|
static inline bool
|
2020-05-12 12:48:17 -07:00
|
|
|
is_integral(struct hash_table *ht, const nir_alu_instr *instr, unsigned src,
|
2018-01-23 09:48:43 +08:00
|
|
|
UNUSED unsigned num_components, UNUSED const uint8_t *swizzle)
|
|
|
|
|
{
|
2019-09-23 15:40:46 -07:00
|
|
|
const struct ssa_result_range r = nir_analyze_range(ht, instr, src);
|
2018-01-23 09:48:43 +08:00
|
|
|
|
|
|
|
|
return r.is_integral;
|
|
|
|
|
}
|
|
|
|
|
|
2020-08-10 18:34:37 -07:00
|
|
|
/**
|
|
|
|
|
* Is the value finite?
|
|
|
|
|
*/
|
|
|
|
|
static inline bool
|
2020-08-17 15:56:24 -07:00
|
|
|
is_finite(UNUSED struct hash_table *ht, const nir_alu_instr *instr,
|
|
|
|
|
unsigned src, UNUSED unsigned num_components,
|
|
|
|
|
UNUSED const uint8_t *swizzle)
|
2020-08-10 18:34:37 -07:00
|
|
|
{
|
2020-08-17 15:56:24 -07:00
|
|
|
const struct ssa_result_range v = nir_analyze_range(ht, instr, src);
|
2020-08-10 18:34:37 -07:00
|
|
|
|
2020-08-17 15:56:24 -07:00
|
|
|
return v.is_finite;
|
2020-08-10 18:34:37 -07:00
|
|
|
}
|
|
|
|
|
|
2021-09-14 18:02:01 +01:00
|
|
|
static inline bool
|
|
|
|
|
is_finite_not_zero(UNUSED struct hash_table *ht, const nir_alu_instr *instr,
|
|
|
|
|
unsigned src, UNUSED unsigned num_components,
|
|
|
|
|
UNUSED const uint8_t *swizzle)
|
|
|
|
|
{
|
|
|
|
|
const struct ssa_result_range v = nir_analyze_range(ht, instr, src);
|
|
|
|
|
|
|
|
|
|
return v.is_finite &&
|
|
|
|
|
(v.range == lt_zero || v.range == gt_zero || v.range == ne_zero);
|
|
|
|
|
}
|
|
|
|
|
|
2023-08-08 12:00:35 -05:00
|
|
|
#define RELATION(r) \
|
|
|
|
|
static inline bool \
|
|
|
|
|
is_##r(struct hash_table *ht, const nir_alu_instr *instr, \
|
|
|
|
|
unsigned src, UNUSED unsigned num_components, \
|
|
|
|
|
UNUSED const uint8_t *swizzle) \
|
|
|
|
|
{ \
|
|
|
|
|
const struct ssa_result_range v = nir_analyze_range(ht, instr, src); \
|
|
|
|
|
return v.range == r; \
|
|
|
|
|
} \
|
|
|
|
|
\
|
|
|
|
|
static inline bool \
|
|
|
|
|
is_a_number_##r(struct hash_table *ht, const nir_alu_instr *instr, \
|
|
|
|
|
unsigned src, UNUSED unsigned num_components, \
|
|
|
|
|
UNUSED const uint8_t *swizzle) \
|
|
|
|
|
{ \
|
|
|
|
|
const struct ssa_result_range v = nir_analyze_range(ht, instr, src); \
|
|
|
|
|
return v.is_a_number && v.range == r; \
|
|
|
|
|
}
|
2018-01-23 09:48:43 +08:00
|
|
|
|
|
|
|
|
RELATION(lt_zero)
|
|
|
|
|
RELATION(le_zero)
|
|
|
|
|
RELATION(gt_zero)
|
|
|
|
|
RELATION(ge_zero)
|
|
|
|
|
RELATION(ne_zero)
|
|
|
|
|
|
|
|
|
|
static inline bool
|
2020-05-12 12:48:17 -07:00
|
|
|
is_not_negative(struct hash_table *ht, const nir_alu_instr *instr, unsigned src,
|
2018-01-23 09:48:43 +08:00
|
|
|
UNUSED unsigned num_components, UNUSED const uint8_t *swizzle)
|
|
|
|
|
{
|
2019-09-23 15:40:46 -07:00
|
|
|
const struct ssa_result_range v = nir_analyze_range(ht, instr, src);
|
2018-01-23 09:48:43 +08:00
|
|
|
return v.range == ge_zero || v.range == gt_zero || v.range == eq_zero;
|
|
|
|
|
}
|
|
|
|
|
|
2021-02-26 07:12:42 -08:00
|
|
|
static inline bool
|
|
|
|
|
is_a_number_not_negative(struct hash_table *ht, const nir_alu_instr *instr,
|
|
|
|
|
unsigned src, UNUSED unsigned num_components,
|
|
|
|
|
UNUSED const uint8_t *swizzle)
|
|
|
|
|
{
|
|
|
|
|
const struct ssa_result_range v = nir_analyze_range(ht, instr, src);
|
|
|
|
|
return v.is_a_number &&
|
|
|
|
|
(v.range == ge_zero || v.range == gt_zero || v.range == eq_zero);
|
|
|
|
|
}
|
|
|
|
|
|
2018-01-23 09:48:43 +08:00
|
|
|
static inline bool
|
2020-05-12 12:48:17 -07:00
|
|
|
is_not_positive(struct hash_table *ht, const nir_alu_instr *instr, unsigned src,
|
2018-01-23 09:48:43 +08:00
|
|
|
UNUSED unsigned num_components, UNUSED const uint8_t *swizzle)
|
|
|
|
|
{
|
2019-09-23 15:40:46 -07:00
|
|
|
const struct ssa_result_range v = nir_analyze_range(ht, instr, src);
|
2018-01-23 09:48:43 +08:00
|
|
|
return v.range == le_zero || v.range == lt_zero || v.range == eq_zero;
|
|
|
|
|
}
|
|
|
|
|
|
2021-02-26 07:12:42 -08:00
|
|
|
static inline bool
|
|
|
|
|
is_a_number_not_positive(struct hash_table *ht, const nir_alu_instr *instr,
|
|
|
|
|
unsigned src, UNUSED unsigned num_components,
|
|
|
|
|
UNUSED const uint8_t *swizzle)
|
|
|
|
|
{
|
|
|
|
|
const struct ssa_result_range v = nir_analyze_range(ht, instr, src);
|
|
|
|
|
return v.is_a_number &&
|
|
|
|
|
(v.range == le_zero || v.range == lt_zero || v.range == eq_zero);
|
|
|
|
|
}
|
|
|
|
|
|
2018-01-23 09:48:43 +08:00
|
|
|
static inline bool
|
2020-05-12 12:48:17 -07:00
|
|
|
is_not_zero(struct hash_table *ht, const nir_alu_instr *instr, unsigned src,
|
2018-01-23 09:48:43 +08:00
|
|
|
UNUSED unsigned num_components, UNUSED const uint8_t *swizzle)
|
|
|
|
|
{
|
2019-09-23 15:40:46 -07:00
|
|
|
const struct ssa_result_range v = nir_analyze_range(ht, instr, src);
|
2018-01-23 09:48:43 +08:00
|
|
|
return v.range == lt_zero || v.range == gt_zero || v.range == ne_zero;
|
|
|
|
|
}
|
|
|
|
|
|
2021-02-26 07:12:42 -08:00
|
|
|
static inline bool
|
|
|
|
|
is_a_number_not_zero(struct hash_table *ht, const nir_alu_instr *instr,
|
|
|
|
|
unsigned src, UNUSED unsigned num_components,
|
|
|
|
|
UNUSED const uint8_t *swizzle)
|
|
|
|
|
{
|
|
|
|
|
const struct ssa_result_range v = nir_analyze_range(ht, instr, src);
|
|
|
|
|
return v.is_a_number &&
|
|
|
|
|
(v.range == lt_zero || v.range == gt_zero || v.range == ne_zero);
|
|
|
|
|
}
|
|
|
|
|
|
2020-07-02 18:18:09 -07:00
|
|
|
static inline bool
|
|
|
|
|
is_a_number(struct hash_table *ht, const nir_alu_instr *instr, unsigned src,
|
|
|
|
|
UNUSED unsigned num_components, UNUSED const uint8_t *swizzle)
|
|
|
|
|
{
|
|
|
|
|
const struct ssa_result_range v = nir_analyze_range(ht, instr, src);
|
|
|
|
|
return v.is_a_number;
|
|
|
|
|
}
|
|
|
|
|
|
2016-05-07 13:01:24 -04:00
|
|
|
#endif /* _NIR_SEARCH_ */
|