2018-05-08 11:24:40 -07:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2010 Intel Corporation
|
|
|
|
|
* Copyright © 2018 Broadcom
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
* Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
|
|
|
* DEALINGS IN THE SOFTWARE.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include "nir.h"
|
|
|
|
|
#include "nir_builder.h"
|
|
|
|
|
|
|
|
|
|
/** nir_lower_alu.c
|
|
|
|
|
*
|
|
|
|
|
* NIR's home for miscellaneous ALU operation lowering implementations.
|
|
|
|
|
*
|
|
|
|
|
* Most NIR ALU lowering occurs in nir_opt_algebraic.py, since it's generally
|
|
|
|
|
* easy to write them there. However, if terms appear multiple times in the
|
|
|
|
|
* lowered code, it can get very verbose and cause a lot of work for CSE, so
|
|
|
|
|
* it may end up being easier to write out in C code.
|
|
|
|
|
*
|
|
|
|
|
* The shader must be in SSA for this pass.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
static bool
|
2024-08-08 22:28:13 -04:00
|
|
|
lower_alu_instr(nir_builder *b, nir_alu_instr *instr, UNUSED void *cb_data)
|
2018-05-08 11:24:40 -07:00
|
|
|
{
|
2023-08-12 16:17:15 -04:00
|
|
|
nir_def *lowered = NULL;
|
2018-05-08 11:24:40 -07:00
|
|
|
|
|
|
|
|
b->cursor = nir_before_instr(&instr->instr);
|
2025-12-18 17:36:51 +01:00
|
|
|
b->fp_math_ctrl = instr->fp_math_ctrl;
|
2018-05-08 11:24:40 -07:00
|
|
|
|
|
|
|
|
switch (instr->op) {
|
2018-05-08 12:47:48 -07:00
|
|
|
case nir_op_bitfield_reverse:
|
|
|
|
|
if (b->shader->options->lower_bitfield_reverse) {
|
2025-04-10 15:37:59 -04:00
|
|
|
assert(instr->def.bit_size == 32);
|
|
|
|
|
|
2018-05-08 12:47:48 -07:00
|
|
|
/* For more details, see:
|
|
|
|
|
*
|
|
|
|
|
* http://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel
|
|
|
|
|
*/
|
2023-08-12 16:17:15 -04:00
|
|
|
nir_def *c1 = nir_imm_int(b, 1);
|
|
|
|
|
nir_def *c2 = nir_imm_int(b, 2);
|
|
|
|
|
nir_def *c4 = nir_imm_int(b, 4);
|
|
|
|
|
nir_def *c8 = nir_imm_int(b, 8);
|
|
|
|
|
nir_def *c16 = nir_imm_int(b, 16);
|
|
|
|
|
nir_def *c33333333 = nir_imm_int(b, 0x33333333);
|
|
|
|
|
nir_def *c55555555 = nir_imm_int(b, 0x55555555);
|
|
|
|
|
nir_def *c0f0f0f0f = nir_imm_int(b, 0x0f0f0f0f);
|
|
|
|
|
nir_def *c00ff00ff = nir_imm_int(b, 0x00ff00ff);
|
2018-05-08 12:47:48 -07:00
|
|
|
|
|
|
|
|
lowered = nir_ssa_for_alu_src(b, instr, 0);
|
|
|
|
|
|
|
|
|
|
/* Swap odd and even bits. */
|
|
|
|
|
lowered = nir_ior(b,
|
|
|
|
|
nir_iand(b, nir_ushr(b, lowered, c1), c55555555),
|
|
|
|
|
nir_ishl(b, nir_iand(b, lowered, c55555555), c1));
|
|
|
|
|
|
|
|
|
|
/* Swap consecutive pairs. */
|
|
|
|
|
lowered = nir_ior(b,
|
|
|
|
|
nir_iand(b, nir_ushr(b, lowered, c2), c33333333),
|
|
|
|
|
nir_ishl(b, nir_iand(b, lowered, c33333333), c2));
|
|
|
|
|
|
|
|
|
|
/* Swap nibbles. */
|
|
|
|
|
lowered = nir_ior(b,
|
|
|
|
|
nir_iand(b, nir_ushr(b, lowered, c4), c0f0f0f0f),
|
|
|
|
|
nir_ishl(b, nir_iand(b, lowered, c0f0f0f0f), c4));
|
|
|
|
|
|
|
|
|
|
/* Swap bytes. */
|
|
|
|
|
lowered = nir_ior(b,
|
|
|
|
|
nir_iand(b, nir_ushr(b, lowered, c8), c00ff00ff),
|
|
|
|
|
nir_ishl(b, nir_iand(b, lowered, c00ff00ff), c8));
|
|
|
|
|
|
|
|
|
|
lowered = nir_ior(b,
|
|
|
|
|
nir_ushr(b, lowered, c16),
|
|
|
|
|
nir_ishl(b, lowered, c16));
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
2018-05-08 13:04:37 -07:00
|
|
|
case nir_op_bit_count:
|
|
|
|
|
if (b->shader->options->lower_bit_count) {
|
|
|
|
|
/* For more details, see:
|
|
|
|
|
*
|
|
|
|
|
* http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
|
|
|
|
|
*/
|
2024-06-15 18:49:06 +02:00
|
|
|
|
|
|
|
|
lowered = nir_ssa_for_alu_src(b, instr, 0);
|
|
|
|
|
unsigned bit_size = lowered->bit_size;
|
|
|
|
|
|
2018-05-08 13:04:37 -07:00
|
|
|
lowered = nir_isub(b, lowered,
|
2025-04-15 12:37:24 -04:00
|
|
|
nir_iand_imm(b, nir_ushr_imm(b, lowered, 1), 0x55555555));
|
|
|
|
|
|
|
|
|
|
lowered = nir_iadd(b, nir_iand_imm(b, lowered, 0x33333333),
|
|
|
|
|
nir_iand_imm(b, nir_ushr_imm(b, lowered, 2), 0x33333333));
|
|
|
|
|
|
|
|
|
|
lowered = nir_iadd(b, lowered, nir_ushr_imm(b, lowered, 4));
|
|
|
|
|
|
|
|
|
|
lowered = nir_iand_imm(b, lowered, 0x0f0f0f0f);
|
|
|
|
|
lowered = nir_imul_imm(b, lowered, 0x01010101);
|
|
|
|
|
lowered = nir_u2u32(b, nir_ushr_imm(b, lowered, bit_size - 8));
|
2018-05-08 13:04:37 -07:00
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
2018-05-08 11:24:40 -07:00
|
|
|
case nir_op_imul_high:
|
|
|
|
|
case nir_op_umul_high:
|
|
|
|
|
if (b->shader->options->lower_mul_high) {
|
2023-08-12 16:17:15 -04:00
|
|
|
nir_def *src0 = nir_ssa_for_alu_src(b, instr, 0);
|
|
|
|
|
nir_def *src1 = nir_ssa_for_alu_src(b, instr, 1);
|
2020-06-22 14:59:39 -07:00
|
|
|
if (src0->bit_size < 32) {
|
|
|
|
|
/* Just do the math in 32-bit space and shift the result */
|
|
|
|
|
nir_alu_type base_type = nir_op_infos[instr->op].output_type;
|
|
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
nir_def *src0_32 = nir_type_convert(b, src0, base_type, base_type | 32, nir_rounding_mode_undef);
|
|
|
|
|
nir_def *src1_32 = nir_type_convert(b, src1, base_type, base_type | 32, nir_rounding_mode_undef);
|
|
|
|
|
nir_def *dest_32 = nir_imul(b, src0_32, src1_32);
|
|
|
|
|
nir_def *dest_shifted = nir_ishr_imm(b, dest_32, src0->bit_size);
|
2022-11-01 18:12:19 -07:00
|
|
|
lowered = nir_type_convert(b, dest_shifted, base_type, base_type | src0->bit_size, nir_rounding_mode_undef);
|
2020-06-22 14:59:39 -07:00
|
|
|
} else {
|
2023-08-12 16:17:15 -04:00
|
|
|
nir_def *cshift = nir_imm_int(b, src0->bit_size / 2);
|
|
|
|
|
nir_def *cmask = nir_imm_intN_t(b, (1ull << (src0->bit_size / 2)) - 1, src0->bit_size);
|
nir/lower_alu: use Knuth's Algorithm M for [iu]mul_high
This significantly simplifies the handling of signed numbers as the same
code path can handle signed and unsigned numbers by simply using ishr
instead of ushr for some of the shifts. For both cases, the number of
additions and shifts are also reduced.
Note that LLVM uses the same algorithm.
fossil-db stats for Turnip:
Totals from 4849 (2.94% of 164705) affected shaders:
MaxWaves: 52318 -> 52332 (+0.03%); split: +0.04%, -0.02%
Instrs: 5262458 -> 5218922 (-0.83%); split: -0.87%, +0.05%
CodeSize: 10831900 -> 10655170 (-1.63%); split: -1.64%, +0.01%
NOPs: 829481 -> 836010 (+0.79%); split: -0.95%, +1.74%
MOVs: 176187 -> 173788 (-1.36%); split: -3.27%, +1.91%
COVs: 104096 -> 86543 (-16.86%); split: -16.87%, +0.01%
Full: 90434 -> 90158 (-0.31%); split: -0.33%, +0.03%
(ss): 131091 -> 130866 (-0.17%); split: -0.87%, +0.70%
(sy): 55550 -> 55769 (+0.39%); split: -0.92%, +1.32%
(ss)-stall: 406003 -> 407194 (+0.29%); split: -1.10%, +1.39%
(sy)-stall: 1668213 -> 1678082 (+0.59%); split: -1.31%, +1.90%
Preamble Instrs: 1105270 -> 1067290 (-3.44%); split: -3.50%, +0.06%
Constlen: 423776 -> 423560 (-0.05%)
Last helper: 1038202 -> 1035540 (-0.26%); split: -0.42%, +0.16%
Last baryf: 38908 -> 38632 (-0.71%)
Subgroup size: 336640 -> 336832 (+0.06%)
Cat0: 916209 -> 922848 (+0.72%); split: -0.87%, +1.59%
Cat1: 282813 -> 262845 (-7.06%); split: -7.49%, +0.43%
Cat2: 2198715 -> 2183012 (-0.71%); split: -0.72%, +0.01%
Cat3: 1390914 -> 1376421 (-1.04%)
Cat7: 123127 -> 123116 (-0.01%); split: -0.24%, +0.23%
Signed-off-by: Job Noorman <jnoorman@igalia.com>
Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37793>
2025-10-10 07:30:18 +02:00
|
|
|
|
|
|
|
|
/* Taken from Figure 8-2 in Hacker's Delight, which is derived
|
|
|
|
|
* from Knuth's Algorithm M.
|
2018-05-08 11:24:40 -07:00
|
|
|
*/
|
nir/lower_alu: use Knuth's Algorithm M for [iu]mul_high
This significantly simplifies the handling of signed numbers as the same
code path can handle signed and unsigned numbers by simply using ishr
instead of ushr for some of the shifts. For both cases, the number of
additions and shifts are also reduced.
Note that LLVM uses the same algorithm.
fossil-db stats for Turnip:
Totals from 4849 (2.94% of 164705) affected shaders:
MaxWaves: 52318 -> 52332 (+0.03%); split: +0.04%, -0.02%
Instrs: 5262458 -> 5218922 (-0.83%); split: -0.87%, +0.05%
CodeSize: 10831900 -> 10655170 (-1.63%); split: -1.64%, +0.01%
NOPs: 829481 -> 836010 (+0.79%); split: -0.95%, +1.74%
MOVs: 176187 -> 173788 (-1.36%); split: -3.27%, +1.91%
COVs: 104096 -> 86543 (-16.86%); split: -16.87%, +0.01%
Full: 90434 -> 90158 (-0.31%); split: -0.33%, +0.03%
(ss): 131091 -> 130866 (-0.17%); split: -0.87%, +0.70%
(sy): 55550 -> 55769 (+0.39%); split: -0.92%, +1.32%
(ss)-stall: 406003 -> 407194 (+0.29%); split: -1.10%, +1.39%
(sy)-stall: 1668213 -> 1678082 (+0.59%); split: -1.31%, +1.90%
Preamble Instrs: 1105270 -> 1067290 (-3.44%); split: -3.50%, +0.06%
Constlen: 423776 -> 423560 (-0.05%)
Last helper: 1038202 -> 1035540 (-0.26%); split: -0.42%, +0.16%
Last baryf: 38908 -> 38632 (-0.71%)
Subgroup size: 336640 -> 336832 (+0.06%)
Cat0: 916209 -> 922848 (+0.72%); split: -0.87%, +1.59%
Cat1: 282813 -> 262845 (-7.06%); split: -7.49%, +0.43%
Cat2: 2198715 -> 2183012 (-0.71%); split: -0.72%, +0.01%
Cat3: 1390914 -> 1376421 (-1.04%)
Cat7: 123127 -> 123116 (-0.01%); split: -0.24%, +0.23%
Signed-off-by: Job Noorman <jnoorman@igalia.com>
Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37793>
2025-10-10 07:30:18 +02:00
|
|
|
bool is_signed = instr->op == nir_op_imul_high;
|
2023-08-12 16:17:15 -04:00
|
|
|
nir_def *src0l = nir_iand(b, src0, cmask);
|
|
|
|
|
nir_def *src1l = nir_iand(b, src1, cmask);
|
nir/lower_alu: use Knuth's Algorithm M for [iu]mul_high
This significantly simplifies the handling of signed numbers as the same
code path can handle signed and unsigned numbers by simply using ishr
instead of ushr for some of the shifts. For both cases, the number of
additions and shifts are also reduced.
Note that LLVM uses the same algorithm.
fossil-db stats for Turnip:
Totals from 4849 (2.94% of 164705) affected shaders:
MaxWaves: 52318 -> 52332 (+0.03%); split: +0.04%, -0.02%
Instrs: 5262458 -> 5218922 (-0.83%); split: -0.87%, +0.05%
CodeSize: 10831900 -> 10655170 (-1.63%); split: -1.64%, +0.01%
NOPs: 829481 -> 836010 (+0.79%); split: -0.95%, +1.74%
MOVs: 176187 -> 173788 (-1.36%); split: -3.27%, +1.91%
COVs: 104096 -> 86543 (-16.86%); split: -16.87%, +0.01%
Full: 90434 -> 90158 (-0.31%); split: -0.33%, +0.03%
(ss): 131091 -> 130866 (-0.17%); split: -0.87%, +0.70%
(sy): 55550 -> 55769 (+0.39%); split: -0.92%, +1.32%
(ss)-stall: 406003 -> 407194 (+0.29%); split: -1.10%, +1.39%
(sy)-stall: 1668213 -> 1678082 (+0.59%); split: -1.31%, +1.90%
Preamble Instrs: 1105270 -> 1067290 (-3.44%); split: -3.50%, +0.06%
Constlen: 423776 -> 423560 (-0.05%)
Last helper: 1038202 -> 1035540 (-0.26%); split: -0.42%, +0.16%
Last baryf: 38908 -> 38632 (-0.71%)
Subgroup size: 336640 -> 336832 (+0.06%)
Cat0: 916209 -> 922848 (+0.72%); split: -0.87%, +1.59%
Cat1: 282813 -> 262845 (-7.06%); split: -7.49%, +0.43%
Cat2: 2198715 -> 2183012 (-0.71%); split: -0.72%, +0.01%
Cat3: 1390914 -> 1376421 (-1.04%)
Cat7: 123127 -> 123116 (-0.01%); split: -0.24%, +0.23%
Signed-off-by: Job Noorman <jnoorman@igalia.com>
Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37793>
2025-10-10 07:30:18 +02:00
|
|
|
nir_def *src0h = nir_shr(b, is_signed, src0, cshift);
|
|
|
|
|
nir_def *src1h = nir_shr(b, is_signed, src1, cshift);
|
2020-06-22 14:59:39 -07:00
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
nir_def *lo = nir_imul(b, src0l, src1l);
|
|
|
|
|
nir_def *m1 = nir_imul(b, src0l, src1h);
|
|
|
|
|
nir_def *m2 = nir_imul(b, src0h, src1l);
|
|
|
|
|
nir_def *hi = nir_imul(b, src0h, src1h);
|
2020-06-22 14:59:39 -07:00
|
|
|
|
nir/lower_alu: use Knuth's Algorithm M for [iu]mul_high
This significantly simplifies the handling of signed numbers as the same
code path can handle signed and unsigned numbers by simply using ishr
instead of ushr for some of the shifts. For both cases, the number of
additions and shifts are also reduced.
Note that LLVM uses the same algorithm.
fossil-db stats for Turnip:
Totals from 4849 (2.94% of 164705) affected shaders:
MaxWaves: 52318 -> 52332 (+0.03%); split: +0.04%, -0.02%
Instrs: 5262458 -> 5218922 (-0.83%); split: -0.87%, +0.05%
CodeSize: 10831900 -> 10655170 (-1.63%); split: -1.64%, +0.01%
NOPs: 829481 -> 836010 (+0.79%); split: -0.95%, +1.74%
MOVs: 176187 -> 173788 (-1.36%); split: -3.27%, +1.91%
COVs: 104096 -> 86543 (-16.86%); split: -16.87%, +0.01%
Full: 90434 -> 90158 (-0.31%); split: -0.33%, +0.03%
(ss): 131091 -> 130866 (-0.17%); split: -0.87%, +0.70%
(sy): 55550 -> 55769 (+0.39%); split: -0.92%, +1.32%
(ss)-stall: 406003 -> 407194 (+0.29%); split: -1.10%, +1.39%
(sy)-stall: 1668213 -> 1678082 (+0.59%); split: -1.31%, +1.90%
Preamble Instrs: 1105270 -> 1067290 (-3.44%); split: -3.50%, +0.06%
Constlen: 423776 -> 423560 (-0.05%)
Last helper: 1038202 -> 1035540 (-0.26%); split: -0.42%, +0.16%
Last baryf: 38908 -> 38632 (-0.71%)
Subgroup size: 336640 -> 336832 (+0.06%)
Cat0: 916209 -> 922848 (+0.72%); split: -0.87%, +1.59%
Cat1: 282813 -> 262845 (-7.06%); split: -7.49%, +0.43%
Cat2: 2198715 -> 2183012 (-0.71%); split: -0.72%, +0.01%
Cat3: 1390914 -> 1376421 (-1.04%)
Cat7: 123127 -> 123116 (-0.01%); split: -0.24%, +0.23%
Signed-off-by: Job Noorman <jnoorman@igalia.com>
Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37793>
2025-10-10 07:30:18 +02:00
|
|
|
nir_def *t = nir_iadd(b, m2, nir_ushr(b, lo, cshift));
|
|
|
|
|
nir_def *w1 = nir_iand(b, t, cmask);
|
|
|
|
|
nir_def *w2 = nir_shr(b, is_signed, t, cshift);
|
|
|
|
|
w1 = nir_iadd(b, m1, w1);
|
|
|
|
|
hi = nir_iadd(b, hi,
|
|
|
|
|
nir_iadd(b, nir_shr(b, is_signed, w1, cshift), w2));
|
2020-06-22 14:59:39 -07:00
|
|
|
|
|
|
|
|
lowered = hi;
|
2018-05-08 11:24:40 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
2024-07-08 14:21:22 -04:00
|
|
|
case nir_op_fmin:
|
|
|
|
|
case nir_op_fmax: {
|
|
|
|
|
if (!b->shader->options->lower_fminmax_signed_zero ||
|
|
|
|
|
!nir_alu_instr_is_signed_zero_preserve(instr))
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
nir_def *s0 = nir_ssa_for_alu_src(b, instr, 0);
|
|
|
|
|
nir_def *s1 = nir_ssa_for_alu_src(b, instr, 1);
|
|
|
|
|
|
|
|
|
|
bool max = instr->op == nir_op_fmax;
|
|
|
|
|
|
|
|
|
|
/* Lower the fmin/fmax to a no_signed_zero fmin/fmax. This ensures that
|
|
|
|
|
* nir_lower_alu is idempotent, and allows the backend to implement
|
|
|
|
|
* soundly the no_signed_zero subset of fmin/fmax.
|
|
|
|
|
*/
|
2025-12-18 17:36:51 +01:00
|
|
|
b->fp_math_ctrl &= ~nir_fp_preserve_signed_zero;
|
2024-07-08 14:21:22 -04:00
|
|
|
nir_def *fminmax = max ? nir_fmax(b, s0, s1) : nir_fmin(b, s0, s1);
|
2025-12-18 17:36:51 +01:00
|
|
|
b->fp_math_ctrl = instr->fp_math_ctrl;
|
2024-07-08 14:21:22 -04:00
|
|
|
|
2025-07-07 15:10:31 -04:00
|
|
|
/* If we have a constant source, we can usually optimize */
|
|
|
|
|
if (s0->num_components == 1 && s0->bit_size == 32) {
|
|
|
|
|
for (unsigned i = 0; i < 2 && lowered == NULL; ++i) {
|
|
|
|
|
if (!nir_src_is_const(instr->src[i].src))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
uint32_t x = nir_alu_src_as_uint(instr->src[i]);
|
|
|
|
|
bool pos_zero = x == fui(+0.0);
|
|
|
|
|
bool neg_zero = x == fui(-0.0);
|
|
|
|
|
nir_def *zero = i == 0 ? s0 : s1;
|
|
|
|
|
nir_def *other = i == 0 ? s1 : s0;
|
|
|
|
|
|
|
|
|
|
if (!pos_zero && !neg_zero) {
|
|
|
|
|
/* The lowering is only required when both sources are zero, so
|
|
|
|
|
* if we have a nonzero constant source, skip the lowering.
|
|
|
|
|
*/
|
|
|
|
|
lowered = fminmax;
|
|
|
|
|
} else if (pos_zero && max) {
|
nir/opt_algebraic: make bcsel(fcmp(b, a), b, a) -> fmin/fmax patterns exact
These patterns need is_only_used_as_float because fmin/fmax might change NaN
patterns, while bcsel is bit exact. For the same reason, the replacement
must not add undefined results, so make the replacement NaN/inf preserving.
It's impossible to make them signed zero correct (-0.0 == +0.0),
so it's also important that the user alu doesn't care.
Otherwise, the only thing that matters is is whether a is NaN.
Foz-DB Navi48:
Totals from 453 (0.55% of 82405) affected shaders:
MaxWaves: 8242 -> 8270 (+0.34%)
Instrs: 2382059 -> 2380094 (-0.08%); split: -0.09%, +0.00%
CodeSize: 13197208 -> 13179488 (-0.13%); split: -0.14%, +0.00%
VGPRs: 44688 -> 44604 (-0.19%)
Latency: 22839894 -> 22838985 (-0.00%); split: -0.01%, +0.00%
InvThroughput: 4873352 -> 4872924 (-0.01%)
VClause: 50862 -> 50883 (+0.04%); split: -0.02%, +0.06%
SClause: 54000 -> 53993 (-0.01%)
Copies: 250215 -> 250233 (+0.01%); split: -0.00%, +0.01%
PreVGPRs: 39694 -> 39620 (-0.19%)
VALU: 1116881 -> 1116073 (-0.07%); split: -0.07%, +0.00%
SALU: 492799 -> 492139 (-0.13%); split: -0.14%, +0.00%
VOPD: 85457 -> 85461 (+0.00%)
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39641>
2026-02-01 17:22:48 +01:00
|
|
|
b->fp_math_ctrl &= ~nir_fp_preserve_signed_zero;
|
2025-07-07 15:10:31 -04:00
|
|
|
/* max(x, +0.0) = +0.0 < x ? x : +0.0 */
|
|
|
|
|
lowered = nir_bcsel(b, nir_flt(b, zero, other), other, zero);
|
|
|
|
|
} else if (neg_zero && !max) {
|
nir/opt_algebraic: make bcsel(fcmp(b, a), b, a) -> fmin/fmax patterns exact
These patterns need is_only_used_as_float because fmin/fmax might change NaN
patterns, while bcsel is bit exact. For the same reason, the replacement
must not add undefined results, so make the replacement NaN/inf preserving.
It's impossible to make them signed zero correct (-0.0 == +0.0),
so it's also important that the user alu doesn't care.
Otherwise, the only thing that matters is is whether a is NaN.
Foz-DB Navi48:
Totals from 453 (0.55% of 82405) affected shaders:
MaxWaves: 8242 -> 8270 (+0.34%)
Instrs: 2382059 -> 2380094 (-0.08%); split: -0.09%, +0.00%
CodeSize: 13197208 -> 13179488 (-0.13%); split: -0.14%, +0.00%
VGPRs: 44688 -> 44604 (-0.19%)
Latency: 22839894 -> 22838985 (-0.00%); split: -0.01%, +0.00%
InvThroughput: 4873352 -> 4872924 (-0.01%)
VClause: 50862 -> 50883 (+0.04%); split: -0.02%, +0.06%
SClause: 54000 -> 53993 (-0.01%)
Copies: 250215 -> 250233 (+0.01%); split: -0.00%, +0.01%
PreVGPRs: 39694 -> 39620 (-0.19%)
VALU: 1116881 -> 1116073 (-0.07%); split: -0.07%, +0.00%
SALU: 492799 -> 492139 (-0.13%); split: -0.14%, +0.00%
VOPD: 85457 -> 85461 (+0.00%)
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39641>
2026-02-01 17:22:48 +01:00
|
|
|
b->fp_math_ctrl &= ~nir_fp_preserve_signed_zero;
|
2025-07-07 15:10:31 -04:00
|
|
|
/* min(x, -0.0) = x < -0.0 ? x : -0.0 */
|
|
|
|
|
lowered = nir_bcsel(b, nir_flt(b, other, zero), other, zero);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
nir/opt_algebraic: make bcsel(fcmp(b, a), b, a) -> fmin/fmax patterns exact
These patterns need is_only_used_as_float because fmin/fmax might change NaN
patterns, while bcsel is bit exact. For the same reason, the replacement
must not add undefined results, so make the replacement NaN/inf preserving.
It's impossible to make them signed zero correct (-0.0 == +0.0),
so it's also important that the user alu doesn't care.
Otherwise, the only thing that matters is is whether a is NaN.
Foz-DB Navi48:
Totals from 453 (0.55% of 82405) affected shaders:
MaxWaves: 8242 -> 8270 (+0.34%)
Instrs: 2382059 -> 2380094 (-0.08%); split: -0.09%, +0.00%
CodeSize: 13197208 -> 13179488 (-0.13%); split: -0.14%, +0.00%
VGPRs: 44688 -> 44604 (-0.19%)
Latency: 22839894 -> 22838985 (-0.00%); split: -0.01%, +0.00%
InvThroughput: 4873352 -> 4872924 (-0.01%)
VClause: 50862 -> 50883 (+0.04%); split: -0.02%, +0.06%
SClause: 54000 -> 53993 (-0.01%)
Copies: 250215 -> 250233 (+0.01%); split: -0.00%, +0.01%
PreVGPRs: 39694 -> 39620 (-0.19%)
VALU: 1116881 -> 1116073 (-0.07%); split: -0.07%, +0.00%
SALU: 492799 -> 492139 (-0.13%); split: -0.14%, +0.00%
VOPD: 85457 -> 85461 (+0.00%)
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39641>
2026-02-01 17:22:48 +01:00
|
|
|
b->fp_math_ctrl = instr->fp_math_ctrl;
|
|
|
|
|
|
2025-07-07 15:10:31 -04:00
|
|
|
/* Fallback on the emulation */
|
|
|
|
|
if (!lowered) {
|
|
|
|
|
nir_def *iminmax = max ? nir_imax(b, s0, s1) : nir_imin(b, s0, s1);
|
|
|
|
|
lowered = nir_bcsel(b, nir_feq(b, s0, s1), iminmax, fminmax);
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-08 14:21:22 -04:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2018-05-08 11:24:40 -07:00
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (lowered) {
|
2024-06-20 12:07:26 -04:00
|
|
|
nir_def_replace(&instr->def, lowered);
|
2018-05-08 11:24:40 -07:00
|
|
|
return true;
|
|
|
|
|
} else {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool
|
|
|
|
|
nir_lower_alu(nir_shader *shader)
|
|
|
|
|
{
|
2018-05-08 12:47:48 -07:00
|
|
|
if (!shader->options->lower_bitfield_reverse &&
|
2023-11-04 21:36:47 +03:00
|
|
|
!shader->options->lower_bit_count &&
|
2024-07-08 14:21:22 -04:00
|
|
|
!shader->options->lower_mul_high &&
|
|
|
|
|
!shader->options->lower_fminmax_signed_zero)
|
2018-05-08 11:24:40 -07:00
|
|
|
return false;
|
|
|
|
|
|
2024-08-08 22:28:13 -04:00
|
|
|
return nir_shader_alu_pass(shader, lower_alu_instr,
|
|
|
|
|
nir_metadata_control_flow, NULL);
|
2018-05-08 11:24:40 -07:00
|
|
|
}
|