mesa/src/compiler/nir/nir_opt_constant_folding.c

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

397 lines
13 KiB
C
Raw Normal View History

/*
* Copyright © 2014 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include <math.h>
#include "nir.h"
#include "nir_builder.h"
#include "nir_constant_expressions.h"
#include "nir_deref.h"
/*
* Implements SSA-based constant folding.
*/
struct constant_fold_state {
bool has_load_constant;
bool has_indirect_load_const;
};
static bool
try_fold_alu(nir_builder *b, nir_alu_instr *alu)
{
nir_const_value src[NIR_MAX_VEC_COMPONENTS][NIR_MAX_VEC_COMPONENTS];
/* In the case that any outputs/inputs have unsized types, then we need to
* guess the bit-size. In this case, the validator ensures that all
* bit-sizes match so we can just take the bit-size from first
* output/input with an unsized type. If all the outputs/inputs are sized
* then we don't need to guess the bit-size at all because the code we
* generate for constant opcodes in this case already knows the sizes of
* the types involved and does not need the provided bit-size for anything
* (although it still requires to receive a valid bit-size).
*/
unsigned bit_size = 0;
if (!nir_alu_type_get_type_size(nir_op_infos[alu->op].output_type))
bit_size = alu->def.bit_size;
for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) {
if (bit_size == 0 &&
!nir_alu_type_get_type_size(nir_op_infos[alu->op].input_types[i]))
bit_size = alu->src[i].src.ssa->bit_size;
nir_instr *src_instr = alu->src[i].src.ssa->parent_instr;
if (src_instr->type != nir_instr_type_load_const)
return false;
nir_load_const_instr *load_const = nir_instr_as_load_const(src_instr);
for (unsigned j = 0; j < nir_ssa_alu_instr_src_components(alu, i);
j++) {
src[i][j] = load_const->value[alu->src[i].swizzle[j]];
}
}
if (bit_size == 0)
bit_size = 32;
nir_const_value dest[NIR_MAX_VEC_COMPONENTS];
nir_const_value *srcs[NIR_MAX_VEC_COMPONENTS];
memset(dest, 0, sizeof(dest));
for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; ++i)
srcs[i] = src[i];
nir_eval_const_opcode(alu->op, dest, alu->def.num_components,
bit_size, srcs,
b->shader->info.float_controls_execution_mode);
b->cursor = nir_before_instr(&alu->instr);
nir_def *imm = nir_build_imm(b, alu->def.num_components,
alu->def.bit_size,
dest);
nir_def_rewrite_uses(&alu->def, imm);
nir_instr_remove(&alu->instr);
nir_instr_free(&alu->instr);
return true;
}
static nir_const_value *
const_value_for_deref(nir_deref_instr *deref)
{
nir: Add and use some deref mode helpers NIR derefs currently have exactly one variable mode. This is about to change so we can handle OpenCL generic pointers. In order to transition safely, we need to audit every deref->mode check. This commit adds a set of helpers that provide more nuanced mode checks and converts most of NIR to use them. For simple cases, we add nir_deref_mode_is and nir_deref_mode_is_one_of helpers. These can be used in passes which don't have to bother with generic pointers and just want to know what mode a thing is. If the pass ever encounters generic pointers in a way that this check would be unsafe, it will assert-fail to alert developers that they need to think harder about things and fix the pass. For more complex passes which require a more nuanced understanding of modes, we add nir_deref_mode_may_be and nir_deref_mode_must_be helpers which accurately describe the compiler's best knowledge about the given deref. Unfortunately, we may not be able to exactly identify the mode in a generic pointers scenario so we have to be very careful when we use these. Conversion of these passes is left to later commits. For the case of mass lowering of a particular mode (nir_lower_explicit_io is one good example), we add nir_deref_mode_is_in_set. This is also pretty assert-happy like nir_deref_mode_is but is for a set containment comparison on deref modes where you expect the deref to either be all-in or all-out. Reviewed-by: Jesse Natalie <jenatali@microsoft.com> Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6332>
2020-10-30 12:19:25 -05:00
if (!nir_deref_mode_is(deref, nir_var_mem_constant))
return NULL;
nir_deref_path path;
nir_deref_path_init(&path, deref, NULL);
if (path.path[0]->deref_type != nir_deref_type_var)
goto fail;
nir_variable *var = path.path[0]->var;
assert(var->data.mode == nir_var_mem_constant);
if (var->constant_initializer == NULL)
goto fail;
if (var->constant_initializer->is_null_constant) {
/* Doesn't matter what casts are in the way, it's all zeros */
nir_deref_path_finish(&path);
return var->constant_initializer->values;
}
nir_constant *c = var->constant_initializer;
nir_const_value *v = NULL; /* Vector value for array-deref-of-vec */
for (unsigned i = 1; path.path[i] != NULL; i++) {
nir_deref_instr *p = path.path[i];
switch (p->deref_type) {
case nir_deref_type_var:
unreachable("Deref paths can only start with a var deref");
case nir_deref_type_array: {
assert(v == NULL);
if (!nir_src_is_const(p->arr.index))
goto fail;
uint64_t idx = nir_src_as_uint(p->arr.index);
if (c->num_elements > 0) {
assert(glsl_type_is_array(path.path[i - 1]->type));
if (idx >= c->num_elements)
goto fail;
c = c->elements[idx];
} else {
assert(glsl_type_is_vector(path.path[i - 1]->type));
assert(glsl_type_is_scalar(p->type));
if (idx >= NIR_MAX_VEC_COMPONENTS)
goto fail;
v = &c->values[idx];
}
break;
}
case nir_deref_type_struct:
assert(glsl_type_is_struct(path.path[i - 1]->type));
assert(v == NULL && c->num_elements > 0);
if (p->strct.index >= c->num_elements)
goto fail;
c = c->elements[p->strct.index];
break;
default:
goto fail;
}
}
/* We have to have ended at a vector */
assert(c->num_elements == 0);
nir_deref_path_finish(&path);
return v ? v : c->values;
fail:
nir_deref_path_finish(&path);
return NULL;
}
static bool
try_fold_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin,
struct constant_fold_state *state)
{
switch (intrin->intrinsic) {
case nir_intrinsic_demote_if:
case nir_intrinsic_discard_if:
case nir_intrinsic_terminate_if:
if (nir_src_is_const(intrin->src[0])) {
if (nir_src_as_bool(intrin->src[0])) {
b->cursor = nir_before_instr(&intrin->instr);
nir_intrinsic_op op;
switch (intrin->intrinsic) {
case nir_intrinsic_discard_if:
op = nir_intrinsic_discard;
break;
case nir_intrinsic_demote_if:
op = nir_intrinsic_demote;
break;
case nir_intrinsic_terminate_if:
op = nir_intrinsic_terminate;
break;
default:
unreachable("invalid intrinsic");
}
nir_intrinsic_instr *new_instr =
nir_intrinsic_instr_create(b->shader, op);
nir_builder_instr_insert(b, &new_instr->instr);
}
nir_instr_remove(&intrin->instr);
return true;
}
return false;
case nir_intrinsic_load_deref: {
nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
nir_const_value *v = const_value_for_deref(deref);
if (v) {
b->cursor = nir_before_instr(&intrin->instr);
nir_def *val = nir_build_imm(b, intrin->def.num_components,
intrin->def.bit_size, v);
nir_def_rewrite_uses(&intrin->def, val);
nir_instr_remove(&intrin->instr);
return true;
}
return false;
}
case nir_intrinsic_load_constant: {
state->has_load_constant = true;
if (!nir_src_is_const(intrin->src[0])) {
state->has_indirect_load_const = true;
return false;
}
unsigned offset = nir_src_as_uint(intrin->src[0]);
unsigned base = nir_intrinsic_base(intrin);
unsigned range = nir_intrinsic_range(intrin);
assert(base + range <= b->shader->constant_data_size);
b->cursor = nir_before_instr(&intrin->instr);
nir_def *val;
if (offset >= range) {
val = nir_undef(b, intrin->def.num_components,
intrin->def.bit_size);
} else {
nir_const_value imm[NIR_MAX_VEC_COMPONENTS];
memset(imm, 0, sizeof(imm));
uint8_t *data = (uint8_t *)b->shader->constant_data + base;
for (unsigned i = 0; i < intrin->num_components; i++) {
unsigned bytes = intrin->def.bit_size / 8;
bytes = MIN2(bytes, range - offset);
memcpy(&imm[i].u64, data + offset, bytes);
offset += bytes;
}
val = nir_build_imm(b, intrin->def.num_components,
intrin->def.bit_size, imm);
}
nir_def_rewrite_uses(&intrin->def, val);
nir_instr_remove(&intrin->instr);
return true;
}
case nir_intrinsic_vote_any:
case nir_intrinsic_vote_all:
case nir_intrinsic_read_invocation:
case nir_intrinsic_read_first_invocation:
case nir_intrinsic_shuffle:
case nir_intrinsic_shuffle_xor:
case nir_intrinsic_shuffle_up:
case nir_intrinsic_shuffle_down:
case nir_intrinsic_quad_broadcast:
case nir_intrinsic_quad_swap_horizontal:
case nir_intrinsic_quad_swap_vertical:
case nir_intrinsic_quad_swap_diagonal:
case nir_intrinsic_quad_swizzle_amd:
case nir_intrinsic_masked_swizzle_amd:
/* All of these have the data payload in the first source. They may
* have a second source with a shuffle index but that doesn't matter if
* the data is constant.
*/
if (nir_src_is_const(intrin->src[0])) {
nir_def_rewrite_uses(&intrin->def,
intrin->src[0].ssa);
nir_instr_remove(&intrin->instr);
return true;
}
return false;
case nir_intrinsic_vote_feq:
case nir_intrinsic_vote_ieq:
if (nir_src_is_const(intrin->src[0])) {
b->cursor = nir_before_instr(&intrin->instr);
nir_def_rewrite_uses(&intrin->def,
nir_imm_true(b));
nir_instr_remove(&intrin->instr);
return true;
}
return false;
default:
return false;
}
}
nir/constant_folding: Optimize txb with bias of constant zero to tex v2: Fail gracefully when bias_idx < 0. See comment in the code for the rationale. See also issue #5722. All Haswell and newer Intel GPUs had similar results. (Ice Lake shown) total instructions in shared programs: 19757733 -> 19753431 (-0.02%) instructions in affected programs: 277248 -> 272946 (-1.55%) helped: 1644 HURT: 1 helped stats (abs) min: 1 max: 16 x̄: 2.62 x̃: 2 helped stats (rel) min: 0.05% max: 11.11% x̄: 2.11% x̃: 1.61% HURT stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 HURT stats (rel) min: 0.35% max: 0.35% x̄: 0.35% x̃: 0.35% 95% mean confidence interval for instructions value: -2.72 -2.51 95% mean confidence interval for instructions %-change: -2.19% -2.03% Instructions are helped. total cycles in shared programs: 938517439 -> 938384079 (-0.01%) cycles in affected programs: 19548849 -> 19415489 (-0.68%) helped: 1358 HURT: 269 helped stats (abs) min: 1 max: 2328 x̄: 133.01 x̃: 16 helped stats (rel) min: <.01% max: 41.12% x̄: 1.40% x̃: 0.48% HURT stats (abs) min: 1 max: 1302 x̄: 175.70 x̃: 30 HURT stats (rel) min: <.01% max: 69.03% x̄: 6.24% x̃: 1.04% 95% mean confidence interval for cycles value: -99.14 -64.79 95% mean confidence interval for cycles %-change: -0.47% 0.19% Inconclusive result (%-change mean confidence interval includes 0). LOST: 21 GAINED: 32 All Ivy Bridge and older Intel GPUs had similar results. (Ivy Bridge shown) total instructions in shared programs: 15302017 -> 15301485 (<.01%) instructions in affected programs: 22565 -> 22033 (-2.36%) helped: 168 HURT: 0 helped stats (abs) min: 1 max: 7 x̄: 3.17 x̃: 3 helped stats (rel) min: 0.04% max: 4.39% x̄: 3.05% x̃: 3.27% 95% mean confidence interval for instructions value: -3.45 -2.89 95% mean confidence interval for instructions %-change: -3.19% -2.91% Instructions are helped. total cycles in shared programs: 550119761 -> 549989147 (-0.02%) cycles in affected programs: 12834251 -> 12703637 (-1.02%) helped: 164 HURT: 0 helped stats (abs) min: 20 max: 4547 x̄: 796.43 x̃: 294 helped stats (rel) min: 0.23% max: 53.84% x̄: 2.05% x̃: 0.37% 95% mean confidence interval for cycles value: -942.62 -650.24 95% mean confidence interval for cycles %-change: -3.17% -0.94% Cycles are helped. fossil-db results: Tiger Lake, Ice Lake, and Skylake had similar results. (Ice Lake shown) Instructions in all programs: 142073649 -> 141307526 (-0.5%) SENDs in all programs: 6876848 -> 6876778 (-0.0%) Loops in all programs: 38283 -> 38283 (+0.0%) Cycles in all programs: 8410049681 -> 8402902960 (-0.1%) Spills in all programs: 190623 -> 190599 (-0.0%) Fills in all programs: 297780 -> 297756 (-0.0%) Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> [v1] Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14025>
2021-11-09 15:51:41 -08:00
static bool
try_fold_txb_to_tex(nir_builder *b, nir_tex_instr *tex)
nir/constant_folding: Optimize txb with bias of constant zero to tex v2: Fail gracefully when bias_idx < 0. See comment in the code for the rationale. See also issue #5722. All Haswell and newer Intel GPUs had similar results. (Ice Lake shown) total instructions in shared programs: 19757733 -> 19753431 (-0.02%) instructions in affected programs: 277248 -> 272946 (-1.55%) helped: 1644 HURT: 1 helped stats (abs) min: 1 max: 16 x̄: 2.62 x̃: 2 helped stats (rel) min: 0.05% max: 11.11% x̄: 2.11% x̃: 1.61% HURT stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 HURT stats (rel) min: 0.35% max: 0.35% x̄: 0.35% x̃: 0.35% 95% mean confidence interval for instructions value: -2.72 -2.51 95% mean confidence interval for instructions %-change: -2.19% -2.03% Instructions are helped. total cycles in shared programs: 938517439 -> 938384079 (-0.01%) cycles in affected programs: 19548849 -> 19415489 (-0.68%) helped: 1358 HURT: 269 helped stats (abs) min: 1 max: 2328 x̄: 133.01 x̃: 16 helped stats (rel) min: <.01% max: 41.12% x̄: 1.40% x̃: 0.48% HURT stats (abs) min: 1 max: 1302 x̄: 175.70 x̃: 30 HURT stats (rel) min: <.01% max: 69.03% x̄: 6.24% x̃: 1.04% 95% mean confidence interval for cycles value: -99.14 -64.79 95% mean confidence interval for cycles %-change: -0.47% 0.19% Inconclusive result (%-change mean confidence interval includes 0). LOST: 21 GAINED: 32 All Ivy Bridge and older Intel GPUs had similar results. (Ivy Bridge shown) total instructions in shared programs: 15302017 -> 15301485 (<.01%) instructions in affected programs: 22565 -> 22033 (-2.36%) helped: 168 HURT: 0 helped stats (abs) min: 1 max: 7 x̄: 3.17 x̃: 3 helped stats (rel) min: 0.04% max: 4.39% x̄: 3.05% x̃: 3.27% 95% mean confidence interval for instructions value: -3.45 -2.89 95% mean confidence interval for instructions %-change: -3.19% -2.91% Instructions are helped. total cycles in shared programs: 550119761 -> 549989147 (-0.02%) cycles in affected programs: 12834251 -> 12703637 (-1.02%) helped: 164 HURT: 0 helped stats (abs) min: 20 max: 4547 x̄: 796.43 x̃: 294 helped stats (rel) min: 0.23% max: 53.84% x̄: 2.05% x̃: 0.37% 95% mean confidence interval for cycles value: -942.62 -650.24 95% mean confidence interval for cycles %-change: -3.17% -0.94% Cycles are helped. fossil-db results: Tiger Lake, Ice Lake, and Skylake had similar results. (Ice Lake shown) Instructions in all programs: 142073649 -> 141307526 (-0.5%) SENDs in all programs: 6876848 -> 6876778 (-0.0%) Loops in all programs: 38283 -> 38283 (+0.0%) Cycles in all programs: 8410049681 -> 8402902960 (-0.1%) Spills in all programs: 190623 -> 190599 (-0.0%) Fills in all programs: 297780 -> 297756 (-0.0%) Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> [v1] Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14025>
2021-11-09 15:51:41 -08:00
{
assert(tex->op == nir_texop_txb);
nir/constant_folding: Optimize txb with bias of constant zero to tex v2: Fail gracefully when bias_idx < 0. See comment in the code for the rationale. See also issue #5722. All Haswell and newer Intel GPUs had similar results. (Ice Lake shown) total instructions in shared programs: 19757733 -> 19753431 (-0.02%) instructions in affected programs: 277248 -> 272946 (-1.55%) helped: 1644 HURT: 1 helped stats (abs) min: 1 max: 16 x̄: 2.62 x̃: 2 helped stats (rel) min: 0.05% max: 11.11% x̄: 2.11% x̃: 1.61% HURT stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 HURT stats (rel) min: 0.35% max: 0.35% x̄: 0.35% x̃: 0.35% 95% mean confidence interval for instructions value: -2.72 -2.51 95% mean confidence interval for instructions %-change: -2.19% -2.03% Instructions are helped. total cycles in shared programs: 938517439 -> 938384079 (-0.01%) cycles in affected programs: 19548849 -> 19415489 (-0.68%) helped: 1358 HURT: 269 helped stats (abs) min: 1 max: 2328 x̄: 133.01 x̃: 16 helped stats (rel) min: <.01% max: 41.12% x̄: 1.40% x̃: 0.48% HURT stats (abs) min: 1 max: 1302 x̄: 175.70 x̃: 30 HURT stats (rel) min: <.01% max: 69.03% x̄: 6.24% x̃: 1.04% 95% mean confidence interval for cycles value: -99.14 -64.79 95% mean confidence interval for cycles %-change: -0.47% 0.19% Inconclusive result (%-change mean confidence interval includes 0). LOST: 21 GAINED: 32 All Ivy Bridge and older Intel GPUs had similar results. (Ivy Bridge shown) total instructions in shared programs: 15302017 -> 15301485 (<.01%) instructions in affected programs: 22565 -> 22033 (-2.36%) helped: 168 HURT: 0 helped stats (abs) min: 1 max: 7 x̄: 3.17 x̃: 3 helped stats (rel) min: 0.04% max: 4.39% x̄: 3.05% x̃: 3.27% 95% mean confidence interval for instructions value: -3.45 -2.89 95% mean confidence interval for instructions %-change: -3.19% -2.91% Instructions are helped. total cycles in shared programs: 550119761 -> 549989147 (-0.02%) cycles in affected programs: 12834251 -> 12703637 (-1.02%) helped: 164 HURT: 0 helped stats (abs) min: 20 max: 4547 x̄: 796.43 x̃: 294 helped stats (rel) min: 0.23% max: 53.84% x̄: 2.05% x̃: 0.37% 95% mean confidence interval for cycles value: -942.62 -650.24 95% mean confidence interval for cycles %-change: -3.17% -0.94% Cycles are helped. fossil-db results: Tiger Lake, Ice Lake, and Skylake had similar results. (Ice Lake shown) Instructions in all programs: 142073649 -> 141307526 (-0.5%) SENDs in all programs: 6876848 -> 6876778 (-0.0%) Loops in all programs: 38283 -> 38283 (+0.0%) Cycles in all programs: 8410049681 -> 8402902960 (-0.1%) Spills in all programs: 190623 -> 190599 (-0.0%) Fills in all programs: 297780 -> 297756 (-0.0%) Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> [v1] Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14025>
2021-11-09 15:51:41 -08:00
const int bias_idx = nir_tex_instr_src_index(tex, nir_tex_src_bias);
nir/constant_folding: Optimize txb with bias of constant zero to tex v2: Fail gracefully when bias_idx < 0. See comment in the code for the rationale. See also issue #5722. All Haswell and newer Intel GPUs had similar results. (Ice Lake shown) total instructions in shared programs: 19757733 -> 19753431 (-0.02%) instructions in affected programs: 277248 -> 272946 (-1.55%) helped: 1644 HURT: 1 helped stats (abs) min: 1 max: 16 x̄: 2.62 x̃: 2 helped stats (rel) min: 0.05% max: 11.11% x̄: 2.11% x̃: 1.61% HURT stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 HURT stats (rel) min: 0.35% max: 0.35% x̄: 0.35% x̃: 0.35% 95% mean confidence interval for instructions value: -2.72 -2.51 95% mean confidence interval for instructions %-change: -2.19% -2.03% Instructions are helped. total cycles in shared programs: 938517439 -> 938384079 (-0.01%) cycles in affected programs: 19548849 -> 19415489 (-0.68%) helped: 1358 HURT: 269 helped stats (abs) min: 1 max: 2328 x̄: 133.01 x̃: 16 helped stats (rel) min: <.01% max: 41.12% x̄: 1.40% x̃: 0.48% HURT stats (abs) min: 1 max: 1302 x̄: 175.70 x̃: 30 HURT stats (rel) min: <.01% max: 69.03% x̄: 6.24% x̃: 1.04% 95% mean confidence interval for cycles value: -99.14 -64.79 95% mean confidence interval for cycles %-change: -0.47% 0.19% Inconclusive result (%-change mean confidence interval includes 0). LOST: 21 GAINED: 32 All Ivy Bridge and older Intel GPUs had similar results. (Ivy Bridge shown) total instructions in shared programs: 15302017 -> 15301485 (<.01%) instructions in affected programs: 22565 -> 22033 (-2.36%) helped: 168 HURT: 0 helped stats (abs) min: 1 max: 7 x̄: 3.17 x̃: 3 helped stats (rel) min: 0.04% max: 4.39% x̄: 3.05% x̃: 3.27% 95% mean confidence interval for instructions value: -3.45 -2.89 95% mean confidence interval for instructions %-change: -3.19% -2.91% Instructions are helped. total cycles in shared programs: 550119761 -> 549989147 (-0.02%) cycles in affected programs: 12834251 -> 12703637 (-1.02%) helped: 164 HURT: 0 helped stats (abs) min: 20 max: 4547 x̄: 796.43 x̃: 294 helped stats (rel) min: 0.23% max: 53.84% x̄: 2.05% x̃: 0.37% 95% mean confidence interval for cycles value: -942.62 -650.24 95% mean confidence interval for cycles %-change: -3.17% -0.94% Cycles are helped. fossil-db results: Tiger Lake, Ice Lake, and Skylake had similar results. (Ice Lake shown) Instructions in all programs: 142073649 -> 141307526 (-0.5%) SENDs in all programs: 6876848 -> 6876778 (-0.0%) Loops in all programs: 38283 -> 38283 (+0.0%) Cycles in all programs: 8410049681 -> 8402902960 (-0.1%) Spills in all programs: 190623 -> 190599 (-0.0%) Fills in all programs: 297780 -> 297756 (-0.0%) Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> [v1] Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14025>
2021-11-09 15:51:41 -08:00
/* nir_to_tgsi_lower_tex mangles many kinds of texture instructions,
* including txb, into invalid states. It removes the special
* parameters and appends the values to the texture coordinate.
*/
if (bias_idx < 0)
return false;
if (nir_src_is_const(tex->src[bias_idx].src) &&
nir_src_as_float(tex->src[bias_idx].src) == 0.0) {
nir_tex_instr_remove_src(tex, bias_idx);
tex->op = nir_texop_tex;
return true;
nir/constant_folding: Optimize txb with bias of constant zero to tex v2: Fail gracefully when bias_idx < 0. See comment in the code for the rationale. See also issue #5722. All Haswell and newer Intel GPUs had similar results. (Ice Lake shown) total instructions in shared programs: 19757733 -> 19753431 (-0.02%) instructions in affected programs: 277248 -> 272946 (-1.55%) helped: 1644 HURT: 1 helped stats (abs) min: 1 max: 16 x̄: 2.62 x̃: 2 helped stats (rel) min: 0.05% max: 11.11% x̄: 2.11% x̃: 1.61% HURT stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 HURT stats (rel) min: 0.35% max: 0.35% x̄: 0.35% x̃: 0.35% 95% mean confidence interval for instructions value: -2.72 -2.51 95% mean confidence interval for instructions %-change: -2.19% -2.03% Instructions are helped. total cycles in shared programs: 938517439 -> 938384079 (-0.01%) cycles in affected programs: 19548849 -> 19415489 (-0.68%) helped: 1358 HURT: 269 helped stats (abs) min: 1 max: 2328 x̄: 133.01 x̃: 16 helped stats (rel) min: <.01% max: 41.12% x̄: 1.40% x̃: 0.48% HURT stats (abs) min: 1 max: 1302 x̄: 175.70 x̃: 30 HURT stats (rel) min: <.01% max: 69.03% x̄: 6.24% x̃: 1.04% 95% mean confidence interval for cycles value: -99.14 -64.79 95% mean confidence interval for cycles %-change: -0.47% 0.19% Inconclusive result (%-change mean confidence interval includes 0). LOST: 21 GAINED: 32 All Ivy Bridge and older Intel GPUs had similar results. (Ivy Bridge shown) total instructions in shared programs: 15302017 -> 15301485 (<.01%) instructions in affected programs: 22565 -> 22033 (-2.36%) helped: 168 HURT: 0 helped stats (abs) min: 1 max: 7 x̄: 3.17 x̃: 3 helped stats (rel) min: 0.04% max: 4.39% x̄: 3.05% x̃: 3.27% 95% mean confidence interval for instructions value: -3.45 -2.89 95% mean confidence interval for instructions %-change: -3.19% -2.91% Instructions are helped. total cycles in shared programs: 550119761 -> 549989147 (-0.02%) cycles in affected programs: 12834251 -> 12703637 (-1.02%) helped: 164 HURT: 0 helped stats (abs) min: 20 max: 4547 x̄: 796.43 x̃: 294 helped stats (rel) min: 0.23% max: 53.84% x̄: 2.05% x̃: 0.37% 95% mean confidence interval for cycles value: -942.62 -650.24 95% mean confidence interval for cycles %-change: -3.17% -0.94% Cycles are helped. fossil-db results: Tiger Lake, Ice Lake, and Skylake had similar results. (Ice Lake shown) Instructions in all programs: 142073649 -> 141307526 (-0.5%) SENDs in all programs: 6876848 -> 6876778 (-0.0%) Loops in all programs: 38283 -> 38283 (+0.0%) Cycles in all programs: 8410049681 -> 8402902960 (-0.1%) Spills in all programs: 190623 -> 190599 (-0.0%) Fills in all programs: 297780 -> 297756 (-0.0%) Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> [v1] Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14025>
2021-11-09 15:51:41 -08:00
}
return false;
}
static bool
try_fold_tex_offset(nir_tex_instr *tex, unsigned *index,
nir_tex_src_type src_type)
{
const int src_idx = nir_tex_instr_src_index(tex, src_type);
if (src_idx < 0)
return false;
if (!nir_src_is_const(tex->src[src_idx].src))
return false;
*index += nir_src_as_uint(tex->src[src_idx].src);
nir_tex_instr_remove_src(tex, src_idx);
return true;
}
static bool
try_fold_tex(nir_builder *b, nir_tex_instr *tex)
{
bool progress = false;
progress |= try_fold_tex_offset(tex, &tex->texture_index,
nir_tex_src_texture_offset);
progress |= try_fold_tex_offset(tex, &tex->sampler_index,
nir_tex_src_sampler_offset);
/* txb with a bias of constant zero is just tex. */
if (tex->op == nir_texop_txb)
progress |= try_fold_txb_to_tex(b, tex);
return progress;
}
static bool
try_fold_instr(nir_builder *b, nir_instr *instr, void *_state)
{
switch (instr->type) {
case nir_instr_type_alu:
return try_fold_alu(b, nir_instr_as_alu(instr));
case nir_instr_type_intrinsic:
return try_fold_intrinsic(b, nir_instr_as_intrinsic(instr), _state);
nir/constant_folding: Optimize txb with bias of constant zero to tex v2: Fail gracefully when bias_idx < 0. See comment in the code for the rationale. See also issue #5722. All Haswell and newer Intel GPUs had similar results. (Ice Lake shown) total instructions in shared programs: 19757733 -> 19753431 (-0.02%) instructions in affected programs: 277248 -> 272946 (-1.55%) helped: 1644 HURT: 1 helped stats (abs) min: 1 max: 16 x̄: 2.62 x̃: 2 helped stats (rel) min: 0.05% max: 11.11% x̄: 2.11% x̃: 1.61% HURT stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 HURT stats (rel) min: 0.35% max: 0.35% x̄: 0.35% x̃: 0.35% 95% mean confidence interval for instructions value: -2.72 -2.51 95% mean confidence interval for instructions %-change: -2.19% -2.03% Instructions are helped. total cycles in shared programs: 938517439 -> 938384079 (-0.01%) cycles in affected programs: 19548849 -> 19415489 (-0.68%) helped: 1358 HURT: 269 helped stats (abs) min: 1 max: 2328 x̄: 133.01 x̃: 16 helped stats (rel) min: <.01% max: 41.12% x̄: 1.40% x̃: 0.48% HURT stats (abs) min: 1 max: 1302 x̄: 175.70 x̃: 30 HURT stats (rel) min: <.01% max: 69.03% x̄: 6.24% x̃: 1.04% 95% mean confidence interval for cycles value: -99.14 -64.79 95% mean confidence interval for cycles %-change: -0.47% 0.19% Inconclusive result (%-change mean confidence interval includes 0). LOST: 21 GAINED: 32 All Ivy Bridge and older Intel GPUs had similar results. (Ivy Bridge shown) total instructions in shared programs: 15302017 -> 15301485 (<.01%) instructions in affected programs: 22565 -> 22033 (-2.36%) helped: 168 HURT: 0 helped stats (abs) min: 1 max: 7 x̄: 3.17 x̃: 3 helped stats (rel) min: 0.04% max: 4.39% x̄: 3.05% x̃: 3.27% 95% mean confidence interval for instructions value: -3.45 -2.89 95% mean confidence interval for instructions %-change: -3.19% -2.91% Instructions are helped. total cycles in shared programs: 550119761 -> 549989147 (-0.02%) cycles in affected programs: 12834251 -> 12703637 (-1.02%) helped: 164 HURT: 0 helped stats (abs) min: 20 max: 4547 x̄: 796.43 x̃: 294 helped stats (rel) min: 0.23% max: 53.84% x̄: 2.05% x̃: 0.37% 95% mean confidence interval for cycles value: -942.62 -650.24 95% mean confidence interval for cycles %-change: -3.17% -0.94% Cycles are helped. fossil-db results: Tiger Lake, Ice Lake, and Skylake had similar results. (Ice Lake shown) Instructions in all programs: 142073649 -> 141307526 (-0.5%) SENDs in all programs: 6876848 -> 6876778 (-0.0%) Loops in all programs: 38283 -> 38283 (+0.0%) Cycles in all programs: 8410049681 -> 8402902960 (-0.1%) Spills in all programs: 190623 -> 190599 (-0.0%) Fills in all programs: 297780 -> 297756 (-0.0%) Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> [v1] Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14025>
2021-11-09 15:51:41 -08:00
case nir_instr_type_tex:
return try_fold_tex(b, nir_instr_as_tex(instr));
default:
/* Don't know how to constant fold */
return false;
}
}
bool
nir_opt_constant_folding(nir_shader *shader)
{
struct constant_fold_state state;
state.has_load_constant = false;
state.has_indirect_load_const = false;
bool progress = nir_shader_instructions_pass(shader, try_fold_instr,
nir_metadata_block_index |
nir_metadata_dominance,
&state);
/* This doesn't free the constant data if there are no constant loads because
* the data might still be used but the loads have been lowered to load_ubo
*/
if (state.has_load_constant && !state.has_indirect_load_const &&
shader->constant_data_size) {
ralloc_free(shader->constant_data);
shader->constant_data = NULL;
shader->constant_data_size = 0;
}
return progress;
}