mesa/src/compiler/nir/nir_lower_shader_calls.c

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

2093 lines
73 KiB
C
Raw Normal View History

/*
* Copyright © 2020 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "nir.h"
#include "nir_builder.h"
#include "nir_phi_builder.h"
nir/lower_shader_calls: rematerialize values in more complex cases Previously when considering whether to rematerialize or spill/fill ssa_1954, we would go for a spill/fill : vec4 32 ssa_388 = (float32)txf ssa_387 (texture_handle), ssa_86 (coord), ssa_23 (lod), 0 (texture), 0 (sampler) ... vec1 32 ssa_1953 = load_const (0xbd23d70a = -0.040000) vec1 32 ssa_1954 = fadd ssa_388.x, ssa_1953 vec1 32 ssa_1955 = fneg ssa_1954 This is because when looking at ssa_1955 the first time, we would consider ssa_388 unrematerialiable, and therefore all values built on top of it would be considered unrematerialiable as well. The missing piece when considering whether to rematerialize ssa_1954 is that we should look at filled values. Now that ssa_388 has been spilled/filled, we can rebuild ssa_1955 on top of the filled value and avoid spilling/filling ssa_1955 at all. This requires a bit more work though. We can't just look at an instruction in isolation, we need to go through the ssa chains until we find values we can rematerialize or not. In this change we build a list of all ssa values involved in building a given value, up to the point there we find a filled or a rematerializable value. In this particular case, looking at ssa_1955 : * We can rematerialize ssa_388 from its filled value * We can rematerialize ssa_1953 trivially * We can rematerialize ssa_1954 because its 2 inputs are rematerializable * We can rematerialize ssa_1955 because ssa_1954 is rematerializable Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16556>
2022-08-24 23:24:57 +03:00
#include "util/u_dynarray.h"
#include "util/u_math.h"
static bool
move_system_values_to_top(nir_shader *shader)
{
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
bool progress = false;
nir_foreach_block(block, impl) {
nir_foreach_instr_safe(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
/* These intrinsics not only can't be re-materialized but aren't
* preserved when moving to the continuation shader. We have to move
* them to the top to ensure they get spilled as needed.
*/
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
switch (intrin->intrinsic) {
case nir_intrinsic_load_shader_record_ptr:
case nir_intrinsic_load_btd_local_arg_addr_intel:
nir_instr_remove(instr);
nir_instr_insert(nir_before_cf_list(&impl->body), instr);
progress = true;
break;
default:
break;
}
}
}
if (progress) {
nir_metadata_preserve(impl, nir_metadata_block_index |
nir_metadata_dominance);
} else {
nir_metadata_preserve(impl, nir_metadata_all);
}
return progress;
}
static bool
instr_is_shader_call(nir_instr *instr)
{
if (instr->type != nir_instr_type_intrinsic)
return false;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
return intrin->intrinsic == nir_intrinsic_trace_ray ||
intrin->intrinsic == nir_intrinsic_report_ray_intersection ||
intrin->intrinsic == nir_intrinsic_execute_callable;
}
/* Previously named bitset, it had to be renamed as FreeBSD defines a struct
* named bitset in sys/_bitset.h required by pthread_np.h which is included
* from src/util/u_thread.h that is indirectly included by this file.
*/
struct sized_bitset {
BITSET_WORD *set;
unsigned size;
};
static struct sized_bitset
bitset_create(void *mem_ctx, unsigned size)
{
return (struct sized_bitset) {
.set = rzalloc_array(mem_ctx, BITSET_WORD, BITSET_WORDS(size)),
.size = size,
};
}
static bool
src_is_in_bitset(nir_src *src, void *_set)
{
struct sized_bitset *set = _set;
assert(src->is_ssa);
/* Any SSA values which were added after we generated liveness information
* are things generated by this pass and, while most of it is arithmetic
* which we could re-materialize, we don't need to because it's only used
* for a single load/store and so shouldn't cross any shader calls.
*/
if (src->ssa->index >= set->size)
return false;
return BITSET_TEST(set->set, src->ssa->index);
}
static void
add_ssa_def_to_bitset(nir_ssa_def *def, struct sized_bitset *set)
{
if (def->index >= set->size)
return;
BITSET_SET(set->set, def->index);
}
static bool
can_remat_instr(nir_instr *instr, struct sized_bitset *remat)
{
/* Set of all values which are trivially re-materializable and we shouldn't
* ever spill them. This includes:
*
* - Undef values
* - Constants
* - Uniforms (UBO or push constant)
* - ALU combinations of any of the above
* - Derefs which are either complete or casts of any of the above
*
* Because this pass rewrites things in-order and phis are always turned
* into register writes, we can use "is it SSA?" to answer the question
* "can my source be re-materialized?". Register writes happen via
* non-rematerializable intrinsics.
*/
switch (instr->type) {
case nir_instr_type_alu:
return nir_foreach_src(instr, src_is_in_bitset, remat);
case nir_instr_type_deref:
return nir_foreach_src(instr, src_is_in_bitset, remat);
case nir_instr_type_intrinsic: {
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
switch (intrin->intrinsic) {
case nir_intrinsic_load_uniform:
case nir_intrinsic_load_ubo:
case nir_intrinsic_vulkan_resource_index:
case nir_intrinsic_vulkan_resource_reindex:
case nir_intrinsic_load_vulkan_descriptor:
case nir_intrinsic_load_push_constant:
case nir_intrinsic_load_global_constant:
case nir_intrinsic_load_global_const_block_intel:
case nir_intrinsic_load_desc_set_address_intel:
/* These intrinsics don't need to be spilled as long as they don't
* depend on any spilled values.
*/
return nir_foreach_src(instr, src_is_in_bitset, remat);
case nir_intrinsic_load_scratch_base_ptr:
case nir_intrinsic_load_ray_launch_id:
case nir_intrinsic_load_topology_id_intel:
case nir_intrinsic_load_btd_global_arg_addr_intel:
case nir_intrinsic_load_btd_resume_sbt_addr_intel:
case nir_intrinsic_load_ray_base_mem_addr_intel:
case nir_intrinsic_load_ray_hw_stack_size_intel:
case nir_intrinsic_load_ray_sw_stack_size_intel:
case nir_intrinsic_load_ray_num_dss_rt_stacks_intel:
case nir_intrinsic_load_ray_hit_sbt_addr_intel:
case nir_intrinsic_load_ray_hit_sbt_stride_intel:
case nir_intrinsic_load_ray_miss_sbt_addr_intel:
case nir_intrinsic_load_ray_miss_sbt_stride_intel:
case nir_intrinsic_load_callable_sbt_addr_intel:
case nir_intrinsic_load_callable_sbt_stride_intel:
case nir_intrinsic_load_reloc_const_intel:
case nir_intrinsic_load_ray_query_global_intel:
case nir_intrinsic_load_ray_launch_size:
/* Notably missing from the above list is btd_local_arg_addr_intel.
* This is because the resume shader will have a different local
* argument pointer because it has a different BSR. Any access of
* the original shader's local arguments needs to be preserved so
* that pointer has to be saved on the stack.
*
* TODO: There may be some system values we want to avoid
* re-materializing as well but we have to be very careful
* to ensure that it's a system value which cannot change
* across a shader call.
*/
return true;
case nir_intrinsic_resource_intel:
return nir_foreach_src(instr, src_is_in_bitset, remat);
default:
return false;
}
}
case nir_instr_type_ssa_undef:
case nir_instr_type_load_const:
return true;
default:
return false;
}
}
static bool
can_remat_ssa_def(nir_ssa_def *def, struct sized_bitset *remat)
{
return can_remat_instr(def->parent_instr, remat);
}
nir/lower_shader_calls: rematerialize values in more complex cases Previously when considering whether to rematerialize or spill/fill ssa_1954, we would go for a spill/fill : vec4 32 ssa_388 = (float32)txf ssa_387 (texture_handle), ssa_86 (coord), ssa_23 (lod), 0 (texture), 0 (sampler) ... vec1 32 ssa_1953 = load_const (0xbd23d70a = -0.040000) vec1 32 ssa_1954 = fadd ssa_388.x, ssa_1953 vec1 32 ssa_1955 = fneg ssa_1954 This is because when looking at ssa_1955 the first time, we would consider ssa_388 unrematerialiable, and therefore all values built on top of it would be considered unrematerialiable as well. The missing piece when considering whether to rematerialize ssa_1954 is that we should look at filled values. Now that ssa_388 has been spilled/filled, we can rebuild ssa_1955 on top of the filled value and avoid spilling/filling ssa_1955 at all. This requires a bit more work though. We can't just look at an instruction in isolation, we need to go through the ssa chains until we find values we can rematerialize or not. In this change we build a list of all ssa values involved in building a given value, up to the point there we find a filled or a rematerializable value. In this particular case, looking at ssa_1955 : * We can rematerialize ssa_388 from its filled value * We can rematerialize ssa_1953 trivially * We can rematerialize ssa_1954 because its 2 inputs are rematerializable * We can rematerialize ssa_1955 because ssa_1954 is rematerializable Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16556>
2022-08-24 23:24:57 +03:00
struct add_instr_data {
struct util_dynarray *buf;
struct sized_bitset *remat;
nir/lower_shader_calls: rematerialize values in more complex cases Previously when considering whether to rematerialize or spill/fill ssa_1954, we would go for a spill/fill : vec4 32 ssa_388 = (float32)txf ssa_387 (texture_handle), ssa_86 (coord), ssa_23 (lod), 0 (texture), 0 (sampler) ... vec1 32 ssa_1953 = load_const (0xbd23d70a = -0.040000) vec1 32 ssa_1954 = fadd ssa_388.x, ssa_1953 vec1 32 ssa_1955 = fneg ssa_1954 This is because when looking at ssa_1955 the first time, we would consider ssa_388 unrematerialiable, and therefore all values built on top of it would be considered unrematerialiable as well. The missing piece when considering whether to rematerialize ssa_1954 is that we should look at filled values. Now that ssa_388 has been spilled/filled, we can rebuild ssa_1955 on top of the filled value and avoid spilling/filling ssa_1955 at all. This requires a bit more work though. We can't just look at an instruction in isolation, we need to go through the ssa chains until we find values we can rematerialize or not. In this change we build a list of all ssa values involved in building a given value, up to the point there we find a filled or a rematerializable value. In this particular case, looking at ssa_1955 : * We can rematerialize ssa_388 from its filled value * We can rematerialize ssa_1953 trivially * We can rematerialize ssa_1954 because its 2 inputs are rematerializable * We can rematerialize ssa_1955 because ssa_1954 is rematerializable Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16556>
2022-08-24 23:24:57 +03:00
};
static bool
add_src_instr(nir_src *src, void *state)
{
struct add_instr_data *data = state;
if (BITSET_TEST(data->remat->set, src->ssa->index))
return true;
util_dynarray_foreach(data->buf, nir_instr *, instr_ptr) {
if (*instr_ptr == src->ssa->parent_instr)
return true;
}
util_dynarray_append(data->buf, nir_instr *, src->ssa->parent_instr);
return true;
}
static int
compare_instr_indexes(const void *_inst1, const void *_inst2)
{
const nir_instr * const *inst1 = _inst1;
const nir_instr * const *inst2 = _inst2;
return (*inst1)->index - (*inst2)->index;
}
static bool
can_remat_chain_ssa_def(nir_ssa_def *def, struct sized_bitset *remat, struct util_dynarray *buf)
nir/lower_shader_calls: rematerialize values in more complex cases Previously when considering whether to rematerialize or spill/fill ssa_1954, we would go for a spill/fill : vec4 32 ssa_388 = (float32)txf ssa_387 (texture_handle), ssa_86 (coord), ssa_23 (lod), 0 (texture), 0 (sampler) ... vec1 32 ssa_1953 = load_const (0xbd23d70a = -0.040000) vec1 32 ssa_1954 = fadd ssa_388.x, ssa_1953 vec1 32 ssa_1955 = fneg ssa_1954 This is because when looking at ssa_1955 the first time, we would consider ssa_388 unrematerialiable, and therefore all values built on top of it would be considered unrematerialiable as well. The missing piece when considering whether to rematerialize ssa_1954 is that we should look at filled values. Now that ssa_388 has been spilled/filled, we can rebuild ssa_1955 on top of the filled value and avoid spilling/filling ssa_1955 at all. This requires a bit more work though. We can't just look at an instruction in isolation, we need to go through the ssa chains until we find values we can rematerialize or not. In this change we build a list of all ssa values involved in building a given value, up to the point there we find a filled or a rematerializable value. In this particular case, looking at ssa_1955 : * We can rematerialize ssa_388 from its filled value * We can rematerialize ssa_1953 trivially * We can rematerialize ssa_1954 because its 2 inputs are rematerializable * We can rematerialize ssa_1955 because ssa_1954 is rematerializable Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16556>
2022-08-24 23:24:57 +03:00
{
assert(util_dynarray_num_elements(buf, nir_instr *) == 0);
void *mem_ctx = ralloc_context(NULL);
/* Add all the instructions involved in build this ssa_def */
util_dynarray_append(buf, nir_instr *, def->parent_instr);
unsigned idx = 0;
struct add_instr_data data = {
.buf = buf,
.remat = remat,
};
while (idx < util_dynarray_num_elements(buf, nir_instr *)) {
nir_instr *instr = *util_dynarray_element(buf, nir_instr *, idx++);
if (!nir_foreach_src(instr, add_src_instr, &data))
goto fail;
}
/* Sort instructions by index */
qsort(util_dynarray_begin(buf),
util_dynarray_num_elements(buf, nir_instr *),
sizeof(nir_instr *),
compare_instr_indexes);
/* Create a temporary bitset with all values already
* rematerialized/rematerializable. We'll add to this bit set as we go
* through values that might not be in that set but that we can
* rematerialize.
*/
struct sized_bitset potential_remat = bitset_create(mem_ctx, remat->size);
nir/lower_shader_calls: rematerialize values in more complex cases Previously when considering whether to rematerialize or spill/fill ssa_1954, we would go for a spill/fill : vec4 32 ssa_388 = (float32)txf ssa_387 (texture_handle), ssa_86 (coord), ssa_23 (lod), 0 (texture), 0 (sampler) ... vec1 32 ssa_1953 = load_const (0xbd23d70a = -0.040000) vec1 32 ssa_1954 = fadd ssa_388.x, ssa_1953 vec1 32 ssa_1955 = fneg ssa_1954 This is because when looking at ssa_1955 the first time, we would consider ssa_388 unrematerialiable, and therefore all values built on top of it would be considered unrematerialiable as well. The missing piece when considering whether to rematerialize ssa_1954 is that we should look at filled values. Now that ssa_388 has been spilled/filled, we can rebuild ssa_1955 on top of the filled value and avoid spilling/filling ssa_1955 at all. This requires a bit more work though. We can't just look at an instruction in isolation, we need to go through the ssa chains until we find values we can rematerialize or not. In this change we build a list of all ssa values involved in building a given value, up to the point there we find a filled or a rematerializable value. In this particular case, looking at ssa_1955 : * We can rematerialize ssa_388 from its filled value * We can rematerialize ssa_1953 trivially * We can rematerialize ssa_1954 because its 2 inputs are rematerializable * We can rematerialize ssa_1955 because ssa_1954 is rematerializable Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16556>
2022-08-24 23:24:57 +03:00
memcpy(potential_remat.set, remat->set, BITSET_WORDS(remat->size) * sizeof(BITSET_WORD));
util_dynarray_foreach(buf, nir_instr *, instr_ptr) {
nir_ssa_def *instr_ssa_def = nir_instr_ssa_def(*instr_ptr);
/* If already in the potential rematerializable, nothing to do. */
if (BITSET_TEST(potential_remat.set, instr_ssa_def->index))
continue;
if (!can_remat_instr(*instr_ptr, &potential_remat))
goto fail;
/* All the sources are rematerializable and the instruction is also
* rematerializable, mark it as rematerializable too.
*/
BITSET_SET(potential_remat.set, instr_ssa_def->index);
}
ralloc_free(mem_ctx);
return true;
fail:
util_dynarray_clear(buf);
ralloc_free(mem_ctx);
return false;
}
static nir_ssa_def *
nir/lower_shader_calls: rematerialize values in more complex cases Previously when considering whether to rematerialize or spill/fill ssa_1954, we would go for a spill/fill : vec4 32 ssa_388 = (float32)txf ssa_387 (texture_handle), ssa_86 (coord), ssa_23 (lod), 0 (texture), 0 (sampler) ... vec1 32 ssa_1953 = load_const (0xbd23d70a = -0.040000) vec1 32 ssa_1954 = fadd ssa_388.x, ssa_1953 vec1 32 ssa_1955 = fneg ssa_1954 This is because when looking at ssa_1955 the first time, we would consider ssa_388 unrematerialiable, and therefore all values built on top of it would be considered unrematerialiable as well. The missing piece when considering whether to rematerialize ssa_1954 is that we should look at filled values. Now that ssa_388 has been spilled/filled, we can rebuild ssa_1955 on top of the filled value and avoid spilling/filling ssa_1955 at all. This requires a bit more work though. We can't just look at an instruction in isolation, we need to go through the ssa chains until we find values we can rematerialize or not. In this change we build a list of all ssa values involved in building a given value, up to the point there we find a filled or a rematerializable value. In this particular case, looking at ssa_1955 : * We can rematerialize ssa_388 from its filled value * We can rematerialize ssa_1953 trivially * We can rematerialize ssa_1954 because its 2 inputs are rematerializable * We can rematerialize ssa_1955 because ssa_1954 is rematerializable Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16556>
2022-08-24 23:24:57 +03:00
remat_ssa_def(nir_builder *b, nir_ssa_def *def, struct hash_table *remap_table)
{
nir/lower_shader_calls: rematerialize values in more complex cases Previously when considering whether to rematerialize or spill/fill ssa_1954, we would go for a spill/fill : vec4 32 ssa_388 = (float32)txf ssa_387 (texture_handle), ssa_86 (coord), ssa_23 (lod), 0 (texture), 0 (sampler) ... vec1 32 ssa_1953 = load_const (0xbd23d70a = -0.040000) vec1 32 ssa_1954 = fadd ssa_388.x, ssa_1953 vec1 32 ssa_1955 = fneg ssa_1954 This is because when looking at ssa_1955 the first time, we would consider ssa_388 unrematerialiable, and therefore all values built on top of it would be considered unrematerialiable as well. The missing piece when considering whether to rematerialize ssa_1954 is that we should look at filled values. Now that ssa_388 has been spilled/filled, we can rebuild ssa_1955 on top of the filled value and avoid spilling/filling ssa_1955 at all. This requires a bit more work though. We can't just look at an instruction in isolation, we need to go through the ssa chains until we find values we can rematerialize or not. In this change we build a list of all ssa values involved in building a given value, up to the point there we find a filled or a rematerializable value. In this particular case, looking at ssa_1955 : * We can rematerialize ssa_388 from its filled value * We can rematerialize ssa_1953 trivially * We can rematerialize ssa_1954 because its 2 inputs are rematerializable * We can rematerialize ssa_1955 because ssa_1954 is rematerializable Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16556>
2022-08-24 23:24:57 +03:00
nir_instr *clone = nir_instr_clone_deep(b->shader, def->parent_instr, remap_table);
nir_builder_instr_insert(b, clone);
return nir_instr_ssa_def(clone);
}
nir/lower_shader_calls: rematerialize values in more complex cases Previously when considering whether to rematerialize or spill/fill ssa_1954, we would go for a spill/fill : vec4 32 ssa_388 = (float32)txf ssa_387 (texture_handle), ssa_86 (coord), ssa_23 (lod), 0 (texture), 0 (sampler) ... vec1 32 ssa_1953 = load_const (0xbd23d70a = -0.040000) vec1 32 ssa_1954 = fadd ssa_388.x, ssa_1953 vec1 32 ssa_1955 = fneg ssa_1954 This is because when looking at ssa_1955 the first time, we would consider ssa_388 unrematerialiable, and therefore all values built on top of it would be considered unrematerialiable as well. The missing piece when considering whether to rematerialize ssa_1954 is that we should look at filled values. Now that ssa_388 has been spilled/filled, we can rebuild ssa_1955 on top of the filled value and avoid spilling/filling ssa_1955 at all. This requires a bit more work though. We can't just look at an instruction in isolation, we need to go through the ssa chains until we find values we can rematerialize or not. In this change we build a list of all ssa values involved in building a given value, up to the point there we find a filled or a rematerializable value. In this particular case, looking at ssa_1955 : * We can rematerialize ssa_388 from its filled value * We can rematerialize ssa_1953 trivially * We can rematerialize ssa_1954 because its 2 inputs are rematerializable * We can rematerialize ssa_1955 because ssa_1954 is rematerializable Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16556>
2022-08-24 23:24:57 +03:00
static nir_ssa_def *
remat_chain_ssa_def(nir_builder *b, struct util_dynarray *buf,
struct sized_bitset *remat, nir_ssa_def ***fill_defs,
nir/lower_shader_calls: rematerialize values in more complex cases Previously when considering whether to rematerialize or spill/fill ssa_1954, we would go for a spill/fill : vec4 32 ssa_388 = (float32)txf ssa_387 (texture_handle), ssa_86 (coord), ssa_23 (lod), 0 (texture), 0 (sampler) ... vec1 32 ssa_1953 = load_const (0xbd23d70a = -0.040000) vec1 32 ssa_1954 = fadd ssa_388.x, ssa_1953 vec1 32 ssa_1955 = fneg ssa_1954 This is because when looking at ssa_1955 the first time, we would consider ssa_388 unrematerialiable, and therefore all values built on top of it would be considered unrematerialiable as well. The missing piece when considering whether to rematerialize ssa_1954 is that we should look at filled values. Now that ssa_388 has been spilled/filled, we can rebuild ssa_1955 on top of the filled value and avoid spilling/filling ssa_1955 at all. This requires a bit more work though. We can't just look at an instruction in isolation, we need to go through the ssa chains until we find values we can rematerialize or not. In this change we build a list of all ssa values involved in building a given value, up to the point there we find a filled or a rematerializable value. In this particular case, looking at ssa_1955 : * We can rematerialize ssa_388 from its filled value * We can rematerialize ssa_1953 trivially * We can rematerialize ssa_1954 because its 2 inputs are rematerializable * We can rematerialize ssa_1955 because ssa_1954 is rematerializable Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16556>
2022-08-24 23:24:57 +03:00
unsigned call_idx, struct hash_table *remap_table)
{
nir_ssa_def *last_def = NULL;
util_dynarray_foreach(buf, nir_instr *, instr_ptr) {
nir_ssa_def *instr_ssa_def = nir_instr_ssa_def(*instr_ptr);
unsigned ssa_index = instr_ssa_def->index;
if (fill_defs[ssa_index] != NULL &&
fill_defs[ssa_index][call_idx] != NULL)
continue;
/* Clone the instruction we want to rematerialize */
nir_ssa_def *clone_ssa_def = remat_ssa_def(b, instr_ssa_def, remap_table);
if (fill_defs[ssa_index] == NULL) {
fill_defs[ssa_index] =
rzalloc_array(fill_defs, nir_ssa_def *, remat->size);
}
/* Add the new ssa_def to the list fill_defs and flag it as
* rematerialized
*/
fill_defs[ssa_index][call_idx] = last_def = clone_ssa_def;
BITSET_SET(remat->set, ssa_index);
_mesa_hash_table_insert(remap_table, instr_ssa_def, last_def);
}
return last_def;
}
struct pbv_array {
struct nir_phi_builder_value **arr;
unsigned len;
};
static struct nir_phi_builder_value *
get_phi_builder_value_for_def(nir_ssa_def *def,
struct pbv_array *pbv_arr)
{
if (def->index >= pbv_arr->len)
return NULL;
return pbv_arr->arr[def->index];
}
static nir_ssa_def *
get_phi_builder_def_for_src(nir_src *src, struct pbv_array *pbv_arr,
nir_block *block)
{
assert(src->is_ssa);
struct nir_phi_builder_value *pbv =
get_phi_builder_value_for_def(src->ssa, pbv_arr);
if (pbv == NULL)
return NULL;
return nir_phi_builder_value_get_block_def(pbv, block);
}
static bool
rewrite_instr_src_from_phi_builder(nir_src *src, void *_pbv_arr)
{
nir_block *block;
if (src->parent_instr->type == nir_instr_type_phi) {
nir_phi_src *phi_src = exec_node_data(nir_phi_src, src, src);
block = phi_src->pred;
} else {
block = src->parent_instr->block;
}
nir_ssa_def *new_def = get_phi_builder_def_for_src(src, _pbv_arr, block);
if (new_def != NULL)
nir_instr_rewrite_src(src->parent_instr, src, nir_src_for_ssa(new_def));
return true;
}
static nir_ssa_def *
spill_fill(nir_builder *before, nir_builder *after, nir_ssa_def *def,
unsigned value_id, unsigned call_idx,
unsigned offset, unsigned stack_alignment)
{
const unsigned comp_size = def->bit_size / 8;
nir_store_stack(before, def,
.base = offset,
.call_idx = call_idx,
.align_mul = MIN2(comp_size, stack_alignment),
.value_id = value_id,
.write_mask = BITFIELD_MASK(def->num_components));
return nir_load_stack(after, def->num_components, def->bit_size,
.base = offset,
.call_idx = call_idx,
.value_id = value_id,
.align_mul = MIN2(comp_size, stack_alignment));
}
static bool
add_src_to_call_live_bitset(nir_src *src, void *state)
{
BITSET_WORD *call_live = state;
assert(src->is_ssa);
BITSET_SET(call_live, src->ssa->index);
return true;
}
static void
spill_ssa_defs_and_lower_shader_calls(nir_shader *shader, uint32_t num_calls,
const nir_lower_shader_calls_options *options)
{
/* TODO: If a SSA def is filled more than once, we probably want to just
* spill it at the LCM of the fill sites so we avoid unnecessary
* extra spills
*
* TODO: If a SSA def is defined outside a loop but live through some call
* inside the loop, we probably want to spill outside the loop. We
* may also want to fill outside the loop if it's not used in the
* loop.
*
* TODO: Right now, we only re-materialize things if their immediate
* sources are things which we filled. We probably want to expand
* that to re-materialize things whose sources are things we can
* re-materialize from things we filled. We may want some DAG depth
* heuristic on this.
*/
/* This happens per-shader rather than per-impl because we mess with
* nir_shader::scratch_size.
*/
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
nir_metadata_require(impl, nir_metadata_live_ssa_defs |
nir_metadata_dominance |
nir/lower_shader_calls: rematerialize values in more complex cases Previously when considering whether to rematerialize or spill/fill ssa_1954, we would go for a spill/fill : vec4 32 ssa_388 = (float32)txf ssa_387 (texture_handle), ssa_86 (coord), ssa_23 (lod), 0 (texture), 0 (sampler) ... vec1 32 ssa_1953 = load_const (0xbd23d70a = -0.040000) vec1 32 ssa_1954 = fadd ssa_388.x, ssa_1953 vec1 32 ssa_1955 = fneg ssa_1954 This is because when looking at ssa_1955 the first time, we would consider ssa_388 unrematerialiable, and therefore all values built on top of it would be considered unrematerialiable as well. The missing piece when considering whether to rematerialize ssa_1954 is that we should look at filled values. Now that ssa_388 has been spilled/filled, we can rebuild ssa_1955 on top of the filled value and avoid spilling/filling ssa_1955 at all. This requires a bit more work though. We can't just look at an instruction in isolation, we need to go through the ssa chains until we find values we can rematerialize or not. In this change we build a list of all ssa values involved in building a given value, up to the point there we find a filled or a rematerializable value. In this particular case, looking at ssa_1955 : * We can rematerialize ssa_388 from its filled value * We can rematerialize ssa_1953 trivially * We can rematerialize ssa_1954 because its 2 inputs are rematerializable * We can rematerialize ssa_1955 because ssa_1954 is rematerializable Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16556>
2022-08-24 23:24:57 +03:00
nir_metadata_block_index |
nir_metadata_instr_index);
void *mem_ctx = ralloc_context(shader);
const unsigned num_ssa_defs = impl->ssa_alloc;
const unsigned live_words = BITSET_WORDS(num_ssa_defs);
struct sized_bitset trivial_remat = bitset_create(mem_ctx, num_ssa_defs);
/* Array of all live SSA defs which are spill candidates */
nir_ssa_def **spill_defs =
rzalloc_array(mem_ctx, nir_ssa_def *, num_ssa_defs);
/* For each spill candidate, an array of every time it's defined by a fill,
* indexed by call instruction index.
*/
nir_ssa_def ***fill_defs =
rzalloc_array(mem_ctx, nir_ssa_def **, num_ssa_defs);
/* For each call instruction, the liveness set at the call */
const BITSET_WORD **call_live =
rzalloc_array(mem_ctx, const BITSET_WORD *, num_calls);
/* For each call instruction, the block index of the block it lives in */
uint32_t *call_block_indices = rzalloc_array(mem_ctx, uint32_t, num_calls);
nir/lower_shader_calls: rematerialize values in more complex cases Previously when considering whether to rematerialize or spill/fill ssa_1954, we would go for a spill/fill : vec4 32 ssa_388 = (float32)txf ssa_387 (texture_handle), ssa_86 (coord), ssa_23 (lod), 0 (texture), 0 (sampler) ... vec1 32 ssa_1953 = load_const (0xbd23d70a = -0.040000) vec1 32 ssa_1954 = fadd ssa_388.x, ssa_1953 vec1 32 ssa_1955 = fneg ssa_1954 This is because when looking at ssa_1955 the first time, we would consider ssa_388 unrematerialiable, and therefore all values built on top of it would be considered unrematerialiable as well. The missing piece when considering whether to rematerialize ssa_1954 is that we should look at filled values. Now that ssa_388 has been spilled/filled, we can rebuild ssa_1955 on top of the filled value and avoid spilling/filling ssa_1955 at all. This requires a bit more work though. We can't just look at an instruction in isolation, we need to go through the ssa chains until we find values we can rematerialize or not. In this change we build a list of all ssa values involved in building a given value, up to the point there we find a filled or a rematerializable value. In this particular case, looking at ssa_1955 : * We can rematerialize ssa_388 from its filled value * We can rematerialize ssa_1953 trivially * We can rematerialize ssa_1954 because its 2 inputs are rematerializable * We can rematerialize ssa_1955 because ssa_1954 is rematerializable Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16556>
2022-08-24 23:24:57 +03:00
/* Remap table when rebuilding instructions out of fill operations */
struct hash_table *trivial_remap_table =
_mesa_pointer_hash_table_create(mem_ctx);
/* Walk the call instructions and fetch the liveness set and block index
* for each one. We need to do this before we start modifying the shader
* so that liveness doesn't complain that it's been invalidated. Don't
* worry, we'll be very careful with our live sets. :-)
*/
unsigned call_idx = 0;
nir_foreach_block(block, impl) {
nir_foreach_instr(instr, block) {
if (!instr_is_shader_call(instr))
continue;
call_block_indices[call_idx] = block->index;
/* The objective here is to preserve values around shader call
* instructions. Therefore, we use the live set after the
* instruction as the set of things we want to preserve. Because
* none of our shader call intrinsics return anything, we don't have
* to worry about spilling over a return value.
*
* TODO: This isn't quite true for report_intersection.
*/
call_live[call_idx] =
nir_get_live_ssa_defs(nir_after_instr(instr), mem_ctx);
call_idx++;
}
}
/* If a should_remat_callback is given, call it on each of the live values
* for each call site. If it returns true we need to rematerialize that
* instruction (instead of spill/fill). Therefore we need to add the
* sources as live values so that we can rematerialize on top of those
* spilled/filled sources.
*/
if (options->should_remat_callback) {
BITSET_WORD **updated_call_live =
rzalloc_array(mem_ctx, BITSET_WORD *, num_calls);
nir_foreach_block(block, impl) {
nir_foreach_instr(instr, block) {
nir_ssa_def *def = nir_instr_ssa_def(instr);
if (def == NULL)
continue;
for (unsigned c = 0; c < num_calls; c++) {
if (!BITSET_TEST(call_live[c], def->index))
continue;
if (!options->should_remat_callback(def->parent_instr,
options->should_remat_data))
continue;
if (updated_call_live[c] == NULL) {
const unsigned bitset_words = BITSET_WORDS(impl->ssa_alloc);
updated_call_live[c] = ralloc_array(mem_ctx, BITSET_WORD, bitset_words);
memcpy(updated_call_live[c], call_live[c], bitset_words * sizeof(BITSET_WORD));
}
nir_foreach_src(instr, add_src_to_call_live_bitset, updated_call_live[c]);
}
}
}
for (unsigned c = 0; c < num_calls; c++) {
if (updated_call_live[c] != NULL)
call_live[c] = updated_call_live[c];
}
}
nir_builder before, after;
before = nir_builder_create(impl);
after = nir_builder_create(impl);
call_idx = 0;
unsigned max_scratch_size = shader->scratch_size;
nir_foreach_block(block, impl) {
nir_foreach_instr_safe(instr, block) {
nir_ssa_def *def = nir_instr_ssa_def(instr);
if (def != NULL) {
if (can_remat_ssa_def(def, &trivial_remat)) {
add_ssa_def_to_bitset(def, &trivial_remat);
nir/lower_shader_calls: rematerialize values in more complex cases Previously when considering whether to rematerialize or spill/fill ssa_1954, we would go for a spill/fill : vec4 32 ssa_388 = (float32)txf ssa_387 (texture_handle), ssa_86 (coord), ssa_23 (lod), 0 (texture), 0 (sampler) ... vec1 32 ssa_1953 = load_const (0xbd23d70a = -0.040000) vec1 32 ssa_1954 = fadd ssa_388.x, ssa_1953 vec1 32 ssa_1955 = fneg ssa_1954 This is because when looking at ssa_1955 the first time, we would consider ssa_388 unrematerialiable, and therefore all values built on top of it would be considered unrematerialiable as well. The missing piece when considering whether to rematerialize ssa_1954 is that we should look at filled values. Now that ssa_388 has been spilled/filled, we can rebuild ssa_1955 on top of the filled value and avoid spilling/filling ssa_1955 at all. This requires a bit more work though. We can't just look at an instruction in isolation, we need to go through the ssa chains until we find values we can rematerialize or not. In this change we build a list of all ssa values involved in building a given value, up to the point there we find a filled or a rematerializable value. In this particular case, looking at ssa_1955 : * We can rematerialize ssa_388 from its filled value * We can rematerialize ssa_1953 trivially * We can rematerialize ssa_1954 because its 2 inputs are rematerializable * We can rematerialize ssa_1955 because ssa_1954 is rematerializable Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16556>
2022-08-24 23:24:57 +03:00
_mesa_hash_table_insert(trivial_remap_table, def, def);
} else {
spill_defs[def->index] = def;
}
}
if (!instr_is_shader_call(instr))
continue;
const BITSET_WORD *live = call_live[call_idx];
nir/lower_shader_calls: rematerialize values in more complex cases Previously when considering whether to rematerialize or spill/fill ssa_1954, we would go for a spill/fill : vec4 32 ssa_388 = (float32)txf ssa_387 (texture_handle), ssa_86 (coord), ssa_23 (lod), 0 (texture), 0 (sampler) ... vec1 32 ssa_1953 = load_const (0xbd23d70a = -0.040000) vec1 32 ssa_1954 = fadd ssa_388.x, ssa_1953 vec1 32 ssa_1955 = fneg ssa_1954 This is because when looking at ssa_1955 the first time, we would consider ssa_388 unrematerialiable, and therefore all values built on top of it would be considered unrematerialiable as well. The missing piece when considering whether to rematerialize ssa_1954 is that we should look at filled values. Now that ssa_388 has been spilled/filled, we can rebuild ssa_1955 on top of the filled value and avoid spilling/filling ssa_1955 at all. This requires a bit more work though. We can't just look at an instruction in isolation, we need to go through the ssa chains until we find values we can rematerialize or not. In this change we build a list of all ssa values involved in building a given value, up to the point there we find a filled or a rematerializable value. In this particular case, looking at ssa_1955 : * We can rematerialize ssa_388 from its filled value * We can rematerialize ssa_1953 trivially * We can rematerialize ssa_1954 because its 2 inputs are rematerializable * We can rematerialize ssa_1955 because ssa_1954 is rematerializable Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16556>
2022-08-24 23:24:57 +03:00
struct hash_table *remap_table =
_mesa_hash_table_clone(trivial_remap_table, mem_ctx);
/* Make a copy of trivial_remat that we'll update as we crawl through
* the live SSA defs and unspill them.
*/
struct sized_bitset remat = bitset_create(mem_ctx, num_ssa_defs);
memcpy(remat.set, trivial_remat.set, live_words * sizeof(BITSET_WORD));
/* Before the two builders are always separated by the call
* instruction, it won't break anything to have two of them.
*/
before.cursor = nir_before_instr(instr);
after.cursor = nir_after_instr(instr);
nir/lower_shader_calls: rematerialize values in more complex cases Previously when considering whether to rematerialize or spill/fill ssa_1954, we would go for a spill/fill : vec4 32 ssa_388 = (float32)txf ssa_387 (texture_handle), ssa_86 (coord), ssa_23 (lod), 0 (texture), 0 (sampler) ... vec1 32 ssa_1953 = load_const (0xbd23d70a = -0.040000) vec1 32 ssa_1954 = fadd ssa_388.x, ssa_1953 vec1 32 ssa_1955 = fneg ssa_1954 This is because when looking at ssa_1955 the first time, we would consider ssa_388 unrematerialiable, and therefore all values built on top of it would be considered unrematerialiable as well. The missing piece when considering whether to rematerialize ssa_1954 is that we should look at filled values. Now that ssa_388 has been spilled/filled, we can rebuild ssa_1955 on top of the filled value and avoid spilling/filling ssa_1955 at all. This requires a bit more work though. We can't just look at an instruction in isolation, we need to go through the ssa chains until we find values we can rematerialize or not. In this change we build a list of all ssa values involved in building a given value, up to the point there we find a filled or a rematerializable value. In this particular case, looking at ssa_1955 : * We can rematerialize ssa_388 from its filled value * We can rematerialize ssa_1953 trivially * We can rematerialize ssa_1954 because its 2 inputs are rematerializable * We can rematerialize ssa_1955 because ssa_1954 is rematerializable Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16556>
2022-08-24 23:24:57 +03:00
/* Array used to hold all the values needed to rematerialize a live
* value.
*/
struct util_dynarray remat_chain;
util_dynarray_init(&remat_chain, mem_ctx);
unsigned offset = shader->scratch_size;
for (unsigned w = 0; w < live_words; w++) {
BITSET_WORD spill_mask = live[w] & ~trivial_remat.set[w];
while (spill_mask) {
int i = u_bit_scan(&spill_mask);
assert(i >= 0);
unsigned index = w * BITSET_WORDBITS + i;
assert(index < num_ssa_defs);
nir/lower_shader_calls: rematerialize values in more complex cases Previously when considering whether to rematerialize or spill/fill ssa_1954, we would go for a spill/fill : vec4 32 ssa_388 = (float32)txf ssa_387 (texture_handle), ssa_86 (coord), ssa_23 (lod), 0 (texture), 0 (sampler) ... vec1 32 ssa_1953 = load_const (0xbd23d70a = -0.040000) vec1 32 ssa_1954 = fadd ssa_388.x, ssa_1953 vec1 32 ssa_1955 = fneg ssa_1954 This is because when looking at ssa_1955 the first time, we would consider ssa_388 unrematerialiable, and therefore all values built on top of it would be considered unrematerialiable as well. The missing piece when considering whether to rematerialize ssa_1954 is that we should look at filled values. Now that ssa_388 has been spilled/filled, we can rebuild ssa_1955 on top of the filled value and avoid spilling/filling ssa_1955 at all. This requires a bit more work though. We can't just look at an instruction in isolation, we need to go through the ssa chains until we find values we can rematerialize or not. In this change we build a list of all ssa values involved in building a given value, up to the point there we find a filled or a rematerializable value. In this particular case, looking at ssa_1955 : * We can rematerialize ssa_388 from its filled value * We can rematerialize ssa_1953 trivially * We can rematerialize ssa_1954 because its 2 inputs are rematerializable * We can rematerialize ssa_1955 because ssa_1954 is rematerializable Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16556>
2022-08-24 23:24:57 +03:00
def = spill_defs[index];
nir_ssa_def *original_def = def, *new_def;
if (can_remat_ssa_def(def, &remat)) {
/* If this SSA def is re-materializable or based on other
* things we've already spilled, re-materialize it rather
* than spilling and filling. Anything which is trivially
* re-materializable won't even get here because we take
* those into account in spill_mask above.
*/
nir/lower_shader_calls: rematerialize values in more complex cases Previously when considering whether to rematerialize or spill/fill ssa_1954, we would go for a spill/fill : vec4 32 ssa_388 = (float32)txf ssa_387 (texture_handle), ssa_86 (coord), ssa_23 (lod), 0 (texture), 0 (sampler) ... vec1 32 ssa_1953 = load_const (0xbd23d70a = -0.040000) vec1 32 ssa_1954 = fadd ssa_388.x, ssa_1953 vec1 32 ssa_1955 = fneg ssa_1954 This is because when looking at ssa_1955 the first time, we would consider ssa_388 unrematerialiable, and therefore all values built on top of it would be considered unrematerialiable as well. The missing piece when considering whether to rematerialize ssa_1954 is that we should look at filled values. Now that ssa_388 has been spilled/filled, we can rebuild ssa_1955 on top of the filled value and avoid spilling/filling ssa_1955 at all. This requires a bit more work though. We can't just look at an instruction in isolation, we need to go through the ssa chains until we find values we can rematerialize or not. In this change we build a list of all ssa values involved in building a given value, up to the point there we find a filled or a rematerializable value. In this particular case, looking at ssa_1955 : * We can rematerialize ssa_388 from its filled value * We can rematerialize ssa_1953 trivially * We can rematerialize ssa_1954 because its 2 inputs are rematerializable * We can rematerialize ssa_1955 because ssa_1954 is rematerializable Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16556>
2022-08-24 23:24:57 +03:00
new_def = remat_ssa_def(&after, def, remap_table);
} else if (can_remat_chain_ssa_def(def, &remat, &remat_chain)) {
new_def = remat_chain_ssa_def(&after, &remat_chain, &remat,
fill_defs, call_idx,
remap_table);
util_dynarray_clear(&remat_chain);
} else {
bool is_bool = def->bit_size == 1;
if (is_bool)
def = nir_b2b32(&before, def);
const unsigned comp_size = def->bit_size / 8;
offset = ALIGN(offset, comp_size);
nir/lower_shader_calls: rematerialize values in more complex cases Previously when considering whether to rematerialize or spill/fill ssa_1954, we would go for a spill/fill : vec4 32 ssa_388 = (float32)txf ssa_387 (texture_handle), ssa_86 (coord), ssa_23 (lod), 0 (texture), 0 (sampler) ... vec1 32 ssa_1953 = load_const (0xbd23d70a = -0.040000) vec1 32 ssa_1954 = fadd ssa_388.x, ssa_1953 vec1 32 ssa_1955 = fneg ssa_1954 This is because when looking at ssa_1955 the first time, we would consider ssa_388 unrematerialiable, and therefore all values built on top of it would be considered unrematerialiable as well. The missing piece when considering whether to rematerialize ssa_1954 is that we should look at filled values. Now that ssa_388 has been spilled/filled, we can rebuild ssa_1955 on top of the filled value and avoid spilling/filling ssa_1955 at all. This requires a bit more work though. We can't just look at an instruction in isolation, we need to go through the ssa chains until we find values we can rematerialize or not. In this change we build a list of all ssa values involved in building a given value, up to the point there we find a filled or a rematerializable value. In this particular case, looking at ssa_1955 : * We can rematerialize ssa_388 from its filled value * We can rematerialize ssa_1953 trivially * We can rematerialize ssa_1954 because its 2 inputs are rematerializable * We can rematerialize ssa_1955 because ssa_1954 is rematerializable Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16556>
2022-08-24 23:24:57 +03:00
new_def = spill_fill(&before, &after, def,
index, call_idx,
offset, options->stack_alignment);
if (is_bool)
nir/lower_shader_calls: rematerialize values in more complex cases Previously when considering whether to rematerialize or spill/fill ssa_1954, we would go for a spill/fill : vec4 32 ssa_388 = (float32)txf ssa_387 (texture_handle), ssa_86 (coord), ssa_23 (lod), 0 (texture), 0 (sampler) ... vec1 32 ssa_1953 = load_const (0xbd23d70a = -0.040000) vec1 32 ssa_1954 = fadd ssa_388.x, ssa_1953 vec1 32 ssa_1955 = fneg ssa_1954 This is because when looking at ssa_1955 the first time, we would consider ssa_388 unrematerialiable, and therefore all values built on top of it would be considered unrematerialiable as well. The missing piece when considering whether to rematerialize ssa_1954 is that we should look at filled values. Now that ssa_388 has been spilled/filled, we can rebuild ssa_1955 on top of the filled value and avoid spilling/filling ssa_1955 at all. This requires a bit more work though. We can't just look at an instruction in isolation, we need to go through the ssa chains until we find values we can rematerialize or not. In this change we build a list of all ssa values involved in building a given value, up to the point there we find a filled or a rematerializable value. In this particular case, looking at ssa_1955 : * We can rematerialize ssa_388 from its filled value * We can rematerialize ssa_1953 trivially * We can rematerialize ssa_1954 because its 2 inputs are rematerializable * We can rematerialize ssa_1955 because ssa_1954 is rematerializable Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16556>
2022-08-24 23:24:57 +03:00
new_def = nir_b2b1(&after, new_def);
offset += def->num_components * comp_size;
}
/* Mark this SSA def as available in the remat set so that, if
* some other SSA def we need is computed based on it, we can
* just re-compute instead of fetching from memory.
*/
BITSET_SET(remat.set, index);
/* For now, we just make a note of this new SSA def. We'll
* fix things up with the phi builder as a second pass.
*/
if (fill_defs[index] == NULL) {
fill_defs[index] =
nir/lower_shader_calls: rematerialize values in more complex cases Previously when considering whether to rematerialize or spill/fill ssa_1954, we would go for a spill/fill : vec4 32 ssa_388 = (float32)txf ssa_387 (texture_handle), ssa_86 (coord), ssa_23 (lod), 0 (texture), 0 (sampler) ... vec1 32 ssa_1953 = load_const (0xbd23d70a = -0.040000) vec1 32 ssa_1954 = fadd ssa_388.x, ssa_1953 vec1 32 ssa_1955 = fneg ssa_1954 This is because when looking at ssa_1955 the first time, we would consider ssa_388 unrematerialiable, and therefore all values built on top of it would be considered unrematerialiable as well. The missing piece when considering whether to rematerialize ssa_1954 is that we should look at filled values. Now that ssa_388 has been spilled/filled, we can rebuild ssa_1955 on top of the filled value and avoid spilling/filling ssa_1955 at all. This requires a bit more work though. We can't just look at an instruction in isolation, we need to go through the ssa chains until we find values we can rematerialize or not. In this change we build a list of all ssa values involved in building a given value, up to the point there we find a filled or a rematerializable value. In this particular case, looking at ssa_1955 : * We can rematerialize ssa_388 from its filled value * We can rematerialize ssa_1953 trivially * We can rematerialize ssa_1954 because its 2 inputs are rematerializable * We can rematerialize ssa_1955 because ssa_1954 is rematerializable Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16556>
2022-08-24 23:24:57 +03:00
rzalloc_array(fill_defs, nir_ssa_def *, num_calls);
}
nir/lower_shader_calls: rematerialize values in more complex cases Previously when considering whether to rematerialize or spill/fill ssa_1954, we would go for a spill/fill : vec4 32 ssa_388 = (float32)txf ssa_387 (texture_handle), ssa_86 (coord), ssa_23 (lod), 0 (texture), 0 (sampler) ... vec1 32 ssa_1953 = load_const (0xbd23d70a = -0.040000) vec1 32 ssa_1954 = fadd ssa_388.x, ssa_1953 vec1 32 ssa_1955 = fneg ssa_1954 This is because when looking at ssa_1955 the first time, we would consider ssa_388 unrematerialiable, and therefore all values built on top of it would be considered unrematerialiable as well. The missing piece when considering whether to rematerialize ssa_1954 is that we should look at filled values. Now that ssa_388 has been spilled/filled, we can rebuild ssa_1955 on top of the filled value and avoid spilling/filling ssa_1955 at all. This requires a bit more work though. We can't just look at an instruction in isolation, we need to go through the ssa chains until we find values we can rematerialize or not. In this change we build a list of all ssa values involved in building a given value, up to the point there we find a filled or a rematerializable value. In this particular case, looking at ssa_1955 : * We can rematerialize ssa_388 from its filled value * We can rematerialize ssa_1953 trivially * We can rematerialize ssa_1954 because its 2 inputs are rematerializable * We can rematerialize ssa_1955 because ssa_1954 is rematerializable Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16556>
2022-08-24 23:24:57 +03:00
fill_defs[index][call_idx] = new_def;
_mesa_hash_table_insert(remap_table, original_def, new_def);
}
}
nir_builder *b = &before;
offset = ALIGN(offset, options->stack_alignment);
max_scratch_size = MAX2(max_scratch_size, offset);
/* First thing on the called shader's stack is the resume address
* followed by a pointer to the payload.
*/
nir_intrinsic_instr *call = nir_instr_as_intrinsic(instr);
/* Lower to generic intrinsics with information about the stack & resume shader. */
switch (call->intrinsic) {
case nir_intrinsic_trace_ray: {
nir_rt_trace_ray(b, call->src[0].ssa, call->src[1].ssa,
call->src[2].ssa, call->src[3].ssa,
call->src[4].ssa, call->src[5].ssa,
call->src[6].ssa, call->src[7].ssa,
call->src[8].ssa, call->src[9].ssa,
call->src[10].ssa,
.call_idx = call_idx, .stack_size = offset);
break;
}
case nir_intrinsic_report_ray_intersection:
unreachable("Any-hit shaders must be inlined");
case nir_intrinsic_execute_callable: {
nir_rt_execute_callable(b, call->src[0].ssa, call->src[1].ssa, .call_idx = call_idx, .stack_size = offset);
break;
}
default:
unreachable("Invalid shader call instruction");
}
nir_rt_resume(b, .call_idx = call_idx, .stack_size = offset);
nir_instr_remove(&call->instr);
call_idx++;
}
}
assert(call_idx == num_calls);
shader->scratch_size = max_scratch_size;
struct nir_phi_builder *pb = nir_phi_builder_create(impl);
struct pbv_array pbv_arr = {
.arr = rzalloc_array(mem_ctx, struct nir_phi_builder_value *,
num_ssa_defs),
.len = num_ssa_defs,
};
const unsigned block_words = BITSET_WORDS(impl->num_blocks);
BITSET_WORD *def_blocks = ralloc_array(mem_ctx, BITSET_WORD, block_words);
/* Go through and set up phi builder values for each spillable value which
* we ever needed to spill at any point.
*/
for (unsigned index = 0; index < num_ssa_defs; index++) {
if (fill_defs[index] == NULL)
continue;
nir_ssa_def *def = spill_defs[index];
memset(def_blocks, 0, block_words * sizeof(BITSET_WORD));
BITSET_SET(def_blocks, def->parent_instr->block->index);
for (unsigned call_idx = 0; call_idx < num_calls; call_idx++) {
if (fill_defs[index][call_idx] != NULL)
BITSET_SET(def_blocks, call_block_indices[call_idx]);
}
pbv_arr.arr[index] = nir_phi_builder_add_value(pb, def->num_components,
def->bit_size, def_blocks);
}
/* Walk the shader one more time and rewrite SSA defs as needed using the
* phi builder.
*/
nir_foreach_block(block, impl) {
nir_foreach_instr_safe(instr, block) {
nir_ssa_def *def = nir_instr_ssa_def(instr);
if (def != NULL) {
struct nir_phi_builder_value *pbv =
get_phi_builder_value_for_def(def, &pbv_arr);
if (pbv != NULL)
nir_phi_builder_value_set_block_def(pbv, block, def);
}
if (instr->type == nir_instr_type_phi)
continue;
nir_foreach_src(instr, rewrite_instr_src_from_phi_builder, &pbv_arr);
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *resume = nir_instr_as_intrinsic(instr);
if (resume->intrinsic != nir_intrinsic_rt_resume)
continue;
call_idx = nir_intrinsic_call_idx(resume);
/* Technically, this is the wrong place to add the fill defs to the
* phi builder values because we haven't seen any of the load_scratch
* instructions for this call yet. However, we know based on how we
* emitted them that no value ever gets used until after the load
* instruction has been emitted so this should be safe. If we ever
* fail validation due this it likely means a bug in our spilling
* code and not the phi re-construction code here.
*/
for (unsigned index = 0; index < num_ssa_defs; index++) {
if (fill_defs[index] && fill_defs[index][call_idx]) {
nir_phi_builder_value_set_block_def(pbv_arr.arr[index], block,
fill_defs[index][call_idx]);
}
}
}
nir_if *following_if = nir_block_get_following_if(block);
if (following_if) {
nir_ssa_def *new_def =
get_phi_builder_def_for_src(&following_if->condition,
&pbv_arr, block);
if (new_def != NULL)
nir_if_rewrite_condition(following_if, nir_src_for_ssa(new_def));
}
/* Handle phi sources that source from this block. We have to do this
* as a separate pass because the phi builder assumes that uses and
* defs are processed in an order that respects dominance. When we have
* loops, a phi source may be a back-edge so we have to handle it as if
* it were one of the last instructions in the predecessor block.
*/
nir_foreach_phi_src_leaving_block(block,
rewrite_instr_src_from_phi_builder,
&pbv_arr);
}
nir_phi_builder_finish(pb);
ralloc_free(mem_ctx);
nir_metadata_preserve(impl, nir_metadata_block_index |
nir_metadata_dominance);
}
static nir_instr *
find_resume_instr(nir_function_impl *impl, unsigned call_idx)
{
nir_foreach_block(block, impl) {
nir_foreach_instr(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *resume = nir_instr_as_intrinsic(instr);
if (resume->intrinsic != nir_intrinsic_rt_resume)
continue;
if (nir_intrinsic_call_idx(resume) == call_idx)
return &resume->instr;
}
}
unreachable("Couldn't find resume instruction");
}
/* Walk the CF tree and duplicate the contents of every loop, one half runs on
* resume and the other half is for any post-resume loop iterations. We are
* careful in our duplication to ensure that resume_instr is in the resume
* half of the loop though a copy of resume_instr will remain in the other
* half as well in case the same shader call happens twice.
*/
static bool
duplicate_loop_bodies(nir_function_impl *impl, nir_instr *resume_instr)
{
nir_ssa_def *resume_reg = NULL;
for (nir_cf_node *node = resume_instr->block->cf_node.parent;
node->type != nir_cf_node_function; node = node->parent) {
if (node->type != nir_cf_node_loop)
continue;
nir_loop *loop = nir_cf_node_as_loop(node);
assert(!nir_loop_has_continue_construct(loop));
nir_builder b = nir_builder_create(impl);
if (resume_reg == NULL) {
/* We only create resume_reg if we encounter a loop. This way we can
* avoid re-validating the shader and calling ssa_to_reg_intrinsics in
* the case where it's just if-ladders.
*/
resume_reg = nir_decl_reg(&b, 1, 1, 0);
/* Initialize resume to true at the start of the shader, right after
* the register is declared at the start.
*/
b.cursor = nir_after_instr(resume_reg->parent_instr);
nir_store_reg(&b, nir_imm_true(&b), resume_reg);
/* Set resume to false right after the resume instruction */
b.cursor = nir_after_instr(resume_instr);
nir_store_reg(&b, nir_imm_false(&b), resume_reg);
}
/* Before we go any further, make sure that everything which exits the
* loop or continues around to the top of the loop does so through
* registers. We're about to duplicate the loop body and we'll have
* serious trouble if we don't do this.
*/
nir_convert_loop_to_lcssa(loop);
nir_lower_phis_to_regs_block(nir_loop_first_block(loop));
nir_lower_phis_to_regs_block(
nir_cf_node_as_block(nir_cf_node_next(&loop->cf_node)));
nir_cf_list cf_list;
nir_cf_list_extract(&cf_list, &loop->body);
nir_if *_if = nir_if_create(impl->function->shader);
b.cursor = nir_after_cf_list(&loop->body);
_if->condition = nir_src_for_ssa(nir_load_reg(&b, resume_reg));
nir_cf_node_insert(nir_after_cf_list(&loop->body), &_if->cf_node);
nir_cf_list clone;
nir_cf_list_clone(&clone, &cf_list, &loop->cf_node, NULL);
/* Insert the clone in the else and the original in the then so that
* the resume_instr remains valid even after the duplication.
*/
nir_cf_reinsert(&cf_list, nir_before_cf_list(&_if->then_list));
nir_cf_reinsert(&clone, nir_before_cf_list(&_if->else_list));
}
if (resume_reg != NULL)
nir_metadata_preserve(impl, nir_metadata_none);
return resume_reg != NULL;
}
static bool
cf_node_contains_block(nir_cf_node *node, nir_block *block)
{
for (nir_cf_node *n = &block->cf_node; n != NULL; n = n->parent) {
if (n == node)
return true;
}
return false;
}
static void
rewrite_phis_to_pred(nir_block *block, nir_block *pred)
{
nir_foreach_phi(phi, block) {
ASSERTED bool found = false;
nir_foreach_phi_src(phi_src, phi) {
if (phi_src->pred == pred) {
found = true;
assert(phi_src->src.is_ssa);
nir_ssa_def_rewrite_uses(&phi->dest.ssa, phi_src->src.ssa);
break;
}
}
assert(found);
}
}
static bool
cursor_is_after_jump(nir_cursor cursor)
{
switch (cursor.option) {
case nir_cursor_before_instr:
case nir_cursor_before_block:
return false;
case nir_cursor_after_instr:
return cursor.instr->type == nir_instr_type_jump;
case nir_cursor_after_block:
return nir_block_ends_in_jump(cursor.block);;
}
unreachable("Invalid cursor option");
}
/** Flattens if ladders leading up to a resume
*
* Given a resume_instr, this function flattens any if ladders leading to the
* resume instruction and deletes any code that cannot be encountered on a
* direct path to the resume instruction. This way we get, for the most part,
* straight-line control-flow up to the resume instruction.
*
* While we do this flattening, we also move any code which is in the remat
* set up to the top of the function or to the top of the resume portion of
* the current loop. We don't worry about control-flow as we do this because
* phis will never be in the remat set (see can_remat_instr) and so nothing
* control-dependent will ever need to be re-materialized. It is possible
* that this algorithm will preserve too many instructions by moving them to
* the top but we leave that for DCE to clean up. Any code not in the remat
* set is deleted because it's either unused in the continuation or else
* unspilled from a previous continuation and the unspill code is after the
* resume instruction.
*
* If, for instance, we have something like this:
*
* // block 0
* if (cond1) {
* // block 1
* } else {
* // block 2
* if (cond2) {
* // block 3
* resume;
* if (cond3) {
* // block 4
* }
* } else {
* // block 5
* }
* }
*
* then we know, because we know the resume instruction had to be encoutered,
* that cond1 = false and cond2 = true and we lower as follows:
*
* // block 0
* // block 2
* // block 3
* resume;
* if (cond3) {
* // block 4
* }
*
* As you can see, the code in blocks 1 and 5 was removed because there is no
* path from the start of the shader to the resume instruction which execute
* blocks 1 or 5. Any remat code from blocks 0, 2, and 3 is preserved and
* moved to the top. If the resume instruction is inside a loop then we know
* a priori that it is of the form
*
* loop {
* if (resume) {
* // Contents containing resume_instr
* } else {
* // Second copy of contents
* }
* }
*
* In this case, we only descend into the first half of the loop. The second
* half is left alone as that portion is only ever executed after the resume
* instruction.
*/
static bool
flatten_resume_if_ladder(nir_builder *b,
nir_cf_node *parent_node,
struct exec_list *child_list,
bool child_list_contains_cursor,
nir_instr *resume_instr,
struct sized_bitset *remat)
{
nir_cf_list cf_list;
/* If our child list contains the cursor instruction then we start out
* before the cursor instruction. We need to know this so that we can skip
* moving instructions which are already before the cursor.
*/
bool before_cursor = child_list_contains_cursor;
nir_cf_node *resume_node = NULL;
foreach_list_typed_safe(nir_cf_node, child, node, child_list) {
switch (child->type) {
case nir_cf_node_block: {
nir_block *block = nir_cf_node_as_block(child);
if (b->cursor.option == nir_cursor_before_block &&
b->cursor.block == block) {
assert(before_cursor);
before_cursor = false;
}
nir_foreach_instr_safe(instr, block) {
if ((b->cursor.option == nir_cursor_before_instr ||
b->cursor.option == nir_cursor_after_instr) &&
b->cursor.instr == instr) {
assert(nir_cf_node_is_first(&block->cf_node));
assert(before_cursor);
before_cursor = false;
continue;
}
if (instr == resume_instr)
goto found_resume;
if (!before_cursor && can_remat_instr(instr, remat)) {
nir_instr_remove(instr);
nir_instr_insert(b->cursor, instr);
b->cursor = nir_after_instr(instr);
nir_ssa_def *def = nir_instr_ssa_def(instr);
BITSET_SET(remat->set, def->index);
}
}
if (b->cursor.option == nir_cursor_after_block &&
b->cursor.block == block) {
assert(before_cursor);
before_cursor = false;
}
break;
}
case nir_cf_node_if: {
assert(!before_cursor);
nir_if *_if = nir_cf_node_as_if(child);
if (flatten_resume_if_ladder(b, &_if->cf_node, &_if->then_list,
false, resume_instr, remat)) {
resume_node = child;
rewrite_phis_to_pred(nir_cf_node_as_block(nir_cf_node_next(child)),
nir_if_last_then_block(_if));
goto found_resume;
}
if (flatten_resume_if_ladder(b, &_if->cf_node, &_if->else_list,
false, resume_instr, remat)) {
resume_node = child;
rewrite_phis_to_pred(nir_cf_node_as_block(nir_cf_node_next(child)),
nir_if_last_else_block(_if));
goto found_resume;
}
break;
}
case nir_cf_node_loop: {
assert(!before_cursor);
nir_loop *loop = nir_cf_node_as_loop(child);
assert(!nir_loop_has_continue_construct(loop));
if (cf_node_contains_block(&loop->cf_node, resume_instr->block)) {
/* Thanks to our loop body duplication pass, every level of loop
* containing the resume instruction contains exactly three nodes:
* two blocks and an if. We don't want to lower away this if
* because it's the resume selection if. The resume half is
* always the then_list so that's what we want to flatten.
*/
nir_block *header = nir_loop_first_block(loop);
nir_if *_if = nir_cf_node_as_if(nir_cf_node_next(&header->cf_node));
/* We want to place anything re-materialized from inside the loop
* at the top of the resume half of the loop.
*/
nir_builder bl = nir_builder_at(nir_before_cf_list(&_if->then_list));
ASSERTED bool found =
flatten_resume_if_ladder(&bl, &_if->cf_node, &_if->then_list,
true, resume_instr, remat);
assert(found);
resume_node = child;
goto found_resume;
} else {
ASSERTED bool found =
flatten_resume_if_ladder(b, &loop->cf_node, &loop->body,
false, resume_instr, remat);
assert(!found);
}
break;
}
case nir_cf_node_function:
unreachable("Unsupported CF node type");
}
}
assert(!before_cursor);
/* If we got here, we didn't find the resume node or instruction. */
return false;
found_resume:
/* If we got here then we found either the resume node or the resume
* instruction in this CF list.
*/
if (resume_node) {
/* If the resume instruction is buried in side one of our children CF
* nodes, resume_node now points to that child.
*/
if (resume_node->type == nir_cf_node_if) {
/* Thanks to the recursive call, all of the interesting contents of
* resume_node have been copied before the cursor. We just need to
* copy the stuff after resume_node.
*/
nir_cf_extract(&cf_list, nir_after_cf_node(resume_node),
nir_after_cf_list(child_list));
} else {
/* The loop contains its own cursor and still has useful stuff in it.
* We want to move everything after and including the loop to before
* the cursor.
*/
assert(resume_node->type == nir_cf_node_loop);
nir_cf_extract(&cf_list, nir_before_cf_node(resume_node),
nir_after_cf_list(child_list));
}
} else {
/* If we found the resume instruction in one of our blocks, grab
* everything after it in the entire list (not just the one block), and
* place it before the cursor instr.
*/
nir_cf_extract(&cf_list, nir_after_instr(resume_instr),
nir_after_cf_list(child_list));
}
/* If the resume instruction is in the first block of the child_list,
* and the cursor is still before that block, the nir_cf_extract() may
* extract the block object pointed by the cursor, and instead create
* a new one for the code before the resume. In such case the cursor
* will be broken, as it will point to a block which is no longer
* in a function.
*
* Luckily, in both cases when this is possible, the intended cursor
* position is right before the child_list, so we can fix the cursor here.
*/
if (child_list_contains_cursor &&
b->cursor.option == nir_cursor_before_block &&
b->cursor.block->cf_node.parent == NULL)
b->cursor = nir_before_cf_list(child_list);
if (cursor_is_after_jump(b->cursor)) {
/* If the resume instruction is in a loop, it's possible cf_list ends
* in a break or continue instruction, in which case we don't want to
* insert anything. It's also possible we have an early return if
* someone hasn't lowered those yet. In either case, nothing after that
* point executes in this context so we can delete it.
*/
nir_cf_delete(&cf_list);
} else {
b->cursor = nir_cf_reinsert(&cf_list, b->cursor);
}
if (!resume_node) {
/* We want the resume to be the first "interesting" instruction */
nir_instr_remove(resume_instr);
nir_instr_insert(nir_before_cf_list(&b->impl->body), resume_instr);
}
/* We've copied everything interesting out of this CF list to before the
* cursor. Delete everything else.
*/
if (child_list_contains_cursor) {
nir_cf_extract(&cf_list, b->cursor, nir_after_cf_list(child_list));
} else {
nir_cf_list_extract(&cf_list, child_list);
}
nir_cf_delete(&cf_list);
return true;
}
typedef bool (*wrap_instr_callback)(nir_instr *instr);
static bool
wrap_instr(nir_builder *b, nir_instr *instr, void *data)
{
wrap_instr_callback callback = data;
if (!callback(instr))
return false;
b->cursor = nir_before_instr(instr);
nir_if *_if = nir_push_if(b, nir_imm_true(b));
nir_pop_if(b, NULL);
nir_cf_list cf_list;
nir_cf_extract(&cf_list, nir_before_instr(instr), nir_after_instr(instr));
nir_cf_reinsert(&cf_list, nir_before_block(nir_if_first_then_block(_if)));
return true;
}
/* This pass wraps jump instructions in a dummy if block so that when
* flatten_resume_if_ladder() does its job, it doesn't move a jump instruction
* directly in front of another instruction which the NIR control flow helpers
* do not allow.
*/
static bool
wrap_instrs(nir_shader *shader, wrap_instr_callback callback)
{
return nir_shader_instructions_pass(shader, wrap_instr,
nir_metadata_none, callback);
}
static bool
instr_is_jump(nir_instr *instr)
{
return instr->type == nir_instr_type_jump;
}
static nir_instr *
lower_resume(nir_shader *shader, int call_idx)
{
wrap_instrs(shader, instr_is_jump);
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
nir_instr *resume_instr = find_resume_instr(impl, call_idx);
if (duplicate_loop_bodies(impl, resume_instr)) {
nir_validate_shader(shader, "after duplicate_loop_bodies in "
"nir_lower_shader_calls");
/* If we duplicated the bodies of any loops, run reg_intrinsics_to_ssa to
* get rid of all those pesky registers we just added.
*/
NIR_PASS_V(shader, nir_lower_reg_intrinsics_to_ssa);
}
/* Re-index nir_ssa_def::index. We don't care about actual liveness in
* this pass but, so we can use the same helpers as the spilling pass, we
* need to make sure that live_index is something sane. It's used
* constantly for determining if an SSA value has been added since the
* start of the pass.
*/
nir_index_ssa_defs(impl);
void *mem_ctx = ralloc_context(shader);
/* Used to track which things may have been assumed to be re-materialized
* by the spilling pass and which we shouldn't delete.
*/
struct sized_bitset remat = bitset_create(mem_ctx, impl->ssa_alloc);
/* Create a nop instruction to use as a cursor as we extract and re-insert
* stuff into the CFG.
*/
nir_builder b = nir_builder_at(nir_before_cf_list(&impl->body));
ASSERTED bool found =
flatten_resume_if_ladder(&b, &impl->cf_node, &impl->body,
true, resume_instr, &remat);
assert(found);
ralloc_free(mem_ctx);
nir_metadata_preserve(impl, nir_metadata_none);
nir_validate_shader(shader, "after flatten_resume_if_ladder in "
"nir_lower_shader_calls");
return resume_instr;
}
static void
replace_resume_with_halt(nir_shader *shader, nir_instr *keep)
{
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
nir_builder b = nir_builder_create(impl);
nir_foreach_block_safe(block, impl) {
nir_foreach_instr_safe(instr, block) {
if (instr == keep)
continue;
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *resume = nir_instr_as_intrinsic(instr);
if (resume->intrinsic != nir_intrinsic_rt_resume)
continue;
/* If this is some other resume, then we've kicked off a ray or
* bindless thread and we don't want to go any further in this
* shader. Insert a halt so that NIR will delete any instructions
* dominated by this call instruction including the scratch_load
* instructions we inserted.
*/
nir_cf_list cf_list;
nir_cf_extract(&cf_list, nir_after_instr(&resume->instr),
nir_after_block(block));
nir_cf_delete(&cf_list);
b.cursor = nir_instr_remove(&resume->instr);
nir_jump(&b, nir_jump_halt);
break;
}
}
}
struct lower_scratch_state {
nir_address_format address_format;
};
static bool
lower_stack_instr_to_scratch(struct nir_builder *b, nir_instr *instr, void *data)
{
struct lower_scratch_state *state = data;
if (instr->type != nir_instr_type_intrinsic)
return false;
nir_intrinsic_instr *stack = nir_instr_as_intrinsic(instr);
switch (stack->intrinsic) {
case nir_intrinsic_load_stack: {
b->cursor = nir_instr_remove(instr);
nir_ssa_def *data, *old_data = nir_instr_ssa_def(instr);
if (state->address_format == nir_address_format_64bit_global) {
nir_ssa_def *addr = nir_iadd_imm(b,
nir_load_scratch_base_ptr(b, 1, 64, 1),
nir_intrinsic_base(stack));
data = nir_load_global(b, addr,
nir_intrinsic_align_mul(stack),
stack->dest.ssa.num_components,
stack->dest.ssa.bit_size);
} else {
assert(state->address_format == nir_address_format_32bit_offset);
data = nir_load_scratch(b,
old_data->num_components,
old_data->bit_size,
nir_imm_int(b, nir_intrinsic_base(stack)),
.align_mul = nir_intrinsic_align_mul(stack));
}
nir_ssa_def_rewrite_uses(old_data, data);
break;
}
case nir_intrinsic_store_stack: {
b->cursor = nir_instr_remove(instr);
nir_ssa_def *data = stack->src[0].ssa;
if (state->address_format == nir_address_format_64bit_global) {
nir_ssa_def *addr = nir_iadd_imm(b,
nir_load_scratch_base_ptr(b, 1, 64, 1),
nir_intrinsic_base(stack));
nir_store_global(b, addr,
nir_intrinsic_align_mul(stack),
data,
BITFIELD_MASK(data->num_components));
} else {
assert(state->address_format == nir_address_format_32bit_offset);
nir_store_scratch(b, data,
nir_imm_int(b, nir_intrinsic_base(stack)),
.align_mul = nir_intrinsic_align_mul(stack),
.write_mask = BITFIELD_MASK(data->num_components));
}
break;
}
default:
return false;
}
return true;
}
static bool
nir_lower_stack_to_scratch(nir_shader *shader,
nir_address_format address_format)
{
struct lower_scratch_state state = {
.address_format = address_format,
};
return nir_shader_instructions_pass(shader,
lower_stack_instr_to_scratch,
nir_metadata_block_index |
nir_metadata_dominance,
&state);
}
static bool
opt_remove_respills_instr(struct nir_builder *b, nir_instr *instr, void *data)
{
if (instr->type != nir_instr_type_intrinsic)
return false;
nir_intrinsic_instr *store_intrin = nir_instr_as_intrinsic(instr);
if (store_intrin->intrinsic != nir_intrinsic_store_stack)
return false;
nir_instr *value_instr = store_intrin->src[0].ssa->parent_instr;
if (value_instr->type != nir_instr_type_intrinsic)
return false;
nir_intrinsic_instr *load_intrin = nir_instr_as_intrinsic(value_instr);
if (load_intrin->intrinsic != nir_intrinsic_load_stack)
return false;
if (nir_intrinsic_base(load_intrin) != nir_intrinsic_base(store_intrin))
return false;
nir_instr_remove(&store_intrin->instr);
return true;
}
/* After shader split, look at stack load/store operations. If we're loading
* and storing the same value at the same location, we can drop the store
* instruction.
*/
static bool
nir_opt_remove_respills(nir_shader *shader)
{
return nir_shader_instructions_pass(shader,
opt_remove_respills_instr,
nir_metadata_block_index |
nir_metadata_dominance,
NULL);
}
static void
add_use_mask(struct hash_table_u64 *offset_to_mask,
unsigned offset, unsigned mask)
{
uintptr_t old_mask = (uintptr_t)
_mesa_hash_table_u64_search(offset_to_mask, offset);
_mesa_hash_table_u64_insert(offset_to_mask, offset,
(void *)(uintptr_t)(old_mask | mask));
}
/* When splitting the shaders, we might have inserted store & loads of vec4s,
* because a live value is a 4 components. But sometimes, only some components
* of that vec4 will be used by after the scratch load. This pass removes the
* unused components of scratch load/stores.
*/
static bool
nir_opt_trim_stack_values(nir_shader *shader)
{
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
struct hash_table_u64 *value_id_to_mask = _mesa_hash_table_u64_create(NULL);
bool progress = false;
/* Find all the loads and how their value is being used */
nir_foreach_block_safe(block, impl) {
nir_foreach_instr_safe(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic != nir_intrinsic_load_stack)
continue;
const unsigned value_id = nir_intrinsic_value_id(intrin);
const unsigned mask =
nir_ssa_def_components_read(nir_instr_ssa_def(instr));
add_use_mask(value_id_to_mask, value_id, mask);
}
}
/* For each store, if it stores more than is being used, trim it.
* Otherwise, remove it from the hash table.
*/
nir_foreach_block_safe(block, impl) {
nir_foreach_instr_safe(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic != nir_intrinsic_store_stack)
continue;
const unsigned value_id = nir_intrinsic_value_id(intrin);
const unsigned write_mask = nir_intrinsic_write_mask(intrin);
const unsigned read_mask = (uintptr_t)
_mesa_hash_table_u64_search(value_id_to_mask, value_id);
/* Already removed from the table, nothing to do */
if (read_mask == 0)
continue;
/* Matching read/write mask, nothing to do, remove from the table. */
if (write_mask == read_mask) {
_mesa_hash_table_u64_remove(value_id_to_mask, value_id);
continue;
}
nir_builder b = nir_builder_at(nir_before_instr(instr));
nir_ssa_def *value = nir_channels(&b, intrin->src[0].ssa, read_mask);
nir_instr_rewrite_src_ssa(instr, &intrin->src[0], value);
intrin->num_components = util_bitcount(read_mask);
nir_intrinsic_set_write_mask(intrin, (1u << intrin->num_components) - 1);
progress = true;
}
}
/* For each load remaining in the hash table (only the ones we changed the
* number of components of), apply triming/reswizzle.
*/
nir_foreach_block_safe(block, impl) {
nir_foreach_instr_safe(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic != nir_intrinsic_load_stack)
continue;
const unsigned value_id = nir_intrinsic_value_id(intrin);
unsigned read_mask = (uintptr_t)
_mesa_hash_table_u64_search(value_id_to_mask, value_id);
if (read_mask == 0)
continue;
unsigned swiz_map[NIR_MAX_VEC_COMPONENTS] = { 0, };
unsigned swiz_count = 0;
u_foreach_bit(idx, read_mask)
swiz_map[idx] = swiz_count++;
nir_ssa_def *def = nir_instr_ssa_def(instr);
nir_foreach_use_safe(use_src, def) {
if (use_src->parent_instr->type == nir_instr_type_alu) {
nir_alu_instr *alu = nir_instr_as_alu(use_src->parent_instr);
nir_alu_src *alu_src = exec_node_data(nir_alu_src, use_src, src);
unsigned write_mask = alu->dest.write_mask;
u_foreach_bit(idx, write_mask)
alu_src->swizzle[idx] = swiz_map[alu_src->swizzle[idx]];
} else if (use_src->parent_instr->type == nir_instr_type_intrinsic) {
nir_intrinsic_instr *use_intrin =
nir_instr_as_intrinsic(use_src->parent_instr);
assert(nir_intrinsic_has_write_mask(use_intrin));
unsigned write_mask = nir_intrinsic_write_mask(use_intrin);
unsigned new_write_mask = 0;
u_foreach_bit(idx, write_mask)
new_write_mask |= 1 << swiz_map[idx];
nir_intrinsic_set_write_mask(use_intrin, new_write_mask);
} else {
unreachable("invalid instruction type");
}
}
intrin->dest.ssa.num_components = intrin->num_components = swiz_count;
progress = true;
}
}
nir_metadata_preserve(impl,
progress ?
(nir_metadata_dominance |
nir_metadata_block_index |
nir_metadata_loop_analysis) :
nir_metadata_all);
_mesa_hash_table_u64_destroy(value_id_to_mask);
return progress;
}
struct scratch_item {
unsigned old_offset;
unsigned new_offset;
unsigned bit_size;
unsigned num_components;
unsigned value;
unsigned call_idx;
};
static int
sort_scratch_item_by_size_and_value_id(const void *_item1, const void *_item2)
{
const struct scratch_item *item1 = _item1;
const struct scratch_item *item2 = _item2;
/* By ascending value_id */
if (item1->bit_size == item2->bit_size)
return (int) item1->value - (int) item2->value;
/* By descending size */
return (int) item2->bit_size - (int) item1->bit_size;
}
static bool
nir_opt_sort_and_pack_stack(nir_shader *shader,
unsigned start_call_scratch,
unsigned stack_alignment,
unsigned num_calls)
{
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
void *mem_ctx = ralloc_context(NULL);
struct hash_table_u64 *value_id_to_item =
_mesa_hash_table_u64_create(mem_ctx);
struct util_dynarray ops;
util_dynarray_init(&ops, mem_ctx);
for (unsigned call_idx = 0; call_idx < num_calls; call_idx++) {
_mesa_hash_table_u64_clear(value_id_to_item);
util_dynarray_clear(&ops);
/* Find all the stack load and their offset. */
nir_foreach_block_safe(block, impl) {
nir_foreach_instr_safe(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic != nir_intrinsic_load_stack)
continue;
if (nir_intrinsic_call_idx(intrin) != call_idx)
continue;
const unsigned value_id = nir_intrinsic_value_id(intrin);
nir_ssa_def *def = nir_instr_ssa_def(instr);
assert(_mesa_hash_table_u64_search(value_id_to_item,
value_id) == NULL);
struct scratch_item item = {
.old_offset = nir_intrinsic_base(intrin),
.bit_size = def->bit_size,
.num_components = def->num_components,
.value = value_id,
};
util_dynarray_append(&ops, struct scratch_item, item);
_mesa_hash_table_u64_insert(value_id_to_item, value_id, (void *)(uintptr_t)true);
}
}
/* Sort scratch item by component size. */
qsort(util_dynarray_begin(&ops),
util_dynarray_num_elements(&ops, struct scratch_item),
sizeof(struct scratch_item),
sort_scratch_item_by_size_and_value_id);
/* Reorder things on the stack */
_mesa_hash_table_u64_clear(value_id_to_item);
unsigned scratch_size = start_call_scratch;
util_dynarray_foreach(&ops, struct scratch_item, item) {
item->new_offset = ALIGN(scratch_size, item->bit_size / 8);
scratch_size = item->new_offset + (item->bit_size * item->num_components) / 8;
_mesa_hash_table_u64_insert(value_id_to_item, item->value, item);
}
shader->scratch_size = ALIGN(scratch_size, stack_alignment);
/* Update offsets in the instructions */
nir_foreach_block_safe(block, impl) {
nir_foreach_instr_safe(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
switch (intrin->intrinsic) {
case nir_intrinsic_load_stack:
case nir_intrinsic_store_stack: {
if (nir_intrinsic_call_idx(intrin) != call_idx)
continue;
struct scratch_item *item =
_mesa_hash_table_u64_search(value_id_to_item,
nir_intrinsic_value_id(intrin));
assert(item);
nir_intrinsic_set_base(intrin, item->new_offset);
break;
}
case nir_intrinsic_rt_trace_ray:
case nir_intrinsic_rt_execute_callable:
case nir_intrinsic_rt_resume:
if (nir_intrinsic_call_idx(intrin) != call_idx)
continue;
nir_intrinsic_set_stack_size(intrin, shader->scratch_size);
break;
default:
break;
}
}
}
}
ralloc_free(mem_ctx);
nir_shader_preserve_all_metadata(shader);
return true;
}
static unsigned
nir_block_loop_depth(nir_block *block)
{
nir_cf_node *node = &block->cf_node;
unsigned loop_depth = 0;
while (node != NULL) {
if (node->type == nir_cf_node_loop)
loop_depth++;
node = node->parent;
}
return loop_depth;
}
/* Find the last block dominating all the uses of a SSA value. */
static nir_block *
find_last_dominant_use_block(nir_function_impl *impl, nir_ssa_def *value)
{
nir_block *old_block = value->parent_instr->block;
unsigned old_block_loop_depth = nir_block_loop_depth(old_block);
nir_foreach_block_reverse_safe(block, impl) {
bool fits = true;
/* Store on the current block of the value */
if (block == old_block)
return block;
/* Don't move instructions deeper into loops, this would generate more
* memory traffic.
*/
unsigned block_loop_depth = nir_block_loop_depth(block);
if (block_loop_depth > old_block_loop_depth)
continue;
nir_foreach_if_use(src, value) {
nir_block *block_before_if =
nir_cf_node_as_block(nir_cf_node_prev(&src->parent_if->cf_node));
if (!nir_block_dominates(block, block_before_if)) {
fits = false;
break;
}
}
if (!fits)
continue;
nir_foreach_use(src, value) {
if (src->parent_instr->type == nir_instr_type_phi &&
block == src->parent_instr->block) {
fits = false;
break;
}
if (!nir_block_dominates(block, src->parent_instr->block)) {
fits = false;
break;
}
}
if (!fits)
continue;
return block;
}
unreachable("Cannot find block");
}
/* Put the scratch loads in the branches where they're needed. */
static bool
nir_opt_stack_loads(nir_shader *shader)
{
bool progress = false;
nir_foreach_function_impl(impl, shader) {
nir_metadata_require(impl, nir_metadata_dominance |
nir_metadata_block_index);
bool func_progress = false;
nir_foreach_block_safe(block, impl) {
nir_foreach_instr_safe(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic != nir_intrinsic_load_stack)
continue;
nir_ssa_def *value = &intrin->dest.ssa;
nir_block *new_block = find_last_dominant_use_block(impl, value);
if (new_block == block)
continue;
/* Move the scratch load in the new block, after the phis. */
nir_instr_remove(instr);
nir_instr_insert(nir_before_block_after_phis(new_block), instr);
func_progress = true;
}
}
nir_metadata_preserve(impl,
func_progress ? (nir_metadata_block_index |
nir_metadata_dominance |
nir_metadata_loop_analysis) :
nir_metadata_all);
progress |= func_progress;
}
return progress;
}
static bool
split_stack_components_instr(struct nir_builder *b, nir_instr *instr, void *data)
{
if (instr->type != nir_instr_type_intrinsic)
return false;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic != nir_intrinsic_load_stack &&
intrin->intrinsic != nir_intrinsic_store_stack)
return false;
if (intrin->intrinsic == nir_intrinsic_load_stack &&
intrin->dest.ssa.num_components == 1)
return false;
if (intrin->intrinsic == nir_intrinsic_store_stack &&
intrin->src[0].ssa->num_components == 1)
return false;
b->cursor = nir_before_instr(instr);
if (intrin->intrinsic == nir_intrinsic_load_stack) {
nir_ssa_def *components[NIR_MAX_VEC_COMPONENTS] = { 0, };
for (unsigned c = 0; c < intrin->dest.ssa.num_components; c++) {
components[c] = nir_load_stack(b, 1, intrin->dest.ssa.bit_size,
.base = nir_intrinsic_base(intrin) +
c * intrin->dest.ssa.bit_size / 8,
.call_idx = nir_intrinsic_call_idx(intrin),
.value_id = nir_intrinsic_value_id(intrin),
.align_mul = nir_intrinsic_align_mul(intrin));
}
nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
nir_vec(b, components,
intrin->dest.ssa.num_components));
} else {
assert(intrin->intrinsic == nir_intrinsic_store_stack);
for (unsigned c = 0; c < intrin->src[0].ssa->num_components; c++) {
nir_store_stack(b, nir_channel(b, intrin->src[0].ssa, c),
.base = nir_intrinsic_base(intrin) +
c * intrin->src[0].ssa->bit_size / 8,
.call_idx = nir_intrinsic_call_idx(intrin),
.align_mul = nir_intrinsic_align_mul(intrin),
.value_id = nir_intrinsic_value_id(intrin),
.write_mask = 0x1);
}
}
nir_instr_remove(instr);
return true;
}
/* Break the load_stack/store_stack intrinsics into single compoments. This
* helps the vectorizer to pack components.
*/
static bool
nir_split_stack_components(nir_shader *shader)
{
return nir_shader_instructions_pass(shader,
split_stack_components_instr,
nir_metadata_block_index |
nir_metadata_dominance,
NULL);
}
struct stack_op_vectorizer_state {
nir_should_vectorize_mem_func driver_callback;
void *driver_data;
};
static bool
should_vectorize(unsigned align_mul,
unsigned align_offset,
unsigned bit_size,
unsigned num_components,
nir_intrinsic_instr *low, nir_intrinsic_instr *high,
void *data)
{
/* We only care about those intrinsics */
if ((low->intrinsic != nir_intrinsic_load_stack &&
low->intrinsic != nir_intrinsic_store_stack) ||
(high->intrinsic != nir_intrinsic_load_stack &&
high->intrinsic != nir_intrinsic_store_stack))
return false;
struct stack_op_vectorizer_state *state = data;
return state->driver_callback(align_mul, align_offset,
bit_size, num_components,
low, high, state->driver_data);
}
/** Lower shader call instructions to split shaders.
*
* Shader calls can be split into an initial shader and a series of "resume"
* shaders. When the shader is first invoked, it is the initial shader which
* is executed. At any point in the initial shader or any one of the resume
* shaders, a shader call operation may be performed. The possible shader call
* operations are:
*
* - trace_ray
* - report_ray_intersection
* - execute_callable
*
* When a shader call operation is performed, we push all live values to the
* stack,call rt_trace_ray/rt_execute_callable and then kill the shader. Once
* the operation we invoked is complete, a callee shader will return execution
* to the respective resume shader. The resume shader pops the contents off
* the stack and picks up where the calling shader left off.
*
* Stack management is assumed to be done after this pass. Call
* instructions and their resumes get annotated with stack information that
* should be enough for the backend to implement proper stack management.
*/
bool
nir_lower_shader_calls(nir_shader *shader,
const nir_lower_shader_calls_options *options,
nir_shader ***resume_shaders_out,
uint32_t *num_resume_shaders_out,
void *mem_ctx)
{
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
int num_calls = 0;
nir_foreach_block(block, impl) {
nir_foreach_instr_safe(instr, block) {
if (instr_is_shader_call(instr))
num_calls++;
}
}
if (num_calls == 0) {
nir_shader_preserve_all_metadata(shader);
*num_resume_shaders_out = 0;
return false;
}
/* Some intrinsics not only can't be re-materialized but aren't preserved
* when moving to the continuation shader. We have to move them to the top
* to ensure they get spilled as needed.
*/
{
bool progress = false;
NIR_PASS(progress, shader, move_system_values_to_top);
if (progress)
NIR_PASS(progress, shader, nir_opt_cse);
}
/* Deref chains contain metadata information that is needed by other passes
* after this one. If we don't rematerialize the derefs in the blocks where
* they're used here, the following lowerings will insert phis which can
* prevent other passes from chasing deref chains. Additionally, derefs need
* to be rematerialized after shader call instructions to avoid spilling.
*/
{
bool progress = false;
NIR_PASS(progress, shader, wrap_instrs, instr_is_shader_call);
nir_rematerialize_derefs_in_use_blocks_impl(impl);
if (progress)
NIR_PASS(_, shader, nir_opt_dead_cf);
}
/* Save the start point of the call stack in scratch */
unsigned start_call_scratch = shader->scratch_size;
NIR_PASS_V(shader, spill_ssa_defs_and_lower_shader_calls,
num_calls, options);
NIR_PASS_V(shader, nir_opt_remove_phis);
NIR_PASS_V(shader, nir_opt_trim_stack_values);
NIR_PASS_V(shader, nir_opt_sort_and_pack_stack,
start_call_scratch, options->stack_alignment, num_calls);
/* Make N copies of our shader */
nir_shader **resume_shaders = ralloc_array(mem_ctx, nir_shader *, num_calls);
for (unsigned i = 0; i < num_calls; i++) {
resume_shaders[i] = nir_shader_clone(mem_ctx, shader);
/* Give them a recognizable name */
resume_shaders[i]->info.name =
ralloc_asprintf(mem_ctx, "%s%sresume_%u",
shader->info.name ? shader->info.name : "",
shader->info.name ? "-" : "",
i);
}
replace_resume_with_halt(shader, NULL);
nir_opt_dce(shader);
nir_opt_dead_cf(shader);
for (unsigned i = 0; i < num_calls; i++) {
nir_instr *resume_instr = lower_resume(resume_shaders[i], i);
replace_resume_with_halt(resume_shaders[i], resume_instr);
/* Remove the dummy blocks added by flatten_resume_if_ladder() */
nir_opt_if(resume_shaders[i], nir_opt_if_optimize_phi_true_false);
nir_opt_dce(resume_shaders[i]);
nir_opt_dead_cf(resume_shaders[i]);
nir_opt_remove_phis(resume_shaders[i]);
}
for (unsigned i = 0; i < num_calls; i++)
NIR_PASS_V(resume_shaders[i], nir_opt_remove_respills);
if (options->localized_loads) {
/* Once loads have been combined we can try to put them closer to where
* they're needed.
*/
for (unsigned i = 0; i < num_calls; i++)
NIR_PASS_V(resume_shaders[i], nir_opt_stack_loads);
}
struct stack_op_vectorizer_state vectorizer_state = {
.driver_callback = options->vectorizer_callback,
.driver_data = options->vectorizer_data,
};
nir_load_store_vectorize_options vect_opts = {
.modes = nir_var_shader_temp,
.callback = should_vectorize,
.cb_data = &vectorizer_state,
};
if (options->vectorizer_callback != NULL) {
NIR_PASS_V(shader, nir_split_stack_components);
NIR_PASS_V(shader, nir_opt_load_store_vectorize, &vect_opts);
}
NIR_PASS_V(shader, nir_lower_stack_to_scratch, options->address_format);
nir_opt_cse(shader);
for (unsigned i = 0; i < num_calls; i++) {
if (options->vectorizer_callback != NULL) {
NIR_PASS_V(shader, nir_split_stack_components);
NIR_PASS_V(shader, nir_opt_load_store_vectorize, &vect_opts);
}
NIR_PASS_V(resume_shaders[i], nir_lower_stack_to_scratch,
options->address_format);
nir_opt_cse(resume_shaders[i]);
}
*resume_shaders_out = resume_shaders;
*num_resume_shaders_out = num_calls;
return true;
}