2010-08-27 13:59:49 -07:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2010 Intel Corporation
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
* Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
|
|
|
* DEALINGS IN THE SOFTWARE.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include "glsl_types.h"
|
|
|
|
|
#include "loop_analysis.h"
|
|
|
|
|
#include "ir_hierarchical_visitor.h"
|
|
|
|
|
|
2014-04-08 19:58:36 -07:00
|
|
|
#include "main/mtypes.h"
|
|
|
|
|
|
2013-09-20 11:03:44 -07:00
|
|
|
namespace {
|
|
|
|
|
|
2010-08-27 13:59:49 -07:00
|
|
|
class loop_unroll_visitor : public ir_hierarchical_visitor {
|
|
|
|
|
public:
|
2014-04-08 19:58:36 -07:00
|
|
|
loop_unroll_visitor(loop_state *state,
|
|
|
|
|
const struct gl_shader_compiler_options *options)
|
2010-08-27 13:59:49 -07:00
|
|
|
{
|
|
|
|
|
this->state = state;
|
|
|
|
|
this->progress = false;
|
2014-04-08 19:58:36 -07:00
|
|
|
this->options = options;
|
2010-08-27 13:59:49 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
virtual ir_visitor_status visit_leave(ir_loop *ir);
|
2013-11-28 22:12:08 -08:00
|
|
|
void simple_unroll(ir_loop *ir, int iterations);
|
|
|
|
|
void complex_unroll(ir_loop *ir, int iterations,
|
|
|
|
|
bool continue_from_then_branch);
|
|
|
|
|
void splice_post_if_instructions(ir_if *ir_if, exec_list *splice_dest);
|
2010-08-27 13:59:49 -07:00
|
|
|
|
|
|
|
|
loop_state *state;
|
|
|
|
|
|
|
|
|
|
bool progress;
|
2014-04-08 19:58:36 -07:00
|
|
|
const struct gl_shader_compiler_options *options;
|
2010-08-27 13:59:49 -07:00
|
|
|
};
|
|
|
|
|
|
2013-09-20 11:03:44 -07:00
|
|
|
} /* anonymous namespace */
|
2010-08-27 13:59:49 -07:00
|
|
|
|
2010-12-01 15:06:47 -08:00
|
|
|
static bool
|
|
|
|
|
is_break(ir_instruction *ir)
|
|
|
|
|
{
|
|
|
|
|
return ir != NULL && ir->ir_type == ir_type_loop_jump
|
|
|
|
|
&& ((ir_loop_jump *) ir)->is_break();
|
|
|
|
|
}
|
|
|
|
|
|
2012-02-21 13:37:49 -08:00
|
|
|
class loop_unroll_count : public ir_hierarchical_visitor {
|
|
|
|
|
public:
|
|
|
|
|
int nodes;
|
glsl: Ignore loop-too-large heuristic if there's bad variable indexing.
Many shaders use a pattern such as:
for (int i = 0; i < NUM_LIGHTS; i++) {
...access a uniform array, or shader input/output array...
}
where NUM_LIGHTS is a small constant (such as 2, 4, or 8).
The expectation is that the compiler will unroll those loops, turning
the array access into constant indexing, which is more efficient, and
which may enable array splitting and other optimizations.
In many cases, our heuristic fails - either there's another tiny nested
loop inside, or the estimated number of instructions is just barely
beyond the threshold. So, we fail to unroll the loop, leaving the
variable indexing in place.
Drivers which don't support the particular flavor of variable indexing
will call lower_variable_index_to_cond_assign(), which generates piles
and piles of immensely inefficient code. We'd like to avoid generating
that.
This patch detects unsupported forms of variable-indexing in loops, where
the array index is a loop induction variable. In that case, it bypasses
the loop-too-large heuristic and forces unrolling.
Improves performance in various microbenchmarks: Gl32PSBump8 by 47%,
Gl32ShMapVsm by 80%, and Gl32ShMapPcf by 27%. No changes in shader-db.
v2: Check ir->array for being an array or matrix, rather than the
ir_dereference_array itself.
v3: Fix and expand statistics in commit message.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-04-08 18:09:43 -07:00
|
|
|
bool unsupported_variable_indexing;
|
glsl: Skip loop-too-large heuristic if indexing arrays of a certain size
A pattern in certain shaders is:
uniform vec4 colors[NUM_LIGHTS];
for (int i = 0; i < NUM_LIGHTS; i++) {
...use colors[i]...
}
In this case, the application author expects the shader compiler to
unroll the loop. By doing so, it replaces variable indexing of the
array with constant indexing, which is more efficient.
This patch extends the heuristic to see if arrays accessed within the
loop are indexed by an induction variable, and if the array size exactly
matches the number of loop iterations. If so, the application author
probably intended us to unroll it. If not, we rely on the existing
loop-too-large heuristic.
Improves performance in a phong shading microbenchmark by 2.88x, and a
shadow mapping microbenchmark by 1.63x. Without variable indexing, we
can upload the small uniform arrays as push constants instead of pull
constants, avoiding shader memory access. Affects several games, but
doesn't appear to impact their performance.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
Acked-by: Kristian Høgsberg <krh@bitplanet.net>
2014-10-29 20:56:07 -07:00
|
|
|
bool array_indexed_by_induction_var_with_exact_iterations;
|
2014-04-08 15:33:27 -07:00
|
|
|
/* If there are nested loops, the node count will be inaccurate. */
|
|
|
|
|
bool nested_loop;
|
2012-02-21 13:37:49 -08:00
|
|
|
|
glsl: Ignore loop-too-large heuristic if there's bad variable indexing.
Many shaders use a pattern such as:
for (int i = 0; i < NUM_LIGHTS; i++) {
...access a uniform array, or shader input/output array...
}
where NUM_LIGHTS is a small constant (such as 2, 4, or 8).
The expectation is that the compiler will unroll those loops, turning
the array access into constant indexing, which is more efficient, and
which may enable array splitting and other optimizations.
In many cases, our heuristic fails - either there's another tiny nested
loop inside, or the estimated number of instructions is just barely
beyond the threshold. So, we fail to unroll the loop, leaving the
variable indexing in place.
Drivers which don't support the particular flavor of variable indexing
will call lower_variable_index_to_cond_assign(), which generates piles
and piles of immensely inefficient code. We'd like to avoid generating
that.
This patch detects unsupported forms of variable-indexing in loops, where
the array index is a loop induction variable. In that case, it bypasses
the loop-too-large heuristic and forces unrolling.
Improves performance in various microbenchmarks: Gl32PSBump8 by 47%,
Gl32ShMapVsm by 80%, and Gl32ShMapPcf by 27%. No changes in shader-db.
v2: Check ir->array for being an array or matrix, rather than the
ir_dereference_array itself.
v3: Fix and expand statistics in commit message.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-04-08 18:09:43 -07:00
|
|
|
loop_unroll_count(exec_list *list, loop_variable_state *ls,
|
|
|
|
|
const struct gl_shader_compiler_options *options)
|
|
|
|
|
: ls(ls), options(options)
|
2012-02-21 13:37:49 -08:00
|
|
|
{
|
|
|
|
|
nodes = 0;
|
2014-04-08 15:33:27 -07:00
|
|
|
nested_loop = false;
|
glsl: Ignore loop-too-large heuristic if there's bad variable indexing.
Many shaders use a pattern such as:
for (int i = 0; i < NUM_LIGHTS; i++) {
...access a uniform array, or shader input/output array...
}
where NUM_LIGHTS is a small constant (such as 2, 4, or 8).
The expectation is that the compiler will unroll those loops, turning
the array access into constant indexing, which is more efficient, and
which may enable array splitting and other optimizations.
In many cases, our heuristic fails - either there's another tiny nested
loop inside, or the estimated number of instructions is just barely
beyond the threshold. So, we fail to unroll the loop, leaving the
variable indexing in place.
Drivers which don't support the particular flavor of variable indexing
will call lower_variable_index_to_cond_assign(), which generates piles
and piles of immensely inefficient code. We'd like to avoid generating
that.
This patch detects unsupported forms of variable-indexing in loops, where
the array index is a loop induction variable. In that case, it bypasses
the loop-too-large heuristic and forces unrolling.
Improves performance in various microbenchmarks: Gl32PSBump8 by 47%,
Gl32ShMapVsm by 80%, and Gl32ShMapPcf by 27%. No changes in shader-db.
v2: Check ir->array for being an array or matrix, rather than the
ir_dereference_array itself.
v3: Fix and expand statistics in commit message.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-04-08 18:09:43 -07:00
|
|
|
unsupported_variable_indexing = false;
|
glsl: Skip loop-too-large heuristic if indexing arrays of a certain size
A pattern in certain shaders is:
uniform vec4 colors[NUM_LIGHTS];
for (int i = 0; i < NUM_LIGHTS; i++) {
...use colors[i]...
}
In this case, the application author expects the shader compiler to
unroll the loop. By doing so, it replaces variable indexing of the
array with constant indexing, which is more efficient.
This patch extends the heuristic to see if arrays accessed within the
loop are indexed by an induction variable, and if the array size exactly
matches the number of loop iterations. If so, the application author
probably intended us to unroll it. If not, we rely on the existing
loop-too-large heuristic.
Improves performance in a phong shading microbenchmark by 2.88x, and a
shadow mapping microbenchmark by 1.63x. Without variable indexing, we
can upload the small uniform arrays as push constants instead of pull
constants, avoiding shader memory access. Affects several games, but
doesn't appear to impact their performance.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
Acked-by: Kristian Høgsberg <krh@bitplanet.net>
2014-10-29 20:56:07 -07:00
|
|
|
array_indexed_by_induction_var_with_exact_iterations = false;
|
2012-02-21 13:37:49 -08:00
|
|
|
|
|
|
|
|
run(list);
|
|
|
|
|
}
|
|
|
|
|
|
2014-03-27 14:17:53 -07:00
|
|
|
virtual ir_visitor_status visit_enter(ir_assignment *)
|
2012-02-21 13:37:49 -08:00
|
|
|
{
|
|
|
|
|
nodes++;
|
|
|
|
|
return visit_continue;
|
|
|
|
|
}
|
|
|
|
|
|
2014-03-27 14:17:53 -07:00
|
|
|
virtual ir_visitor_status visit_enter(ir_expression *)
|
2012-02-21 13:37:49 -08:00
|
|
|
{
|
|
|
|
|
nodes++;
|
|
|
|
|
return visit_continue;
|
|
|
|
|
}
|
|
|
|
|
|
2014-03-27 14:17:53 -07:00
|
|
|
virtual ir_visitor_status visit_enter(ir_loop *)
|
2012-02-21 13:37:49 -08:00
|
|
|
{
|
2014-04-08 15:33:27 -07:00
|
|
|
nested_loop = true;
|
2012-02-21 13:37:49 -08:00
|
|
|
return visit_continue;
|
|
|
|
|
}
|
glsl: Ignore loop-too-large heuristic if there's bad variable indexing.
Many shaders use a pattern such as:
for (int i = 0; i < NUM_LIGHTS; i++) {
...access a uniform array, or shader input/output array...
}
where NUM_LIGHTS is a small constant (such as 2, 4, or 8).
The expectation is that the compiler will unroll those loops, turning
the array access into constant indexing, which is more efficient, and
which may enable array splitting and other optimizations.
In many cases, our heuristic fails - either there's another tiny nested
loop inside, or the estimated number of instructions is just barely
beyond the threshold. So, we fail to unroll the loop, leaving the
variable indexing in place.
Drivers which don't support the particular flavor of variable indexing
will call lower_variable_index_to_cond_assign(), which generates piles
and piles of immensely inefficient code. We'd like to avoid generating
that.
This patch detects unsupported forms of variable-indexing in loops, where
the array index is a loop induction variable. In that case, it bypasses
the loop-too-large heuristic and forces unrolling.
Improves performance in various microbenchmarks: Gl32PSBump8 by 47%,
Gl32ShMapVsm by 80%, and Gl32ShMapPcf by 27%. No changes in shader-db.
v2: Check ir->array for being an array or matrix, rather than the
ir_dereference_array itself.
v3: Fix and expand statistics in commit message.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-04-08 18:09:43 -07:00
|
|
|
|
|
|
|
|
virtual ir_visitor_status visit_enter(ir_dereference_array *ir)
|
|
|
|
|
{
|
2015-06-09 13:33:39 +03:00
|
|
|
/* Force unroll in case of dynamic indexing with sampler arrays
|
|
|
|
|
* when EmitNoIndirectSampler is set.
|
|
|
|
|
*/
|
|
|
|
|
if (options->EmitNoIndirectSampler) {
|
|
|
|
|
if ((ir->array->type->is_array() &&
|
|
|
|
|
ir->array->type->contains_sampler()) &&
|
|
|
|
|
!ir->array_index->constant_expression_value()) {
|
|
|
|
|
unsupported_variable_indexing = true;
|
|
|
|
|
return visit_continue;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
glsl: Ignore loop-too-large heuristic if there's bad variable indexing.
Many shaders use a pattern such as:
for (int i = 0; i < NUM_LIGHTS; i++) {
...access a uniform array, or shader input/output array...
}
where NUM_LIGHTS is a small constant (such as 2, 4, or 8).
The expectation is that the compiler will unroll those loops, turning
the array access into constant indexing, which is more efficient, and
which may enable array splitting and other optimizations.
In many cases, our heuristic fails - either there's another tiny nested
loop inside, or the estimated number of instructions is just barely
beyond the threshold. So, we fail to unroll the loop, leaving the
variable indexing in place.
Drivers which don't support the particular flavor of variable indexing
will call lower_variable_index_to_cond_assign(), which generates piles
and piles of immensely inefficient code. We'd like to avoid generating
that.
This patch detects unsupported forms of variable-indexing in loops, where
the array index is a loop induction variable. In that case, it bypasses
the loop-too-large heuristic and forces unrolling.
Improves performance in various microbenchmarks: Gl32PSBump8 by 47%,
Gl32ShMapVsm by 80%, and Gl32ShMapPcf by 27%. No changes in shader-db.
v2: Check ir->array for being an array or matrix, rather than the
ir_dereference_array itself.
v3: Fix and expand statistics in commit message.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-04-08 18:09:43 -07:00
|
|
|
/* Check for arrays variably-indexed by a loop induction variable.
|
|
|
|
|
* Unrolling the loop may convert that access into constant-indexing.
|
|
|
|
|
*
|
|
|
|
|
* Many drivers don't support particular kinds of variable indexing,
|
|
|
|
|
* and have to resort to using lower_variable_index_to_cond_assign to
|
|
|
|
|
* handle it. This results in huge amounts of horrible code, so we'd
|
|
|
|
|
* like to avoid that if possible. Here, we just note that it will
|
|
|
|
|
* happen.
|
|
|
|
|
*/
|
|
|
|
|
if ((ir->array->type->is_array() || ir->array->type->is_matrix()) &&
|
|
|
|
|
!ir->array_index->as_constant()) {
|
|
|
|
|
ir_variable *array = ir->array->variable_referenced();
|
|
|
|
|
loop_variable *lv = ls->get(ir->array_index->variable_referenced());
|
|
|
|
|
if (array && lv && lv->is_induction_var()) {
|
glsl: Skip loop-too-large heuristic if indexing arrays of a certain size
A pattern in certain shaders is:
uniform vec4 colors[NUM_LIGHTS];
for (int i = 0; i < NUM_LIGHTS; i++) {
...use colors[i]...
}
In this case, the application author expects the shader compiler to
unroll the loop. By doing so, it replaces variable indexing of the
array with constant indexing, which is more efficient.
This patch extends the heuristic to see if arrays accessed within the
loop are indexed by an induction variable, and if the array size exactly
matches the number of loop iterations. If so, the application author
probably intended us to unroll it. If not, we rely on the existing
loop-too-large heuristic.
Improves performance in a phong shading microbenchmark by 2.88x, and a
shadow mapping microbenchmark by 1.63x. Without variable indexing, we
can upload the small uniform arrays as push constants instead of pull
constants, avoiding shader memory access. Affects several games, but
doesn't appear to impact their performance.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
Acked-by: Kristian Høgsberg <krh@bitplanet.net>
2014-10-29 20:56:07 -07:00
|
|
|
/* If an array is indexed by a loop induction variable, and the
|
|
|
|
|
* array size is exactly the number of loop iterations, this is
|
|
|
|
|
* probably a simple for-loop trying to access each element in
|
|
|
|
|
* turn; the application may expect it to be unrolled.
|
|
|
|
|
*/
|
|
|
|
|
if (int(array->type->length) == ls->limiting_terminator->iterations)
|
|
|
|
|
array_indexed_by_induction_var_with_exact_iterations = true;
|
|
|
|
|
|
glsl: Ignore loop-too-large heuristic if there's bad variable indexing.
Many shaders use a pattern such as:
for (int i = 0; i < NUM_LIGHTS; i++) {
...access a uniform array, or shader input/output array...
}
where NUM_LIGHTS is a small constant (such as 2, 4, or 8).
The expectation is that the compiler will unroll those loops, turning
the array access into constant indexing, which is more efficient, and
which may enable array splitting and other optimizations.
In many cases, our heuristic fails - either there's another tiny nested
loop inside, or the estimated number of instructions is just barely
beyond the threshold. So, we fail to unroll the loop, leaving the
variable indexing in place.
Drivers which don't support the particular flavor of variable indexing
will call lower_variable_index_to_cond_assign(), which generates piles
and piles of immensely inefficient code. We'd like to avoid generating
that.
This patch detects unsupported forms of variable-indexing in loops, where
the array index is a loop induction variable. In that case, it bypasses
the loop-too-large heuristic and forces unrolling.
Improves performance in various microbenchmarks: Gl32PSBump8 by 47%,
Gl32ShMapVsm by 80%, and Gl32ShMapPcf by 27%. No changes in shader-db.
v2: Check ir->array for being an array or matrix, rather than the
ir_dereference_array itself.
v3: Fix and expand statistics in commit message.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-04-08 18:09:43 -07:00
|
|
|
switch (array->data.mode) {
|
|
|
|
|
case ir_var_auto:
|
|
|
|
|
case ir_var_temporary:
|
|
|
|
|
case ir_var_const_in:
|
|
|
|
|
case ir_var_function_in:
|
|
|
|
|
case ir_var_function_out:
|
|
|
|
|
case ir_var_function_inout:
|
|
|
|
|
if (options->EmitNoIndirectTemp)
|
|
|
|
|
unsupported_variable_indexing = true;
|
|
|
|
|
break;
|
|
|
|
|
case ir_var_uniform:
|
|
|
|
|
if (options->EmitNoIndirectUniform)
|
|
|
|
|
unsupported_variable_indexing = true;
|
|
|
|
|
break;
|
|
|
|
|
case ir_var_shader_in:
|
|
|
|
|
if (options->EmitNoIndirectInput)
|
|
|
|
|
unsupported_variable_indexing = true;
|
|
|
|
|
break;
|
|
|
|
|
case ir_var_shader_out:
|
|
|
|
|
if (options->EmitNoIndirectOutput)
|
|
|
|
|
unsupported_variable_indexing = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return visit_continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
loop_variable_state *ls;
|
|
|
|
|
const struct gl_shader_compiler_options *options;
|
2012-02-21 13:37:49 -08:00
|
|
|
};
|
|
|
|
|
|
2010-12-01 15:06:47 -08:00
|
|
|
|
2013-11-28 22:12:08 -08:00
|
|
|
/**
|
|
|
|
|
* Unroll a loop which does not contain any jumps. For example, if the input
|
|
|
|
|
* is:
|
|
|
|
|
*
|
|
|
|
|
* (loop (...) ...instrs...)
|
|
|
|
|
*
|
|
|
|
|
* And the iteration count is 3, the output will be:
|
|
|
|
|
*
|
|
|
|
|
* ...instrs... ...instrs... ...instrs...
|
|
|
|
|
*/
|
|
|
|
|
void
|
|
|
|
|
loop_unroll_visitor::simple_unroll(ir_loop *ir, int iterations)
|
|
|
|
|
{
|
|
|
|
|
void *const mem_ctx = ralloc_parent(ir);
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < iterations; i++) {
|
|
|
|
|
exec_list copy_list;
|
|
|
|
|
|
|
|
|
|
copy_list.make_empty();
|
|
|
|
|
clone_ir_list(mem_ctx, ©_list, &ir->body_instructions);
|
|
|
|
|
|
|
|
|
|
ir->insert_before(©_list);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* The loop has been replaced by the unrolled copies. Remove the original
|
|
|
|
|
* loop from the IR sequence.
|
|
|
|
|
*/
|
|
|
|
|
ir->remove();
|
|
|
|
|
|
|
|
|
|
this->progress = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Unroll a loop whose last statement is an ir_if. If \c
|
|
|
|
|
* continue_from_then_branch is true, the loop is repeated only when the
|
|
|
|
|
* "then" branch of the if is taken; otherwise it is repeated only when the
|
|
|
|
|
* "else" branch of the if is taken.
|
|
|
|
|
*
|
|
|
|
|
* For example, if the input is:
|
|
|
|
|
*
|
|
|
|
|
* (loop (...)
|
|
|
|
|
* ...body...
|
|
|
|
|
* (if (cond)
|
|
|
|
|
* (...then_instrs...)
|
|
|
|
|
* (...else_instrs...)))
|
|
|
|
|
*
|
|
|
|
|
* And the iteration count is 3, and \c continue_from_then_branch is true,
|
|
|
|
|
* then the output will be:
|
|
|
|
|
*
|
|
|
|
|
* ...body...
|
|
|
|
|
* (if (cond)
|
|
|
|
|
* (...then_instrs...
|
|
|
|
|
* ...body...
|
|
|
|
|
* (if (cond)
|
|
|
|
|
* (...then_instrs...
|
|
|
|
|
* ...body...
|
|
|
|
|
* (if (cond)
|
|
|
|
|
* (...then_instrs...)
|
|
|
|
|
* (...else_instrs...)))
|
|
|
|
|
* (...else_instrs...)))
|
|
|
|
|
* (...else_instrs))
|
|
|
|
|
*/
|
|
|
|
|
void
|
|
|
|
|
loop_unroll_visitor::complex_unroll(ir_loop *ir, int iterations,
|
|
|
|
|
bool continue_from_then_branch)
|
|
|
|
|
{
|
|
|
|
|
void *const mem_ctx = ralloc_parent(ir);
|
|
|
|
|
ir_instruction *ir_to_replace = ir;
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < iterations; i++) {
|
|
|
|
|
exec_list copy_list;
|
|
|
|
|
|
|
|
|
|
copy_list.make_empty();
|
|
|
|
|
clone_ir_list(mem_ctx, ©_list, &ir->body_instructions);
|
|
|
|
|
|
|
|
|
|
ir_if *ir_if = ((ir_instruction *) copy_list.get_tail())->as_if();
|
|
|
|
|
assert(ir_if != NULL);
|
|
|
|
|
|
|
|
|
|
ir_to_replace->insert_before(©_list);
|
|
|
|
|
ir_to_replace->remove();
|
|
|
|
|
|
|
|
|
|
/* placeholder that will be removed in the next iteration */
|
|
|
|
|
ir_to_replace =
|
|
|
|
|
new(mem_ctx) ir_loop_jump(ir_loop_jump::jump_continue);
|
|
|
|
|
|
|
|
|
|
exec_list *const list = (continue_from_then_branch)
|
|
|
|
|
? &ir_if->then_instructions : &ir_if->else_instructions;
|
|
|
|
|
|
|
|
|
|
list->push_tail(ir_to_replace);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ir_to_replace->remove();
|
|
|
|
|
|
|
|
|
|
this->progress = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Move all of the instructions which follow \c ir_if to the end of
|
|
|
|
|
* \c splice_dest.
|
|
|
|
|
*
|
|
|
|
|
* For example, in the code snippet:
|
|
|
|
|
*
|
|
|
|
|
* (if (cond)
|
|
|
|
|
* (...then_instructions...
|
|
|
|
|
* break)
|
|
|
|
|
* (...else_instructions...))
|
|
|
|
|
* ...post_if_instructions...
|
|
|
|
|
*
|
|
|
|
|
* If \c ir_if points to the "if" instruction, and \c splice_dest points to
|
|
|
|
|
* (...else_instructions...), the code snippet is transformed into:
|
|
|
|
|
*
|
|
|
|
|
* (if (cond)
|
|
|
|
|
* (...then_instructions...
|
|
|
|
|
* break)
|
|
|
|
|
* (...else_instructions...
|
|
|
|
|
* ...post_if_instructions...))
|
|
|
|
|
*/
|
|
|
|
|
void
|
|
|
|
|
loop_unroll_visitor::splice_post_if_instructions(ir_if *ir_if,
|
|
|
|
|
exec_list *splice_dest)
|
|
|
|
|
{
|
|
|
|
|
while (!ir_if->get_next()->is_tail_sentinel()) {
|
|
|
|
|
ir_instruction *move_ir = (ir_instruction *) ir_if->get_next();
|
|
|
|
|
|
|
|
|
|
move_ir->remove();
|
|
|
|
|
splice_dest->push_tail(move_ir);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2010-08-27 13:59:49 -07:00
|
|
|
ir_visitor_status
|
|
|
|
|
loop_unroll_visitor::visit_leave(ir_loop *ir)
|
|
|
|
|
{
|
|
|
|
|
loop_variable_state *const ls = this->state->get(ir);
|
2010-09-07 17:03:43 +02:00
|
|
|
int iterations;
|
2010-08-27 13:59:49 -07:00
|
|
|
|
|
|
|
|
/* If we've entered a loop that hasn't been analyzed, something really,
|
|
|
|
|
* really bad has happened.
|
|
|
|
|
*/
|
|
|
|
|
if (ls == NULL) {
|
|
|
|
|
assert(ls != NULL);
|
|
|
|
|
return visit_continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Don't try to unroll loops where the number of iterations is not known
|
|
|
|
|
* at compile-time.
|
|
|
|
|
*/
|
glsl/loops: Stop creating normatively bound loops in loop_controls.
Previously, when loop_controls analyzed a loop and found that it had a
fixed bound (known at compile time), it would remove all of the loop
terminators and instead set the loop's normative_bound field to force
the loop to execute the correct number of times.
This made loop unrolling easy, but it had a serious disadvantage.
Since most GPU's don't have a native mechanism for executing a loop a
fixed number of times, in order to implement the normative bound, the
back-ends would have to synthesize a new loop induction variable. As
a result, many loops wound up having two induction variables instead
of one. This caused extra register pressure and unnecessary
instructions.
This patch modifies loop_controls so that it doesn't set the loop's
normative_bound anymore. Instead it leaves one of the terminators in
the loop (the limiting terminator), so the back-end doesn't have to go
to any extra work to ensure the loop terminates at the right time.
This complicates loop unrolling slightly: when deciding whether a loop
can be unrolled, we have to account for the presence of the limiting
terminator. And when we do unroll the loop, we have to remove the
limiting terminator first.
For an example of how this results in more efficient back end code,
consider the loop:
for (int i = 0; i < 100; i++) {
total += i;
}
Previous to this patch, on i965, this loop would compile down to this
(vec4) native code:
mov(8) g4<1>.xD 0D
mov(8) g8<1>.xD 0D
loop:
cmp.ge.f0(8) null g8<4;4,1>.xD 100D
(+f0) if(8)
break(8)
endif(8)
add(8) g5<1>.xD g5<4;4,1>.xD g4<4;4,1>.xD
add(8) g8<1>.xD g8<4;4,1>.xD 1D
add(8) g4<1>.xD g4<4;4,1>.xD 1D
while(8) loop
(notice that both g8 and g4 are loop induction variables; one is used
to terminate the loop, and the other is used to accumulate the total).
After this patch, the same loop compiles to:
mov(8) g4<1>.xD 0D
loop:
cmp.ge.f0(8) null g4<4;4,1>.xD 100D
(+f0) if(8)
break(8)
endif(8)
add(8) g5<1>.xD g5<4;4,1>.xD g4<4;4,1>.xD
add(8) g4<1>.xD g4<4;4,1>.xD 1D
while(8) loop
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
2013-11-29 00:16:43 -08:00
|
|
|
if (ls->limiting_terminator == NULL)
|
2010-08-27 13:59:49 -07:00
|
|
|
return visit_continue;
|
|
|
|
|
|
glsl/loops: Stop creating normatively bound loops in loop_controls.
Previously, when loop_controls analyzed a loop and found that it had a
fixed bound (known at compile time), it would remove all of the loop
terminators and instead set the loop's normative_bound field to force
the loop to execute the correct number of times.
This made loop unrolling easy, but it had a serious disadvantage.
Since most GPU's don't have a native mechanism for executing a loop a
fixed number of times, in order to implement the normative bound, the
back-ends would have to synthesize a new loop induction variable. As
a result, many loops wound up having two induction variables instead
of one. This caused extra register pressure and unnecessary
instructions.
This patch modifies loop_controls so that it doesn't set the loop's
normative_bound anymore. Instead it leaves one of the terminators in
the loop (the limiting terminator), so the back-end doesn't have to go
to any extra work to ensure the loop terminates at the right time.
This complicates loop unrolling slightly: when deciding whether a loop
can be unrolled, we have to account for the presence of the limiting
terminator. And when we do unroll the loop, we have to remove the
limiting terminator first.
For an example of how this results in more efficient back end code,
consider the loop:
for (int i = 0; i < 100; i++) {
total += i;
}
Previous to this patch, on i965, this loop would compile down to this
(vec4) native code:
mov(8) g4<1>.xD 0D
mov(8) g8<1>.xD 0D
loop:
cmp.ge.f0(8) null g8<4;4,1>.xD 100D
(+f0) if(8)
break(8)
endif(8)
add(8) g5<1>.xD g5<4;4,1>.xD g4<4;4,1>.xD
add(8) g8<1>.xD g8<4;4,1>.xD 1D
add(8) g4<1>.xD g4<4;4,1>.xD 1D
while(8) loop
(notice that both g8 and g4 are loop induction variables; one is used
to terminate the loop, and the other is used to accumulate the total).
After this patch, the same loop compiles to:
mov(8) g4<1>.xD 0D
loop:
cmp.ge.f0(8) null g4<4;4,1>.xD 100D
(+f0) if(8)
break(8)
endif(8)
add(8) g5<1>.xD g5<4;4,1>.xD g4<4;4,1>.xD
add(8) g4<1>.xD g4<4;4,1>.xD 1D
while(8) loop
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
2013-11-29 00:16:43 -08:00
|
|
|
iterations = ls->limiting_terminator->iterations;
|
2013-11-29 00:11:12 -08:00
|
|
|
|
2014-04-08 19:58:36 -07:00
|
|
|
const int max_iterations = options->MaxUnrollIterations;
|
|
|
|
|
|
2010-08-27 13:59:49 -07:00
|
|
|
/* Don't try to unroll loops that have zillions of iterations either.
|
|
|
|
|
*/
|
2014-04-08 19:58:36 -07:00
|
|
|
if (iterations > max_iterations)
|
2010-08-27 13:59:49 -07:00
|
|
|
return visit_continue;
|
|
|
|
|
|
2012-01-25 17:35:01 +01:00
|
|
|
/* Don't try to unroll nested loops and loops with a huge body.
|
|
|
|
|
*/
|
glsl: Ignore loop-too-large heuristic if there's bad variable indexing.
Many shaders use a pattern such as:
for (int i = 0; i < NUM_LIGHTS; i++) {
...access a uniform array, or shader input/output array...
}
where NUM_LIGHTS is a small constant (such as 2, 4, or 8).
The expectation is that the compiler will unroll those loops, turning
the array access into constant indexing, which is more efficient, and
which may enable array splitting and other optimizations.
In many cases, our heuristic fails - either there's another tiny nested
loop inside, or the estimated number of instructions is just barely
beyond the threshold. So, we fail to unroll the loop, leaving the
variable indexing in place.
Drivers which don't support the particular flavor of variable indexing
will call lower_variable_index_to_cond_assign(), which generates piles
and piles of immensely inefficient code. We'd like to avoid generating
that.
This patch detects unsupported forms of variable-indexing in loops, where
the array index is a loop induction variable. In that case, it bypasses
the loop-too-large heuristic and forces unrolling.
Improves performance in various microbenchmarks: Gl32PSBump8 by 47%,
Gl32ShMapVsm by 80%, and Gl32ShMapPcf by 27%. No changes in shader-db.
v2: Check ir->array for being an array or matrix, rather than the
ir_dereference_array itself.
v3: Fix and expand statistics in commit message.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-04-08 18:09:43 -07:00
|
|
|
loop_unroll_count count(&ir->body_instructions, ls, options);
|
|
|
|
|
|
|
|
|
|
bool loop_too_large =
|
|
|
|
|
count.nested_loop || count.nodes * iterations > max_iterations * 5;
|
2012-02-21 13:37:49 -08:00
|
|
|
|
glsl: Skip loop-too-large heuristic if indexing arrays of a certain size
A pattern in certain shaders is:
uniform vec4 colors[NUM_LIGHTS];
for (int i = 0; i < NUM_LIGHTS; i++) {
...use colors[i]...
}
In this case, the application author expects the shader compiler to
unroll the loop. By doing so, it replaces variable indexing of the
array with constant indexing, which is more efficient.
This patch extends the heuristic to see if arrays accessed within the
loop are indexed by an induction variable, and if the array size exactly
matches the number of loop iterations. If so, the application author
probably intended us to unroll it. If not, we rely on the existing
loop-too-large heuristic.
Improves performance in a phong shading microbenchmark by 2.88x, and a
shadow mapping microbenchmark by 1.63x. Without variable indexing, we
can upload the small uniform arrays as push constants instead of pull
constants, avoiding shader memory access. Affects several games, but
doesn't appear to impact their performance.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
Acked-by: Kristian Høgsberg <krh@bitplanet.net>
2014-10-29 20:56:07 -07:00
|
|
|
if (loop_too_large && !count.unsupported_variable_indexing &&
|
|
|
|
|
!count.array_indexed_by_induction_var_with_exact_iterations)
|
2012-02-21 13:37:49 -08:00
|
|
|
return visit_continue;
|
2012-01-25 17:35:01 +01:00
|
|
|
|
glsl/loops: Stop creating normatively bound loops in loop_controls.
Previously, when loop_controls analyzed a loop and found that it had a
fixed bound (known at compile time), it would remove all of the loop
terminators and instead set the loop's normative_bound field to force
the loop to execute the correct number of times.
This made loop unrolling easy, but it had a serious disadvantage.
Since most GPU's don't have a native mechanism for executing a loop a
fixed number of times, in order to implement the normative bound, the
back-ends would have to synthesize a new loop induction variable. As
a result, many loops wound up having two induction variables instead
of one. This caused extra register pressure and unnecessary
instructions.
This patch modifies loop_controls so that it doesn't set the loop's
normative_bound anymore. Instead it leaves one of the terminators in
the loop (the limiting terminator), so the back-end doesn't have to go
to any extra work to ensure the loop terminates at the right time.
This complicates loop unrolling slightly: when deciding whether a loop
can be unrolled, we have to account for the presence of the limiting
terminator. And when we do unroll the loop, we have to remove the
limiting terminator first.
For an example of how this results in more efficient back end code,
consider the loop:
for (int i = 0; i < 100; i++) {
total += i;
}
Previous to this patch, on i965, this loop would compile down to this
(vec4) native code:
mov(8) g4<1>.xD 0D
mov(8) g8<1>.xD 0D
loop:
cmp.ge.f0(8) null g8<4;4,1>.xD 100D
(+f0) if(8)
break(8)
endif(8)
add(8) g5<1>.xD g5<4;4,1>.xD g4<4;4,1>.xD
add(8) g8<1>.xD g8<4;4,1>.xD 1D
add(8) g4<1>.xD g4<4;4,1>.xD 1D
while(8) loop
(notice that both g8 and g4 are loop induction variables; one is used
to terminate the loop, and the other is used to accumulate the total).
After this patch, the same loop compiles to:
mov(8) g4<1>.xD 0D
loop:
cmp.ge.f0(8) null g4<4;4,1>.xD 100D
(+f0) if(8)
break(8)
endif(8)
add(8) g5<1>.xD g5<4;4,1>.xD g4<4;4,1>.xD
add(8) g4<1>.xD g4<4;4,1>.xD 1D
while(8) loop
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
2013-11-29 00:16:43 -08:00
|
|
|
/* Note: the limiting terminator contributes 1 to ls->num_loop_jumps.
|
|
|
|
|
* We'll be removing the limiting terminator before we unroll.
|
|
|
|
|
*/
|
|
|
|
|
assert(ls->num_loop_jumps > 0);
|
|
|
|
|
unsigned predicted_num_loop_jumps = ls->num_loop_jumps - 1;
|
|
|
|
|
|
|
|
|
|
if (predicted_num_loop_jumps > 1)
|
2010-08-27 13:59:49 -07:00
|
|
|
return visit_continue;
|
2010-09-07 17:03:43 +02:00
|
|
|
|
glsl/loops: Stop creating normatively bound loops in loop_controls.
Previously, when loop_controls analyzed a loop and found that it had a
fixed bound (known at compile time), it would remove all of the loop
terminators and instead set the loop's normative_bound field to force
the loop to execute the correct number of times.
This made loop unrolling easy, but it had a serious disadvantage.
Since most GPU's don't have a native mechanism for executing a loop a
fixed number of times, in order to implement the normative bound, the
back-ends would have to synthesize a new loop induction variable. As
a result, many loops wound up having two induction variables instead
of one. This caused extra register pressure and unnecessary
instructions.
This patch modifies loop_controls so that it doesn't set the loop's
normative_bound anymore. Instead it leaves one of the terminators in
the loop (the limiting terminator), so the back-end doesn't have to go
to any extra work to ensure the loop terminates at the right time.
This complicates loop unrolling slightly: when deciding whether a loop
can be unrolled, we have to account for the presence of the limiting
terminator. And when we do unroll the loop, we have to remove the
limiting terminator first.
For an example of how this results in more efficient back end code,
consider the loop:
for (int i = 0; i < 100; i++) {
total += i;
}
Previous to this patch, on i965, this loop would compile down to this
(vec4) native code:
mov(8) g4<1>.xD 0D
mov(8) g8<1>.xD 0D
loop:
cmp.ge.f0(8) null g8<4;4,1>.xD 100D
(+f0) if(8)
break(8)
endif(8)
add(8) g5<1>.xD g5<4;4,1>.xD g4<4;4,1>.xD
add(8) g8<1>.xD g8<4;4,1>.xD 1D
add(8) g4<1>.xD g4<4;4,1>.xD 1D
while(8) loop
(notice that both g8 and g4 are loop induction variables; one is used
to terminate the loop, and the other is used to accumulate the total).
After this patch, the same loop compiles to:
mov(8) g4<1>.xD 0D
loop:
cmp.ge.f0(8) null g4<4;4,1>.xD 100D
(+f0) if(8)
break(8)
endif(8)
add(8) g5<1>.xD g5<4;4,1>.xD g4<4;4,1>.xD
add(8) g4<1>.xD g4<4;4,1>.xD 1D
while(8) loop
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
2013-11-29 00:16:43 -08:00
|
|
|
if (predicted_num_loop_jumps == 0) {
|
|
|
|
|
ls->limiting_terminator->ir->remove();
|
2013-11-28 22:12:08 -08:00
|
|
|
simple_unroll(ir, iterations);
|
|
|
|
|
return visit_continue;
|
|
|
|
|
}
|
2010-09-07 17:03:43 +02:00
|
|
|
|
2013-11-28 22:12:08 -08:00
|
|
|
ir_instruction *last_ir = (ir_instruction *) ir->body_instructions.get_tail();
|
|
|
|
|
assert(last_ir != NULL);
|
2010-09-07 17:03:43 +02:00
|
|
|
|
2013-11-28 22:12:08 -08:00
|
|
|
if (is_break(last_ir)) {
|
|
|
|
|
/* If the only loop-jump is a break at the end of the loop, the loop
|
|
|
|
|
* will execute exactly once. Remove the break and use the simple
|
|
|
|
|
* unroller with an iteration count of 1.
|
|
|
|
|
*/
|
|
|
|
|
last_ir->remove();
|
2010-09-07 17:03:43 +02:00
|
|
|
|
glsl/loops: Stop creating normatively bound loops in loop_controls.
Previously, when loop_controls analyzed a loop and found that it had a
fixed bound (known at compile time), it would remove all of the loop
terminators and instead set the loop's normative_bound field to force
the loop to execute the correct number of times.
This made loop unrolling easy, but it had a serious disadvantage.
Since most GPU's don't have a native mechanism for executing a loop a
fixed number of times, in order to implement the normative bound, the
back-ends would have to synthesize a new loop induction variable. As
a result, many loops wound up having two induction variables instead
of one. This caused extra register pressure and unnecessary
instructions.
This patch modifies loop_controls so that it doesn't set the loop's
normative_bound anymore. Instead it leaves one of the terminators in
the loop (the limiting terminator), so the back-end doesn't have to go
to any extra work to ensure the loop terminates at the right time.
This complicates loop unrolling slightly: when deciding whether a loop
can be unrolled, we have to account for the presence of the limiting
terminator. And when we do unroll the loop, we have to remove the
limiting terminator first.
For an example of how this results in more efficient back end code,
consider the loop:
for (int i = 0; i < 100; i++) {
total += i;
}
Previous to this patch, on i965, this loop would compile down to this
(vec4) native code:
mov(8) g4<1>.xD 0D
mov(8) g8<1>.xD 0D
loop:
cmp.ge.f0(8) null g8<4;4,1>.xD 100D
(+f0) if(8)
break(8)
endif(8)
add(8) g5<1>.xD g5<4;4,1>.xD g4<4;4,1>.xD
add(8) g8<1>.xD g8<4;4,1>.xD 1D
add(8) g4<1>.xD g4<4;4,1>.xD 1D
while(8) loop
(notice that both g8 and g4 are loop induction variables; one is used
to terminate the loop, and the other is used to accumulate the total).
After this patch, the same loop compiles to:
mov(8) g4<1>.xD 0D
loop:
cmp.ge.f0(8) null g4<4;4,1>.xD 100D
(+f0) if(8)
break(8)
endif(8)
add(8) g5<1>.xD g5<4;4,1>.xD g4<4;4,1>.xD
add(8) g4<1>.xD g4<4;4,1>.xD 1D
while(8) loop
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
2013-11-29 00:16:43 -08:00
|
|
|
ls->limiting_terminator->ir->remove();
|
2013-11-28 22:12:08 -08:00
|
|
|
simple_unroll(ir, 1);
|
|
|
|
|
return visit_continue;
|
|
|
|
|
}
|
2010-09-07 17:03:43 +02:00
|
|
|
|
2014-06-24 21:34:05 -07:00
|
|
|
/* recognize loops in the form produced by ir_lower_jumps */
|
|
|
|
|
foreach_in_list(ir_instruction, cur_ir, &ir->body_instructions) {
|
glsl/loops: Stop creating normatively bound loops in loop_controls.
Previously, when loop_controls analyzed a loop and found that it had a
fixed bound (known at compile time), it would remove all of the loop
terminators and instead set the loop's normative_bound field to force
the loop to execute the correct number of times.
This made loop unrolling easy, but it had a serious disadvantage.
Since most GPU's don't have a native mechanism for executing a loop a
fixed number of times, in order to implement the normative bound, the
back-ends would have to synthesize a new loop induction variable. As
a result, many loops wound up having two induction variables instead
of one. This caused extra register pressure and unnecessary
instructions.
This patch modifies loop_controls so that it doesn't set the loop's
normative_bound anymore. Instead it leaves one of the terminators in
the loop (the limiting terminator), so the back-end doesn't have to go
to any extra work to ensure the loop terminates at the right time.
This complicates loop unrolling slightly: when deciding whether a loop
can be unrolled, we have to account for the presence of the limiting
terminator. And when we do unroll the loop, we have to remove the
limiting terminator first.
For an example of how this results in more efficient back end code,
consider the loop:
for (int i = 0; i < 100; i++) {
total += i;
}
Previous to this patch, on i965, this loop would compile down to this
(vec4) native code:
mov(8) g4<1>.xD 0D
mov(8) g8<1>.xD 0D
loop:
cmp.ge.f0(8) null g8<4;4,1>.xD 100D
(+f0) if(8)
break(8)
endif(8)
add(8) g5<1>.xD g5<4;4,1>.xD g4<4;4,1>.xD
add(8) g8<1>.xD g8<4;4,1>.xD 1D
add(8) g4<1>.xD g4<4;4,1>.xD 1D
while(8) loop
(notice that both g8 and g4 are loop induction variables; one is used
to terminate the loop, and the other is used to accumulate the total).
After this patch, the same loop compiles to:
mov(8) g4<1>.xD 0D
loop:
cmp.ge.f0(8) null g4<4;4,1>.xD 100D
(+f0) if(8)
break(8)
endif(8)
add(8) g5<1>.xD g5<4;4,1>.xD g4<4;4,1>.xD
add(8) g4<1>.xD g4<4;4,1>.xD 1D
while(8) loop
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
2013-11-29 00:16:43 -08:00
|
|
|
/* Skip the limiting terminator, since it will go away when we
|
|
|
|
|
* unroll.
|
|
|
|
|
*/
|
|
|
|
|
if (cur_ir == ls->limiting_terminator->ir)
|
|
|
|
|
continue;
|
|
|
|
|
|
2013-11-28 22:12:08 -08:00
|
|
|
ir_if *ir_if = cur_ir->as_if();
|
|
|
|
|
if (ir_if != NULL) {
|
|
|
|
|
/* Determine which if-statement branch, if any, ends with a
|
|
|
|
|
* break. The branch that did *not* have the break will get a
|
|
|
|
|
* temporary continue inserted in each iteration of the loop
|
|
|
|
|
* unroll.
|
|
|
|
|
*
|
|
|
|
|
* Note that since ls->num_loop_jumps is <= 1, it is impossible
|
|
|
|
|
* for both branches to end with a break.
|
|
|
|
|
*/
|
|
|
|
|
ir_instruction *ir_if_last =
|
|
|
|
|
(ir_instruction *) ir_if->then_instructions.get_tail();
|
2010-09-07 17:03:43 +02:00
|
|
|
|
2013-11-28 22:12:08 -08:00
|
|
|
if (is_break(ir_if_last)) {
|
glsl/loops: Stop creating normatively bound loops in loop_controls.
Previously, when loop_controls analyzed a loop and found that it had a
fixed bound (known at compile time), it would remove all of the loop
terminators and instead set the loop's normative_bound field to force
the loop to execute the correct number of times.
This made loop unrolling easy, but it had a serious disadvantage.
Since most GPU's don't have a native mechanism for executing a loop a
fixed number of times, in order to implement the normative bound, the
back-ends would have to synthesize a new loop induction variable. As
a result, many loops wound up having two induction variables instead
of one. This caused extra register pressure and unnecessary
instructions.
This patch modifies loop_controls so that it doesn't set the loop's
normative_bound anymore. Instead it leaves one of the terminators in
the loop (the limiting terminator), so the back-end doesn't have to go
to any extra work to ensure the loop terminates at the right time.
This complicates loop unrolling slightly: when deciding whether a loop
can be unrolled, we have to account for the presence of the limiting
terminator. And when we do unroll the loop, we have to remove the
limiting terminator first.
For an example of how this results in more efficient back end code,
consider the loop:
for (int i = 0; i < 100; i++) {
total += i;
}
Previous to this patch, on i965, this loop would compile down to this
(vec4) native code:
mov(8) g4<1>.xD 0D
mov(8) g8<1>.xD 0D
loop:
cmp.ge.f0(8) null g8<4;4,1>.xD 100D
(+f0) if(8)
break(8)
endif(8)
add(8) g5<1>.xD g5<4;4,1>.xD g4<4;4,1>.xD
add(8) g8<1>.xD g8<4;4,1>.xD 1D
add(8) g4<1>.xD g4<4;4,1>.xD 1D
while(8) loop
(notice that both g8 and g4 are loop induction variables; one is used
to terminate the loop, and the other is used to accumulate the total).
After this patch, the same loop compiles to:
mov(8) g4<1>.xD 0D
loop:
cmp.ge.f0(8) null g4<4;4,1>.xD 100D
(+f0) if(8)
break(8)
endif(8)
add(8) g5<1>.xD g5<4;4,1>.xD g4<4;4,1>.xD
add(8) g4<1>.xD g4<4;4,1>.xD 1D
while(8) loop
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
2013-11-29 00:16:43 -08:00
|
|
|
ls->limiting_terminator->ir->remove();
|
2013-11-28 22:12:08 -08:00
|
|
|
splice_post_if_instructions(ir_if, &ir_if->else_instructions);
|
|
|
|
|
ir_if_last->remove();
|
|
|
|
|
complex_unroll(ir, iterations, false);
|
|
|
|
|
return visit_continue;
|
|
|
|
|
} else {
|
|
|
|
|
ir_if_last =
|
|
|
|
|
(ir_instruction *) ir_if->else_instructions.get_tail();
|
|
|
|
|
|
|
|
|
|
if (is_break(ir_if_last)) {
|
glsl/loops: Stop creating normatively bound loops in loop_controls.
Previously, when loop_controls analyzed a loop and found that it had a
fixed bound (known at compile time), it would remove all of the loop
terminators and instead set the loop's normative_bound field to force
the loop to execute the correct number of times.
This made loop unrolling easy, but it had a serious disadvantage.
Since most GPU's don't have a native mechanism for executing a loop a
fixed number of times, in order to implement the normative bound, the
back-ends would have to synthesize a new loop induction variable. As
a result, many loops wound up having two induction variables instead
of one. This caused extra register pressure and unnecessary
instructions.
This patch modifies loop_controls so that it doesn't set the loop's
normative_bound anymore. Instead it leaves one of the terminators in
the loop (the limiting terminator), so the back-end doesn't have to go
to any extra work to ensure the loop terminates at the right time.
This complicates loop unrolling slightly: when deciding whether a loop
can be unrolled, we have to account for the presence of the limiting
terminator. And when we do unroll the loop, we have to remove the
limiting terminator first.
For an example of how this results in more efficient back end code,
consider the loop:
for (int i = 0; i < 100; i++) {
total += i;
}
Previous to this patch, on i965, this loop would compile down to this
(vec4) native code:
mov(8) g4<1>.xD 0D
mov(8) g8<1>.xD 0D
loop:
cmp.ge.f0(8) null g8<4;4,1>.xD 100D
(+f0) if(8)
break(8)
endif(8)
add(8) g5<1>.xD g5<4;4,1>.xD g4<4;4,1>.xD
add(8) g8<1>.xD g8<4;4,1>.xD 1D
add(8) g4<1>.xD g4<4;4,1>.xD 1D
while(8) loop
(notice that both g8 and g4 are loop induction variables; one is used
to terminate the loop, and the other is used to accumulate the total).
After this patch, the same loop compiles to:
mov(8) g4<1>.xD 0D
loop:
cmp.ge.f0(8) null g4<4;4,1>.xD 100D
(+f0) if(8)
break(8)
endif(8)
add(8) g5<1>.xD g5<4;4,1>.xD g4<4;4,1>.xD
add(8) g4<1>.xD g4<4;4,1>.xD 1D
while(8) loop
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
2013-11-29 00:16:43 -08:00
|
|
|
ls->limiting_terminator->ir->remove();
|
2013-11-28 22:12:08 -08:00
|
|
|
splice_post_if_instructions(ir_if, &ir_if->then_instructions);
|
|
|
|
|
ir_if_last->remove();
|
|
|
|
|
complex_unroll(ir, iterations, true);
|
|
|
|
|
return visit_continue;
|
|
|
|
|
}
|
2010-09-07 17:03:43 +02:00
|
|
|
}
|
2010-12-01 15:12:07 -08:00
|
|
|
}
|
2010-09-07 17:03:43 +02:00
|
|
|
}
|
2010-08-27 13:59:49 -07:00
|
|
|
|
2013-11-28 22:12:08 -08:00
|
|
|
/* Did not find the break statement. It must be in a complex if-nesting,
|
|
|
|
|
* so don't try to unroll.
|
2010-08-27 13:59:49 -07:00
|
|
|
*/
|
|
|
|
|
return visit_continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool
|
2014-04-08 19:58:36 -07:00
|
|
|
unroll_loops(exec_list *instructions, loop_state *ls,
|
|
|
|
|
const struct gl_shader_compiler_options *options)
|
2010-08-27 13:59:49 -07:00
|
|
|
{
|
2014-04-08 19:58:36 -07:00
|
|
|
loop_unroll_visitor v(ls, options);
|
2010-08-27 13:59:49 -07:00
|
|
|
|
|
|
|
|
v.run(instructions);
|
|
|
|
|
|
|
|
|
|
return v.progress;
|
|
|
|
|
}
|