2019-09-17 13:22:17 +02:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2018 Valve Corporation
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
* Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
* IN THE SOFTWARE.
|
|
|
|
|
*
|
|
|
|
|
* Authors:
|
|
|
|
|
* Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de)
|
|
|
|
|
*
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include <algorithm>
|
|
|
|
|
#include <math.h>
|
|
|
|
|
|
|
|
|
|
#include "aco_ir.h"
|
|
|
|
|
#include "util/half_float.h"
|
2020-08-04 10:58:11 -07:00
|
|
|
#include "util/memstream.h"
|
2019-09-17 13:22:17 +02:00
|
|
|
#include "util/u_math.h"
|
|
|
|
|
|
|
|
|
|
namespace aco {
|
|
|
|
|
|
2020-08-14 10:42:27 +02:00
|
|
|
#ifndef NDEBUG
|
|
|
|
|
void perfwarn(Program *program, bool cond, const char *msg, Instruction *instr)
|
|
|
|
|
{
|
|
|
|
|
if (cond) {
|
|
|
|
|
char *out;
|
|
|
|
|
size_t outsize;
|
2020-08-04 10:58:11 -07:00
|
|
|
struct u_memstream mem;
|
|
|
|
|
u_memstream_open(&mem, &out, &outsize);
|
|
|
|
|
FILE *const memf = u_memstream_get(&mem);
|
2020-08-14 10:42:27 +02:00
|
|
|
|
|
|
|
|
fprintf(memf, "%s: ", msg);
|
|
|
|
|
aco_print_instr(instr, memf);
|
2020-08-04 10:58:11 -07:00
|
|
|
u_memstream_close(&mem);
|
2020-08-14 10:42:27 +02:00
|
|
|
|
|
|
|
|
aco_perfwarn(program, out);
|
|
|
|
|
free(out);
|
|
|
|
|
|
|
|
|
|
if (debug_flags & DEBUG_PERFWARN)
|
|
|
|
|
exit(1);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
/**
|
|
|
|
|
* The optimizer works in 4 phases:
|
|
|
|
|
* (1) The first pass collects information for each ssa-def,
|
|
|
|
|
* propagates reg->reg operands of the same type, inline constants
|
|
|
|
|
* and neg/abs input modifiers.
|
|
|
|
|
* (2) The second pass combines instructions like mad, omod, clamp and
|
|
|
|
|
* propagates sgpr's on VALU instructions.
|
|
|
|
|
* This pass depends on information collected in the first pass.
|
|
|
|
|
* (3) The third pass goes backwards, and selects instructions,
|
|
|
|
|
* i.e. decides if a mad instruction is profitable and eliminates dead code.
|
|
|
|
|
* (4) The fourth pass cleans up the sequence: literals get applied and dead
|
|
|
|
|
* instructions are removed from the sequence.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
struct mad_info {
|
|
|
|
|
aco_ptr<Instruction> add_instr;
|
|
|
|
|
uint32_t mul_temp_id;
|
2020-06-01 16:19:56 +01:00
|
|
|
uint16_t literal_idx;
|
2019-09-17 13:22:17 +02:00
|
|
|
bool check_literal;
|
|
|
|
|
|
2019-11-22 15:18:38 +00:00
|
|
|
mad_info(aco_ptr<Instruction> instr, uint32_t id)
|
2020-09-22 17:51:06 -07:00
|
|
|
: add_instr(std::move(instr)), mul_temp_id(id), literal_idx(0), check_literal(false) {}
|
2019-09-17 13:22:17 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
enum Label {
|
|
|
|
|
label_vec = 1 << 0,
|
2020-05-15 16:28:03 +01:00
|
|
|
label_constant_32bit = 1 << 1,
|
2020-05-15 15:12:33 +01:00
|
|
|
/* label_{abs,neg,mul,omod2,omod4,omod5,clamp} are used for both 16 and
|
|
|
|
|
* 32-bit operations but this shouldn't cause any issues because we don't
|
|
|
|
|
* look through any conversions */
|
2019-09-17 13:22:17 +02:00
|
|
|
label_abs = 1 << 2,
|
|
|
|
|
label_neg = 1 << 3,
|
|
|
|
|
label_mul = 1 << 4,
|
|
|
|
|
label_temp = 1 << 5,
|
|
|
|
|
label_literal = 1 << 6,
|
|
|
|
|
label_mad = 1 << 7,
|
|
|
|
|
label_omod2 = 1 << 8,
|
|
|
|
|
label_omod4 = 1 << 9,
|
|
|
|
|
label_omod5 = 1 << 10,
|
|
|
|
|
label_clamp = 1 << 12,
|
|
|
|
|
label_undefined = 1 << 14,
|
|
|
|
|
label_vcc = 1 << 15,
|
|
|
|
|
label_b2f = 1 << 16,
|
|
|
|
|
label_add_sub = 1 << 17,
|
|
|
|
|
label_bitwise = 1 << 18,
|
|
|
|
|
label_minmax = 1 << 19,
|
2020-06-19 16:09:48 +01:00
|
|
|
label_vopc = 1 << 20,
|
2019-11-05 11:41:00 +01:00
|
|
|
label_uniform_bool = 1 << 21,
|
2019-11-13 11:14:51 +01:00
|
|
|
label_constant_64bit = 1 << 22,
|
2020-01-03 10:30:04 +01:00
|
|
|
label_uniform_bitwise = 1 << 23,
|
aco: Flip s_cbranch / s_cselect to optimize out an s_not if possible.
When possible, get rid of an s_not when all it does is invert the SCC,
and its successor s_cbranch / s_cselect can be inverted instead.
Also modify some parts of instruction_selection to take advantage of
this feature.
Example:
s2: %3900, s1: %3899:scc = s_andn2_b64 %0:exec, %406
s2: %3902 = s_cselect_b64 -1, 0, %3900:scc
s2: %407, s1: %3903:scc = s_not_b64 %3902
s2: %3906, s1: %3905:scc = s_and_b64 %407, %0:exec
p_cbranch_z %3905:scc
Can now be optimized to:
s2: %3900, s1: %3899:scc = s_andn2_b64 %0:exec, %406
p_cbranch_nz %3900:scc
Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
2019-11-19 13:29:54 +01:00
|
|
|
label_scc_invert = 1 << 24,
|
2020-01-07 10:12:08 +01:00
|
|
|
label_vcc_hint = 1 << 25,
|
2020-01-16 19:32:31 +01:00
|
|
|
label_scc_needed = 1 << 26,
|
2020-04-02 17:41:36 +02:00
|
|
|
label_b2i = 1 << 27,
|
2020-05-15 16:28:03 +01:00
|
|
|
label_constant_16bit = 1 << 29,
|
2020-11-11 15:13:08 +01:00
|
|
|
label_usedef = 1 << 30, /* generic label */
|
2019-09-17 13:22:17 +02:00
|
|
|
};
|
|
|
|
|
|
2020-08-12 15:58:32 +01:00
|
|
|
static constexpr uint64_t instr_usedef_labels = label_vec | label_mul | label_mad | label_add_sub |
|
2020-11-11 15:13:08 +01:00
|
|
|
label_bitwise | label_uniform_bitwise | label_minmax | label_vopc | label_usedef;
|
2020-08-12 15:58:32 +01:00
|
|
|
static constexpr uint64_t instr_mod_labels = label_omod2 | label_omod4 | label_omod5 | label_clamp;
|
|
|
|
|
|
|
|
|
|
static constexpr uint64_t instr_labels = instr_usedef_labels | instr_mod_labels;
|
2020-06-01 11:27:53 +01:00
|
|
|
static constexpr uint64_t temp_labels = label_abs | label_neg | label_temp | label_vcc | label_b2f | label_uniform_bool |
|
2020-08-12 15:58:32 +01:00
|
|
|
label_scc_invert | label_b2i;
|
2020-06-01 11:27:53 +01:00
|
|
|
static constexpr uint32_t val_labels = label_constant_32bit | label_constant_64bit | label_constant_16bit | label_literal;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-08-12 13:52:55 +01:00
|
|
|
static_assert((instr_labels & temp_labels) == 0, "labels cannot intersect");
|
|
|
|
|
static_assert((instr_labels & val_labels) == 0, "labels cannot intersect");
|
|
|
|
|
static_assert((temp_labels & val_labels) == 0, "labels cannot intersect");
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
struct ssa_info {
|
2020-06-01 11:27:53 +01:00
|
|
|
uint64_t label;
|
2019-09-17 13:22:17 +02:00
|
|
|
union {
|
2020-06-01 11:27:53 +01:00
|
|
|
uint32_t val;
|
2019-09-17 13:22:17 +02:00
|
|
|
Temp temp;
|
|
|
|
|
Instruction* instr;
|
|
|
|
|
};
|
|
|
|
|
|
2020-04-06 14:08:39 +01:00
|
|
|
ssa_info() : label(0) {}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
void add_label(Label new_label)
|
|
|
|
|
{
|
2020-08-12 15:58:32 +01:00
|
|
|
/* Since all the instr_usedef_labels use instr for the same thing
|
|
|
|
|
* (indicating the defining instruction), there is usually no need to
|
|
|
|
|
* clear any other instr labels. */
|
|
|
|
|
if (new_label & instr_usedef_labels)
|
|
|
|
|
label &= ~(instr_mod_labels | temp_labels | val_labels); /* instr, temp and val alias */
|
|
|
|
|
|
|
|
|
|
if (new_label & instr_mod_labels) {
|
|
|
|
|
label &= ~instr_labels;
|
2020-06-01 11:27:53 +01:00
|
|
|
label &= ~(temp_labels | val_labels); /* instr, temp and val alias */
|
2020-08-12 15:58:32 +01:00
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
if (new_label & temp_labels) {
|
|
|
|
|
label &= ~temp_labels;
|
2020-06-01 11:27:53 +01:00
|
|
|
label &= ~(instr_labels | val_labels); /* instr, temp and val alias */
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2020-05-15 16:28:03 +01:00
|
|
|
uint32_t const_labels = label_literal | label_constant_32bit | label_constant_64bit | label_constant_16bit;
|
2020-06-01 11:27:53 +01:00
|
|
|
if (new_label & const_labels) {
|
2020-05-15 16:28:03 +01:00
|
|
|
label &= ~val_labels | const_labels;
|
2020-06-01 11:27:53 +01:00
|
|
|
label &= ~(instr_labels | temp_labels); /* instr, temp and val alias */
|
|
|
|
|
} else if (new_label & val_labels) {
|
2019-09-17 13:22:17 +02:00
|
|
|
label &= ~val_labels;
|
2020-06-01 11:27:53 +01:00
|
|
|
label &= ~(instr_labels | temp_labels); /* instr, temp and val alias */
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
label |= new_label;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void set_vec(Instruction* vec)
|
|
|
|
|
{
|
|
|
|
|
add_label(label_vec);
|
|
|
|
|
instr = vec;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool is_vec()
|
|
|
|
|
{
|
|
|
|
|
return label & label_vec;
|
|
|
|
|
}
|
|
|
|
|
|
2020-05-15 16:28:03 +01:00
|
|
|
void set_constant(chip_class chip, uint64_t constant)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2020-05-15 16:28:03 +01:00
|
|
|
Operand op16((uint16_t)constant);
|
|
|
|
|
Operand op32((uint32_t)constant);
|
|
|
|
|
add_label(label_literal);
|
2019-09-17 13:22:17 +02:00
|
|
|
val = constant;
|
|
|
|
|
|
2020-05-15 16:28:03 +01:00
|
|
|
if (chip >= GFX8 && !op16.isLiteral())
|
|
|
|
|
add_label(label_constant_16bit);
|
|
|
|
|
|
|
|
|
|
if (!op32.isLiteral() || ((uint32_t)constant == 0x3e22f983 && chip >= GFX8))
|
|
|
|
|
add_label(label_constant_32bit);
|
|
|
|
|
|
|
|
|
|
if (constant <= 64) {
|
|
|
|
|
add_label(label_constant_64bit);
|
|
|
|
|
} else if (constant >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
|
|
|
|
|
add_label(label_constant_64bit);
|
|
|
|
|
} else if (constant == 0x3FE0000000000000) { /* 0.5 */
|
|
|
|
|
add_label(label_constant_64bit);
|
|
|
|
|
} else if (constant == 0xBFE0000000000000) { /* -0.5 */
|
|
|
|
|
add_label(label_constant_64bit);
|
|
|
|
|
} else if (constant == 0x3FF0000000000000) { /* 1.0 */
|
|
|
|
|
add_label(label_constant_64bit);
|
|
|
|
|
} else if (constant == 0xBFF0000000000000) { /* -1.0 */
|
|
|
|
|
add_label(label_constant_64bit);
|
|
|
|
|
} else if (constant == 0x4000000000000000) { /* 2.0 */
|
|
|
|
|
add_label(label_constant_64bit);
|
|
|
|
|
} else if (constant == 0xC000000000000000) { /* -2.0 */
|
|
|
|
|
add_label(label_constant_64bit);
|
|
|
|
|
} else if (constant == 0x4010000000000000) { /* 4.0 */
|
|
|
|
|
add_label(label_constant_64bit);
|
|
|
|
|
} else if (constant == 0xC010000000000000) { /* -4.0 */
|
|
|
|
|
add_label(label_constant_64bit);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (label & label_constant_64bit) {
|
|
|
|
|
val = Operand(constant).constantValue();
|
|
|
|
|
if (val != constant)
|
|
|
|
|
label &= ~(label_literal | label_constant_16bit | label_constant_32bit);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool is_constant(unsigned bits)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2020-05-15 16:28:03 +01:00
|
|
|
switch (bits) {
|
|
|
|
|
case 8:
|
|
|
|
|
return label & label_literal;
|
|
|
|
|
case 16:
|
|
|
|
|
return label & label_constant_16bit;
|
|
|
|
|
case 32:
|
|
|
|
|
return label & label_constant_32bit;
|
|
|
|
|
case 64:
|
|
|
|
|
return label & label_constant_64bit;
|
|
|
|
|
}
|
|
|
|
|
return false;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2020-05-15 16:28:03 +01:00
|
|
|
bool is_literal(unsigned bits)
|
2019-11-13 11:14:51 +01:00
|
|
|
{
|
2020-05-15 16:28:03 +01:00
|
|
|
bool is_lit = label & label_literal;
|
|
|
|
|
switch (bits) {
|
|
|
|
|
case 8:
|
|
|
|
|
return false;
|
|
|
|
|
case 16:
|
|
|
|
|
return is_lit && ~(label & label_constant_16bit);
|
|
|
|
|
case 32:
|
|
|
|
|
return is_lit && ~(label & label_constant_32bit);
|
|
|
|
|
case 64:
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
return false;
|
2019-11-13 11:14:51 +01:00
|
|
|
}
|
|
|
|
|
|
2020-05-15 16:28:03 +01:00
|
|
|
bool is_constant_or_literal(unsigned bits)
|
2019-11-13 11:14:51 +01:00
|
|
|
{
|
2020-05-15 16:28:03 +01:00
|
|
|
if (bits == 64)
|
|
|
|
|
return label & label_constant_64bit;
|
|
|
|
|
else
|
|
|
|
|
return label & label_literal;
|
2019-11-13 11:14:51 +01:00
|
|
|
}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
void set_abs(Temp abs_temp)
|
|
|
|
|
{
|
|
|
|
|
add_label(label_abs);
|
|
|
|
|
temp = abs_temp;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool is_abs()
|
|
|
|
|
{
|
|
|
|
|
return label & label_abs;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void set_neg(Temp neg_temp)
|
|
|
|
|
{
|
|
|
|
|
add_label(label_neg);
|
|
|
|
|
temp = neg_temp;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool is_neg()
|
|
|
|
|
{
|
|
|
|
|
return label & label_neg;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void set_neg_abs(Temp neg_abs_temp)
|
|
|
|
|
{
|
|
|
|
|
add_label((Label)((uint32_t)label_abs | (uint32_t)label_neg));
|
|
|
|
|
temp = neg_abs_temp;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void set_mul(Instruction* mul)
|
|
|
|
|
{
|
|
|
|
|
add_label(label_mul);
|
|
|
|
|
instr = mul;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool is_mul()
|
|
|
|
|
{
|
|
|
|
|
return label & label_mul;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void set_temp(Temp tmp)
|
|
|
|
|
{
|
|
|
|
|
add_label(label_temp);
|
|
|
|
|
temp = tmp;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool is_temp()
|
|
|
|
|
{
|
|
|
|
|
return label & label_temp;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void set_mad(Instruction* mad, uint32_t mad_info_idx)
|
|
|
|
|
{
|
|
|
|
|
add_label(label_mad);
|
2020-06-01 11:27:53 +01:00
|
|
|
mad->pass_flags = mad_info_idx;
|
2019-09-17 13:22:17 +02:00
|
|
|
instr = mad;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool is_mad()
|
|
|
|
|
{
|
|
|
|
|
return label & label_mad;
|
|
|
|
|
}
|
|
|
|
|
|
2020-08-12 15:58:32 +01:00
|
|
|
void set_omod2(Instruction* mul)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
add_label(label_omod2);
|
2020-08-12 15:58:32 +01:00
|
|
|
instr = mul;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool is_omod2()
|
|
|
|
|
{
|
|
|
|
|
return label & label_omod2;
|
|
|
|
|
}
|
|
|
|
|
|
2020-08-12 15:58:32 +01:00
|
|
|
void set_omod4(Instruction* mul)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
add_label(label_omod4);
|
2020-08-12 15:58:32 +01:00
|
|
|
instr = mul;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool is_omod4()
|
|
|
|
|
{
|
|
|
|
|
return label & label_omod4;
|
|
|
|
|
}
|
|
|
|
|
|
2020-08-12 15:58:32 +01:00
|
|
|
void set_omod5(Instruction* mul)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
add_label(label_omod5);
|
2020-08-12 15:58:32 +01:00
|
|
|
instr = mul;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool is_omod5()
|
|
|
|
|
{
|
|
|
|
|
return label & label_omod5;
|
|
|
|
|
}
|
|
|
|
|
|
2020-08-12 15:58:32 +01:00
|
|
|
void set_clamp(Instruction *med3)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
add_label(label_clamp);
|
2020-08-12 15:58:32 +01:00
|
|
|
instr = med3;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool is_clamp()
|
|
|
|
|
{
|
|
|
|
|
return label & label_clamp;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void set_undefined()
|
|
|
|
|
{
|
|
|
|
|
add_label(label_undefined);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool is_undefined()
|
|
|
|
|
{
|
|
|
|
|
return label & label_undefined;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void set_vcc(Temp vcc)
|
|
|
|
|
{
|
|
|
|
|
add_label(label_vcc);
|
|
|
|
|
temp = vcc;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool is_vcc()
|
|
|
|
|
{
|
|
|
|
|
return label & label_vcc;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void set_b2f(Temp val)
|
|
|
|
|
{
|
|
|
|
|
add_label(label_b2f);
|
|
|
|
|
temp = val;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool is_b2f()
|
|
|
|
|
{
|
|
|
|
|
return label & label_b2f;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void set_add_sub(Instruction *add_sub_instr)
|
|
|
|
|
{
|
|
|
|
|
add_label(label_add_sub);
|
|
|
|
|
instr = add_sub_instr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool is_add_sub()
|
|
|
|
|
{
|
|
|
|
|
return label & label_add_sub;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void set_bitwise(Instruction *bitwise_instr)
|
|
|
|
|
{
|
|
|
|
|
add_label(label_bitwise);
|
|
|
|
|
instr = bitwise_instr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool is_bitwise()
|
|
|
|
|
{
|
|
|
|
|
return label & label_bitwise;
|
|
|
|
|
}
|
|
|
|
|
|
2020-01-03 10:30:04 +01:00
|
|
|
void set_uniform_bitwise()
|
|
|
|
|
{
|
|
|
|
|
add_label(label_uniform_bitwise);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool is_uniform_bitwise()
|
|
|
|
|
{
|
|
|
|
|
return label & label_uniform_bitwise;
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
void set_minmax(Instruction *minmax_instr)
|
|
|
|
|
{
|
|
|
|
|
add_label(label_minmax);
|
|
|
|
|
instr = minmax_instr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool is_minmax()
|
|
|
|
|
{
|
|
|
|
|
return label & label_minmax;
|
|
|
|
|
}
|
|
|
|
|
|
2020-06-19 16:09:48 +01:00
|
|
|
void set_vopc(Instruction *vopc_instr)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2020-06-19 16:09:48 +01:00
|
|
|
add_label(label_vopc);
|
|
|
|
|
instr = vopc_instr;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2020-06-19 16:09:48 +01:00
|
|
|
bool is_vopc()
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2020-06-19 16:09:48 +01:00
|
|
|
return label & label_vopc;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2020-01-16 19:32:31 +01:00
|
|
|
void set_scc_needed()
|
|
|
|
|
{
|
|
|
|
|
add_label(label_scc_needed);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool is_scc_needed()
|
|
|
|
|
{
|
|
|
|
|
return label & label_scc_needed;
|
|
|
|
|
}
|
|
|
|
|
|
aco: Flip s_cbranch / s_cselect to optimize out an s_not if possible.
When possible, get rid of an s_not when all it does is invert the SCC,
and its successor s_cbranch / s_cselect can be inverted instead.
Also modify some parts of instruction_selection to take advantage of
this feature.
Example:
s2: %3900, s1: %3899:scc = s_andn2_b64 %0:exec, %406
s2: %3902 = s_cselect_b64 -1, 0, %3900:scc
s2: %407, s1: %3903:scc = s_not_b64 %3902
s2: %3906, s1: %3905:scc = s_and_b64 %407, %0:exec
p_cbranch_z %3905:scc
Can now be optimized to:
s2: %3900, s1: %3899:scc = s_andn2_b64 %0:exec, %406
p_cbranch_nz %3900:scc
Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
2019-11-19 13:29:54 +01:00
|
|
|
void set_scc_invert(Temp scc_inv)
|
|
|
|
|
{
|
|
|
|
|
add_label(label_scc_invert);
|
|
|
|
|
temp = scc_inv;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool is_scc_invert()
|
|
|
|
|
{
|
|
|
|
|
return label & label_scc_invert;
|
|
|
|
|
}
|
|
|
|
|
|
2019-11-05 11:41:00 +01:00
|
|
|
void set_uniform_bool(Temp uniform_bool)
|
|
|
|
|
{
|
|
|
|
|
add_label(label_uniform_bool);
|
|
|
|
|
temp = uniform_bool;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool is_uniform_bool()
|
|
|
|
|
{
|
|
|
|
|
return label & label_uniform_bool;
|
|
|
|
|
}
|
|
|
|
|
|
2020-01-07 10:12:08 +01:00
|
|
|
void set_vcc_hint()
|
|
|
|
|
{
|
|
|
|
|
add_label(label_vcc_hint);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool is_vcc_hint()
|
|
|
|
|
{
|
|
|
|
|
return label & label_vcc_hint;
|
|
|
|
|
}
|
2020-04-02 17:41:36 +02:00
|
|
|
|
|
|
|
|
void set_b2i(Temp val)
|
|
|
|
|
{
|
|
|
|
|
add_label(label_b2i);
|
|
|
|
|
temp = val;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool is_b2i()
|
|
|
|
|
{
|
|
|
|
|
return label & label_b2i;
|
|
|
|
|
}
|
|
|
|
|
|
2020-11-11 15:13:08 +01:00
|
|
|
void set_usedef(Instruction *label_instr)
|
|
|
|
|
{
|
|
|
|
|
add_label(label_usedef);
|
|
|
|
|
instr = label_instr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool is_usedef()
|
|
|
|
|
{
|
|
|
|
|
return label & label_usedef;
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct opt_ctx {
|
|
|
|
|
Program* program;
|
|
|
|
|
std::vector<aco_ptr<Instruction>> instructions;
|
|
|
|
|
ssa_info* info;
|
|
|
|
|
std::pair<uint32_t,Temp> last_literal;
|
|
|
|
|
std::vector<mad_info> mad_infos;
|
|
|
|
|
std::vector<uint16_t> uses;
|
|
|
|
|
};
|
|
|
|
|
|
2020-05-19 13:41:43 +01:00
|
|
|
struct CmpInfo {
|
|
|
|
|
aco_opcode ordered;
|
|
|
|
|
aco_opcode unordered;
|
|
|
|
|
aco_opcode ordered_swapped;
|
|
|
|
|
aco_opcode unordered_swapped;
|
|
|
|
|
aco_opcode inverse;
|
|
|
|
|
aco_opcode f32;
|
|
|
|
|
unsigned size;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
ALWAYS_INLINE bool get_cmp_info(aco_opcode op, CmpInfo *info);
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
bool can_swap_operands(aco_ptr<Instruction>& instr)
|
|
|
|
|
{
|
|
|
|
|
if (instr->operands[0].isConstant() ||
|
|
|
|
|
(instr->operands[0].isTemp() && instr->operands[0].getTemp().type() == RegType::sgpr))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
switch (instr->opcode) {
|
2020-06-30 16:16:18 +01:00
|
|
|
case aco_opcode::v_add_u32:
|
|
|
|
|
case aco_opcode::v_add_co_u32:
|
|
|
|
|
case aco_opcode::v_add_co_u32_e64:
|
|
|
|
|
case aco_opcode::v_add_i32:
|
2020-04-09 16:41:00 +02:00
|
|
|
case aco_opcode::v_add_f16:
|
2019-09-17 13:22:17 +02:00
|
|
|
case aco_opcode::v_add_f32:
|
2020-04-09 16:41:00 +02:00
|
|
|
case aco_opcode::v_mul_f16:
|
2019-09-17 13:22:17 +02:00
|
|
|
case aco_opcode::v_mul_f32:
|
|
|
|
|
case aco_opcode::v_or_b32:
|
|
|
|
|
case aco_opcode::v_and_b32:
|
|
|
|
|
case aco_opcode::v_xor_b32:
|
2020-04-09 16:41:00 +02:00
|
|
|
case aco_opcode::v_max_f16:
|
2019-09-17 13:22:17 +02:00
|
|
|
case aco_opcode::v_max_f32:
|
2020-04-09 16:41:00 +02:00
|
|
|
case aco_opcode::v_min_f16:
|
2019-09-17 13:22:17 +02:00
|
|
|
case aco_opcode::v_min_f32:
|
2019-12-13 16:17:21 +00:00
|
|
|
case aco_opcode::v_max_i32:
|
|
|
|
|
case aco_opcode::v_min_i32:
|
|
|
|
|
case aco_opcode::v_max_u32:
|
|
|
|
|
case aco_opcode::v_min_u32:
|
2020-05-19 13:41:43 +01:00
|
|
|
case aco_opcode::v_max_i16:
|
|
|
|
|
case aco_opcode::v_min_i16:
|
|
|
|
|
case aco_opcode::v_max_u16:
|
|
|
|
|
case aco_opcode::v_min_u16:
|
|
|
|
|
case aco_opcode::v_max_i16_e64:
|
|
|
|
|
case aco_opcode::v_min_i16_e64:
|
|
|
|
|
case aco_opcode::v_max_u16_e64:
|
|
|
|
|
case aco_opcode::v_min_u16_e64:
|
2019-09-17 13:22:17 +02:00
|
|
|
return true;
|
2020-04-09 16:41:00 +02:00
|
|
|
case aco_opcode::v_sub_f16:
|
|
|
|
|
instr->opcode = aco_opcode::v_subrev_f16;
|
|
|
|
|
return true;
|
2019-09-17 13:22:17 +02:00
|
|
|
case aco_opcode::v_sub_f32:
|
|
|
|
|
instr->opcode = aco_opcode::v_subrev_f32;
|
|
|
|
|
return true;
|
2020-05-19 13:41:43 +01:00
|
|
|
case aco_opcode::v_sub_co_u32:
|
|
|
|
|
instr->opcode = aco_opcode::v_subrev_co_u32;
|
2019-09-17 13:22:17 +02:00
|
|
|
return true;
|
2020-05-19 13:41:43 +01:00
|
|
|
case aco_opcode::v_sub_u16:
|
|
|
|
|
instr->opcode = aco_opcode::v_subrev_u16;
|
2020-04-09 16:41:00 +02:00
|
|
|
return true;
|
2020-05-19 13:41:43 +01:00
|
|
|
case aco_opcode::v_sub_u32:
|
|
|
|
|
instr->opcode = aco_opcode::v_subrev_u32;
|
2019-09-17 13:22:17 +02:00
|
|
|
return true;
|
2020-05-19 13:41:43 +01:00
|
|
|
default: {
|
|
|
|
|
CmpInfo info;
|
|
|
|
|
get_cmp_info(instr->opcode, &info);
|
|
|
|
|
if (info.ordered == instr->opcode) {
|
|
|
|
|
instr->opcode = info.ordered_swapped;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
if (info.unordered == instr->opcode) {
|
|
|
|
|
instr->opcode = info.unordered_swapped;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
return false;
|
|
|
|
|
}
|
2020-05-19 13:41:43 +01:00
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2020-05-11 17:49:40 +01:00
|
|
|
bool can_use_VOP3(opt_ctx& ctx, const aco_ptr<Instruction>& instr)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2019-11-22 15:00:04 +00:00
|
|
|
if (instr->isVOP3())
|
|
|
|
|
return true;
|
|
|
|
|
|
2019-11-20 16:42:17 +00:00
|
|
|
if (instr->operands.size() && instr->operands[0].isLiteral() && ctx.program->chip_class < GFX10)
|
2019-09-17 13:22:17 +02:00
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (instr->isDPP() || instr->isSDWA())
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
return instr->opcode != aco_opcode::v_madmk_f32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_madak_f32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_madmk_f16 &&
|
2019-11-07 18:02:33 +01:00
|
|
|
instr->opcode != aco_opcode::v_madak_f16 &&
|
2019-11-22 15:00:04 +00:00
|
|
|
instr->opcode != aco_opcode::v_fmamk_f32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_fmaak_f32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_fmamk_f16 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_fmaak_f16 &&
|
2019-11-07 18:02:33 +01:00
|
|
|
instr->opcode != aco_opcode::v_readlane_b32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_writelane_b32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_readfirstlane_b32;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2019-12-05 14:12:39 +00:00
|
|
|
bool can_apply_sgprs(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
2019-09-24 13:32:56 +01:00
|
|
|
{
|
2019-12-05 14:12:39 +00:00
|
|
|
if (instr->isSDWA() && ctx.program->chip_class < GFX9)
|
|
|
|
|
return false;
|
2019-09-24 13:32:56 +01:00
|
|
|
return instr->opcode != aco_opcode::v_readfirstlane_b32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_readlane_b32 &&
|
2019-11-07 18:02:33 +01:00
|
|
|
instr->opcode != aco_opcode::v_readlane_b32_e64 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_writelane_b32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_writelane_b32_e64;
|
2019-09-24 13:32:56 +01:00
|
|
|
}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
void to_VOP3(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
|
|
|
|
{
|
|
|
|
|
if (instr->isVOP3())
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
aco_ptr<Instruction> tmp = std::move(instr);
|
|
|
|
|
Format format = asVOP3(tmp->format);
|
|
|
|
|
instr.reset(create_instruction<VOP3A_instruction>(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
|
|
|
|
|
std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
|
|
|
|
|
for (unsigned i = 0; i < instr->definitions.size(); i++) {
|
|
|
|
|
instr->definitions[i] = tmp->definitions[i];
|
|
|
|
|
if (instr->definitions[i].isTemp()) {
|
|
|
|
|
ssa_info& info = ctx.info[instr->definitions[i].tempId()];
|
2020-08-12 15:58:32 +01:00
|
|
|
if (info.label & instr_usedef_labels && info.instr == tmp.get())
|
2019-09-17 13:22:17 +02:00
|
|
|
info.instr = instr.get();
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-08-12 15:58:32 +01:00
|
|
|
/* we don't need to update any instr_mod_labels because they either haven't
|
|
|
|
|
* been applied yet or this instruction isn't dead and so they've been ignored */
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* only covers special cases */
|
2020-01-16 16:54:35 +01:00
|
|
|
bool alu_can_accept_constant(aco_opcode opcode, unsigned operand)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2020-01-16 16:54:35 +01:00
|
|
|
switch (opcode) {
|
2019-09-17 13:22:17 +02:00
|
|
|
case aco_opcode::v_interp_p2_f32:
|
|
|
|
|
case aco_opcode::v_mac_f32:
|
|
|
|
|
case aco_opcode::v_writelane_b32:
|
2019-11-07 18:02:33 +01:00
|
|
|
case aco_opcode::v_writelane_b32_e64:
|
2019-09-17 13:22:17 +02:00
|
|
|
case aco_opcode::v_cndmask_b32:
|
|
|
|
|
return operand != 2;
|
|
|
|
|
case aco_opcode::s_addk_i32:
|
|
|
|
|
case aco_opcode::s_mulk_i32:
|
|
|
|
|
case aco_opcode::p_wqm:
|
|
|
|
|
case aco_opcode::p_extract_vector:
|
|
|
|
|
case aco_opcode::p_split_vector:
|
2019-09-24 13:32:56 +01:00
|
|
|
case aco_opcode::v_readlane_b32:
|
2019-11-07 18:02:33 +01:00
|
|
|
case aco_opcode::v_readlane_b32_e64:
|
2019-09-24 13:32:56 +01:00
|
|
|
case aco_opcode::v_readfirstlane_b32:
|
2019-09-17 13:22:17 +02:00
|
|
|
return operand != 0;
|
|
|
|
|
default:
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-11-12 15:53:15 +00:00
|
|
|
bool valu_can_accept_vgpr(aco_ptr<Instruction>& instr, unsigned operand)
|
|
|
|
|
{
|
2019-11-07 18:02:33 +01:00
|
|
|
if (instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_readlane_b32_e64 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_writelane_b32 || instr->opcode == aco_opcode::v_writelane_b32_e64)
|
2019-11-12 15:53:15 +00:00
|
|
|
return operand != 1;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2019-11-22 14:50:41 +00:00
|
|
|
/* check constant bus and literal limitations */
|
|
|
|
|
bool check_vop3_operands(opt_ctx& ctx, unsigned num_operands, Operand *operands)
|
|
|
|
|
{
|
2019-11-20 16:42:17 +00:00
|
|
|
int limit = ctx.program->chip_class >= GFX10 ? 2 : 1;
|
|
|
|
|
Operand literal32(s1);
|
|
|
|
|
Operand literal64(s2);
|
2019-11-22 14:50:41 +00:00
|
|
|
unsigned num_sgprs = 0;
|
|
|
|
|
unsigned sgpr[] = {0, 0};
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < num_operands; i++) {
|
|
|
|
|
Operand op = operands[i];
|
|
|
|
|
|
|
|
|
|
if (op.hasRegClass() && op.regClass().type() == RegType::sgpr) {
|
|
|
|
|
/* two reads of the same SGPR count as 1 to the limit */
|
|
|
|
|
if (op.tempId() != sgpr[0] && op.tempId() != sgpr[1]) {
|
|
|
|
|
if (num_sgprs < 2)
|
|
|
|
|
sgpr[num_sgprs++] = op.tempId();
|
|
|
|
|
limit--;
|
|
|
|
|
if (limit < 0)
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
} else if (op.isLiteral()) {
|
2019-11-20 16:42:17 +00:00
|
|
|
if (ctx.program->chip_class < GFX10)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (!literal32.isUndefined() && literal32.constantValue() != op.constantValue())
|
|
|
|
|
return false;
|
|
|
|
|
if (!literal64.isUndefined() && literal64.constantValue() != op.constantValue())
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
/* Any number of 32-bit literals counts as only 1 to the limit. Same
|
|
|
|
|
* (but separately) for 64-bit literals. */
|
|
|
|
|
if (op.size() == 1 && literal32.isUndefined()) {
|
|
|
|
|
limit--;
|
|
|
|
|
literal32 = op;
|
|
|
|
|
} else if (op.size() == 2 && literal64.isUndefined()) {
|
|
|
|
|
limit--;
|
|
|
|
|
literal64 = op;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (limit < 0)
|
|
|
|
|
return false;
|
2019-11-22 14:50:41 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2019-10-15 17:00:55 +01:00
|
|
|
bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp *base, uint32_t *offset, bool prevent_overflow)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
Operand op = instr->operands[op_index];
|
|
|
|
|
|
|
|
|
|
if (!op.isTemp())
|
|
|
|
|
return false;
|
|
|
|
|
Temp tmp = op.getTemp();
|
|
|
|
|
if (!ctx.info[tmp.id()].is_add_sub())
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
Instruction *add_instr = ctx.info[tmp.id()].instr;
|
|
|
|
|
|
|
|
|
|
switch (add_instr->opcode) {
|
|
|
|
|
case aco_opcode::v_add_u32:
|
|
|
|
|
case aco_opcode::v_add_co_u32:
|
2020-02-21 12:02:06 +00:00
|
|
|
case aco_opcode::v_add_co_u32_e64:
|
2019-09-17 13:22:17 +02:00
|
|
|
case aco_opcode::s_add_i32:
|
|
|
|
|
case aco_opcode::s_add_u32:
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
return false;
|
|
|
|
|
}
|
2019-10-15 17:25:57 +01:00
|
|
|
if (prevent_overflow && !add_instr->definitions[0].isNUW())
|
|
|
|
|
return false;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2019-10-29 13:59:59 +00:00
|
|
|
if (add_instr->usesModifiers())
|
|
|
|
|
return false;
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
|
|
|
|
if (add_instr->operands[i].isConstant()) {
|
|
|
|
|
*offset = add_instr->operands[i].constantValue();
|
|
|
|
|
} else if (add_instr->operands[i].isTemp() &&
|
2020-05-15 16:28:03 +01:00
|
|
|
ctx.info[add_instr->operands[i].tempId()].is_constant_or_literal(32)) {
|
2019-09-17 13:22:17 +02:00
|
|
|
*offset = ctx.info[add_instr->operands[i].tempId()].val;
|
|
|
|
|
} else {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (!add_instr->operands[!i].isTemp())
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
uint32_t offset2 = 0;
|
2019-10-15 17:00:55 +01:00
|
|
|
if (parse_base_offset(ctx, add_instr, !i, base, &offset2, prevent_overflow)) {
|
2019-09-17 13:22:17 +02:00
|
|
|
*offset += offset2;
|
|
|
|
|
} else {
|
|
|
|
|
*base = add_instr->operands[!i].getTemp();
|
|
|
|
|
}
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2020-05-15 15:12:33 +01:00
|
|
|
unsigned get_operand_size(aco_ptr<Instruction>& instr, unsigned index)
|
|
|
|
|
{
|
|
|
|
|
if (instr->format == Format::PSEUDO)
|
|
|
|
|
return instr->operands[index].bytes() * 8u;
|
|
|
|
|
else if (instr->opcode == aco_opcode::v_mad_u64_u32 || instr->opcode == aco_opcode::v_mad_i64_i32)
|
|
|
|
|
return index == 2 ? 64 : 32;
|
|
|
|
|
else if (instr->isVALU() || instr->isSALU())
|
|
|
|
|
return instr_info.operand_size[(int)instr->opcode];
|
|
|
|
|
else
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2020-05-15 16:28:03 +01:00
|
|
|
Operand get_constant_op(opt_ctx &ctx, ssa_info info, uint32_t bits)
|
2019-11-14 08:09:32 +01:00
|
|
|
{
|
2020-05-15 16:28:03 +01:00
|
|
|
if (bits == 8)
|
|
|
|
|
return Operand((uint8_t)info.val);
|
|
|
|
|
if (bits == 16)
|
|
|
|
|
return Operand((uint16_t)info.val);
|
2019-11-14 08:09:32 +01:00
|
|
|
// TODO: this functions shouldn't be needed if we store Operand instead of value.
|
2020-05-15 16:28:03 +01:00
|
|
|
Operand op(info.val, bits == 64);
|
|
|
|
|
if (info.is_literal(32) && info.val == 0x3e22f983 && ctx.program->chip_class >= GFX8)
|
2019-11-14 08:09:32 +01:00
|
|
|
op.setFixed(PhysReg{248}); /* 1/2 PI can be an inline constant on GFX8+ */
|
|
|
|
|
return op;
|
|
|
|
|
}
|
|
|
|
|
|
2020-01-28 12:04:48 +00:00
|
|
|
bool fixed_to_exec(Operand op)
|
|
|
|
|
{
|
|
|
|
|
return op.isFixed() && op.physReg() == exec;
|
|
|
|
|
}
|
|
|
|
|
|
2019-11-09 20:51:45 +00:00
|
|
|
void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
if (instr->isSALU() || instr->isVALU() || instr->format == Format::PSEUDO) {
|
|
|
|
|
ASSERTED bool all_const = false;
|
|
|
|
|
for (Operand& op : instr->operands)
|
2020-05-15 16:28:03 +01:00
|
|
|
all_const = all_const && (!op.isTemp() || ctx.info[op.tempId()].is_constant_or_literal(32));
|
2020-08-14 10:42:27 +02:00
|
|
|
perfwarn(ctx.program, all_const, "All instruction operands are constant", instr.get());
|
2020-10-15 15:18:40 +01:00
|
|
|
|
|
|
|
|
ASSERTED bool is_copy = instr->opcode == aco_opcode::s_mov_b32 ||
|
|
|
|
|
instr->opcode == aco_opcode::s_mov_b64 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_mov_b32;
|
|
|
|
|
perfwarn(ctx.program, is_copy && !instr->usesModifiers(), "Use p_parallelcopy instead", instr.get());
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < instr->operands.size(); i++)
|
|
|
|
|
{
|
|
|
|
|
if (!instr->operands[i].isTemp())
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
ssa_info info = ctx.info[instr->operands[i].tempId()];
|
|
|
|
|
/* propagate undef */
|
|
|
|
|
if (info.is_undefined() && is_phi(instr))
|
|
|
|
|
instr->operands[i] = Operand(instr->operands[i].regClass());
|
|
|
|
|
/* propagate reg->reg of same type */
|
|
|
|
|
if (info.is_temp() && info.temp.regClass() == instr->operands[i].getTemp().regClass()) {
|
|
|
|
|
instr->operands[i].setTemp(ctx.info[instr->operands[i].tempId()].temp);
|
|
|
|
|
info = ctx.info[info.temp.id()];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* SALU / PSEUDO: propagate inline constants */
|
|
|
|
|
if (instr->isSALU() || instr->format == Format::PSEUDO) {
|
2020-05-18 15:26:58 +01:00
|
|
|
bool is_subdword = false;
|
2020-05-15 16:28:03 +01:00
|
|
|
// TODO: optimize SGPR propagation for subdword pseudo instructions on gfx9+
|
2020-05-18 15:26:58 +01:00
|
|
|
if (instr->format == Format::PSEUDO) {
|
|
|
|
|
is_subdword = std::any_of(instr->definitions.begin(), instr->definitions.end(),
|
|
|
|
|
[] (const Definition& def) { return def.regClass().is_subdword();});
|
|
|
|
|
is_subdword = is_subdword || std::any_of(instr->operands.begin(), instr->operands.end(),
|
2020-10-13 13:33:57 +01:00
|
|
|
[] (const Operand& op) { return op.bytes() % 4;});
|
2020-05-15 16:28:03 +01:00
|
|
|
if (is_subdword && ctx.program->chip_class < GFX9)
|
2020-05-18 15:26:58 +01:00
|
|
|
continue;
|
|
|
|
|
}
|
2020-04-07 10:46:37 +01:00
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
if (info.is_temp() && info.temp.type() == RegType::sgpr) {
|
2020-04-07 10:46:37 +01:00
|
|
|
instr->operands[i].setTemp(info.temp);
|
|
|
|
|
info = ctx.info[info.temp.id()];
|
2019-09-17 13:22:17 +02:00
|
|
|
} else if (info.is_temp() && info.temp.type() == RegType::vgpr) {
|
|
|
|
|
/* propagate vgpr if it can take it */
|
|
|
|
|
switch (instr->opcode) {
|
|
|
|
|
case aco_opcode::p_create_vector:
|
|
|
|
|
case aco_opcode::p_split_vector:
|
|
|
|
|
case aco_opcode::p_extract_vector:
|
aco: always use p_parallelcopy for pre-RA copies
Most fossil-db changes are because literals are applied earlier
(in label_instruction), so use counts are more accurate and more literals
are applied.
fossil-db (Navi):
Totals from 79551 (57.89% of 137413) affected shaders:
SGPRs: 4549610 -> 4542802 (-0.15%); split: -0.19%, +0.04%
VGPRs: 3326764 -> 3324172 (-0.08%); split: -0.10%, +0.03%
SpillSGPRs: 38886 -> 34562 (-11.12%); split: -11.14%, +0.02%
CodeSize: 240143456 -> 240001008 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 1078919 -> 1079281 (+0.03%); split: +0.04%, -0.01%
Instrs: 46627073 -> 46528490 (-0.21%); split: -0.22%, +0.01%
fossil-db (Polaris):
Totals from 98463 (70.90% of 138881) affected shaders:
SGPRs: 5164689 -> 5164353 (-0.01%); split: -0.02%, +0.01%
VGPRs: 3920936 -> 3921856 (+0.02%); split: -0.00%, +0.03%
SpillSGPRs: 56298 -> 52259 (-7.17%); split: -7.22%, +0.04%
CodeSize: 258680092 -> 258692712 (+0.00%); split: -0.02%, +0.03%
MaxWaves: 620863 -> 620823 (-0.01%); split: +0.00%, -0.01%
Instrs: 50776289 -> 50757577 (-0.04%); split: -0.04%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7216>
2020-10-14 13:50:24 +01:00
|
|
|
case aco_opcode::p_phi:
|
|
|
|
|
case aco_opcode::p_parallelcopy: {
|
2019-09-17 13:22:17 +02:00
|
|
|
const bool all_vgpr = std::none_of(instr->definitions.begin(), instr->definitions.end(),
|
|
|
|
|
[] (const Definition& def) { return def.getTemp().type() != RegType::vgpr;});
|
|
|
|
|
if (all_vgpr) {
|
|
|
|
|
instr->operands[i] = Operand(info.temp);
|
|
|
|
|
info = ctx.info[info.temp.id()];
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-05-15 16:28:03 +01:00
|
|
|
unsigned bits = get_operand_size(instr, i);
|
2020-10-13 13:33:57 +01:00
|
|
|
if ((info.is_constant(bits) || (info.is_literal(bits) && instr->format == Format::PSEUDO)) &&
|
2020-01-16 16:54:35 +01:00
|
|
|
!instr->operands[i].isFixed() && alu_can_accept_constant(instr->opcode, i)) {
|
2020-05-15 16:28:03 +01:00
|
|
|
instr->operands[i] = get_constant_op(ctx, info, bits);
|
2019-09-17 13:22:17 +02:00
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* VALU: propagate neg, abs & inline constants */
|
|
|
|
|
else if (instr->isVALU()) {
|
2019-11-12 15:53:15 +00:00
|
|
|
if (info.is_temp() && info.temp.type() == RegType::vgpr && valu_can_accept_vgpr(instr, i)) {
|
2019-09-17 13:22:17 +02:00
|
|
|
instr->operands[i].setTemp(info.temp);
|
|
|
|
|
info = ctx.info[info.temp.id()];
|
|
|
|
|
}
|
2020-09-18 00:00:38 +01:00
|
|
|
/* applying SGPRs to VOP1 doesn't increase code size and DCE is helped by doing it earlier */
|
2019-12-05 14:12:39 +00:00
|
|
|
if (info.is_temp() && info.temp.type() == RegType::sgpr && can_apply_sgprs(ctx, instr) && instr->operands.size() == 1) {
|
2020-09-18 00:00:38 +01:00
|
|
|
instr->operands[i].setTemp(info.temp);
|
|
|
|
|
info = ctx.info[info.temp.id()];
|
|
|
|
|
}
|
2020-05-15 15:12:33 +01:00
|
|
|
|
|
|
|
|
/* for instructions other than v_cndmask_b32, the size of the instruction should match the operand size */
|
|
|
|
|
unsigned can_use_mod = instr->opcode != aco_opcode::v_cndmask_b32 || instr->operands[i].getTemp().bytes() == 4;
|
|
|
|
|
can_use_mod = can_use_mod && instr_info.can_use_input_modifiers[(int)instr->opcode];
|
|
|
|
|
|
2019-12-05 14:12:39 +00:00
|
|
|
if (instr->isSDWA())
|
|
|
|
|
can_use_mod = can_use_mod && (static_cast<SDWA_instruction*>(instr.get())->sel[i] & sdwa_asuint) == sdwa_udword;
|
|
|
|
|
else
|
|
|
|
|
can_use_mod = can_use_mod && (instr->isDPP() || can_use_VOP3(ctx, instr));
|
|
|
|
|
|
|
|
|
|
if (info.is_abs() && can_use_mod) {
|
|
|
|
|
if (!instr->isDPP() && !instr->isSDWA())
|
2019-09-17 13:22:17 +02:00
|
|
|
to_VOP3(ctx, instr);
|
|
|
|
|
instr->operands[i] = Operand(info.temp);
|
|
|
|
|
if (instr->isDPP())
|
|
|
|
|
static_cast<DPP_instruction*>(instr.get())->abs[i] = true;
|
2019-12-05 14:12:39 +00:00
|
|
|
else if (instr->isSDWA())
|
|
|
|
|
static_cast<SDWA_instruction*>(instr.get())->abs[i] = true;
|
2019-09-17 13:22:17 +02:00
|
|
|
else
|
|
|
|
|
static_cast<VOP3A_instruction*>(instr.get())->abs[i] = true;
|
|
|
|
|
}
|
|
|
|
|
if (info.is_neg() && instr->opcode == aco_opcode::v_add_f32) {
|
|
|
|
|
instr->opcode = i ? aco_opcode::v_sub_f32 : aco_opcode::v_subrev_f32;
|
|
|
|
|
instr->operands[i].setTemp(info.temp);
|
|
|
|
|
continue;
|
2020-05-15 15:12:33 +01:00
|
|
|
} else if (info.is_neg() && instr->opcode == aco_opcode::v_add_f16) {
|
|
|
|
|
instr->opcode = i ? aco_opcode::v_sub_f16 : aco_opcode::v_subrev_f16;
|
|
|
|
|
instr->operands[i].setTemp(info.temp);
|
|
|
|
|
continue;
|
2019-12-05 14:12:39 +00:00
|
|
|
} else if (info.is_neg() && can_use_mod) {
|
|
|
|
|
if (!instr->isDPP() && !instr->isSDWA())
|
2019-09-17 13:22:17 +02:00
|
|
|
to_VOP3(ctx, instr);
|
|
|
|
|
instr->operands[i].setTemp(info.temp);
|
|
|
|
|
if (instr->isDPP())
|
|
|
|
|
static_cast<DPP_instruction*>(instr.get())->neg[i] = true;
|
2019-12-05 14:12:39 +00:00
|
|
|
else if (instr->isSDWA())
|
|
|
|
|
static_cast<SDWA_instruction*>(instr.get())->neg[i] = true;
|
2019-09-17 13:22:17 +02:00
|
|
|
else
|
|
|
|
|
static_cast<VOP3A_instruction*>(instr.get())->neg[i] = true;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2020-05-15 16:28:03 +01:00
|
|
|
unsigned bits = get_operand_size(instr, i);
|
2020-09-07 20:21:37 +01:00
|
|
|
if (info.is_constant(bits) && alu_can_accept_constant(instr->opcode, i) &&
|
|
|
|
|
(!instr->isSDWA() || ctx.program->chip_class >= GFX9)) {
|
2020-05-15 16:28:03 +01:00
|
|
|
Operand op = get_constant_op(ctx, info, bits);
|
2020-08-14 10:42:27 +02:00
|
|
|
perfwarn(ctx.program, instr->opcode == aco_opcode::v_cndmask_b32 && i == 2, "v_cndmask_b32 with a constant selector", instr.get());
|
2019-12-05 14:12:39 +00:00
|
|
|
if (i == 0 || instr->isSDWA() || instr->opcode == aco_opcode::v_readlane_b32 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_writelane_b32) {
|
2019-11-13 11:14:51 +01:00
|
|
|
instr->operands[i] = op;
|
2019-09-17 13:22:17 +02:00
|
|
|
continue;
|
|
|
|
|
} else if (!instr->isVOP3() && can_swap_operands(instr)) {
|
|
|
|
|
instr->operands[i] = instr->operands[0];
|
2019-11-13 11:14:51 +01:00
|
|
|
instr->operands[0] = op;
|
2019-09-17 13:22:17 +02:00
|
|
|
continue;
|
2019-11-20 16:42:17 +00:00
|
|
|
} else if (can_use_VOP3(ctx, instr)) {
|
2019-09-17 13:22:17 +02:00
|
|
|
to_VOP3(ctx, instr);
|
2019-11-13 11:14:51 +01:00
|
|
|
instr->operands[i] = op;
|
2019-09-17 13:22:17 +02:00
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* MUBUF: propagate constants and combine additions */
|
|
|
|
|
else if (instr->format == Format::MUBUF) {
|
|
|
|
|
MUBUF_instruction *mubuf = static_cast<MUBUF_instruction *>(instr.get());
|
|
|
|
|
Temp base;
|
|
|
|
|
uint32_t offset;
|
|
|
|
|
while (info.is_temp())
|
|
|
|
|
info = ctx.info[info.temp.id()];
|
|
|
|
|
|
2019-10-15 17:00:55 +01:00
|
|
|
/* According to AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(), vaddr
|
|
|
|
|
* overflow for scratch accesses works only on GFX9+ and saddr overflow
|
|
|
|
|
* never works. Since swizzling is the only thing that separates
|
|
|
|
|
* scratch accesses and other accesses and swizzling changing how
|
|
|
|
|
* addressing works significantly, this probably applies to swizzled
|
|
|
|
|
* MUBUF accesses. */
|
|
|
|
|
bool vaddr_prevent_overflow = mubuf->swizzled && ctx.program->chip_class < GFX9;
|
|
|
|
|
bool saddr_prevent_overflow = mubuf->swizzled;
|
|
|
|
|
|
2020-05-15 16:28:03 +01:00
|
|
|
if (mubuf->offen && i == 1 && info.is_constant_or_literal(32) && mubuf->offset + info.val < 4096) {
|
2019-09-17 13:22:17 +02:00
|
|
|
assert(!mubuf->idxen);
|
2020-01-16 16:54:35 +01:00
|
|
|
instr->operands[1] = Operand(v1);
|
2019-09-17 13:22:17 +02:00
|
|
|
mubuf->offset += info.val;
|
|
|
|
|
mubuf->offen = false;
|
|
|
|
|
continue;
|
2020-05-15 16:28:03 +01:00
|
|
|
} else if (i == 2 && info.is_constant_or_literal(32) && mubuf->offset + info.val < 4096) {
|
2019-09-17 13:22:17 +02:00
|
|
|
instr->operands[2] = Operand((uint32_t) 0);
|
|
|
|
|
mubuf->offset += info.val;
|
|
|
|
|
continue;
|
2019-10-15 17:00:55 +01:00
|
|
|
} else if (mubuf->offen && i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, vaddr_prevent_overflow) &&
|
|
|
|
|
base.regClass() == v1 && mubuf->offset + offset < 4096) {
|
2019-09-17 13:22:17 +02:00
|
|
|
assert(!mubuf->idxen);
|
2020-01-16 16:54:35 +01:00
|
|
|
instr->operands[1].setTemp(base);
|
2019-09-17 13:22:17 +02:00
|
|
|
mubuf->offset += offset;
|
|
|
|
|
continue;
|
2019-10-15 17:00:55 +01:00
|
|
|
} else if (i == 2 && parse_base_offset(ctx, instr.get(), i, &base, &offset, saddr_prevent_overflow) &&
|
|
|
|
|
base.regClass() == s1 && mubuf->offset + offset < 4096) {
|
2019-09-17 13:22:17 +02:00
|
|
|
instr->operands[i].setTemp(base);
|
|
|
|
|
mubuf->offset += offset;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* DS: combine additions */
|
|
|
|
|
else if (instr->format == Format::DS) {
|
|
|
|
|
|
|
|
|
|
DS_instruction *ds = static_cast<DS_instruction *>(instr.get());
|
|
|
|
|
Temp base;
|
|
|
|
|
uint32_t offset;
|
2020-01-15 10:47:17 +01:00
|
|
|
bool has_usable_ds_offset = ctx.program->chip_class >= GFX7;
|
|
|
|
|
if (has_usable_ds_offset &&
|
2019-10-15 17:00:55 +01:00
|
|
|
i == 0 && parse_base_offset(ctx, instr.get(), i, &base, &offset, false) &&
|
2020-01-15 10:47:17 +01:00
|
|
|
base.regClass() == instr->operands[i].regClass() &&
|
|
|
|
|
instr->opcode != aco_opcode::ds_swizzle_b32) {
|
2019-10-14 17:17:00 +01:00
|
|
|
if (instr->opcode == aco_opcode::ds_write2_b32 || instr->opcode == aco_opcode::ds_read2_b32 ||
|
|
|
|
|
instr->opcode == aco_opcode::ds_write2_b64 || instr->opcode == aco_opcode::ds_read2_b64) {
|
2020-03-09 17:07:41 +01:00
|
|
|
unsigned mask = (instr->opcode == aco_opcode::ds_write2_b64 || instr->opcode == aco_opcode::ds_read2_b64) ? 0x7 : 0x3;
|
|
|
|
|
unsigned shifts = (instr->opcode == aco_opcode::ds_write2_b64 || instr->opcode == aco_opcode::ds_read2_b64) ? 3 : 2;
|
|
|
|
|
|
|
|
|
|
if ((offset & mask) == 0 &&
|
|
|
|
|
ds->offset0 + (offset >> shifts) <= 255 &&
|
|
|
|
|
ds->offset1 + (offset >> shifts) <= 255) {
|
2019-09-17 13:22:17 +02:00
|
|
|
instr->operands[i].setTemp(base);
|
2020-03-09 17:07:41 +01:00
|
|
|
ds->offset0 += offset >> shifts;
|
|
|
|
|
ds->offset1 += offset >> shifts;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
if (ds->offset0 + offset <= 65535) {
|
|
|
|
|
instr->operands[i].setTemp(base);
|
|
|
|
|
ds->offset0 += offset;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* SMEM: propagate constants and combine additions */
|
|
|
|
|
else if (instr->format == Format::SMEM) {
|
|
|
|
|
|
|
|
|
|
SMEM_instruction *smem = static_cast<SMEM_instruction *>(instr.get());
|
|
|
|
|
Temp base;
|
|
|
|
|
uint32_t offset;
|
2020-07-17 15:01:41 +01:00
|
|
|
bool prevent_overflow = smem->operands[0].size() > 2 || smem->prevent_overflow;
|
2020-05-15 16:28:03 +01:00
|
|
|
if (i == 1 && info.is_constant_or_literal(32) &&
|
2020-01-16 14:04:49 +01:00
|
|
|
((ctx.program->chip_class == GFX6 && info.val <= 0x3FF) ||
|
|
|
|
|
(ctx.program->chip_class == GFX7 && info.val <= 0xFFFFFFFF) ||
|
|
|
|
|
(ctx.program->chip_class >= GFX8 && info.val <= 0xFFFFF))) {
|
2019-09-17 13:22:17 +02:00
|
|
|
instr->operands[i] = Operand(info.val);
|
|
|
|
|
continue;
|
2020-07-17 15:01:41 +01:00
|
|
|
} else if (i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, prevent_overflow) && base.regClass() == s1 && offset <= 0xFFFFF && ctx.program->chip_class >= GFX9) {
|
2019-09-17 13:22:17 +02:00
|
|
|
bool soe = smem->operands.size() >= (!smem->definitions.empty() ? 3 : 4);
|
|
|
|
|
if (soe &&
|
2020-05-15 16:28:03 +01:00
|
|
|
(!ctx.info[smem->operands.back().tempId()].is_constant_or_literal(32) ||
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.info[smem->operands.back().tempId()].val != 0)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (soe) {
|
|
|
|
|
smem->operands[1] = Operand(offset);
|
|
|
|
|
smem->operands.back() = Operand(base);
|
|
|
|
|
} else {
|
|
|
|
|
SMEM_instruction *new_instr = create_instruction<SMEM_instruction>(smem->opcode, Format::SMEM, smem->operands.size() + 1, smem->definitions.size());
|
|
|
|
|
new_instr->operands[0] = smem->operands[0];
|
|
|
|
|
new_instr->operands[1] = Operand(offset);
|
|
|
|
|
if (smem->definitions.empty())
|
|
|
|
|
new_instr->operands[2] = smem->operands[2];
|
|
|
|
|
new_instr->operands.back() = Operand(base);
|
|
|
|
|
if (!smem->definitions.empty())
|
|
|
|
|
new_instr->definitions[0] = smem->definitions[0];
|
2020-06-26 15:54:22 +01:00
|
|
|
new_instr->sync = smem->sync;
|
2020-04-30 16:10:42 +01:00
|
|
|
new_instr->glc = smem->glc;
|
|
|
|
|
new_instr->dlc = smem->dlc;
|
|
|
|
|
new_instr->nv = smem->nv;
|
|
|
|
|
new_instr->disable_wqm = smem->disable_wqm;
|
2019-09-17 13:22:17 +02:00
|
|
|
instr.reset(new_instr);
|
|
|
|
|
smem = static_cast<SMEM_instruction *>(instr.get());
|
|
|
|
|
}
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
}
|
aco: Flip s_cbranch / s_cselect to optimize out an s_not if possible.
When possible, get rid of an s_not when all it does is invert the SCC,
and its successor s_cbranch / s_cselect can be inverted instead.
Also modify some parts of instruction_selection to take advantage of
this feature.
Example:
s2: %3900, s1: %3899:scc = s_andn2_b64 %0:exec, %406
s2: %3902 = s_cselect_b64 -1, 0, %3900:scc
s2: %407, s1: %3903:scc = s_not_b64 %3902
s2: %3906, s1: %3905:scc = s_and_b64 %407, %0:exec
p_cbranch_z %3905:scc
Can now be optimized to:
s2: %3900, s1: %3899:scc = s_andn2_b64 %0:exec, %406
p_cbranch_nz %3900:scc
Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
2019-11-19 13:29:54 +01:00
|
|
|
|
|
|
|
|
else if (instr->format == Format::PSEUDO_BRANCH) {
|
|
|
|
|
if (ctx.info[instr->operands[0].tempId()].is_scc_invert()) {
|
|
|
|
|
/* Flip the branch instruction to get rid of the scc_invert instruction */
|
|
|
|
|
instr->opcode = instr->opcode == aco_opcode::p_cbranch_z ? aco_opcode::p_cbranch_nz : aco_opcode::p_cbranch_z;
|
|
|
|
|
instr->operands[0].setTemp(ctx.info[instr->operands[0].tempId()].temp);
|
|
|
|
|
}
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* if this instruction doesn't define anything, return */
|
|
|
|
|
if (instr->definitions.empty())
|
|
|
|
|
return;
|
|
|
|
|
|
2020-06-19 16:09:48 +01:00
|
|
|
if ((uint16_t) instr->format & (uint16_t) Format::VOPC) {
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_vopc(instr.get());
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
switch (instr->opcode) {
|
|
|
|
|
case aco_opcode::p_create_vector: {
|
2020-05-06 17:24:38 +01:00
|
|
|
bool copy_prop = instr->operands.size() == 1 && instr->operands[0].isTemp() &&
|
|
|
|
|
instr->operands[0].regClass() == instr->definitions[0].regClass();
|
|
|
|
|
if (copy_prop) {
|
2020-04-21 17:37:44 +01:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
|
2020-05-06 17:24:38 +01:00
|
|
|
break;
|
|
|
|
|
}
|
2020-04-21 17:37:44 +01:00
|
|
|
|
2020-09-18 11:52:35 +01:00
|
|
|
/* expand vector operands */
|
|
|
|
|
bool accept_subdword = instr->definitions[0].regClass().type() == RegType::vgpr &&
|
|
|
|
|
std::all_of(instr->operands.begin(), instr->operands.end(),
|
|
|
|
|
[&] (const Operand& op) { return !op.isLiteral() &&
|
|
|
|
|
(ctx.program->chip_class >= GFX9 || (op.hasRegClass() && op.regClass().type() == RegType::vgpr));});
|
|
|
|
|
|
|
|
|
|
std::vector<Operand> ops;
|
2019-09-17 13:22:17 +02:00
|
|
|
for (const Operand& op : instr->operands) {
|
2020-09-18 11:52:35 +01:00
|
|
|
if (!op.isTemp() || !ctx.info[op.tempId()].is_vec()) {
|
|
|
|
|
ops.emplace_back(op);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
Instruction* vec = ctx.info[op.tempId()].instr;
|
|
|
|
|
bool is_subdword = std::any_of(vec->operands.begin(), vec->operands.end(),
|
|
|
|
|
[&] (const Operand& op) { return op.bytes() % 4; } );
|
|
|
|
|
|
|
|
|
|
if (accept_subdword || !is_subdword) {
|
|
|
|
|
for (const Operand& vec_op : vec->operands) {
|
|
|
|
|
ops.emplace_back(vec_op);
|
|
|
|
|
if (op.isLiteral() || (ctx.program->chip_class <= GFX8 &&
|
|
|
|
|
(!op.hasRegClass() || op.regClass().type() == RegType::sgpr)))
|
|
|
|
|
accept_subdword = false;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
2020-09-18 11:52:35 +01:00
|
|
|
} else {
|
|
|
|
|
ops.emplace_back(op);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
2020-04-24 11:58:17 +01:00
|
|
|
|
2020-09-18 11:52:35 +01:00
|
|
|
/* combine expanded operands to new vector */
|
|
|
|
|
if (ops.size() != instr->operands.size()) {
|
|
|
|
|
assert(ops.size() > instr->operands.size());
|
|
|
|
|
Definition def = instr->definitions[0];
|
|
|
|
|
instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, ops.size(), 1));
|
|
|
|
|
for (unsigned i = 0; i < ops.size(); i++) {
|
|
|
|
|
if (ops[i].isTemp() && ctx.info[ops[i].tempId()].is_temp() &&
|
|
|
|
|
ctx.info[ops[i].tempId()].temp.type() == def.regClass().type())
|
|
|
|
|
ops[i].setTemp(ctx.info[ops[i].tempId()].temp);
|
|
|
|
|
instr->operands[i] = ops[i];
|
|
|
|
|
}
|
|
|
|
|
instr->definitions[0] = def;
|
|
|
|
|
} else {
|
|
|
|
|
for (unsigned i = 0; i < ops.size(); i++) {
|
|
|
|
|
assert(instr->operands[i] == ops[i]);
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-05-06 17:24:38 +01:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_vec(instr.get());
|
2019-09-17 13:22:17 +02:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case aco_opcode::p_split_vector: {
|
2020-05-18 19:42:40 +01:00
|
|
|
ssa_info& info = ctx.info[instr->operands[0].tempId()];
|
|
|
|
|
|
|
|
|
|
if (info.is_constant_or_literal(32)) {
|
|
|
|
|
uint32_t val = info.val;
|
|
|
|
|
for (Definition def : instr->definitions) {
|
|
|
|
|
uint32_t mask = u_bit_consecutive(0, def.bytes() * 8u);
|
|
|
|
|
ctx.info[def.tempId()].set_constant(ctx.program->chip_class, val & mask);
|
|
|
|
|
val >>= def.bytes() * 8u;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
} else if (!info.is_vec()) {
|
2019-09-17 13:22:17 +02:00
|
|
|
break;
|
2020-05-18 19:42:40 +01:00
|
|
|
}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
Instruction* vec = ctx.info[instr->operands[0].tempId()].instr;
|
2020-04-10 13:09:54 +01:00
|
|
|
unsigned split_offset = 0;
|
|
|
|
|
unsigned vec_offset = 0;
|
|
|
|
|
unsigned vec_index = 0;
|
|
|
|
|
for (unsigned i = 0; i < instr->definitions.size(); split_offset += instr->definitions[i++].bytes()) {
|
|
|
|
|
while (vec_offset < split_offset && vec_index < vec->operands.size())
|
|
|
|
|
vec_offset += vec->operands[vec_index++].bytes();
|
|
|
|
|
|
|
|
|
|
if (vec_offset != split_offset || vec->operands[vec_index].bytes() != instr->definitions[i].bytes())
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
Operand vec_op = vec->operands[vec_index];
|
2019-09-17 13:22:17 +02:00
|
|
|
if (vec_op.isConstant()) {
|
2020-05-15 16:28:03 +01:00
|
|
|
ctx.info[instr->definitions[i].tempId()].set_constant(ctx.program->chip_class, vec_op.constantValue64());
|
2020-04-16 20:18:23 +01:00
|
|
|
} else if (vec_op.isUndefined()) {
|
|
|
|
|
ctx.info[instr->definitions[i].tempId()].set_undefined();
|
2019-09-17 13:22:17 +02:00
|
|
|
} else {
|
|
|
|
|
assert(vec_op.isTemp());
|
|
|
|
|
ctx.info[instr->definitions[i].tempId()].set_temp(vec_op.getTemp());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case aco_opcode::p_extract_vector: { /* mov */
|
2020-05-18 19:42:40 +01:00
|
|
|
ssa_info& info = ctx.info[instr->operands[0].tempId()];
|
|
|
|
|
const unsigned index = instr->operands[1].constantValue();
|
|
|
|
|
const unsigned dst_offset = index * instr->definitions[0].bytes();
|
|
|
|
|
|
|
|
|
|
if (info.is_constant_or_literal(32)) {
|
|
|
|
|
uint32_t mask = u_bit_consecutive(0, instr->definitions[0].bytes() * 8u);
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, (info.val >> (dst_offset * 8u)) & mask);
|
|
|
|
|
break;
|
|
|
|
|
} else if (!info.is_vec()) {
|
2019-09-17 13:22:17 +02:00
|
|
|
break;
|
2020-05-18 19:42:40 +01:00
|
|
|
}
|
2020-04-10 11:52:13 +01:00
|
|
|
|
|
|
|
|
/* check if we index directly into a vector element */
|
2020-05-18 19:42:40 +01:00
|
|
|
Instruction* vec = info.instr;
|
2020-04-10 11:52:13 +01:00
|
|
|
unsigned offset = 0;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-04-10 11:52:13 +01:00
|
|
|
for (const Operand& op : vec->operands) {
|
|
|
|
|
if (offset < dst_offset) {
|
|
|
|
|
offset += op.bytes();
|
|
|
|
|
continue;
|
|
|
|
|
} else if (offset != dst_offset || op.bytes() != instr->definitions[0].bytes()) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
2019-11-13 11:14:51 +01:00
|
|
|
|
2020-04-10 11:52:13 +01:00
|
|
|
/* convert this extract into a copy instruction */
|
|
|
|
|
instr->opcode = aco_opcode::p_parallelcopy;
|
|
|
|
|
instr->operands.pop_back();
|
|
|
|
|
instr->operands[0] = op;
|
|
|
|
|
|
|
|
|
|
if (op.isConstant()) {
|
2020-05-15 16:28:03 +01:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, op.constantValue64());
|
2020-04-16 20:18:23 +01:00
|
|
|
} else if (op.isUndefined()) {
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_undefined();
|
2019-09-17 13:22:17 +02:00
|
|
|
} else {
|
2020-04-10 11:52:13 +01:00
|
|
|
assert(op.isTemp());
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_temp(op.getTemp());
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
2020-04-10 11:52:13 +01:00
|
|
|
break;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
2020-10-15 15:18:40 +01:00
|
|
|
case aco_opcode::p_parallelcopy: /* propagate */
|
2020-10-15 14:49:34 +01:00
|
|
|
if (instr->operands[0].isTemp() && ctx.info[instr->operands[0].tempId()].is_vec() &&
|
|
|
|
|
instr->operands[0].regClass() != instr->definitions[0].regClass()) {
|
|
|
|
|
/* We might not be able to copy-propagate if it's a SGPR->VGPR copy, so
|
|
|
|
|
* duplicate the vector instead.
|
|
|
|
|
*/
|
|
|
|
|
Instruction *vec = ctx.info[instr->operands[0].tempId()].instr;
|
|
|
|
|
aco_ptr<Instruction> old_copy = std::move(instr);
|
|
|
|
|
|
|
|
|
|
instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, vec->operands.size(), 1));
|
|
|
|
|
instr->definitions[0] = old_copy->definitions[0];
|
|
|
|
|
std::copy(vec->operands.begin(), vec->operands.end(), instr->operands.begin());
|
|
|
|
|
for (unsigned i = 0; i < vec->operands.size(); i++) {
|
|
|
|
|
Operand& op = instr->operands[i];
|
|
|
|
|
if (op.isTemp() && ctx.info[op.tempId()].is_temp() &&
|
|
|
|
|
ctx.info[op.tempId()].temp.type() == instr->definitions[0].regClass().type())
|
|
|
|
|
op.setTemp(ctx.info[op.tempId()].temp);
|
|
|
|
|
}
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_vec(instr.get());
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
/* fallthrough */
|
2019-09-17 13:22:17 +02:00
|
|
|
case aco_opcode::p_as_uniform:
|
|
|
|
|
if (instr->definitions[0].isFixed()) {
|
|
|
|
|
/* don't copy-propagate copies into fixed registers */
|
2019-10-29 13:59:59 +00:00
|
|
|
} else if (instr->usesModifiers()) {
|
|
|
|
|
// TODO
|
2019-09-17 13:22:17 +02:00
|
|
|
} else if (instr->operands[0].isConstant()) {
|
2020-05-15 16:28:03 +01:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, instr->operands[0].constantValue64());
|
2019-09-17 13:22:17 +02:00
|
|
|
} else if (instr->operands[0].isTemp()) {
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
|
|
|
|
|
} else {
|
|
|
|
|
assert(instr->operands[0].isFixed());
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
case aco_opcode::p_is_helper:
|
|
|
|
|
if (!ctx.program->needs_wqm)
|
2020-05-15 16:28:03 +01:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, 0u);
|
2019-09-17 13:22:17 +02:00
|
|
|
break;
|
2020-05-15 15:12:33 +01:00
|
|
|
case aco_opcode::v_mul_f16:
|
2019-09-17 13:22:17 +02:00
|
|
|
case aco_opcode::v_mul_f32: { /* omod */
|
2020-08-12 15:58:32 +01:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_mul(instr.get());
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
/* TODO: try to move the negate/abs modifier to the consumer instead */
|
2019-10-29 13:59:59 +00:00
|
|
|
if (instr->usesModifiers())
|
|
|
|
|
break;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-05-15 15:12:33 +01:00
|
|
|
bool fp16 = instr->opcode == aco_opcode::v_mul_f16;
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
|
|
|
|
if (instr->operands[!i].isConstant() && instr->operands[i].isTemp()) {
|
2020-05-15 15:12:33 +01:00
|
|
|
if (instr->operands[!i].constantValue() == (fp16 ? 0x4000 : 0x40000000)) { /* 2.0 */
|
2020-08-12 15:58:32 +01:00
|
|
|
ctx.info[instr->operands[i].tempId()].set_omod2(instr.get());
|
2020-05-15 15:12:33 +01:00
|
|
|
} else if (instr->operands[!i].constantValue() == (fp16 ? 0x4400 : 0x40800000)) { /* 4.0 */
|
2020-08-12 15:58:32 +01:00
|
|
|
ctx.info[instr->operands[i].tempId()].set_omod4(instr.get());
|
2020-05-15 15:12:33 +01:00
|
|
|
} else if (instr->operands[!i].constantValue() == (fp16 ? 0xb800 : 0x3f000000)) { /* 0.5 */
|
2020-08-12 15:58:32 +01:00
|
|
|
ctx.info[instr->operands[i].tempId()].set_omod5(instr.get());
|
2020-05-15 15:12:33 +01:00
|
|
|
} else if (instr->operands[!i].constantValue() == (fp16 ? 0x3c00 : 0x3f800000) &&
|
|
|
|
|
!(fp16 ? block.fp_mode.must_flush_denorms16_64 : block.fp_mode.must_flush_denorms32)) { /* 1.0 */
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[i].getTemp());
|
|
|
|
|
} else {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
2020-05-15 15:12:33 +01:00
|
|
|
case aco_opcode::v_and_b32: { /* abs */
|
|
|
|
|
if (!instr->usesModifiers() && instr->operands[1].isTemp() &&
|
|
|
|
|
instr->operands[1].getTemp().type() == RegType::vgpr &&
|
|
|
|
|
((instr->definitions[0].bytes() == 4 && instr->operands[0].constantEquals(0x7FFFFFFFu)) ||
|
|
|
|
|
(instr->definitions[0].bytes() == 2 && instr->operands[0].constantEquals(0x7FFFu))))
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_abs(instr->operands[1].getTemp());
|
|
|
|
|
else
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
|
|
|
|
|
break;
|
2020-05-15 15:12:33 +01:00
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
case aco_opcode::v_xor_b32: { /* neg */
|
2020-05-15 15:12:33 +01:00
|
|
|
if (!instr->usesModifiers() && instr->operands[1].isTemp() &&
|
|
|
|
|
((instr->definitions[0].bytes() == 4 && instr->operands[0].constantEquals(0x80000000u)) ||
|
|
|
|
|
(instr->definitions[0].bytes() == 2 && instr->operands[0].constantEquals(0x8000u)))) {
|
2019-09-17 13:22:17 +02:00
|
|
|
if (ctx.info[instr->operands[1].tempId()].is_neg()) {
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_temp(ctx.info[instr->operands[1].tempId()].temp);
|
2019-12-13 16:59:54 +00:00
|
|
|
} else if (instr->operands[1].getTemp().type() == RegType::vgpr) {
|
2019-09-17 13:22:17 +02:00
|
|
|
if (ctx.info[instr->operands[1].tempId()].is_abs()) { /* neg(abs(x)) */
|
|
|
|
|
instr->operands[1].setTemp(ctx.info[instr->operands[1].tempId()].temp);
|
|
|
|
|
instr->opcode = aco_opcode::v_or_b32;
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_neg_abs(instr->operands[1].getTemp());
|
|
|
|
|
} else {
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_neg(instr->operands[1].getTemp());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
2020-05-15 15:12:33 +01:00
|
|
|
case aco_opcode::v_med3_f16:
|
2019-09-17 13:22:17 +02:00
|
|
|
case aco_opcode::v_med3_f32: { /* clamp */
|
|
|
|
|
VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(instr.get());
|
2020-01-08 11:49:11 +01:00
|
|
|
if (vop3->abs[0] || vop3->abs[1] || vop3->abs[2] ||
|
|
|
|
|
vop3->neg[0] || vop3->neg[1] || vop3->neg[2] ||
|
|
|
|
|
vop3->omod != 0 || vop3->opsel != 0)
|
2019-09-17 13:22:17 +02:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
unsigned idx = 0;
|
|
|
|
|
bool found_zero = false, found_one = false;
|
2020-05-15 15:12:33 +01:00
|
|
|
bool is_fp16 = instr->opcode == aco_opcode::v_med3_f16;
|
2019-09-17 13:22:17 +02:00
|
|
|
for (unsigned i = 0; i < 3; i++)
|
|
|
|
|
{
|
|
|
|
|
if (instr->operands[i].constantEquals(0))
|
|
|
|
|
found_zero = true;
|
2020-05-15 15:12:33 +01:00
|
|
|
else if (instr->operands[i].constantEquals(is_fp16 ? 0x3c00 : 0x3f800000)) /* 1.0 */
|
2019-09-17 13:22:17 +02:00
|
|
|
found_one = true;
|
|
|
|
|
else
|
|
|
|
|
idx = i;
|
|
|
|
|
}
|
2020-08-12 15:58:32 +01:00
|
|
|
if (found_zero && found_one && instr->operands[idx].isTemp())
|
|
|
|
|
ctx.info[instr->operands[idx].tempId()].set_clamp(instr.get());
|
2019-09-17 13:22:17 +02:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case aco_opcode::v_cndmask_b32:
|
|
|
|
|
if (instr->operands[0].constantEquals(0) &&
|
2020-04-02 17:41:36 +02:00
|
|
|
instr->operands[1].constantEquals(0xFFFFFFFF))
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_vcc(instr->operands[2].getTemp());
|
|
|
|
|
else if (instr->operands[0].constantEquals(0) &&
|
2020-04-02 17:41:36 +02:00
|
|
|
instr->operands[1].constantEquals(0x3f800000u))
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_b2f(instr->operands[2].getTemp());
|
2020-04-02 17:41:36 +02:00
|
|
|
else if (instr->operands[0].constantEquals(0) &&
|
|
|
|
|
instr->operands[1].constantEquals(1))
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_b2i(instr->operands[2].getTemp());
|
2020-01-07 10:12:08 +01:00
|
|
|
|
|
|
|
|
ctx.info[instr->operands[2].tempId()].set_vcc_hint();
|
2019-09-17 13:22:17 +02:00
|
|
|
break;
|
|
|
|
|
case aco_opcode::v_cmp_lg_u32:
|
|
|
|
|
if (instr->format == Format::VOPC && /* don't optimize VOP3 / SDWA / DPP */
|
|
|
|
|
instr->operands[0].constantEquals(0) &&
|
|
|
|
|
instr->operands[1].isTemp() && ctx.info[instr->operands[1].tempId()].is_vcc())
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_temp(ctx.info[instr->operands[1].tempId()].temp);
|
|
|
|
|
break;
|
|
|
|
|
case aco_opcode::p_phi:
|
|
|
|
|
case aco_opcode::p_linear_phi: {
|
|
|
|
|
/* lower_bool_phis() can create phis like this */
|
|
|
|
|
bool all_same_temp = instr->operands[0].isTemp();
|
|
|
|
|
/* this check is needed when moving uniform loop counters out of a divergent loop */
|
|
|
|
|
if (all_same_temp)
|
|
|
|
|
all_same_temp = instr->definitions[0].regClass() == instr->operands[0].regClass();
|
|
|
|
|
for (unsigned i = 1; all_same_temp && (i < instr->operands.size()); i++) {
|
|
|
|
|
if (!instr->operands[i].isTemp() || instr->operands[i].tempId() != instr->operands[0].tempId())
|
|
|
|
|
all_same_temp = false;
|
|
|
|
|
}
|
|
|
|
|
if (all_same_temp) {
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
|
|
|
|
|
} else {
|
|
|
|
|
bool all_undef = instr->operands[0].isUndefined();
|
|
|
|
|
for (unsigned i = 1; all_undef && (i < instr->operands.size()); i++) {
|
|
|
|
|
if (!instr->operands[i].isUndefined())
|
|
|
|
|
all_undef = false;
|
|
|
|
|
}
|
|
|
|
|
if (all_undef)
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_undefined();
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case aco_opcode::v_add_u32:
|
|
|
|
|
case aco_opcode::v_add_co_u32:
|
2020-02-21 12:02:06 +00:00
|
|
|
case aco_opcode::v_add_co_u32_e64:
|
2019-09-17 13:22:17 +02:00
|
|
|
case aco_opcode::s_add_i32:
|
|
|
|
|
case aco_opcode::s_add_u32:
|
aco: optimize v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc)
fossils-db (Vega10):
Totals from 7786 (5.70% of 136546) affected shaders:
SGPRs: 517778 -> 518626 (+0.16%); split: -0.01%, +0.17%
VGPRs: 488252 -> 488084 (-0.03%); split: -0.04%, +0.01%
CodeSize: 42282068 -> 42250152 (-0.08%); split: -0.16%, +0.09%
MaxWaves: 35697 -> 35716 (+0.05%); split: +0.06%, -0.01%
Instrs: 8319309 -> 8304792 (-0.17%); split: -0.18%, +0.00%
Cycles: 88619440 -> 88489636 (-0.15%); split: -0.16%, +0.01%
VMEM: 2788278 -> 2780431 (-0.28%); split: +0.06%, -0.35%
SMEM: 570364 -> 569370 (-0.17%); split: +0.12%, -0.30%
VClause: 144906 -> 144908 (+0.00%); split: -0.05%, +0.05%
SClause: 302143 -> 302055 (-0.03%); split: -0.04%, +0.01%
Copies: 579124 -> 578779 (-0.06%); split: -0.14%, +0.08%
PreSGPRs: 327695 -> 328845 (+0.35%); split: -0.00%, +0.35%
PreVGPRs: 434280 -> 433954 (-0.08%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7438>
2020-11-03 18:50:32 +01:00
|
|
|
case aco_opcode::v_subbrev_co_u32:
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_add_sub(instr.get());
|
|
|
|
|
break;
|
aco: Flip s_cbranch / s_cselect to optimize out an s_not if possible.
When possible, get rid of an s_not when all it does is invert the SCC,
and its successor s_cbranch / s_cselect can be inverted instead.
Also modify some parts of instruction_selection to take advantage of
this feature.
Example:
s2: %3900, s1: %3899:scc = s_andn2_b64 %0:exec, %406
s2: %3902 = s_cselect_b64 -1, 0, %3900:scc
s2: %407, s1: %3903:scc = s_not_b64 %3902
s2: %3906, s1: %3905:scc = s_and_b64 %407, %0:exec
p_cbranch_z %3905:scc
Can now be optimized to:
s2: %3900, s1: %3899:scc = s_andn2_b64 %0:exec, %406
p_cbranch_nz %3900:scc
Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
2019-11-19 13:29:54 +01:00
|
|
|
case aco_opcode::s_not_b32:
|
|
|
|
|
case aco_opcode::s_not_b64:
|
|
|
|
|
if (ctx.info[instr->operands[0].tempId()].is_uniform_bool()) {
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();
|
|
|
|
|
ctx.info[instr->definitions[1].tempId()].set_scc_invert(ctx.info[instr->operands[0].tempId()].temp);
|
|
|
|
|
} else if (ctx.info[instr->operands[0].tempId()].is_uniform_bitwise()) {
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();
|
|
|
|
|
ctx.info[instr->definitions[1].tempId()].set_scc_invert(ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
|
|
|
|
|
}
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
|
|
|
|
|
break;
|
2019-11-22 11:57:45 +01:00
|
|
|
case aco_opcode::s_and_b32:
|
2019-11-05 11:41:00 +01:00
|
|
|
case aco_opcode::s_and_b64:
|
2020-01-28 12:04:48 +00:00
|
|
|
if (fixed_to_exec(instr->operands[1]) && instr->operands[0].isTemp()) {
|
2020-01-03 10:30:04 +01:00
|
|
|
if (ctx.info[instr->operands[0].tempId()].is_uniform_bool()) {
|
|
|
|
|
/* Try to get rid of the superfluous s_cselect + s_and_b64 that comes from turning a uniform bool into divergent */
|
|
|
|
|
ctx.info[instr->definitions[1].tempId()].set_temp(ctx.info[instr->operands[0].tempId()].temp);
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_uniform_bool(ctx.info[instr->operands[0].tempId()].temp);
|
|
|
|
|
break;
|
|
|
|
|
} else if (ctx.info[instr->operands[0].tempId()].is_uniform_bitwise()) {
|
|
|
|
|
/* Try to get rid of the superfluous s_and_b64, since the uniform bitwise instruction already produces the same SCC */
|
|
|
|
|
ctx.info[instr->definitions[1].tempId()].set_temp(ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_uniform_bool(ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
|
|
|
|
|
break;
|
2020-06-19 16:09:48 +01:00
|
|
|
} else if (ctx.info[instr->operands[0].tempId()].is_vopc()) {
|
|
|
|
|
Instruction* vopc_instr = ctx.info[instr->operands[0].tempId()].instr;
|
|
|
|
|
/* Remove superfluous s_and when the VOPC instruction uses the same exec and thus already produces the same result */
|
|
|
|
|
if (vopc_instr->pass_flags == instr->pass_flags) {
|
|
|
|
|
assert(instr->pass_flags > 0);
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_temp(vopc_instr->definitions[0].getTemp());
|
|
|
|
|
break;
|
|
|
|
|
}
|
2020-01-03 10:30:04 +01:00
|
|
|
}
|
2019-11-05 11:41:00 +01:00
|
|
|
}
|
|
|
|
|
/* fallthrough */
|
2019-09-17 13:22:17 +02:00
|
|
|
case aco_opcode::s_or_b32:
|
|
|
|
|
case aco_opcode::s_or_b64:
|
|
|
|
|
case aco_opcode::s_xor_b32:
|
|
|
|
|
case aco_opcode::s_xor_b64:
|
2020-01-03 10:30:04 +01:00
|
|
|
if (std::all_of(instr->operands.begin(), instr->operands.end(), [&ctx](const Operand& op) {
|
|
|
|
|
return op.isTemp() && (ctx.info[op.tempId()].is_uniform_bool() || ctx.info[op.tempId()].is_uniform_bitwise());
|
|
|
|
|
})) {
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();
|
|
|
|
|
}
|
|
|
|
|
/* fallthrough */
|
2019-09-17 13:22:17 +02:00
|
|
|
case aco_opcode::s_lshl_b32:
|
|
|
|
|
case aco_opcode::v_or_b32:
|
|
|
|
|
case aco_opcode::v_lshlrev_b32:
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
|
|
|
|
|
break;
|
|
|
|
|
case aco_opcode::v_min_f32:
|
|
|
|
|
case aco_opcode::v_min_f16:
|
|
|
|
|
case aco_opcode::v_min_u32:
|
|
|
|
|
case aco_opcode::v_min_i32:
|
|
|
|
|
case aco_opcode::v_min_u16:
|
|
|
|
|
case aco_opcode::v_min_i16:
|
|
|
|
|
case aco_opcode::v_max_f32:
|
|
|
|
|
case aco_opcode::v_max_f16:
|
|
|
|
|
case aco_opcode::v_max_u32:
|
|
|
|
|
case aco_opcode::v_max_i32:
|
|
|
|
|
case aco_opcode::v_max_u16:
|
|
|
|
|
case aco_opcode::v_max_i16:
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_minmax(instr.get());
|
|
|
|
|
break;
|
2019-11-05 11:41:00 +01:00
|
|
|
case aco_opcode::s_cselect_b64:
|
2019-11-22 11:57:45 +01:00
|
|
|
case aco_opcode::s_cselect_b32:
|
2019-11-05 11:41:00 +01:00
|
|
|
if (instr->operands[0].constantEquals((unsigned) -1) &&
|
|
|
|
|
instr->operands[1].constantEquals(0)) {
|
|
|
|
|
/* Found a cselect that operates on a uniform bool that comes from eg. s_cmp */
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_uniform_bool(instr->operands[2].getTemp());
|
|
|
|
|
}
|
aco: Flip s_cbranch / s_cselect to optimize out an s_not if possible.
When possible, get rid of an s_not when all it does is invert the SCC,
and its successor s_cbranch / s_cselect can be inverted instead.
Also modify some parts of instruction_selection to take advantage of
this feature.
Example:
s2: %3900, s1: %3899:scc = s_andn2_b64 %0:exec, %406
s2: %3902 = s_cselect_b64 -1, 0, %3900:scc
s2: %407, s1: %3903:scc = s_not_b64 %3902
s2: %3906, s1: %3905:scc = s_and_b64 %407, %0:exec
p_cbranch_z %3905:scc
Can now be optimized to:
s2: %3900, s1: %3899:scc = s_andn2_b64 %0:exec, %406
p_cbranch_nz %3900:scc
Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
2019-11-19 13:29:54 +01:00
|
|
|
if (instr->operands[2].isTemp() && ctx.info[instr->operands[2].tempId()].is_scc_invert()) {
|
|
|
|
|
/* Flip the operands to get rid of the scc_invert instruction */
|
|
|
|
|
std::swap(instr->operands[0], instr->operands[1]);
|
|
|
|
|
instr->operands[2].setTemp(ctx.info[instr->operands[2].tempId()].temp);
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
case aco_opcode::p_wqm:
|
|
|
|
|
if (instr->operands[0].isTemp() &&
|
|
|
|
|
ctx.info[instr->operands[0].tempId()].is_scc_invert()) {
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
|
|
|
|
|
}
|
2019-11-05 11:41:00 +01:00
|
|
|
break;
|
2019-09-17 13:22:17 +02:00
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-05-15 20:26:39 +01:00
|
|
|
ALWAYS_INLINE bool get_cmp_info(aco_opcode op, CmpInfo *info)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2020-05-15 20:26:39 +01:00
|
|
|
info->ordered = aco_opcode::num_opcodes;
|
|
|
|
|
info->unordered = aco_opcode::num_opcodes;
|
2020-05-19 13:41:43 +01:00
|
|
|
info->ordered_swapped = aco_opcode::num_opcodes;
|
|
|
|
|
info->unordered_swapped = aco_opcode::num_opcodes;
|
2019-09-17 13:22:17 +02:00
|
|
|
switch (op) {
|
2020-05-19 13:41:43 +01:00
|
|
|
#define CMP2(ord, unord, ord_swap, unord_swap, sz) \
|
2020-05-15 20:26:39 +01:00
|
|
|
case aco_opcode::v_cmp_##ord##_f##sz:\
|
|
|
|
|
case aco_opcode::v_cmp_n##unord##_f##sz:\
|
|
|
|
|
info->ordered = aco_opcode::v_cmp_##ord##_f##sz;\
|
|
|
|
|
info->unordered = aco_opcode::v_cmp_n##unord##_f##sz;\
|
2020-05-19 13:41:43 +01:00
|
|
|
info->ordered_swapped = aco_opcode::v_cmp_##ord_swap##_f##sz;\
|
|
|
|
|
info->unordered_swapped = aco_opcode::v_cmp_n##unord_swap##_f##sz;\
|
2020-05-15 20:26:39 +01:00
|
|
|
info->inverse = op == aco_opcode::v_cmp_n##unord##_f##sz ? aco_opcode::v_cmp_##unord##_f##sz : aco_opcode::v_cmp_n##ord##_f##sz;\
|
|
|
|
|
info->f32 = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord##_f32 : aco_opcode::v_cmp_n##unord##_f32;\
|
|
|
|
|
info->size = sz;\
|
2019-09-17 13:22:17 +02:00
|
|
|
return true;
|
2020-05-19 13:41:43 +01:00
|
|
|
#define CMP(ord, unord, ord_swap, unord_swap) \
|
|
|
|
|
CMP2(ord, unord, ord_swap, unord_swap, 16)\
|
|
|
|
|
CMP2(ord, unord, ord_swap, unord_swap, 32)\
|
|
|
|
|
CMP2(ord, unord, ord_swap, unord_swap, 64)
|
|
|
|
|
CMP(lt, /*n*/ge, gt, /*n*/le)
|
|
|
|
|
CMP(eq, /*n*/lg, eq, /*n*/lg)
|
|
|
|
|
CMP(le, /*n*/gt, ge, /*n*/lt)
|
|
|
|
|
CMP(gt, /*n*/le, lt, /*n*/le)
|
|
|
|
|
CMP(lg, /*n*/eq, lg, /*n*/eq)
|
|
|
|
|
CMP(ge, /*n*/lt, le, /*n*/gt)
|
2019-09-17 13:22:17 +02:00
|
|
|
#undef CMP
|
2020-05-15 20:26:39 +01:00
|
|
|
#undef CMP2
|
|
|
|
|
#define ORD_TEST(sz) \
|
|
|
|
|
case aco_opcode::v_cmp_u_f##sz:\
|
|
|
|
|
info->f32 = aco_opcode::v_cmp_u_f32;\
|
|
|
|
|
info->inverse = aco_opcode::v_cmp_o_f##sz;\
|
|
|
|
|
info->size = sz;\
|
|
|
|
|
return true;\
|
|
|
|
|
case aco_opcode::v_cmp_o_f##sz:\
|
|
|
|
|
info->f32 = aco_opcode::v_cmp_o_f32;\
|
|
|
|
|
info->inverse = aco_opcode::v_cmp_u_f##sz;\
|
|
|
|
|
info->size = sz;\
|
|
|
|
|
return true;
|
|
|
|
|
ORD_TEST(16)
|
|
|
|
|
ORD_TEST(32)
|
|
|
|
|
ORD_TEST(64)
|
|
|
|
|
#undef ORD_TEST
|
2019-09-17 13:22:17 +02:00
|
|
|
default:
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
aco_opcode get_ordered(aco_opcode op)
|
|
|
|
|
{
|
2020-05-15 20:26:39 +01:00
|
|
|
CmpInfo info;
|
|
|
|
|
return get_cmp_info(op, &info) ? info.ordered : aco_opcode::num_opcodes;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
aco_opcode get_unordered(aco_opcode op)
|
|
|
|
|
{
|
2020-05-15 20:26:39 +01:00
|
|
|
CmpInfo info;
|
|
|
|
|
return get_cmp_info(op, &info) ? info.unordered : aco_opcode::num_opcodes;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
aco_opcode get_inverse(aco_opcode op)
|
|
|
|
|
{
|
2020-05-15 20:26:39 +01:00
|
|
|
CmpInfo info;
|
|
|
|
|
return get_cmp_info(op, &info) ? info.inverse : aco_opcode::num_opcodes;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
aco_opcode get_f32_cmp(aco_opcode op)
|
|
|
|
|
{
|
|
|
|
|
CmpInfo info;
|
|
|
|
|
return get_cmp_info(op, &info) ? info.f32 : aco_opcode::num_opcodes;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
unsigned get_cmp_bitsize(aco_opcode op)
|
|
|
|
|
{
|
|
|
|
|
CmpInfo info;
|
|
|
|
|
return get_cmp_info(op, &info) ? info.size : 0;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool is_cmp(aco_opcode op)
|
|
|
|
|
{
|
2020-05-15 20:26:39 +01:00
|
|
|
CmpInfo info;
|
|
|
|
|
return get_cmp_info(op, &info) && info.ordered != aco_opcode::num_opcodes;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
unsigned original_temp_id(opt_ctx &ctx, Temp tmp)
|
|
|
|
|
{
|
|
|
|
|
if (ctx.info[tmp.id()].is_temp())
|
|
|
|
|
return ctx.info[tmp.id()].temp.id();
|
|
|
|
|
else
|
|
|
|
|
return tmp.id();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void decrease_uses(opt_ctx &ctx, Instruction* instr)
|
|
|
|
|
{
|
|
|
|
|
if (!--ctx.uses[instr->definitions[0].tempId()]) {
|
|
|
|
|
for (const Operand& op : instr->operands) {
|
|
|
|
|
if (op.isTemp())
|
|
|
|
|
ctx.uses[op.tempId()]--;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Instruction *follow_operand(opt_ctx &ctx, Operand op, bool ignore_uses=false)
|
|
|
|
|
{
|
2020-08-12 15:58:32 +01:00
|
|
|
if (!op.isTemp() || !(ctx.info[op.tempId()].label & instr_usedef_labels))
|
2019-09-17 13:22:17 +02:00
|
|
|
return nullptr;
|
|
|
|
|
if (!ignore_uses && ctx.uses[op.tempId()] > 1)
|
|
|
|
|
return nullptr;
|
|
|
|
|
|
|
|
|
|
Instruction *instr = ctx.info[op.tempId()].instr;
|
|
|
|
|
|
|
|
|
|
if (instr->definitions.size() == 2) {
|
|
|
|
|
assert(instr->definitions[0].isTemp() && instr->definitions[0].tempId() == op.tempId());
|
|
|
|
|
if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
|
|
|
|
|
return nullptr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return instr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* s_or_b64(neq(a, a), neq(b, b)) -> v_cmp_u_f32(a, b)
|
|
|
|
|
* s_and_b64(eq(a, a), eq(b, b)) -> v_cmp_o_f32(a, b) */
|
|
|
|
|
bool combine_ordering_test(opt_ctx &ctx, aco_ptr<Instruction>& instr)
|
|
|
|
|
{
|
2019-12-03 13:37:49 +00:00
|
|
|
if (instr->definitions[0].regClass() != ctx.program->lane_mask)
|
2019-09-17 13:22:17 +02:00
|
|
|
return false;
|
|
|
|
|
if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
|
|
|
|
|
return false;
|
|
|
|
|
|
2019-12-03 13:37:49 +00:00
|
|
|
bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32;
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
bool neg[2] = {false, false};
|
|
|
|
|
bool abs[2] = {false, false};
|
2020-01-08 11:49:11 +01:00
|
|
|
uint8_t opsel = 0;
|
2019-09-17 13:22:17 +02:00
|
|
|
Instruction *op_instr[2];
|
|
|
|
|
Temp op[2];
|
|
|
|
|
|
2020-05-15 20:26:39 +01:00
|
|
|
unsigned bitsize = 0;
|
2019-09-17 13:22:17 +02:00
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
|
|
|
|
op_instr[i] = follow_operand(ctx, instr->operands[i], true);
|
|
|
|
|
if (!op_instr[i])
|
|
|
|
|
return false;
|
|
|
|
|
|
2019-12-03 13:37:49 +00:00
|
|
|
aco_opcode expected_cmp = is_or ? aco_opcode::v_cmp_neq_f32 : aco_opcode::v_cmp_eq_f32;
|
2020-05-15 20:26:39 +01:00
|
|
|
unsigned op_bitsize = get_cmp_bitsize(op_instr[i]->opcode);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-05-15 20:26:39 +01:00
|
|
|
if (get_f32_cmp(op_instr[i]->opcode) != expected_cmp)
|
|
|
|
|
return false;
|
|
|
|
|
if (bitsize && op_bitsize != bitsize)
|
2019-09-17 13:22:17 +02:00
|
|
|
return false;
|
|
|
|
|
if (!op_instr[i]->operands[0].isTemp() || !op_instr[i]->operands[1].isTemp())
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (op_instr[i]->isVOP3()) {
|
|
|
|
|
VOP3A_instruction *vop3 = static_cast<VOP3A_instruction*>(op_instr[i]);
|
2020-01-08 11:49:11 +01:00
|
|
|
if (vop3->neg[0] != vop3->neg[1] || vop3->abs[0] != vop3->abs[1] || vop3->opsel == 1 || vop3->opsel == 2)
|
2019-09-17 13:22:17 +02:00
|
|
|
return false;
|
|
|
|
|
neg[i] = vop3->neg[0];
|
|
|
|
|
abs[i] = vop3->abs[0];
|
2020-01-08 11:49:11 +01:00
|
|
|
opsel |= (vop3->opsel & 1) << i;
|
2019-12-05 14:12:39 +00:00
|
|
|
} else if (op_instr[i]->isSDWA()) {
|
|
|
|
|
return false;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Temp op0 = op_instr[i]->operands[0].getTemp();
|
|
|
|
|
Temp op1 = op_instr[i]->operands[1].getTemp();
|
|
|
|
|
if (original_temp_id(ctx, op0) != original_temp_id(ctx, op1))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
op[i] = op1;
|
2020-05-15 20:26:39 +01:00
|
|
|
bitsize = op_bitsize;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2019-11-20 16:31:43 +00:00
|
|
|
if (op[1].type() == RegType::sgpr)
|
|
|
|
|
std::swap(op[0], op[1]);
|
2019-11-20 16:42:17 +00:00
|
|
|
unsigned num_sgprs = (op[0].type() == RegType::sgpr) + (op[1].type() == RegType::sgpr);
|
|
|
|
|
if (num_sgprs > (ctx.program->chip_class >= GFX10 ? 2 : 1))
|
2019-11-20 16:31:43 +00:00
|
|
|
return false;
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.uses[op[0].id()]++;
|
|
|
|
|
ctx.uses[op[1].id()]++;
|
|
|
|
|
decrease_uses(ctx, op_instr[0]);
|
|
|
|
|
decrease_uses(ctx, op_instr[1]);
|
|
|
|
|
|
2020-05-15 20:26:39 +01:00
|
|
|
aco_opcode new_op = aco_opcode::num_opcodes;
|
|
|
|
|
switch (bitsize) {
|
|
|
|
|
case 16:
|
|
|
|
|
new_op = is_or ? aco_opcode::v_cmp_u_f16 : aco_opcode::v_cmp_o_f16;
|
|
|
|
|
break;
|
|
|
|
|
case 32:
|
|
|
|
|
new_op = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32;
|
|
|
|
|
break;
|
|
|
|
|
case 64:
|
|
|
|
|
new_op = is_or ? aco_opcode::v_cmp_u_f64 : aco_opcode::v_cmp_o_f64;
|
|
|
|
|
break;
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
Instruction *new_instr;
|
2019-11-20 16:42:17 +00:00
|
|
|
if (neg[0] || neg[1] || abs[0] || abs[1] || opsel || num_sgprs > 1) {
|
2019-09-17 13:22:17 +02:00
|
|
|
VOP3A_instruction *vop3 = create_instruction<VOP3A_instruction>(new_op, asVOP3(Format::VOPC), 2, 1);
|
|
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
|
|
|
|
vop3->neg[i] = neg[i];
|
|
|
|
|
vop3->abs[i] = abs[i];
|
|
|
|
|
}
|
2020-01-08 11:49:11 +01:00
|
|
|
vop3->opsel = opsel;
|
2019-09-17 13:22:17 +02:00
|
|
|
new_instr = static_cast<Instruction *>(vop3);
|
|
|
|
|
} else {
|
|
|
|
|
new_instr = create_instruction<VOPC_instruction>(new_op, Format::VOPC, 2, 1);
|
|
|
|
|
}
|
|
|
|
|
new_instr->operands[0] = Operand(op[0]);
|
|
|
|
|
new_instr->operands[1] = Operand(op[1]);
|
|
|
|
|
new_instr->definitions[0] = instr->definitions[0];
|
|
|
|
|
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].label = 0;
|
2020-06-19 16:09:48 +01:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
instr.reset(new_instr);
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* s_or_b64(v_cmp_u_f32(a, b), cmp(a, b)) -> get_unordered(cmp)(a, b)
|
|
|
|
|
* s_and_b64(v_cmp_o_f32(a, b), cmp(a, b)) -> get_ordered(cmp)(a, b) */
|
|
|
|
|
bool combine_comparison_ordering(opt_ctx &ctx, aco_ptr<Instruction>& instr)
|
|
|
|
|
{
|
2019-12-03 13:37:49 +00:00
|
|
|
if (instr->definitions[0].regClass() != ctx.program->lane_mask)
|
2019-09-17 13:22:17 +02:00
|
|
|
return false;
|
|
|
|
|
if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
|
|
|
|
|
return false;
|
|
|
|
|
|
2019-12-03 13:37:49 +00:00
|
|
|
bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32;
|
|
|
|
|
aco_opcode expected_nan_test = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
Instruction *nan_test = follow_operand(ctx, instr->operands[0], true);
|
|
|
|
|
Instruction *cmp = follow_operand(ctx, instr->operands[1], true);
|
|
|
|
|
if (!nan_test || !cmp)
|
|
|
|
|
return false;
|
2019-12-05 14:12:39 +00:00
|
|
|
if (nan_test->isSDWA() || cmp->isSDWA())
|
|
|
|
|
return false;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-05-15 20:26:39 +01:00
|
|
|
if (get_f32_cmp(cmp->opcode) == expected_nan_test)
|
2019-09-17 13:22:17 +02:00
|
|
|
std::swap(nan_test, cmp);
|
2020-05-15 20:26:39 +01:00
|
|
|
else if (get_f32_cmp(nan_test->opcode) != expected_nan_test)
|
2019-09-17 13:22:17 +02:00
|
|
|
return false;
|
|
|
|
|
|
2020-05-15 20:26:39 +01:00
|
|
|
if (!is_cmp(cmp->opcode) || get_cmp_bitsize(cmp->opcode) != get_cmp_bitsize(nan_test->opcode))
|
2019-09-17 13:22:17 +02:00
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (!nan_test->operands[0].isTemp() || !nan_test->operands[1].isTemp())
|
|
|
|
|
return false;
|
|
|
|
|
if (!cmp->operands[0].isTemp() || !cmp->operands[1].isTemp())
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
unsigned prop_cmp0 = original_temp_id(ctx, cmp->operands[0].getTemp());
|
|
|
|
|
unsigned prop_cmp1 = original_temp_id(ctx, cmp->operands[1].getTemp());
|
|
|
|
|
unsigned prop_nan0 = original_temp_id(ctx, nan_test->operands[0].getTemp());
|
|
|
|
|
unsigned prop_nan1 = original_temp_id(ctx, nan_test->operands[1].getTemp());
|
|
|
|
|
if (prop_cmp0 != prop_nan0 && prop_cmp0 != prop_nan1)
|
|
|
|
|
return false;
|
|
|
|
|
if (prop_cmp1 != prop_nan0 && prop_cmp1 != prop_nan1)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
ctx.uses[cmp->operands[0].tempId()]++;
|
|
|
|
|
ctx.uses[cmp->operands[1].tempId()]++;
|
|
|
|
|
decrease_uses(ctx, nan_test);
|
|
|
|
|
decrease_uses(ctx, cmp);
|
|
|
|
|
|
2019-12-03 13:37:49 +00:00
|
|
|
aco_opcode new_op = is_or ? get_unordered(cmp->opcode) : get_ordered(cmp->opcode);
|
2019-09-17 13:22:17 +02:00
|
|
|
Instruction *new_instr;
|
|
|
|
|
if (cmp->isVOP3()) {
|
|
|
|
|
VOP3A_instruction *new_vop3 = create_instruction<VOP3A_instruction>(new_op, asVOP3(Format::VOPC), 2, 1);
|
|
|
|
|
VOP3A_instruction *cmp_vop3 = static_cast<VOP3A_instruction*>(cmp);
|
|
|
|
|
memcpy(new_vop3->abs, cmp_vop3->abs, sizeof(new_vop3->abs));
|
|
|
|
|
memcpy(new_vop3->neg, cmp_vop3->neg, sizeof(new_vop3->neg));
|
|
|
|
|
new_vop3->clamp = cmp_vop3->clamp;
|
|
|
|
|
new_vop3->omod = cmp_vop3->omod;
|
2020-01-08 11:49:11 +01:00
|
|
|
new_vop3->opsel = cmp_vop3->opsel;
|
2019-09-17 13:22:17 +02:00
|
|
|
new_instr = new_vop3;
|
|
|
|
|
} else {
|
|
|
|
|
new_instr = create_instruction<VOPC_instruction>(new_op, Format::VOPC, 2, 1);
|
|
|
|
|
}
|
|
|
|
|
new_instr->operands[0] = cmp->operands[0];
|
|
|
|
|
new_instr->operands[1] = cmp->operands[1];
|
|
|
|
|
new_instr->definitions[0] = instr->definitions[0];
|
|
|
|
|
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].label = 0;
|
2020-06-19 16:09:48 +01:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
instr.reset(new_instr);
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* s_or_b64(v_cmp_neq_f32(a, a), cmp(a, #b)) and b is not NaN -> get_unordered(cmp)(a, b)
|
|
|
|
|
* s_and_b64(v_cmp_eq_f32(a, a), cmp(a, #b)) and b is not NaN -> get_ordered(cmp)(a, b) */
|
|
|
|
|
bool combine_constant_comparison_ordering(opt_ctx &ctx, aco_ptr<Instruction>& instr)
|
|
|
|
|
{
|
2019-12-03 13:37:49 +00:00
|
|
|
if (instr->definitions[0].regClass() != ctx.program->lane_mask)
|
2019-09-17 13:22:17 +02:00
|
|
|
return false;
|
|
|
|
|
if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
|
|
|
|
|
return false;
|
|
|
|
|
|
2019-12-03 13:37:49 +00:00
|
|
|
bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32;
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
Instruction *nan_test = follow_operand(ctx, instr->operands[0], true);
|
|
|
|
|
Instruction *cmp = follow_operand(ctx, instr->operands[1], true);
|
|
|
|
|
|
|
|
|
|
if (!nan_test || !cmp)
|
|
|
|
|
return false;
|
2019-12-05 14:12:39 +00:00
|
|
|
if (nan_test->isSDWA() || cmp->isSDWA())
|
|
|
|
|
return false;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2019-12-03 13:37:49 +00:00
|
|
|
aco_opcode expected_nan_test = is_or ? aco_opcode::v_cmp_neq_f32 : aco_opcode::v_cmp_eq_f32;
|
2020-05-15 20:26:39 +01:00
|
|
|
if (get_f32_cmp(cmp->opcode) == expected_nan_test)
|
2019-09-17 13:22:17 +02:00
|
|
|
std::swap(nan_test, cmp);
|
2020-05-15 20:26:39 +01:00
|
|
|
else if (get_f32_cmp(nan_test->opcode) != expected_nan_test)
|
2019-09-17 13:22:17 +02:00
|
|
|
return false;
|
|
|
|
|
|
2020-05-15 20:26:39 +01:00
|
|
|
if (!is_cmp(cmp->opcode) || get_cmp_bitsize(cmp->opcode) != get_cmp_bitsize(nan_test->opcode))
|
2019-09-17 13:22:17 +02:00
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (!nan_test->operands[0].isTemp() || !nan_test->operands[1].isTemp())
|
|
|
|
|
return false;
|
|
|
|
|
if (!cmp->operands[0].isTemp() && !cmp->operands[1].isTemp())
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
unsigned prop_nan0 = original_temp_id(ctx, nan_test->operands[0].getTemp());
|
|
|
|
|
unsigned prop_nan1 = original_temp_id(ctx, nan_test->operands[1].getTemp());
|
|
|
|
|
if (prop_nan0 != prop_nan1)
|
|
|
|
|
return false;
|
|
|
|
|
|
2019-12-11 16:57:11 +00:00
|
|
|
if (nan_test->isVOP3()) {
|
|
|
|
|
VOP3A_instruction *vop3 = static_cast<VOP3A_instruction*>(nan_test);
|
|
|
|
|
if (vop3->neg[0] != vop3->neg[1] || vop3->abs[0] != vop3->abs[1] || vop3->opsel == 1 || vop3->opsel == 2)
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
int constant_operand = -1;
|
|
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
|
|
|
|
if (cmp->operands[i].isTemp() && original_temp_id(ctx, cmp->operands[i].getTemp()) == prop_nan0) {
|
|
|
|
|
constant_operand = !i;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (constant_operand == -1)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
uint32_t constant;
|
|
|
|
|
if (cmp->operands[constant_operand].isConstant()) {
|
|
|
|
|
constant = cmp->operands[constant_operand].constantValue();
|
|
|
|
|
} else if (cmp->operands[constant_operand].isTemp()) {
|
2019-11-22 14:17:27 +00:00
|
|
|
Temp tmp = cmp->operands[constant_operand].getTemp();
|
|
|
|
|
unsigned id = original_temp_id(ctx, tmp);
|
2020-05-15 16:28:03 +01:00
|
|
|
if (!ctx.info[id].is_constant_or_literal(32))
|
2019-09-17 13:22:17 +02:00
|
|
|
return false;
|
|
|
|
|
constant = ctx.info[id].val;
|
|
|
|
|
} else {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
float constantf;
|
|
|
|
|
memcpy(&constantf, &constant, 4);
|
|
|
|
|
if (isnan(constantf))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (cmp->operands[0].isTemp())
|
|
|
|
|
ctx.uses[cmp->operands[0].tempId()]++;
|
|
|
|
|
if (cmp->operands[1].isTemp())
|
|
|
|
|
ctx.uses[cmp->operands[1].tempId()]++;
|
|
|
|
|
decrease_uses(ctx, nan_test);
|
|
|
|
|
decrease_uses(ctx, cmp);
|
|
|
|
|
|
2019-12-03 13:37:49 +00:00
|
|
|
aco_opcode new_op = is_or ? get_unordered(cmp->opcode) : get_ordered(cmp->opcode);
|
2019-09-17 13:22:17 +02:00
|
|
|
Instruction *new_instr;
|
|
|
|
|
if (cmp->isVOP3()) {
|
|
|
|
|
VOP3A_instruction *new_vop3 = create_instruction<VOP3A_instruction>(new_op, asVOP3(Format::VOPC), 2, 1);
|
|
|
|
|
VOP3A_instruction *cmp_vop3 = static_cast<VOP3A_instruction*>(cmp);
|
|
|
|
|
memcpy(new_vop3->abs, cmp_vop3->abs, sizeof(new_vop3->abs));
|
|
|
|
|
memcpy(new_vop3->neg, cmp_vop3->neg, sizeof(new_vop3->neg));
|
|
|
|
|
new_vop3->clamp = cmp_vop3->clamp;
|
|
|
|
|
new_vop3->omod = cmp_vop3->omod;
|
2020-01-08 11:49:11 +01:00
|
|
|
new_vop3->opsel = cmp_vop3->opsel;
|
2019-09-17 13:22:17 +02:00
|
|
|
new_instr = new_vop3;
|
|
|
|
|
} else {
|
|
|
|
|
new_instr = create_instruction<VOPC_instruction>(new_op, Format::VOPC, 2, 1);
|
|
|
|
|
}
|
|
|
|
|
new_instr->operands[0] = cmp->operands[0];
|
|
|
|
|
new_instr->operands[1] = cmp->operands[1];
|
|
|
|
|
new_instr->definitions[0] = instr->definitions[0];
|
|
|
|
|
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].label = 0;
|
2020-06-19 16:09:48 +01:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
instr.reset(new_instr);
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2019-12-16 15:35:14 +00:00
|
|
|
/* s_andn2(exec, cmp(a, b)) -> get_inverse(cmp)(a, b) */
|
2019-09-17 13:22:17 +02:00
|
|
|
bool combine_inverse_comparison(opt_ctx &ctx, aco_ptr<Instruction>& instr)
|
|
|
|
|
{
|
2019-12-16 15:35:14 +00:00
|
|
|
if (!instr->operands[0].isFixed() || instr->operands[0].physReg() != exec)
|
2019-09-17 13:22:17 +02:00
|
|
|
return false;
|
2019-12-16 15:35:14 +00:00
|
|
|
if (ctx.uses[instr->definitions[1].tempId()])
|
2019-09-17 13:22:17 +02:00
|
|
|
return false;
|
|
|
|
|
|
2019-12-16 15:35:14 +00:00
|
|
|
Instruction *cmp = follow_operand(ctx, instr->operands[1]);
|
2019-09-17 13:22:17 +02:00
|
|
|
if (!cmp)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
aco_opcode new_opcode = get_inverse(cmp->opcode);
|
2020-05-19 13:26:21 +01:00
|
|
|
if (new_opcode == aco_opcode::num_opcodes)
|
2019-09-17 13:22:17 +02:00
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (cmp->operands[0].isTemp())
|
|
|
|
|
ctx.uses[cmp->operands[0].tempId()]++;
|
|
|
|
|
if (cmp->operands[1].isTemp())
|
|
|
|
|
ctx.uses[cmp->operands[1].tempId()]++;
|
|
|
|
|
decrease_uses(ctx, cmp);
|
|
|
|
|
|
2019-12-16 15:35:14 +00:00
|
|
|
/* This creates a new instruction instead of modifying the existing
|
|
|
|
|
* comparison so that the comparison is done with the correct exec mask. */
|
2019-09-17 13:22:17 +02:00
|
|
|
Instruction *new_instr;
|
|
|
|
|
if (cmp->isVOP3()) {
|
|
|
|
|
VOP3A_instruction *new_vop3 = create_instruction<VOP3A_instruction>(new_opcode, asVOP3(Format::VOPC), 2, 1);
|
|
|
|
|
VOP3A_instruction *cmp_vop3 = static_cast<VOP3A_instruction*>(cmp);
|
|
|
|
|
memcpy(new_vop3->abs, cmp_vop3->abs, sizeof(new_vop3->abs));
|
|
|
|
|
memcpy(new_vop3->neg, cmp_vop3->neg, sizeof(new_vop3->neg));
|
|
|
|
|
new_vop3->clamp = cmp_vop3->clamp;
|
|
|
|
|
new_vop3->omod = cmp_vop3->omod;
|
2020-01-08 11:49:11 +01:00
|
|
|
new_vop3->opsel = cmp_vop3->opsel;
|
2019-09-17 13:22:17 +02:00
|
|
|
new_instr = new_vop3;
|
2019-12-05 14:12:39 +00:00
|
|
|
} else if (cmp->isSDWA()) {
|
|
|
|
|
SDWA_instruction *new_sdwa = create_instruction<SDWA_instruction>(
|
|
|
|
|
new_opcode, (Format)((uint16_t)Format::SDWA | (uint16_t)Format::VOPC), 2, 1);
|
|
|
|
|
SDWA_instruction *cmp_sdwa = static_cast<SDWA_instruction*>(cmp);
|
|
|
|
|
memcpy(new_sdwa->abs, cmp_sdwa->abs, sizeof(new_sdwa->abs));
|
|
|
|
|
memcpy(new_sdwa->sel, cmp_sdwa->sel, sizeof(new_sdwa->sel));
|
|
|
|
|
memcpy(new_sdwa->neg, cmp_sdwa->neg, sizeof(new_sdwa->neg));
|
|
|
|
|
new_sdwa->dst_sel = cmp_sdwa->dst_sel;
|
|
|
|
|
new_sdwa->dst_preserve = cmp_sdwa->dst_preserve;
|
|
|
|
|
new_sdwa->clamp = cmp_sdwa->clamp;
|
|
|
|
|
new_sdwa->omod = cmp_sdwa->omod;
|
|
|
|
|
new_instr = new_sdwa;
|
2019-09-17 13:22:17 +02:00
|
|
|
} else {
|
|
|
|
|
new_instr = create_instruction<VOPC_instruction>(new_opcode, Format::VOPC, 2, 1);
|
|
|
|
|
}
|
|
|
|
|
new_instr->operands[0] = cmp->operands[0];
|
|
|
|
|
new_instr->operands[1] = cmp->operands[1];
|
|
|
|
|
new_instr->definitions[0] = instr->definitions[0];
|
|
|
|
|
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].label = 0;
|
2020-06-19 16:09:48 +01:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
instr.reset(new_instr);
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* op1(op2(1, 2), 0) if swap = false
|
|
|
|
|
* op1(0, op2(1, 2)) if swap = true */
|
|
|
|
|
bool match_op3_for_vop3(opt_ctx &ctx, aco_opcode op1, aco_opcode op2,
|
|
|
|
|
Instruction* op1_instr, bool swap, const char *shuffle_str,
|
2020-01-08 11:49:11 +01:00
|
|
|
Operand operands[3], bool neg[3], bool abs[3], uint8_t *opsel,
|
|
|
|
|
bool *op1_clamp, uint8_t *op1_omod,
|
2019-09-17 13:22:17 +02:00
|
|
|
bool *inbetween_neg, bool *inbetween_abs, bool *inbetween_opsel)
|
|
|
|
|
{
|
|
|
|
|
/* checks */
|
|
|
|
|
if (op1_instr->opcode != op1)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
Instruction *op2_instr = follow_operand(ctx, op1_instr->operands[swap]);
|
|
|
|
|
if (!op2_instr || op2_instr->opcode != op2)
|
|
|
|
|
return false;
|
2020-01-28 12:04:48 +00:00
|
|
|
if (fixed_to_exec(op2_instr->operands[0]) || fixed_to_exec(op2_instr->operands[1]))
|
|
|
|
|
return false;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
VOP3A_instruction *op1_vop3 = op1_instr->isVOP3() ? static_cast<VOP3A_instruction *>(op1_instr) : NULL;
|
|
|
|
|
VOP3A_instruction *op2_vop3 = op2_instr->isVOP3() ? static_cast<VOP3A_instruction *>(op2_instr) : NULL;
|
|
|
|
|
|
2019-12-05 14:12:39 +00:00
|
|
|
if (op1_instr->isSDWA() || op2_instr->isSDWA())
|
|
|
|
|
return false;
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
/* don't support inbetween clamp/omod */
|
|
|
|
|
if (op2_vop3 && (op2_vop3->clamp || op2_vop3->omod))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
/* get operands and modifiers and check inbetween modifiers */
|
|
|
|
|
*op1_clamp = op1_vop3 ? op1_vop3->clamp : false;
|
|
|
|
|
*op1_omod = op1_vop3 ? op1_vop3->omod : 0u;
|
|
|
|
|
|
|
|
|
|
if (inbetween_neg)
|
|
|
|
|
*inbetween_neg = op1_vop3 ? op1_vop3->neg[swap] : false;
|
|
|
|
|
else if (op1_vop3 && op1_vop3->neg[swap])
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (inbetween_abs)
|
|
|
|
|
*inbetween_abs = op1_vop3 ? op1_vop3->abs[swap] : false;
|
|
|
|
|
else if (op1_vop3 && op1_vop3->abs[swap])
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (inbetween_opsel)
|
2020-01-08 11:49:11 +01:00
|
|
|
*inbetween_opsel = op1_vop3 ? op1_vop3->opsel & (1 << swap) : false;
|
|
|
|
|
else if (op1_vop3 && op1_vop3->opsel & (1 << swap))
|
2019-09-17 13:22:17 +02:00
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
int shuffle[3];
|
|
|
|
|
shuffle[shuffle_str[0] - '0'] = 0;
|
|
|
|
|
shuffle[shuffle_str[1] - '0'] = 1;
|
|
|
|
|
shuffle[shuffle_str[2] - '0'] = 2;
|
|
|
|
|
|
|
|
|
|
operands[shuffle[0]] = op1_instr->operands[!swap];
|
|
|
|
|
neg[shuffle[0]] = op1_vop3 ? op1_vop3->neg[!swap] : false;
|
|
|
|
|
abs[shuffle[0]] = op1_vop3 ? op1_vop3->abs[!swap] : false;
|
2020-01-08 11:49:11 +01:00
|
|
|
if (op1_vop3 && op1_vop3->opsel & (1 << !swap))
|
|
|
|
|
*opsel |= 1 << shuffle[0];
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
|
|
|
|
operands[shuffle[i + 1]] = op2_instr->operands[i];
|
|
|
|
|
neg[shuffle[i + 1]] = op2_vop3 ? op2_vop3->neg[i] : false;
|
|
|
|
|
abs[shuffle[i + 1]] = op2_vop3 ? op2_vop3->abs[i] : false;
|
2020-01-08 11:49:11 +01:00
|
|
|
if (op2_vop3 && op2_vop3->opsel & (1 << i))
|
|
|
|
|
*opsel |= 1 << shuffle[i + 1];
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* check operands */
|
2019-11-22 14:50:41 +00:00
|
|
|
if (!check_vop3_operands(ctx, 3, operands))
|
|
|
|
|
return false;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void create_vop3_for_op3(opt_ctx& ctx, aco_opcode opcode, aco_ptr<Instruction>& instr,
|
2020-01-08 11:49:11 +01:00
|
|
|
Operand operands[3], bool neg[3], bool abs[3], uint8_t opsel,
|
2019-09-17 13:22:17 +02:00
|
|
|
bool clamp, unsigned omod)
|
|
|
|
|
{
|
|
|
|
|
VOP3A_instruction *new_instr = create_instruction<VOP3A_instruction>(opcode, Format::VOP3A, 3, 1);
|
|
|
|
|
memcpy(new_instr->abs, abs, sizeof(bool[3]));
|
|
|
|
|
memcpy(new_instr->neg, neg, sizeof(bool[3]));
|
|
|
|
|
new_instr->clamp = clamp;
|
|
|
|
|
new_instr->omod = omod;
|
2020-01-08 11:49:11 +01:00
|
|
|
new_instr->opsel = opsel;
|
2019-09-17 13:22:17 +02:00
|
|
|
new_instr->operands[0] = operands[0];
|
|
|
|
|
new_instr->operands[1] = operands[1];
|
|
|
|
|
new_instr->operands[2] = operands[2];
|
|
|
|
|
new_instr->definitions[0] = instr->definitions[0];
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].label = 0;
|
|
|
|
|
|
|
|
|
|
instr.reset(new_instr);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool combine_three_valu_op(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode op2, aco_opcode new_op, const char *shuffle, uint8_t ops)
|
|
|
|
|
{
|
|
|
|
|
for (unsigned swap = 0; swap < 2; swap++) {
|
|
|
|
|
if (!((1 << swap) & ops))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
Operand operands[3];
|
2020-01-08 11:49:11 +01:00
|
|
|
bool neg[3], abs[3], clamp;
|
|
|
|
|
uint8_t opsel = 0, omod = 0;
|
2019-09-17 13:22:17 +02:00
|
|
|
if (match_op3_for_vop3(ctx, instr->opcode, op2,
|
|
|
|
|
instr.get(), swap, shuffle,
|
2020-01-08 11:49:11 +01:00
|
|
|
operands, neg, abs, &opsel,
|
2019-09-17 13:22:17 +02:00
|
|
|
&clamp, &omod, NULL, NULL, NULL)) {
|
|
|
|
|
ctx.uses[instr->operands[swap].tempId()]--;
|
|
|
|
|
create_vop3_for_op3(ctx, new_op, instr, operands, neg, abs, opsel, clamp, omod);
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2019-11-22 20:32:11 +00:00
|
|
|
bool combine_minmax(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode opposite, aco_opcode minmax3)
|
|
|
|
|
{
|
|
|
|
|
if (combine_three_valu_op(ctx, instr, instr->opcode, minmax3, "012", 1 | 2))
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
/* min(-max(a, b), c) -> min3(-a, -b, c) *
|
|
|
|
|
* max(-min(a, b), c) -> max3(-a, -b, c) */
|
|
|
|
|
for (unsigned swap = 0; swap < 2; swap++) {
|
|
|
|
|
Operand operands[3];
|
|
|
|
|
bool neg[3], abs[3], clamp;
|
|
|
|
|
uint8_t opsel = 0, omod = 0;
|
|
|
|
|
bool inbetween_neg;
|
|
|
|
|
if (match_op3_for_vop3(ctx, instr->opcode, opposite,
|
|
|
|
|
instr.get(), swap, "012",
|
|
|
|
|
operands, neg, abs, &opsel,
|
|
|
|
|
&clamp, &omod, &inbetween_neg, NULL, NULL) &&
|
|
|
|
|
inbetween_neg) {
|
|
|
|
|
ctx.uses[instr->operands[swap].tempId()]--;
|
|
|
|
|
neg[1] = true;
|
|
|
|
|
neg[2] = true;
|
|
|
|
|
create_vop3_for_op3(ctx, minmax3, instr, operands, neg, abs, opsel, clamp, omod);
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
/* s_not_b32(s_and_b32(a, b)) -> s_nand_b32(a, b)
|
|
|
|
|
* s_not_b32(s_or_b32(a, b)) -> s_nor_b32(a, b)
|
|
|
|
|
* s_not_b32(s_xor_b32(a, b)) -> s_xnor_b32(a, b)
|
|
|
|
|
* s_not_b64(s_and_b64(a, b)) -> s_nand_b64(a, b)
|
|
|
|
|
* s_not_b64(s_or_b64(a, b)) -> s_nor_b64(a, b)
|
|
|
|
|
* s_not_b64(s_xor_b64(a, b)) -> s_xnor_b64(a, b) */
|
|
|
|
|
bool combine_salu_not_bitwise(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
|
|
|
|
{
|
|
|
|
|
/* checks */
|
|
|
|
|
if (!instr->operands[0].isTemp())
|
|
|
|
|
return false;
|
|
|
|
|
if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
Instruction *op2_instr = follow_operand(ctx, instr->operands[0]);
|
|
|
|
|
if (!op2_instr)
|
|
|
|
|
return false;
|
|
|
|
|
switch (op2_instr->opcode) {
|
|
|
|
|
case aco_opcode::s_and_b32:
|
|
|
|
|
case aco_opcode::s_or_b32:
|
|
|
|
|
case aco_opcode::s_xor_b32:
|
|
|
|
|
case aco_opcode::s_and_b64:
|
|
|
|
|
case aco_opcode::s_or_b64:
|
|
|
|
|
case aco_opcode::s_xor_b64:
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* create instruction */
|
|
|
|
|
std::swap(instr->definitions[0], op2_instr->definitions[0]);
|
2020-01-28 12:32:09 +01:00
|
|
|
std::swap(instr->definitions[1], op2_instr->definitions[1]);
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.uses[instr->operands[0].tempId()]--;
|
|
|
|
|
ctx.info[op2_instr->definitions[0].tempId()].label = 0;
|
|
|
|
|
|
|
|
|
|
switch (op2_instr->opcode) {
|
|
|
|
|
case aco_opcode::s_and_b32:
|
|
|
|
|
op2_instr->opcode = aco_opcode::s_nand_b32;
|
|
|
|
|
break;
|
|
|
|
|
case aco_opcode::s_or_b32:
|
|
|
|
|
op2_instr->opcode = aco_opcode::s_nor_b32;
|
|
|
|
|
break;
|
|
|
|
|
case aco_opcode::s_xor_b32:
|
|
|
|
|
op2_instr->opcode = aco_opcode::s_xnor_b32;
|
|
|
|
|
break;
|
|
|
|
|
case aco_opcode::s_and_b64:
|
|
|
|
|
op2_instr->opcode = aco_opcode::s_nand_b64;
|
|
|
|
|
break;
|
|
|
|
|
case aco_opcode::s_or_b64:
|
|
|
|
|
op2_instr->opcode = aco_opcode::s_nor_b64;
|
|
|
|
|
break;
|
|
|
|
|
case aco_opcode::s_xor_b64:
|
|
|
|
|
op2_instr->opcode = aco_opcode::s_xnor_b64;
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* s_and_b32(a, s_not_b32(b)) -> s_andn2_b32(a, b)
|
|
|
|
|
* s_or_b32(a, s_not_b32(b)) -> s_orn2_b32(a, b)
|
|
|
|
|
* s_and_b64(a, s_not_b64(b)) -> s_andn2_b64(a, b)
|
|
|
|
|
* s_or_b64(a, s_not_b64(b)) -> s_orn2_b64(a, b) */
|
|
|
|
|
bool combine_salu_n2(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
|
|
|
|
{
|
2020-02-05 11:19:06 +01:00
|
|
|
if (instr->definitions[0].isTemp() && ctx.info[instr->definitions[0].tempId()].is_uniform_bool())
|
|
|
|
|
return false;
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
|
|
|
|
Instruction *op2_instr = follow_operand(ctx, instr->operands[i]);
|
|
|
|
|
if (!op2_instr || (op2_instr->opcode != aco_opcode::s_not_b32 && op2_instr->opcode != aco_opcode::s_not_b64))
|
|
|
|
|
continue;
|
2020-01-28 12:05:26 +00:00
|
|
|
if (ctx.uses[op2_instr->definitions[1].tempId()] || fixed_to_exec(op2_instr->operands[0]))
|
2020-01-28 12:04:48 +00:00
|
|
|
continue;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2019-11-22 14:34:24 +00:00
|
|
|
if (instr->operands[!i].isLiteral() && op2_instr->operands[0].isLiteral() &&
|
|
|
|
|
instr->operands[!i].constantValue() != op2_instr->operands[0].constantValue())
|
|
|
|
|
continue;
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.uses[instr->operands[i].tempId()]--;
|
|
|
|
|
instr->operands[0] = instr->operands[!i];
|
|
|
|
|
instr->operands[1] = op2_instr->operands[0];
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].label = 0;
|
|
|
|
|
|
|
|
|
|
switch (instr->opcode) {
|
|
|
|
|
case aco_opcode::s_and_b32:
|
|
|
|
|
instr->opcode = aco_opcode::s_andn2_b32;
|
|
|
|
|
break;
|
|
|
|
|
case aco_opcode::s_or_b32:
|
|
|
|
|
instr->opcode = aco_opcode::s_orn2_b32;
|
|
|
|
|
break;
|
|
|
|
|
case aco_opcode::s_and_b64:
|
|
|
|
|
instr->opcode = aco_opcode::s_andn2_b64;
|
|
|
|
|
break;
|
|
|
|
|
case aco_opcode::s_or_b64:
|
|
|
|
|
instr->opcode = aco_opcode::s_orn2_b64;
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* s_add_{i32,u32}(a, s_lshl_b32(b, <n>)) -> s_lshl<n>_add_u32(a, b) */
|
|
|
|
|
bool combine_salu_lshl_add(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
|
|
|
|
{
|
2020-01-28 12:05:26 +00:00
|
|
|
if (instr->opcode == aco_opcode::s_add_i32 && ctx.uses[instr->definitions[1].tempId()])
|
2019-09-17 13:22:17 +02:00
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
aco: combine more s_add+s_lshl to s_lshl<n>_add by ignoring uses
Even if the s_lshl is used more that once, it can still be combined.
fossils-db (Vega10):
Totals from 771 (0.55% of 139517) affected shaders:
SGPRs: 46216 -> 46304 (+0.19%); split: -0.02%, +0.21%
VGPRs: 38488 -> 38464 (-0.06%)
SpillSGPRs: 1894 -> 1875 (-1.00%); split: -3.12%, +2.11%
CodeSize: 5681856 -> 5679844 (-0.04%); split: -0.07%, +0.03%
MaxWaves: 5320 -> 5323 (+0.06%)
Instrs: 1093960 -> 1093474 (-0.04%); split: -0.09%, +0.05%
Cycles: 47198380 -> 47258872 (+0.13%); split: -0.06%, +0.19%
VMEM: 176036 -> 176283 (+0.14%); split: +0.16%, -0.02%
SMEM: 53397 -> 53255 (-0.27%); split: +0.03%, -0.30%
VClause: 23156 -> 23152 (-0.02%); split: -0.03%, +0.01%
SClause: 35716 -> 35726 (+0.03%); split: -0.00%, +0.03%
Copies: 139395 -> 139871 (+0.34%); split: -0.04%, +0.39%
Branches: 33808 -> 33798 (-0.03%); split: -0.04%, +0.01%
PreSGPRs: 35381 -> 35331 (-0.14%); split: -0.20%, +0.06%
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7539>
2020-11-10 11:20:18 +01:00
|
|
|
Instruction *op2_instr = follow_operand(ctx, instr->operands[i], true);
|
2020-01-28 12:05:26 +00:00
|
|
|
if (!op2_instr || op2_instr->opcode != aco_opcode::s_lshl_b32 ||
|
|
|
|
|
ctx.uses[op2_instr->definitions[1].tempId()])
|
2020-01-28 12:04:48 +00:00
|
|
|
continue;
|
|
|
|
|
if (!op2_instr->operands[1].isConstant() || fixed_to_exec(op2_instr->operands[0]))
|
2019-09-17 13:22:17 +02:00
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
uint32_t shift = op2_instr->operands[1].constantValue();
|
|
|
|
|
if (shift < 1 || shift > 4)
|
|
|
|
|
continue;
|
|
|
|
|
|
2019-11-22 14:34:24 +00:00
|
|
|
if (instr->operands[!i].isLiteral() && op2_instr->operands[0].isLiteral() &&
|
|
|
|
|
instr->operands[!i].constantValue() != op2_instr->operands[0].constantValue())
|
|
|
|
|
continue;
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.uses[instr->operands[i].tempId()]--;
|
|
|
|
|
instr->operands[1] = instr->operands[!i];
|
|
|
|
|
instr->operands[0] = op2_instr->operands[0];
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].label = 0;
|
|
|
|
|
|
|
|
|
|
instr->opcode = ((aco_opcode[]){aco_opcode::s_lshl1_add_u32,
|
|
|
|
|
aco_opcode::s_lshl2_add_u32,
|
|
|
|
|
aco_opcode::s_lshl3_add_u32,
|
|
|
|
|
aco_opcode::s_lshl4_add_u32})[shift - 1];
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2020-04-02 17:41:36 +02:00
|
|
|
bool combine_add_sub_b2i(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode new_op, uint8_t ops)
|
|
|
|
|
{
|
|
|
|
|
if (instr->usesModifiers())
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
|
|
|
|
if (!((1 << i) & ops))
|
|
|
|
|
continue;
|
|
|
|
|
if (instr->operands[i].isTemp() &&
|
|
|
|
|
ctx.info[instr->operands[i].tempId()].is_b2i() &&
|
|
|
|
|
ctx.uses[instr->operands[i].tempId()] == 1) {
|
|
|
|
|
|
|
|
|
|
aco_ptr<Instruction> new_instr;
|
|
|
|
|
if (instr->operands[!i].isTemp() && instr->operands[!i].getTemp().type() == RegType::vgpr) {
|
|
|
|
|
new_instr.reset(create_instruction<VOP2_instruction>(new_op, Format::VOP2, 3, 2));
|
|
|
|
|
} else if (ctx.program->chip_class >= GFX10 ||
|
|
|
|
|
(instr->operands[!i].isConstant() && !instr->operands[!i].isLiteral())) {
|
|
|
|
|
new_instr.reset(create_instruction<VOP3A_instruction>(new_op, asVOP3(Format::VOP2), 3, 2));
|
|
|
|
|
} else {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
ctx.uses[instr->operands[i].tempId()]--;
|
|
|
|
|
new_instr->definitions[0] = instr->definitions[0];
|
aco: fix combining add/sub to b2i if a new dest needs to be allocated
The uses vector needs to be expanded to avoid out of bounds access
and to make sure the number of uses is initialized to 0.
This fixes combining more v_and(a, v_subbrev_co_u32).
fossilds-db (Vega10):
Totals from 4574 (3.28% of 139517) affected shaders:
SGPRs: 291625 -> 292217 (+0.20%); split: -0.01%, +0.21%
VGPRs: 276368 -> 276188 (-0.07%); split: -0.07%, +0.01%
SpillSGPRs: 455 -> 533 (+17.14%)
SpillVGPRs: 76 -> 78 (+2.63%)
CodeSize: 23327500 -> 23304152 (-0.10%); split: -0.17%, +0.07%
MaxWaves: 22044 -> 22066 (+0.10%)
Instrs: 4583064 -> 4576301 (-0.15%); split: -0.15%, +0.01%
Cycles: 47925276 -> 47871968 (-0.11%); split: -0.13%, +0.01%
VMEM: 1599363 -> 1597473 (-0.12%); split: +0.08%, -0.19%
SMEM: 331461 -> 331126 (-0.10%); split: +0.08%, -0.18%
VClause: 80639 -> 80696 (+0.07%); split: -0.02%, +0.09%
SClause: 155992 -> 155993 (+0.00%); split: -0.02%, +0.02%
Copies: 333482 -> 333318 (-0.05%); split: -0.12%, +0.07%
Branches: 70967 -> 70968 (+0.00%)
PreSGPRs: 187078 -> 187711 (+0.34%); split: -0.01%, +0.35%
PreVGPRs: 244918 -> 244785 (-0.05%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7513>
2020-11-09 19:42:22 +01:00
|
|
|
if (instr->definitions.size() == 2) {
|
|
|
|
|
new_instr->definitions[1] = instr->definitions[1];
|
|
|
|
|
} else {
|
|
|
|
|
new_instr->definitions[1] =
|
|
|
|
|
Definition(ctx.program->allocateTmp(ctx.program->lane_mask));
|
|
|
|
|
/* Make sure the uses vector is large enough and the number of
|
|
|
|
|
* uses properly initialized to 0.
|
|
|
|
|
*/
|
|
|
|
|
ctx.uses.push_back(0);
|
|
|
|
|
}
|
2020-04-02 17:41:36 +02:00
|
|
|
new_instr->definitions[1].setHint(vcc);
|
|
|
|
|
new_instr->operands[0] = Operand(0u);
|
|
|
|
|
new_instr->operands[1] = instr->operands[!i];
|
|
|
|
|
new_instr->operands[2] = Operand(ctx.info[instr->operands[i].tempId()].temp);
|
|
|
|
|
instr = std::move(new_instr);
|
aco: optimize v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc)
fossils-db (Vega10):
Totals from 7786 (5.70% of 136546) affected shaders:
SGPRs: 517778 -> 518626 (+0.16%); split: -0.01%, +0.17%
VGPRs: 488252 -> 488084 (-0.03%); split: -0.04%, +0.01%
CodeSize: 42282068 -> 42250152 (-0.08%); split: -0.16%, +0.09%
MaxWaves: 35697 -> 35716 (+0.05%); split: +0.06%, -0.01%
Instrs: 8319309 -> 8304792 (-0.17%); split: -0.18%, +0.00%
Cycles: 88619440 -> 88489636 (-0.15%); split: -0.16%, +0.01%
VMEM: 2788278 -> 2780431 (-0.28%); split: +0.06%, -0.35%
SMEM: 570364 -> 569370 (-0.17%); split: +0.12%, -0.30%
VClause: 144906 -> 144908 (+0.00%); split: -0.05%, +0.05%
SClause: 302143 -> 302055 (-0.03%); split: -0.04%, +0.01%
Copies: 579124 -> 578779 (-0.06%); split: -0.14%, +0.08%
PreSGPRs: 327695 -> 328845 (+0.35%); split: -0.00%, +0.35%
PreVGPRs: 434280 -> 433954 (-0.08%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7438>
2020-11-03 18:50:32 +01:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_add_sub(instr.get());
|
2020-04-02 17:41:36 +02:00
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
bool get_minmax_info(aco_opcode op, aco_opcode *min, aco_opcode *max, aco_opcode *min3, aco_opcode *max3, aco_opcode *med3, bool *some_gfx9_only)
|
|
|
|
|
{
|
|
|
|
|
switch (op) {
|
|
|
|
|
#define MINMAX(type, gfx9) \
|
|
|
|
|
case aco_opcode::v_min_##type:\
|
|
|
|
|
case aco_opcode::v_max_##type:\
|
|
|
|
|
case aco_opcode::v_med3_##type:\
|
|
|
|
|
*min = aco_opcode::v_min_##type;\
|
|
|
|
|
*max = aco_opcode::v_max_##type;\
|
|
|
|
|
*med3 = aco_opcode::v_med3_##type;\
|
|
|
|
|
*min3 = aco_opcode::v_min3_##type;\
|
|
|
|
|
*max3 = aco_opcode::v_max3_##type;\
|
|
|
|
|
*some_gfx9_only = gfx9;\
|
|
|
|
|
return true;
|
|
|
|
|
MINMAX(f32, false)
|
|
|
|
|
MINMAX(u32, false)
|
|
|
|
|
MINMAX(i32, false)
|
|
|
|
|
MINMAX(f16, true)
|
|
|
|
|
MINMAX(u16, true)
|
|
|
|
|
MINMAX(i16, true)
|
|
|
|
|
#undef MINMAX
|
|
|
|
|
default:
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* v_min_{f,u,i}{16,32}(v_max_{f,u,i}{16,32}(a, lb), ub) -> v_med3_{f,u,i}{16,32}(a, lb, ub) when ub > lb
|
|
|
|
|
* v_max_{f,u,i}{16,32}(v_min_{f,u,i}{16,32}(a, ub), lb) -> v_med3_{f,u,i}{16,32}(a, lb, ub) when ub > lb */
|
|
|
|
|
bool combine_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr,
|
|
|
|
|
aco_opcode min, aco_opcode max, aco_opcode med)
|
|
|
|
|
{
|
2019-11-22 17:50:29 +00:00
|
|
|
/* TODO: GLSL's clamp(x, minVal, maxVal) and SPIR-V's
|
|
|
|
|
* FClamp(x, minVal, maxVal)/NClamp(x, minVal, maxVal) are undefined if
|
|
|
|
|
* minVal > maxVal, which means we can always select it to a v_med3_f32 */
|
2019-09-17 13:22:17 +02:00
|
|
|
aco_opcode other_op;
|
|
|
|
|
if (instr->opcode == min)
|
|
|
|
|
other_op = max;
|
|
|
|
|
else if (instr->opcode == max)
|
|
|
|
|
other_op = min;
|
|
|
|
|
else
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
for (unsigned swap = 0; swap < 2; swap++) {
|
|
|
|
|
Operand operands[3];
|
2019-11-22 20:58:59 +00:00
|
|
|
bool neg[3], abs[3], clamp;
|
2020-01-08 11:49:11 +01:00
|
|
|
uint8_t opsel = 0, omod = 0;
|
2019-09-17 13:22:17 +02:00
|
|
|
if (match_op3_for_vop3(ctx, instr->opcode, other_op, instr.get(), swap,
|
2020-01-08 11:49:11 +01:00
|
|
|
"012", operands, neg, abs, &opsel,
|
2019-11-22 20:58:59 +00:00
|
|
|
&clamp, &omod, NULL, NULL, NULL)) {
|
2019-09-17 13:22:17 +02:00
|
|
|
int const0_idx = -1, const1_idx = -1;
|
|
|
|
|
uint32_t const0 = 0, const1 = 0;
|
|
|
|
|
for (int i = 0; i < 3; i++) {
|
|
|
|
|
uint32_t val;
|
|
|
|
|
if (operands[i].isConstant()) {
|
|
|
|
|
val = operands[i].constantValue();
|
2020-05-15 16:28:03 +01:00
|
|
|
} else if (operands[i].isTemp() && ctx.info[operands[i].tempId()].is_constant_or_literal(32)) {
|
2019-09-17 13:22:17 +02:00
|
|
|
val = ctx.info[operands[i].tempId()].val;
|
|
|
|
|
} else {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (const0_idx >= 0) {
|
|
|
|
|
const1_idx = i;
|
|
|
|
|
const1 = val;
|
|
|
|
|
} else {
|
|
|
|
|
const0_idx = i;
|
|
|
|
|
const0 = val;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (const0_idx < 0 || const1_idx < 0)
|
|
|
|
|
continue;
|
|
|
|
|
|
2020-01-08 11:49:11 +01:00
|
|
|
if (opsel & (1 << const0_idx))
|
2019-09-17 13:22:17 +02:00
|
|
|
const0 >>= 16;
|
2020-01-08 11:49:11 +01:00
|
|
|
if (opsel & (1 << const1_idx))
|
2019-09-17 13:22:17 +02:00
|
|
|
const1 >>= 16;
|
|
|
|
|
|
|
|
|
|
int lower_idx = const0_idx;
|
|
|
|
|
switch (min) {
|
|
|
|
|
case aco_opcode::v_min_f32:
|
|
|
|
|
case aco_opcode::v_min_f16: {
|
|
|
|
|
float const0_f, const1_f;
|
|
|
|
|
if (min == aco_opcode::v_min_f32) {
|
|
|
|
|
memcpy(&const0_f, &const0, 4);
|
|
|
|
|
memcpy(&const1_f, &const1, 4);
|
|
|
|
|
} else {
|
|
|
|
|
const0_f = _mesa_half_to_float(const0);
|
|
|
|
|
const1_f = _mesa_half_to_float(const1);
|
|
|
|
|
}
|
|
|
|
|
if (abs[const0_idx]) const0_f = fabsf(const0_f);
|
|
|
|
|
if (abs[const1_idx]) const1_f = fabsf(const1_f);
|
|
|
|
|
if (neg[const0_idx]) const0_f = -const0_f;
|
|
|
|
|
if (neg[const1_idx]) const1_f = -const1_f;
|
|
|
|
|
lower_idx = const0_f < const1_f ? const0_idx : const1_idx;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case aco_opcode::v_min_u32: {
|
|
|
|
|
lower_idx = const0 < const1 ? const0_idx : const1_idx;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case aco_opcode::v_min_u16: {
|
|
|
|
|
lower_idx = (uint16_t)const0 < (uint16_t)const1 ? const0_idx : const1_idx;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case aco_opcode::v_min_i32: {
|
|
|
|
|
int32_t const0_i = const0 & 0x80000000u ? -2147483648 + (int32_t)(const0 & 0x7fffffffu) : const0;
|
|
|
|
|
int32_t const1_i = const1 & 0x80000000u ? -2147483648 + (int32_t)(const1 & 0x7fffffffu) : const1;
|
|
|
|
|
lower_idx = const0_i < const1_i ? const0_idx : const1_idx;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case aco_opcode::v_min_i16: {
|
|
|
|
|
int16_t const0_i = const0 & 0x8000u ? -32768 + (int16_t)(const0 & 0x7fffu) : const0;
|
|
|
|
|
int16_t const1_i = const1 & 0x8000u ? -32768 + (int16_t)(const1 & 0x7fffu) : const1;
|
|
|
|
|
lower_idx = const0_i < const1_i ? const0_idx : const1_idx;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
int upper_idx = lower_idx == const0_idx ? const1_idx : const0_idx;
|
|
|
|
|
|
|
|
|
|
if (instr->opcode == min) {
|
|
|
|
|
if (upper_idx != 0 || lower_idx == 0)
|
|
|
|
|
return false;
|
|
|
|
|
} else {
|
|
|
|
|
if (upper_idx == 0 || lower_idx != 0)
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ctx.uses[instr->operands[swap].tempId()]--;
|
|
|
|
|
create_vop3_for_op3(ctx, med, instr, operands, neg, abs, opsel, clamp, omod);
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void apply_sgprs(opt_ctx &ctx, aco_ptr<Instruction>& instr)
|
|
|
|
|
{
|
2019-11-20 16:42:17 +00:00
|
|
|
bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_lshrrev_b64 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_ashrrev_i64;
|
|
|
|
|
|
2019-11-22 14:55:25 +00:00
|
|
|
/* find candidates and create the set of sgprs already read */
|
|
|
|
|
unsigned sgpr_ids[2] = {0, 0};
|
|
|
|
|
uint32_t operand_mask = 0;
|
|
|
|
|
bool has_literal = false;
|
|
|
|
|
for (unsigned i = 0; i < instr->operands.size(); i++) {
|
|
|
|
|
if (instr->operands[i].isLiteral())
|
|
|
|
|
has_literal = true;
|
2019-09-17 13:22:17 +02:00
|
|
|
if (!instr->operands[i].isTemp())
|
|
|
|
|
continue;
|
|
|
|
|
if (instr->operands[i].getTemp().type() == RegType::sgpr) {
|
2019-11-22 14:55:25 +00:00
|
|
|
if (instr->operands[i].tempId() != sgpr_ids[0])
|
|
|
|
|
sgpr_ids[!!sgpr_ids[0]] = instr->operands[i].tempId();
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
ssa_info& info = ctx.info[instr->operands[i].tempId()];
|
2019-11-22 14:55:25 +00:00
|
|
|
if (info.is_temp() && info.temp.type() == RegType::sgpr)
|
|
|
|
|
operand_mask |= 1u << i;
|
|
|
|
|
}
|
|
|
|
|
unsigned max_sgprs = 1;
|
2019-11-20 16:42:17 +00:00
|
|
|
if (ctx.program->chip_class >= GFX10 && !is_shift64)
|
|
|
|
|
max_sgprs = 2;
|
2019-11-22 14:55:25 +00:00
|
|
|
if (has_literal)
|
|
|
|
|
max_sgprs--;
|
|
|
|
|
|
|
|
|
|
unsigned num_sgprs = !!sgpr_ids[0] + !!sgpr_ids[1];
|
|
|
|
|
|
|
|
|
|
/* keep on applying sgprs until there is nothing left to be done */
|
|
|
|
|
while (operand_mask) {
|
|
|
|
|
uint32_t sgpr_idx = 0;
|
|
|
|
|
uint32_t sgpr_info_id = 0;
|
|
|
|
|
uint32_t mask = operand_mask;
|
|
|
|
|
/* choose a sgpr */
|
|
|
|
|
while (mask) {
|
|
|
|
|
unsigned i = u_bit_scan(&mask);
|
2019-09-17 13:22:17 +02:00
|
|
|
uint16_t uses = ctx.uses[instr->operands[i].tempId()];
|
|
|
|
|
if (sgpr_info_id == 0 || uses < ctx.uses[sgpr_info_id]) {
|
|
|
|
|
sgpr_idx = i;
|
|
|
|
|
sgpr_info_id = instr->operands[i].tempId();
|
|
|
|
|
}
|
|
|
|
|
}
|
2019-11-22 14:55:25 +00:00
|
|
|
operand_mask &= ~(1u << sgpr_idx);
|
|
|
|
|
|
|
|
|
|
/* Applying two sgprs require making it VOP3, so don't do it unless it's
|
|
|
|
|
* definitively beneficial.
|
|
|
|
|
* TODO: this is too conservative because later the use count could be reduced to 1 */
|
2019-12-05 14:12:39 +00:00
|
|
|
if (num_sgprs && ctx.uses[sgpr_info_id] > 1 && !instr->isVOP3() && !instr->isSDWA())
|
2019-11-22 14:55:25 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
Temp sgpr = ctx.info[sgpr_info_id].temp;
|
|
|
|
|
bool new_sgpr = sgpr.id() != sgpr_ids[0] && sgpr.id() != sgpr_ids[1];
|
|
|
|
|
if (new_sgpr && num_sgprs >= max_sgprs)
|
|
|
|
|
continue;
|
|
|
|
|
|
2019-12-05 14:12:39 +00:00
|
|
|
if (sgpr_idx == 0 || instr->isVOP3() || instr->isSDWA()) {
|
2019-11-22 14:55:25 +00:00
|
|
|
instr->operands[sgpr_idx] = Operand(sgpr);
|
2019-09-17 13:22:17 +02:00
|
|
|
} else if (can_swap_operands(instr)) {
|
|
|
|
|
instr->operands[sgpr_idx] = instr->operands[0];
|
2019-11-22 14:55:25 +00:00
|
|
|
instr->operands[0] = Operand(sgpr);
|
|
|
|
|
/* swap bits using a 4-entry LUT */
|
|
|
|
|
uint32_t swapped = (0x3120 >> (operand_mask & 0x3)) & 0xf;
|
|
|
|
|
operand_mask = (operand_mask & ~0x3) | swapped;
|
2019-11-20 16:42:17 +00:00
|
|
|
} else if (can_use_VOP3(ctx, instr)) {
|
2019-09-17 13:22:17 +02:00
|
|
|
to_VOP3(ctx, instr);
|
2019-11-22 14:55:25 +00:00
|
|
|
instr->operands[sgpr_idx] = Operand(sgpr);
|
|
|
|
|
} else {
|
|
|
|
|
continue;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2020-01-17 11:35:20 +00:00
|
|
|
if (new_sgpr)
|
|
|
|
|
sgpr_ids[num_sgprs++] = sgpr.id();
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.uses[sgpr_info_id]--;
|
2019-11-22 14:55:25 +00:00
|
|
|
ctx.uses[sgpr.id()]++;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-12-05 14:12:39 +00:00
|
|
|
template <typename T>
|
|
|
|
|
bool apply_omod_clamp_helper(opt_ctx &ctx, T *instr, ssa_info& def_info)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2019-12-05 14:12:39 +00:00
|
|
|
if (!def_info.is_clamp() && (instr->clamp || instr->omod))
|
2020-08-12 15:58:32 +01:00
|
|
|
return false;
|
2019-12-17 14:55:24 +00:00
|
|
|
|
2020-08-12 15:58:32 +01:00
|
|
|
if (def_info.is_omod2())
|
2019-12-05 14:12:39 +00:00
|
|
|
instr->omod = 1;
|
2020-08-12 15:58:32 +01:00
|
|
|
else if (def_info.is_omod4())
|
2019-12-05 14:12:39 +00:00
|
|
|
instr->omod = 2;
|
2020-08-12 15:58:32 +01:00
|
|
|
else if (def_info.is_omod5())
|
2019-12-05 14:12:39 +00:00
|
|
|
instr->omod = 3;
|
2020-08-12 15:58:32 +01:00
|
|
|
else if (def_info.is_clamp())
|
2019-12-05 14:12:39 +00:00
|
|
|
instr->clamp = true;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-08-12 15:58:32 +01:00
|
|
|
return true;
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-08-12 15:58:32 +01:00
|
|
|
/* apply omod / clamp modifiers if the def is used only once and the instruction can have modifiers */
|
|
|
|
|
bool apply_omod_clamp(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
|
|
|
|
|
{
|
|
|
|
|
if (instr->definitions.empty() || ctx.uses[instr->definitions[0].tempId()] != 1 ||
|
|
|
|
|
!instr_info.can_use_output_modifiers[(int)instr->opcode])
|
|
|
|
|
return false;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2019-12-05 14:12:39 +00:00
|
|
|
bool can_vop3 = can_use_VOP3(ctx, instr);
|
|
|
|
|
if (!instr->isSDWA() && !can_vop3)
|
2020-08-12 15:58:32 +01:00
|
|
|
return false;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2019-11-09 20:51:45 +00:00
|
|
|
/* omod has no effect if denormals are enabled */
|
2020-08-12 15:58:32 +01:00
|
|
|
bool can_use_omod = (instr->definitions[0].bytes() == 4 ? block.fp_mode.denorm32 : block.fp_mode.denorm16_64) == 0;
|
2019-12-05 14:12:39 +00:00
|
|
|
can_use_omod = can_use_omod && (can_vop3 || ctx.program->chip_class >= GFX9); /* SDWA omod is GFX9+ */
|
|
|
|
|
|
2020-08-12 15:58:32 +01:00
|
|
|
ssa_info& def_info = ctx.info[instr->definitions[0].tempId()];
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-08-12 15:58:32 +01:00
|
|
|
uint64_t omod_labels = label_omod2 | label_omod4 | label_omod5;
|
|
|
|
|
if (!def_info.is_clamp() && !(can_use_omod && (def_info.label & omod_labels)))
|
|
|
|
|
return false;
|
|
|
|
|
/* if the omod/clamp instruction is dead, then the single user of this
|
|
|
|
|
* instruction is a different instruction */
|
|
|
|
|
if (!ctx.uses[def_info.instr->definitions[0].tempId()])
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
/* MADs/FMAs are created later, so we don't have to update the original add */
|
|
|
|
|
assert(!ctx.info[instr->definitions[0].tempId()].is_mad());
|
|
|
|
|
|
2019-12-05 14:12:39 +00:00
|
|
|
if (instr->isSDWA()) {
|
|
|
|
|
if (!apply_omod_clamp_helper(ctx, static_cast<SDWA_instruction *>(instr.get()), def_info))
|
|
|
|
|
return false;
|
|
|
|
|
} else {
|
|
|
|
|
to_VOP3(ctx, instr);
|
|
|
|
|
if (!apply_omod_clamp_helper(ctx, static_cast<VOP3A_instruction *>(instr.get()), def_info))
|
|
|
|
|
return false;
|
|
|
|
|
}
|
2020-08-12 15:58:32 +01:00
|
|
|
|
|
|
|
|
std::swap(instr->definitions[0], def_info.instr->definitions[0]);
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].label &= label_clamp;
|
|
|
|
|
ctx.uses[def_info.instr->definitions[0].tempId()]--;
|
|
|
|
|
|
|
|
|
|
return true;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
aco: optimize v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc)
fossils-db (Vega10):
Totals from 7786 (5.70% of 136546) affected shaders:
SGPRs: 517778 -> 518626 (+0.16%); split: -0.01%, +0.17%
VGPRs: 488252 -> 488084 (-0.03%); split: -0.04%, +0.01%
CodeSize: 42282068 -> 42250152 (-0.08%); split: -0.16%, +0.09%
MaxWaves: 35697 -> 35716 (+0.05%); split: +0.06%, -0.01%
Instrs: 8319309 -> 8304792 (-0.17%); split: -0.18%, +0.00%
Cycles: 88619440 -> 88489636 (-0.15%); split: -0.16%, +0.01%
VMEM: 2788278 -> 2780431 (-0.28%); split: +0.06%, -0.35%
SMEM: 570364 -> 569370 (-0.17%); split: +0.12%, -0.30%
VClause: 144906 -> 144908 (+0.00%); split: -0.05%, +0.05%
SClause: 302143 -> 302055 (-0.03%); split: -0.04%, +0.01%
Copies: 579124 -> 578779 (-0.06%); split: -0.14%, +0.08%
PreSGPRs: 327695 -> 328845 (+0.35%); split: -0.00%, +0.35%
PreVGPRs: 434280 -> 433954 (-0.08%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7438>
2020-11-03 18:50:32 +01:00
|
|
|
/* v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc) */
|
|
|
|
|
bool combine_and_subbrev(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
|
|
|
|
{
|
|
|
|
|
if (instr->usesModifiers())
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
|
|
|
|
Instruction *op_instr = follow_operand(ctx, instr->operands[i], true);
|
|
|
|
|
if (op_instr &&
|
|
|
|
|
op_instr->opcode == aco_opcode::v_subbrev_co_u32 &&
|
|
|
|
|
op_instr->operands[0].constantEquals(0) &&
|
|
|
|
|
op_instr->operands[1].constantEquals(0) &&
|
|
|
|
|
!op_instr->usesModifiers()) {
|
|
|
|
|
|
|
|
|
|
aco_ptr<Instruction> new_instr;
|
|
|
|
|
if (instr->operands[!i].isTemp() && instr->operands[!i].getTemp().type() == RegType::vgpr) {
|
|
|
|
|
new_instr.reset(create_instruction<VOP2_instruction>(aco_opcode::v_cndmask_b32, Format::VOP2, 3, 1));
|
|
|
|
|
} else if (ctx.program->chip_class >= GFX10 ||
|
|
|
|
|
(instr->operands[!i].isConstant() && !instr->operands[!i].isLiteral())) {
|
|
|
|
|
new_instr.reset(create_instruction<VOP3A_instruction>(aco_opcode::v_cndmask_b32, asVOP3(Format::VOP2), 3, 1));
|
|
|
|
|
} else {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ctx.uses[instr->operands[i].tempId()]--;
|
|
|
|
|
if (ctx.uses[instr->operands[i].tempId()])
|
|
|
|
|
ctx.uses[op_instr->operands[2].tempId()]++;
|
|
|
|
|
|
|
|
|
|
new_instr->operands[0] = Operand(0u);
|
|
|
|
|
new_instr->operands[1] = instr->operands[!i];
|
|
|
|
|
new_instr->operands[2] = Operand(op_instr->operands[2]);
|
|
|
|
|
new_instr->definitions[0] = instr->definitions[0];
|
|
|
|
|
instr = std::move(new_instr);
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].label = 0;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
// TODO: we could possibly move the whole label_instruction pass to combine_instruction:
|
|
|
|
|
// this would mean that we'd have to fix the instruction uses while value propagation
|
|
|
|
|
|
2019-11-09 20:51:45 +00:00
|
|
|
void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2019-11-19 13:38:34 +01:00
|
|
|
if (instr->definitions.empty() || is_dead(ctx.uses, instr.get()))
|
2019-09-17 13:22:17 +02:00
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
if (instr->isVALU()) {
|
2019-12-05 14:12:39 +00:00
|
|
|
if (can_apply_sgprs(ctx, instr))
|
2019-09-24 13:32:56 +01:00
|
|
|
apply_sgprs(ctx, instr);
|
2020-08-12 15:58:32 +01:00
|
|
|
while (apply_omod_clamp(ctx, block, instr)) ;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2020-01-07 10:12:08 +01:00
|
|
|
if (ctx.info[instr->definitions[0].tempId()].is_vcc_hint()) {
|
|
|
|
|
instr->definitions[0].setHint(vcc);
|
|
|
|
|
}
|
|
|
|
|
|
2019-12-05 14:12:39 +00:00
|
|
|
if (instr->isSDWA())
|
|
|
|
|
return;
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
/* TODO: There are still some peephole optimizations that could be done:
|
|
|
|
|
* - abs(a - b) -> s_absdiff_i32
|
|
|
|
|
* - various patterns for s_bitcmp{0,1}_b32 and s_bitset{0,1}_b32
|
|
|
|
|
* - patterns for v_alignbit_b32 and v_alignbyte_b32
|
|
|
|
|
* These aren't probably too interesting though.
|
|
|
|
|
* There are also patterns for v_cmp_class_f{16,32,64}. This is difficult but
|
|
|
|
|
* probably more useful than the previously mentioned optimizations.
|
|
|
|
|
* The various comparison optimizations also currently only work with 32-bit
|
|
|
|
|
* floats. */
|
|
|
|
|
|
|
|
|
|
/* neg(mul(a, b)) -> mul(neg(a), b) */
|
|
|
|
|
if (ctx.info[instr->definitions[0].tempId()].is_neg() && ctx.uses[instr->operands[1].tempId()] == 1) {
|
|
|
|
|
Temp val = ctx.info[instr->definitions[0].tempId()].temp;
|
|
|
|
|
|
|
|
|
|
if (!ctx.info[val.id()].is_mul())
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
Instruction* mul_instr = ctx.info[val.id()].instr;
|
|
|
|
|
|
|
|
|
|
if (mul_instr->operands[0].isLiteral())
|
|
|
|
|
return;
|
|
|
|
|
if (mul_instr->isVOP3() && static_cast<VOP3A_instruction*>(mul_instr)->clamp)
|
|
|
|
|
return;
|
2019-12-05 14:12:39 +00:00
|
|
|
if (mul_instr->isSDWA())
|
|
|
|
|
return;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
/* convert to mul(neg(a), b) */
|
|
|
|
|
ctx.uses[mul_instr->definitions[0].tempId()]--;
|
|
|
|
|
Definition def = instr->definitions[0];
|
|
|
|
|
/* neg(abs(mul(a, b))) -> mul(neg(abs(a)), abs(b)) */
|
|
|
|
|
bool is_abs = ctx.info[instr->definitions[0].tempId()].is_abs();
|
2020-05-15 15:12:33 +01:00
|
|
|
instr.reset(create_instruction<VOP3A_instruction>(mul_instr->opcode, asVOP3(Format::VOP2), 2, 1));
|
2019-09-17 13:22:17 +02:00
|
|
|
instr->operands[0] = mul_instr->operands[0];
|
|
|
|
|
instr->operands[1] = mul_instr->operands[1];
|
|
|
|
|
instr->definitions[0] = def;
|
|
|
|
|
VOP3A_instruction* new_mul = static_cast<VOP3A_instruction*>(instr.get());
|
|
|
|
|
if (mul_instr->isVOP3()) {
|
|
|
|
|
VOP3A_instruction* mul = static_cast<VOP3A_instruction*>(mul_instr);
|
|
|
|
|
new_mul->neg[0] = mul->neg[0] && !is_abs;
|
|
|
|
|
new_mul->neg[1] = mul->neg[1] && !is_abs;
|
|
|
|
|
new_mul->abs[0] = mul->abs[0] || is_abs;
|
|
|
|
|
new_mul->abs[1] = mul->abs[1] || is_abs;
|
|
|
|
|
new_mul->omod = mul->omod;
|
|
|
|
|
}
|
|
|
|
|
new_mul->neg[0] ^= true;
|
|
|
|
|
new_mul->clamp = false;
|
|
|
|
|
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_mul(instr.get());
|
|
|
|
|
return;
|
|
|
|
|
}
|
2020-05-15 14:03:15 +01:00
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
/* combine mul+add -> mad */
|
2020-05-15 14:03:15 +01:00
|
|
|
bool mad32 = instr->opcode == aco_opcode::v_add_f32 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_sub_f32 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_subrev_f32;
|
2020-05-14 21:09:36 +01:00
|
|
|
bool mad16 = instr->opcode == aco_opcode::v_add_f16 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_sub_f16 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_subrev_f16;
|
|
|
|
|
if (mad16 || mad32) {
|
2020-06-16 17:43:01 +01:00
|
|
|
bool need_fma = mad32 ? (block.fp_mode.denorm32 != 0 || ctx.program->chip_class >= GFX10_3) :
|
2020-05-14 21:09:36 +01:00
|
|
|
(block.fp_mode.denorm16_64 != 0 || ctx.program->chip_class >= GFX10);
|
2020-05-15 14:03:15 +01:00
|
|
|
if (need_fma && instr->definitions[0].isPrecise())
|
|
|
|
|
return;
|
2020-05-14 21:09:36 +01:00
|
|
|
if (need_fma && mad32 && !ctx.program->has_fast_fma32)
|
2020-05-15 14:03:15 +01:00
|
|
|
return;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
uint32_t uses_src0 = UINT32_MAX;
|
|
|
|
|
uint32_t uses_src1 = UINT32_MAX;
|
|
|
|
|
Instruction* mul_instr = nullptr;
|
|
|
|
|
unsigned add_op_idx;
|
|
|
|
|
/* check if any of the operands is a multiplication */
|
2020-05-15 14:03:15 +01:00
|
|
|
ssa_info *op0_info = instr->operands[0].isTemp() ? &ctx.info[instr->operands[0].tempId()] : NULL;
|
|
|
|
|
ssa_info *op1_info = instr->operands[1].isTemp() ? &ctx.info[instr->operands[1].tempId()] : NULL;
|
|
|
|
|
if (op0_info && op0_info->is_mul() && (!need_fma || !op0_info->instr->definitions[0].isPrecise()))
|
2019-09-17 13:22:17 +02:00
|
|
|
uses_src0 = ctx.uses[instr->operands[0].tempId()];
|
2020-05-15 14:03:15 +01:00
|
|
|
if (op1_info && op1_info->is_mul() && (!need_fma || !op1_info->instr->definitions[0].isPrecise()))
|
2019-09-17 13:22:17 +02:00
|
|
|
uses_src1 = ctx.uses[instr->operands[1].tempId()];
|
|
|
|
|
|
|
|
|
|
/* find the 'best' mul instruction to combine with the add */
|
|
|
|
|
if (uses_src0 < uses_src1) {
|
2020-05-15 14:03:15 +01:00
|
|
|
mul_instr = op0_info->instr;
|
2019-09-17 13:22:17 +02:00
|
|
|
add_op_idx = 1;
|
|
|
|
|
} else if (uses_src1 < uses_src0) {
|
2020-05-15 14:03:15 +01:00
|
|
|
mul_instr = op1_info->instr;
|
2019-09-17 13:22:17 +02:00
|
|
|
add_op_idx = 0;
|
|
|
|
|
} else if (uses_src0 != UINT32_MAX) {
|
|
|
|
|
/* tiebreaker: quite random what to pick */
|
2020-05-15 14:03:15 +01:00
|
|
|
if (op0_info->instr->operands[0].isLiteral()) {
|
|
|
|
|
mul_instr = op1_info->instr;
|
2019-09-17 13:22:17 +02:00
|
|
|
add_op_idx = 0;
|
|
|
|
|
} else {
|
2020-05-15 14:03:15 +01:00
|
|
|
mul_instr = op0_info->instr;
|
2019-09-17 13:22:17 +02:00
|
|
|
add_op_idx = 1;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (mul_instr) {
|
|
|
|
|
Operand op[3] = {Operand(v1), Operand(v1), Operand(v1)};
|
|
|
|
|
bool neg[3] = {false, false, false};
|
|
|
|
|
bool abs[3] = {false, false, false};
|
|
|
|
|
unsigned omod = 0;
|
|
|
|
|
bool clamp = false;
|
|
|
|
|
op[0] = mul_instr->operands[0];
|
|
|
|
|
op[1] = mul_instr->operands[1];
|
|
|
|
|
op[2] = instr->operands[add_op_idx];
|
2019-11-22 14:50:41 +00:00
|
|
|
// TODO: would be better to check this before selecting a mul instr?
|
|
|
|
|
if (!check_vop3_operands(ctx, 3, op))
|
|
|
|
|
return;
|
2019-12-05 14:12:39 +00:00
|
|
|
if (mul_instr->isSDWA())
|
|
|
|
|
return;
|
2019-11-22 14:50:41 +00:00
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
if (mul_instr->isVOP3()) {
|
|
|
|
|
VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*> (mul_instr);
|
|
|
|
|
neg[0] = vop3->neg[0];
|
|
|
|
|
neg[1] = vop3->neg[1];
|
|
|
|
|
abs[0] = vop3->abs[0];
|
|
|
|
|
abs[1] = vop3->abs[1];
|
|
|
|
|
/* we cannot use these modifiers between mul and add */
|
|
|
|
|
if (vop3->clamp || vop3->omod)
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* convert to mad */
|
|
|
|
|
ctx.uses[mul_instr->definitions[0].tempId()]--;
|
|
|
|
|
if (ctx.uses[mul_instr->definitions[0].tempId()]) {
|
|
|
|
|
if (op[0].isTemp())
|
|
|
|
|
ctx.uses[op[0].tempId()]++;
|
|
|
|
|
if (op[1].isTemp())
|
|
|
|
|
ctx.uses[op[1].tempId()]++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (instr->isVOP3()) {
|
|
|
|
|
VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*> (instr.get());
|
|
|
|
|
neg[2] = vop3->neg[add_op_idx];
|
|
|
|
|
abs[2] = vop3->abs[add_op_idx];
|
|
|
|
|
omod = vop3->omod;
|
|
|
|
|
clamp = vop3->clamp;
|
|
|
|
|
/* abs of the multiplication result */
|
|
|
|
|
if (vop3->abs[1 - add_op_idx]) {
|
|
|
|
|
neg[0] = false;
|
|
|
|
|
neg[1] = false;
|
|
|
|
|
abs[0] = true;
|
|
|
|
|
abs[1] = true;
|
|
|
|
|
}
|
|
|
|
|
/* neg of the multiplication result */
|
|
|
|
|
neg[1] = neg[1] ^ vop3->neg[1 - add_op_idx];
|
|
|
|
|
}
|
2020-05-14 21:09:36 +01:00
|
|
|
if (instr->opcode == aco_opcode::v_sub_f32 || instr->opcode == aco_opcode::v_sub_f16)
|
2019-09-17 13:22:17 +02:00
|
|
|
neg[1 + add_op_idx] = neg[1 + add_op_idx] ^ true;
|
2020-05-14 21:09:36 +01:00
|
|
|
else if (instr->opcode == aco_opcode::v_subrev_f32 || instr->opcode == aco_opcode::v_subrev_f16)
|
2019-09-17 13:22:17 +02:00
|
|
|
neg[2 - add_op_idx] = neg[2 - add_op_idx] ^ true;
|
|
|
|
|
|
2020-05-15 14:03:15 +01:00
|
|
|
aco_opcode mad_op = need_fma ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32;
|
2020-05-14 21:09:36 +01:00
|
|
|
if (mad16)
|
|
|
|
|
mad_op = need_fma ? (ctx.program->chip_class == GFX8 ? aco_opcode::v_fma_legacy_f16 : aco_opcode::v_fma_f16) :
|
|
|
|
|
(ctx.program->chip_class == GFX8 ? aco_opcode::v_mad_legacy_f16 : aco_opcode::v_mad_f16);
|
2020-05-15 14:03:15 +01:00
|
|
|
|
|
|
|
|
aco_ptr<VOP3A_instruction> mad{create_instruction<VOP3A_instruction>(mad_op, Format::VOP3A, 3, 1)};
|
2019-09-17 13:22:17 +02:00
|
|
|
for (unsigned i = 0; i < 3; i++)
|
|
|
|
|
{
|
|
|
|
|
mad->operands[i] = op[i];
|
|
|
|
|
mad->neg[i] = neg[i];
|
|
|
|
|
mad->abs[i] = abs[i];
|
|
|
|
|
}
|
|
|
|
|
mad->omod = omod;
|
|
|
|
|
mad->clamp = clamp;
|
|
|
|
|
mad->definitions[0] = instr->definitions[0];
|
|
|
|
|
|
|
|
|
|
/* mark this ssa_def to be re-checked for profitability and literals */
|
2019-11-22 15:18:38 +00:00
|
|
|
ctx.mad_infos.emplace_back(std::move(instr), mul_instr->definitions[0].tempId());
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.info[mad->definitions[0].tempId()].set_mad(mad.get(), ctx.mad_infos.size() - 1);
|
|
|
|
|
instr.reset(mad.release());
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
/* v_mul_f32(v_cndmask_b32(0, 1.0, cond), a) -> v_cndmask_b32(0, a, cond) */
|
|
|
|
|
else if (instr->opcode == aco_opcode::v_mul_f32 && !instr->isVOP3()) {
|
|
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
|
|
|
|
if (instr->operands[i].isTemp() && ctx.info[instr->operands[i].tempId()].is_b2f() &&
|
|
|
|
|
ctx.uses[instr->operands[i].tempId()] == 1 &&
|
|
|
|
|
instr->operands[!i].isTemp() && instr->operands[!i].getTemp().type() == RegType::vgpr) {
|
|
|
|
|
ctx.uses[instr->operands[i].tempId()]--;
|
|
|
|
|
ctx.uses[ctx.info[instr->operands[i].tempId()].temp.id()]++;
|
|
|
|
|
|
|
|
|
|
aco_ptr<VOP2_instruction> new_instr{create_instruction<VOP2_instruction>(aco_opcode::v_cndmask_b32, Format::VOP2, 3, 1)};
|
|
|
|
|
new_instr->operands[0] = Operand(0u);
|
|
|
|
|
new_instr->operands[1] = instr->operands[!i];
|
|
|
|
|
new_instr->operands[2] = Operand(ctx.info[instr->operands[i].tempId()].temp);
|
|
|
|
|
new_instr->definitions[0] = instr->definitions[0];
|
|
|
|
|
instr.reset(new_instr.release());
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].label = 0;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else if (instr->opcode == aco_opcode::v_or_b32 && ctx.program->chip_class >= GFX9) {
|
2020-04-20 19:16:48 +01:00
|
|
|
if (combine_three_valu_op(ctx, instr, aco_opcode::s_or_b32, aco_opcode::v_or3_b32, "012", 1 | 2)) ;
|
|
|
|
|
else if (combine_three_valu_op(ctx, instr, aco_opcode::v_or_b32, aco_opcode::v_or3_b32, "012", 1 | 2)) ;
|
|
|
|
|
else if (combine_three_valu_op(ctx, instr, aco_opcode::s_and_b32, aco_opcode::v_and_or_b32, "120", 1 | 2)) ;
|
2019-09-17 13:22:17 +02:00
|
|
|
else if (combine_three_valu_op(ctx, instr, aco_opcode::v_and_b32, aco_opcode::v_and_or_b32, "120", 1 | 2)) ;
|
2020-04-24 00:21:46 +01:00
|
|
|
else if (combine_three_valu_op(ctx, instr, aco_opcode::s_lshl_b32, aco_opcode::v_lshl_or_b32, "120", 1 | 2)) ;
|
2019-09-17 13:22:17 +02:00
|
|
|
else combine_three_valu_op(ctx, instr, aco_opcode::v_lshlrev_b32, aco_opcode::v_lshl_or_b32, "210", 1 | 2);
|
2020-06-04 14:36:00 +01:00
|
|
|
} else if (instr->opcode == aco_opcode::v_xor_b32 && ctx.program->chip_class >= GFX10) {
|
|
|
|
|
if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xor3_b32, "012", 1 | 2)) ;
|
|
|
|
|
else combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xor3_b32, "012", 1 | 2);
|
2020-04-02 17:41:36 +02:00
|
|
|
} else if (instr->opcode == aco_opcode::v_add_u32) {
|
|
|
|
|
if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) ;
|
|
|
|
|
else if (ctx.program->chip_class >= GFX9) {
|
|
|
|
|
if (combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xad_u32, "120", 1 | 2)) ;
|
|
|
|
|
else if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xad_u32, "120", 1 | 2)) ;
|
|
|
|
|
else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_i32, aco_opcode::v_add3_u32, "012", 1 | 2)) ;
|
|
|
|
|
else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_u32, aco_opcode::v_add3_u32, "012", 1 | 2)) ;
|
|
|
|
|
else if (combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add3_u32, "012", 1 | 2)) ;
|
|
|
|
|
else if (combine_three_valu_op(ctx, instr, aco_opcode::s_lshl_b32, aco_opcode::v_lshl_add_u32, "120", 1 | 2)) ;
|
|
|
|
|
else combine_three_valu_op(ctx, instr, aco_opcode::v_lshlrev_b32, aco_opcode::v_lshl_add_u32, "210", 1 | 2);
|
|
|
|
|
}
|
|
|
|
|
} else if (instr->opcode == aco_opcode::v_add_co_u32 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_add_co_u32_e64) {
|
|
|
|
|
combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2);
|
|
|
|
|
} else if (instr->opcode == aco_opcode::v_sub_u32 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_sub_co_u32 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_sub_co_u32_e64) {
|
|
|
|
|
combine_add_sub_b2i(ctx, instr, aco_opcode::v_subbrev_co_u32, 2);
|
|
|
|
|
} else if (instr->opcode == aco_opcode::v_subrev_u32 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_subrev_co_u32 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_subrev_co_u32_e64) {
|
|
|
|
|
combine_add_sub_b2i(ctx, instr, aco_opcode::v_subbrev_co_u32, 1);
|
2019-09-17 13:22:17 +02:00
|
|
|
} else if (instr->opcode == aco_opcode::v_lshlrev_b32 && ctx.program->chip_class >= GFX9) {
|
|
|
|
|
combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add_lshl_u32, "120", 2);
|
|
|
|
|
} else if ((instr->opcode == aco_opcode::s_add_u32 || instr->opcode == aco_opcode::s_add_i32) && ctx.program->chip_class >= GFX9) {
|
|
|
|
|
combine_salu_lshl_add(ctx, instr);
|
2019-12-16 15:35:14 +00:00
|
|
|
} else if (instr->opcode == aco_opcode::s_not_b32 || instr->opcode == aco_opcode::s_not_b64) {
|
2019-09-17 13:22:17 +02:00
|
|
|
combine_salu_not_bitwise(ctx, instr);
|
2019-12-03 13:37:49 +00:00
|
|
|
} else if (instr->opcode == aco_opcode::s_and_b32 || instr->opcode == aco_opcode::s_or_b32 ||
|
|
|
|
|
instr->opcode == aco_opcode::s_and_b64 || instr->opcode == aco_opcode::s_or_b64) {
|
2019-09-17 13:22:17 +02:00
|
|
|
if (combine_ordering_test(ctx, instr)) ;
|
|
|
|
|
else if (combine_comparison_ordering(ctx, instr)) ;
|
|
|
|
|
else if (combine_constant_comparison_ordering(ctx, instr)) ;
|
|
|
|
|
else combine_salu_n2(ctx, instr);
|
aco: optimize v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc)
fossils-db (Vega10):
Totals from 7786 (5.70% of 136546) affected shaders:
SGPRs: 517778 -> 518626 (+0.16%); split: -0.01%, +0.17%
VGPRs: 488252 -> 488084 (-0.03%); split: -0.04%, +0.01%
CodeSize: 42282068 -> 42250152 (-0.08%); split: -0.16%, +0.09%
MaxWaves: 35697 -> 35716 (+0.05%); split: +0.06%, -0.01%
Instrs: 8319309 -> 8304792 (-0.17%); split: -0.18%, +0.00%
Cycles: 88619440 -> 88489636 (-0.15%); split: -0.16%, +0.01%
VMEM: 2788278 -> 2780431 (-0.28%); split: +0.06%, -0.35%
SMEM: 570364 -> 569370 (-0.17%); split: +0.12%, -0.30%
VClause: 144906 -> 144908 (+0.00%); split: -0.05%, +0.05%
SClause: 302143 -> 302055 (-0.03%); split: -0.04%, +0.01%
Copies: 579124 -> 578779 (-0.06%); split: -0.14%, +0.08%
PreSGPRs: 327695 -> 328845 (+0.35%); split: -0.00%, +0.35%
PreVGPRs: 434280 -> 433954 (-0.08%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7438>
2020-11-03 18:50:32 +01:00
|
|
|
} else if (instr->opcode == aco_opcode::v_and_b32) {
|
|
|
|
|
combine_and_subbrev(ctx, instr);
|
2019-09-17 13:22:17 +02:00
|
|
|
} else {
|
|
|
|
|
aco_opcode min, max, min3, max3, med3;
|
|
|
|
|
bool some_gfx9_only;
|
|
|
|
|
if (get_minmax_info(instr->opcode, &min, &max, &min3, &max3, &med3, &some_gfx9_only) &&
|
|
|
|
|
(!some_gfx9_only || ctx.program->chip_class >= GFX9)) {
|
2019-11-22 20:32:11 +00:00
|
|
|
if (combine_minmax(ctx, instr, instr->opcode == min ? max : min, instr->opcode == min ? min3 : max3)) ;
|
2019-09-17 13:22:17 +02:00
|
|
|
else combine_clamp(ctx, instr, min, max, med3);
|
|
|
|
|
}
|
|
|
|
|
}
|
2019-12-16 15:35:14 +00:00
|
|
|
|
|
|
|
|
/* do this after combine_salu_n2() */
|
|
|
|
|
if (instr->opcode == aco_opcode::s_andn2_b32 || instr->opcode == aco_opcode::s_andn2_b64)
|
|
|
|
|
combine_inverse_comparison(ctx, instr);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2020-01-16 19:32:31 +01:00
|
|
|
bool to_uniform_bool_instr(opt_ctx &ctx, aco_ptr<Instruction> &instr)
|
|
|
|
|
{
|
|
|
|
|
switch (instr->opcode) {
|
|
|
|
|
case aco_opcode::s_and_b32:
|
|
|
|
|
case aco_opcode::s_and_b64:
|
|
|
|
|
instr->opcode = aco_opcode::s_and_b32;
|
|
|
|
|
break;
|
|
|
|
|
case aco_opcode::s_or_b32:
|
|
|
|
|
case aco_opcode::s_or_b64:
|
|
|
|
|
instr->opcode = aco_opcode::s_or_b32;
|
|
|
|
|
break;
|
|
|
|
|
case aco_opcode::s_xor_b32:
|
|
|
|
|
case aco_opcode::s_xor_b64:
|
|
|
|
|
instr->opcode = aco_opcode::s_absdiff_i32;
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
/* Don't transform other instructions. They are very unlikely to appear here. */
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (Operand &op : instr->operands) {
|
|
|
|
|
ctx.uses[op.tempId()]--;
|
|
|
|
|
|
|
|
|
|
if (ctx.info[op.tempId()].is_uniform_bool()) {
|
|
|
|
|
/* Just use the uniform boolean temp. */
|
|
|
|
|
op.setTemp(ctx.info[op.tempId()].temp);
|
|
|
|
|
} else if (ctx.info[op.tempId()].is_uniform_bitwise()) {
|
|
|
|
|
/* Use the SCC definition of the predecessor instruction.
|
|
|
|
|
* This allows the predecessor to get picked up by the same optimization (if it has no divergent users),
|
|
|
|
|
* and it also makes sure that the current instruction will keep working even if the predecessor won't be transformed.
|
|
|
|
|
*/
|
|
|
|
|
Instruction *pred_instr = ctx.info[op.tempId()].instr;
|
|
|
|
|
assert(pred_instr->definitions.size() >= 2);
|
|
|
|
|
assert(pred_instr->definitions[1].isFixed() && pred_instr->definitions[1].physReg() == scc);
|
|
|
|
|
op.setTemp(pred_instr->definitions[1].getTemp());
|
|
|
|
|
} else {
|
|
|
|
|
unreachable("Invalid operand on uniform bitwise instruction.");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ctx.uses[op.tempId()]++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
instr->definitions[0].setTemp(Temp(instr->definitions[0].tempId(), s1));
|
|
|
|
|
assert(instr->operands[0].regClass() == s1);
|
|
|
|
|
assert(instr->operands[1].regClass() == s1);
|
|
|
|
|
return true;
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-11-02 16:44:04 +01:00
|
|
|
void select_mul_u32_u24(opt_ctx &ctx, aco_ptr<Instruction>& instr)
|
|
|
|
|
{
|
|
|
|
|
if (instr->usesModifiers())
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
/* Only valid if the accumulator is zero (this is selected by isel to
|
|
|
|
|
* combine more v_add_u32+v_mad_u32_u16 together), but the optimizer
|
|
|
|
|
* fallbacks here when not possible.
|
|
|
|
|
*/
|
|
|
|
|
if (!instr->operands[2].constantEquals(0))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
/* Only valid if the upper 16-bits of both operands are zero (because
|
|
|
|
|
* v_mul_u32_u24 doesn't mask them).
|
|
|
|
|
*/
|
|
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
|
|
|
|
if (instr->operands[i].isTemp() && !instr->operands[i].is16bit())
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool swap = false;
|
|
|
|
|
|
|
|
|
|
/* VOP2 instructions can only take constants/sgprs in operand 0. */
|
|
|
|
|
if ((instr->operands[1].isConstant() ||
|
|
|
|
|
(instr->operands[1].hasRegClass() &&
|
|
|
|
|
instr->operands[1].regClass().type() == RegType::sgpr))) {
|
|
|
|
|
swap = true;
|
|
|
|
|
if ((instr->operands[0].isConstant() ||
|
|
|
|
|
(instr->operands[0].hasRegClass() &&
|
|
|
|
|
instr->operands[0].regClass().type() == RegType::sgpr))) {
|
|
|
|
|
/* VOP2 can't take both constants/sgprs, keep v_mad_u32_u16 because
|
|
|
|
|
* v_mul_u32_u24 has no advantages.
|
|
|
|
|
*/
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
VOP2_instruction *new_instr = create_instruction<VOP2_instruction>(aco_opcode::v_mul_u32_u24, Format::VOP2, 2, 1);
|
|
|
|
|
new_instr->operands[0] = instr->operands[swap];
|
|
|
|
|
new_instr->operands[1] = instr->operands[!swap];
|
|
|
|
|
new_instr->definitions[0] = instr->definitions[0];
|
|
|
|
|
instr.reset(new_instr);
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
void select_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
|
|
|
|
|
{
|
|
|
|
|
const uint32_t threshold = 4;
|
|
|
|
|
|
2019-12-16 13:30:10 +00:00
|
|
|
if (is_dead(ctx.uses, instr.get())) {
|
2019-09-17 13:22:17 +02:00
|
|
|
instr.reset();
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2019-12-09 21:20:10 +00:00
|
|
|
/* convert split_vector into a copy or extract_vector if only one definition is ever used */
|
2019-09-17 13:22:17 +02:00
|
|
|
if (instr->opcode == aco_opcode::p_split_vector) {
|
|
|
|
|
unsigned num_used = 0;
|
|
|
|
|
unsigned idx = 0;
|
2020-04-10 13:09:54 +01:00
|
|
|
unsigned split_offset = 0;
|
|
|
|
|
for (unsigned i = 0, offset = 0; i < instr->definitions.size(); offset += instr->definitions[i++].bytes()) {
|
2019-09-17 13:22:17 +02:00
|
|
|
if (ctx.uses[instr->definitions[i].tempId()]) {
|
|
|
|
|
num_used++;
|
|
|
|
|
idx = i;
|
2020-04-10 13:09:54 +01:00
|
|
|
split_offset = offset;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
2019-12-09 21:20:10 +00:00
|
|
|
bool done = false;
|
|
|
|
|
if (num_used == 1 && ctx.info[instr->operands[0].tempId()].is_vec() &&
|
|
|
|
|
ctx.uses[instr->operands[0].tempId()] == 1) {
|
|
|
|
|
Instruction *vec = ctx.info[instr->operands[0].tempId()].instr;
|
|
|
|
|
|
|
|
|
|
unsigned off = 0;
|
|
|
|
|
Operand op;
|
|
|
|
|
for (Operand& vec_op : vec->operands) {
|
2020-04-10 13:09:54 +01:00
|
|
|
if (off == split_offset) {
|
2019-12-09 21:20:10 +00:00
|
|
|
op = vec_op;
|
|
|
|
|
break;
|
|
|
|
|
}
|
2020-04-10 13:09:54 +01:00
|
|
|
off += vec_op.bytes();
|
2019-12-09 21:20:10 +00:00
|
|
|
}
|
2020-04-10 13:09:54 +01:00
|
|
|
if (off != instr->operands[0].bytes() && op.bytes() == instr->definitions[idx].bytes()) {
|
2019-12-09 21:20:10 +00:00
|
|
|
ctx.uses[instr->operands[0].tempId()]--;
|
|
|
|
|
for (Operand& vec_op : vec->operands) {
|
|
|
|
|
if (vec_op.isTemp())
|
|
|
|
|
ctx.uses[vec_op.tempId()]--;
|
|
|
|
|
}
|
|
|
|
|
if (op.isTemp())
|
|
|
|
|
ctx.uses[op.tempId()]++;
|
|
|
|
|
|
|
|
|
|
aco_ptr<Pseudo_instruction> extract{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, 1, 1)};
|
|
|
|
|
extract->operands[0] = op;
|
|
|
|
|
extract->definitions[0] = instr->definitions[idx];
|
|
|
|
|
instr.reset(extract.release());
|
|
|
|
|
|
|
|
|
|
done = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-04-10 13:09:54 +01:00
|
|
|
if (!done && num_used == 1 &&
|
|
|
|
|
instr->operands[0].bytes() % instr->definitions[idx].bytes() == 0 &&
|
|
|
|
|
split_offset % instr->definitions[idx].bytes() == 0) {
|
2019-09-17 13:22:17 +02:00
|
|
|
aco_ptr<Pseudo_instruction> extract{create_instruction<Pseudo_instruction>(aco_opcode::p_extract_vector, Format::PSEUDO, 2, 1)};
|
|
|
|
|
extract->operands[0] = instr->operands[0];
|
2020-04-10 13:09:54 +01:00
|
|
|
extract->operands[1] = Operand((uint32_t) split_offset / instr->definitions[idx].bytes());
|
2019-09-17 13:22:17 +02:00
|
|
|
extract->definitions[0] = instr->definitions[idx];
|
|
|
|
|
instr.reset(extract.release());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-11-22 15:18:38 +00:00
|
|
|
mad_info* mad_info = NULL;
|
2020-05-15 14:03:15 +01:00
|
|
|
if (!instr->definitions.empty() && ctx.info[instr->definitions[0].tempId()].is_mad()) {
|
2020-06-01 11:27:53 +01:00
|
|
|
mad_info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].instr->pass_flags];
|
2019-11-22 15:18:38 +00:00
|
|
|
/* re-check mad instructions */
|
|
|
|
|
if (ctx.uses[mad_info->mul_temp_id]) {
|
|
|
|
|
ctx.uses[mad_info->mul_temp_id]++;
|
2019-11-20 19:09:25 +00:00
|
|
|
if (instr->operands[0].isTemp())
|
|
|
|
|
ctx.uses[instr->operands[0].tempId()]--;
|
|
|
|
|
if (instr->operands[1].isTemp())
|
|
|
|
|
ctx.uses[instr->operands[1].tempId()]--;
|
2019-11-22 15:18:38 +00:00
|
|
|
instr.swap(mad_info->add_instr);
|
|
|
|
|
mad_info = NULL;
|
|
|
|
|
}
|
|
|
|
|
/* check literals */
|
|
|
|
|
else if (!instr->usesModifiers()) {
|
2020-05-15 14:03:15 +01:00
|
|
|
/* FMA can only take literals on GFX10+ */
|
2020-05-14 21:09:36 +01:00
|
|
|
if ((instr->opcode == aco_opcode::v_fma_f32 || instr->opcode == aco_opcode::v_fma_f16) &&
|
|
|
|
|
ctx.program->chip_class < GFX10)
|
2020-05-15 14:03:15 +01:00
|
|
|
return;
|
|
|
|
|
|
2019-11-22 15:18:38 +00:00
|
|
|
bool sgpr_used = false;
|
2019-09-17 13:22:17 +02:00
|
|
|
uint32_t literal_idx = 0;
|
|
|
|
|
uint32_t literal_uses = UINT32_MAX;
|
|
|
|
|
for (unsigned i = 0; i < instr->operands.size(); i++)
|
|
|
|
|
{
|
2019-11-22 15:18:38 +00:00
|
|
|
if (instr->operands[i].isConstant() && i > 0) {
|
|
|
|
|
literal_uses = UINT32_MAX;
|
|
|
|
|
break;
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
if (!instr->operands[i].isTemp())
|
|
|
|
|
continue;
|
2020-05-15 16:28:03 +01:00
|
|
|
unsigned bits = get_operand_size(instr, i);
|
2019-11-22 15:18:38 +00:00
|
|
|
/* if one of the operands is sgpr, we cannot add a literal somewhere else on pre-GFX10 or operands other than the 1st */
|
|
|
|
|
if (instr->operands[i].getTemp().type() == RegType::sgpr && (i > 0 || ctx.program->chip_class < GFX10)) {
|
2020-05-15 16:28:03 +01:00
|
|
|
if (!sgpr_used && ctx.info[instr->operands[i].tempId()].is_literal(bits)) {
|
2019-09-17 13:22:17 +02:00
|
|
|
literal_uses = ctx.uses[instr->operands[i].tempId()];
|
|
|
|
|
literal_idx = i;
|
|
|
|
|
} else {
|
|
|
|
|
literal_uses = UINT32_MAX;
|
|
|
|
|
}
|
2019-11-22 15:18:38 +00:00
|
|
|
sgpr_used = true;
|
|
|
|
|
/* don't break because we still need to check constants */
|
|
|
|
|
} else if (!sgpr_used &&
|
2020-05-15 16:28:03 +01:00
|
|
|
ctx.info[instr->operands[i].tempId()].is_literal(bits) &&
|
2019-11-22 15:18:38 +00:00
|
|
|
ctx.uses[instr->operands[i].tempId()] < literal_uses) {
|
2019-09-17 13:22:17 +02:00
|
|
|
literal_uses = ctx.uses[instr->operands[i].tempId()];
|
|
|
|
|
literal_idx = i;
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-04-01 18:09:43 +02:00
|
|
|
|
|
|
|
|
/* Limit the number of literals to apply to not increase the code
|
|
|
|
|
* size too much, but always apply literals for v_mad->v_madak
|
|
|
|
|
* because both instructions are 64-bit and this doesn't increase
|
|
|
|
|
* code size.
|
|
|
|
|
* TODO: try to apply the literals earlier to lower the number of
|
|
|
|
|
* uses below threshold
|
|
|
|
|
*/
|
|
|
|
|
if (literal_uses < threshold || literal_idx == 2) {
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.uses[instr->operands[literal_idx].tempId()]--;
|
2019-11-22 15:18:38 +00:00
|
|
|
mad_info->check_literal = true;
|
|
|
|
|
mad_info->literal_idx = literal_idx;
|
|
|
|
|
return;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-01-16 19:32:31 +01:00
|
|
|
/* Mark SCC needed, so the uniform boolean transformation won't swap the definitions when it isn't beneficial */
|
|
|
|
|
if (instr->format == Format::PSEUDO_BRANCH &&
|
|
|
|
|
instr->operands.size() &&
|
|
|
|
|
instr->operands[0].isTemp()) {
|
|
|
|
|
ctx.info[instr->operands[0].tempId()].set_scc_needed();
|
|
|
|
|
return;
|
|
|
|
|
} else if ((instr->opcode == aco_opcode::s_cselect_b64 ||
|
|
|
|
|
instr->opcode == aco_opcode::s_cselect_b32) &&
|
|
|
|
|
instr->operands[2].isTemp()) {
|
|
|
|
|
ctx.info[instr->operands[2].tempId()].set_scc_needed();
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
/* check for literals */
|
2019-11-22 13:43:39 +00:00
|
|
|
if (!instr->isSALU() && !instr->isVALU())
|
|
|
|
|
return;
|
|
|
|
|
|
2020-01-16 19:32:31 +01:00
|
|
|
/* Transform uniform bitwise boolean operations to 32-bit when there are no divergent uses. */
|
|
|
|
|
if (instr->definitions.size() &&
|
|
|
|
|
ctx.uses[instr->definitions[0].tempId()] == 0 &&
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].is_uniform_bitwise()) {
|
|
|
|
|
bool transform_done = to_uniform_bool_instr(ctx, instr);
|
|
|
|
|
|
|
|
|
|
if (transform_done && !ctx.info[instr->definitions[1].tempId()].is_scc_needed()) {
|
|
|
|
|
/* Swap the two definition IDs in order to avoid overusing the SCC. This reduces extra moves generated by RA. */
|
|
|
|
|
uint32_t def0_id = instr->definitions[0].getTemp().id();
|
|
|
|
|
uint32_t def1_id = instr->definitions[1].getTemp().id();
|
|
|
|
|
instr->definitions[0].setTemp(Temp(def1_id, s1));
|
|
|
|
|
instr->definitions[1].setTemp(Temp(def0_id, s1));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2020-11-02 16:44:04 +01:00
|
|
|
if (instr->opcode == aco_opcode::v_mad_u32_u16)
|
|
|
|
|
select_mul_u32_u24(ctx, instr);
|
|
|
|
|
|
2019-11-20 16:42:17 +00:00
|
|
|
if (instr->isSDWA() || instr->isDPP() || (instr->isVOP3() && ctx.program->chip_class < GFX10))
|
2019-11-22 13:43:39 +00:00
|
|
|
return; /* some encodings can't ever take literals */
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
/* we do not apply the literals yet as we don't know if it is profitable */
|
2019-11-22 13:43:39 +00:00
|
|
|
Operand current_literal(s1);
|
|
|
|
|
|
|
|
|
|
unsigned literal_id = 0;
|
|
|
|
|
unsigned literal_uses = UINT32_MAX;
|
|
|
|
|
Operand literal(s1);
|
2019-11-20 16:42:17 +00:00
|
|
|
unsigned num_operands = 1;
|
|
|
|
|
if (instr->isSALU() || (ctx.program->chip_class >= GFX10 && can_use_VOP3(ctx, instr)))
|
|
|
|
|
num_operands = instr->operands.size();
|
2020-01-24 17:37:11 +00:00
|
|
|
/* catch VOP2 with a 3rd SGPR operand (e.g. v_cndmask_b32, v_addc_co_u32) */
|
|
|
|
|
else if (instr->isVALU() && instr->operands.size() >= 3)
|
|
|
|
|
return;
|
2019-11-22 13:43:39 +00:00
|
|
|
|
|
|
|
|
unsigned sgpr_ids[2] = {0, 0};
|
|
|
|
|
bool is_literal_sgpr = false;
|
|
|
|
|
uint32_t mask = 0;
|
|
|
|
|
|
|
|
|
|
/* choose a literal to apply */
|
|
|
|
|
for (unsigned i = 0; i < num_operands; i++) {
|
|
|
|
|
Operand op = instr->operands[i];
|
2020-05-15 16:28:03 +01:00
|
|
|
unsigned bits = get_operand_size(instr, i);
|
2020-01-23 20:03:40 +00:00
|
|
|
|
|
|
|
|
if (instr->isVALU() && op.isTemp() && op.getTemp().type() == RegType::sgpr &&
|
|
|
|
|
op.tempId() != sgpr_ids[0])
|
|
|
|
|
sgpr_ids[!!sgpr_ids[0]] = op.tempId();
|
|
|
|
|
|
2019-11-22 13:43:39 +00:00
|
|
|
if (op.isLiteral()) {
|
|
|
|
|
current_literal = op;
|
|
|
|
|
continue;
|
2020-05-15 16:28:03 +01:00
|
|
|
} else if (!op.isTemp() || !ctx.info[op.tempId()].is_literal(bits)) {
|
2019-11-22 13:43:39 +00:00
|
|
|
continue;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
2019-11-22 13:43:39 +00:00
|
|
|
|
2020-01-16 16:54:35 +01:00
|
|
|
if (!alu_can_accept_constant(instr->opcode, i))
|
2019-11-22 13:43:39 +00:00
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
if (ctx.uses[op.tempId()] < literal_uses) {
|
|
|
|
|
is_literal_sgpr = op.getTemp().type() == RegType::sgpr;
|
|
|
|
|
mask = 0;
|
|
|
|
|
literal = Operand(ctx.info[op.tempId()].val);
|
|
|
|
|
literal_uses = ctx.uses[op.tempId()];
|
|
|
|
|
literal_id = op.tempId();
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
2019-11-22 13:43:39 +00:00
|
|
|
|
|
|
|
|
mask |= (op.tempId() == literal_id) << i;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2019-11-22 13:43:39 +00:00
|
|
|
|
|
|
|
|
/* don't go over the constant bus limit */
|
2019-11-20 16:42:17 +00:00
|
|
|
bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_lshrrev_b64 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_ashrrev_i64;
|
2019-11-22 13:43:39 +00:00
|
|
|
unsigned const_bus_limit = instr->isVALU() ? 1 : UINT32_MAX;
|
2019-11-20 16:42:17 +00:00
|
|
|
if (ctx.program->chip_class >= GFX10 && !is_shift64)
|
|
|
|
|
const_bus_limit = 2;
|
|
|
|
|
|
2019-11-22 13:43:39 +00:00
|
|
|
unsigned num_sgprs = !!sgpr_ids[0] + !!sgpr_ids[1];
|
|
|
|
|
if (num_sgprs == const_bus_limit && !is_literal_sgpr)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
if (literal_id && literal_uses < threshold &&
|
|
|
|
|
(current_literal.isUndefined() ||
|
|
|
|
|
(current_literal.size() == literal.size() &&
|
|
|
|
|
current_literal.constantValue() == literal.constantValue()))) {
|
|
|
|
|
/* mark the literal to be applied */
|
|
|
|
|
while (mask) {
|
|
|
|
|
unsigned i = u_bit_scan(&mask);
|
|
|
|
|
if (instr->operands[i].isTemp() && instr->operands[i].tempId() == literal_id)
|
|
|
|
|
ctx.uses[instr->operands[i].tempId()]--;
|
|
|
|
|
}
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void apply_literals(opt_ctx &ctx, aco_ptr<Instruction>& instr)
|
|
|
|
|
{
|
|
|
|
|
/* Cleanup Dead Instructions */
|
|
|
|
|
if (!instr)
|
|
|
|
|
return;
|
|
|
|
|
|
2019-11-22 13:43:39 +00:00
|
|
|
/* apply literals on MAD */
|
2020-05-15 14:03:15 +01:00
|
|
|
if (!instr->definitions.empty() && ctx.info[instr->definitions[0].tempId()].is_mad()) {
|
2020-06-01 11:27:53 +01:00
|
|
|
mad_info* info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].instr->pass_flags];
|
2020-04-01 18:09:43 +02:00
|
|
|
if (info->check_literal &&
|
|
|
|
|
(ctx.uses[instr->operands[info->literal_idx].tempId()] == 0 || info->literal_idx == 2)) {
|
2019-11-22 13:43:39 +00:00
|
|
|
aco_ptr<Instruction> new_mad;
|
2020-05-15 14:03:15 +01:00
|
|
|
|
|
|
|
|
aco_opcode new_op = info->literal_idx == 2 ? aco_opcode::v_madak_f32 : aco_opcode::v_madmk_f32;
|
|
|
|
|
if (instr->opcode == aco_opcode::v_fma_f32)
|
|
|
|
|
new_op = info->literal_idx == 2 ? aco_opcode::v_fmaak_f32 : aco_opcode::v_fmamk_f32;
|
2020-05-14 21:09:36 +01:00
|
|
|
else if (instr->opcode == aco_opcode::v_mad_f16 || instr->opcode == aco_opcode::v_mad_legacy_f16)
|
|
|
|
|
new_op = info->literal_idx == 2 ? aco_opcode::v_madak_f16 : aco_opcode::v_madmk_f16;
|
|
|
|
|
else if (instr->opcode == aco_opcode::v_fma_f16)
|
|
|
|
|
new_op = info->literal_idx == 2 ? aco_opcode::v_fmaak_f16 : aco_opcode::v_fmamk_f16;
|
2020-05-15 14:03:15 +01:00
|
|
|
|
|
|
|
|
new_mad.reset(create_instruction<VOP2_instruction>(new_op, Format::VOP2, 3, 1));
|
2019-11-22 15:18:38 +00:00
|
|
|
if (info->literal_idx == 2) { /* add literal -> madak */
|
|
|
|
|
new_mad->operands[0] = instr->operands[0];
|
|
|
|
|
new_mad->operands[1] = instr->operands[1];
|
|
|
|
|
} else { /* mul literal -> madmk */
|
|
|
|
|
new_mad->operands[0] = instr->operands[1 - info->literal_idx];
|
|
|
|
|
new_mad->operands[1] = instr->operands[2];
|
2019-11-22 13:43:39 +00:00
|
|
|
}
|
2019-11-22 15:18:38 +00:00
|
|
|
new_mad->operands[2] = Operand(ctx.info[instr->operands[info->literal_idx].tempId()].val);
|
|
|
|
|
new_mad->definitions[0] = instr->definitions[0];
|
|
|
|
|
ctx.instructions.emplace_back(std::move(new_mad));
|
|
|
|
|
return;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-11-22 15:18:38 +00:00
|
|
|
/* apply literals on other SALU/VALU */
|
|
|
|
|
if (instr->isSALU() || instr->isVALU()) {
|
2019-11-22 13:43:39 +00:00
|
|
|
for (unsigned i = 0; i < instr->operands.size(); i++) {
|
|
|
|
|
Operand op = instr->operands[i];
|
2020-05-15 16:28:03 +01:00
|
|
|
unsigned bits = get_operand_size(instr, i);
|
|
|
|
|
if (op.isTemp() && ctx.info[op.tempId()].is_literal(bits) && ctx.uses[op.tempId()] == 0) {
|
2019-11-22 13:43:39 +00:00
|
|
|
Operand literal(ctx.info[op.tempId()].val);
|
|
|
|
|
if (instr->isVALU() && i > 0)
|
|
|
|
|
to_VOP3(ctx, instr);
|
|
|
|
|
instr->operands[i] = literal;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ctx.instructions.emplace_back(std::move(instr));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void optimize(Program* program)
|
|
|
|
|
{
|
|
|
|
|
opt_ctx ctx;
|
|
|
|
|
ctx.program = program;
|
|
|
|
|
std::vector<ssa_info> info(program->peekAllocationId());
|
|
|
|
|
ctx.info = info.data();
|
|
|
|
|
|
|
|
|
|
/* 1. Bottom-Up DAG pass (forward) to label all ssa-defs */
|
|
|
|
|
for (Block& block : program->blocks) {
|
|
|
|
|
for (aco_ptr<Instruction>& instr : block.instructions)
|
2019-11-09 20:51:45 +00:00
|
|
|
label_instruction(ctx, block, instr);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2020-05-22 12:52:05 +02:00
|
|
|
ctx.uses = dead_code_analysis(program);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
/* 2. Combine v_mad, omod, clamp and propagate sgpr on VALU instructions */
|
|
|
|
|
for (Block& block : program->blocks) {
|
|
|
|
|
for (aco_ptr<Instruction>& instr : block.instructions)
|
2019-11-09 20:51:45 +00:00
|
|
|
combine_instruction(ctx, block, instr);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* 3. Top-Down DAG pass (backward) to select instructions (includes DCE) */
|
|
|
|
|
for (std::vector<Block>::reverse_iterator it = program->blocks.rbegin(); it != program->blocks.rend(); ++it) {
|
|
|
|
|
Block* block = &(*it);
|
|
|
|
|
for (std::vector<aco_ptr<Instruction>>::reverse_iterator it = block->instructions.rbegin(); it != block->instructions.rend(); ++it)
|
|
|
|
|
select_instruction(ctx, *it);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* 4. Add literals to instructions */
|
|
|
|
|
for (Block& block : program->blocks) {
|
|
|
|
|
ctx.instructions.clear();
|
|
|
|
|
for (aco_ptr<Instruction>& instr : block.instructions)
|
|
|
|
|
apply_literals(ctx, instr);
|
|
|
|
|
block.instructions.swap(ctx.instructions);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|