2019-09-17 13:22:17 +02:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2018 Valve Corporation
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
* Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
* IN THE SOFTWARE.
|
|
|
|
|
*
|
|
|
|
|
*/
|
|
|
|
|
|
2020-06-30 15:33:18 +01:00
|
|
|
#include "aco_builder.h"
|
2019-09-17 13:22:17 +02:00
|
|
|
#include "aco_ir.h"
|
2021-06-09 10:14:54 +02:00
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
#include "util/half_float.h"
|
2020-08-04 10:58:11 -07:00
|
|
|
#include "util/memstream.h"
|
2021-06-09 15:40:03 +02:00
|
|
|
|
|
|
|
|
#include <algorithm>
|
|
|
|
|
#include <array>
|
|
|
|
|
#include <vector>
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
namespace aco {
|
|
|
|
|
|
2020-08-14 10:42:27 +02:00
|
|
|
#ifndef NDEBUG
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
perfwarn(Program* program, bool cond, const char* msg, Instruction* instr)
|
2020-08-14 10:42:27 +02:00
|
|
|
{
|
|
|
|
|
if (cond) {
|
2021-06-09 10:14:54 +02:00
|
|
|
char* out;
|
2020-08-14 10:42:27 +02:00
|
|
|
size_t outsize;
|
2020-08-04 10:58:11 -07:00
|
|
|
struct u_memstream mem;
|
|
|
|
|
u_memstream_open(&mem, &out, &outsize);
|
2021-06-09 10:14:54 +02:00
|
|
|
FILE* const memf = u_memstream_get(&mem);
|
2020-08-14 10:42:27 +02:00
|
|
|
|
|
|
|
|
fprintf(memf, "%s: ", msg);
|
2022-07-21 15:17:24 +01:00
|
|
|
aco_print_instr(program->gfx_level, instr, memf);
|
2020-08-04 10:58:11 -07:00
|
|
|
u_memstream_close(&mem);
|
2020-08-14 10:42:27 +02:00
|
|
|
|
|
|
|
|
aco_perfwarn(program, out);
|
|
|
|
|
free(out);
|
|
|
|
|
|
|
|
|
|
if (debug_flags & DEBUG_PERFWARN)
|
|
|
|
|
exit(1);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
/**
|
|
|
|
|
* The optimizer works in 4 phases:
|
|
|
|
|
* (1) The first pass collects information for each ssa-def,
|
|
|
|
|
* propagates reg->reg operands of the same type, inline constants
|
|
|
|
|
* and neg/abs input modifiers.
|
|
|
|
|
* (2) The second pass combines instructions like mad, omod, clamp and
|
|
|
|
|
* propagates sgpr's on VALU instructions.
|
|
|
|
|
* This pass depends on information collected in the first pass.
|
|
|
|
|
* (3) The third pass goes backwards, and selects instructions,
|
|
|
|
|
* i.e. decides if a mad instruction is profitable and eliminates dead code.
|
|
|
|
|
* (4) The fourth pass cleans up the sequence: literals get applied and dead
|
|
|
|
|
* instructions are removed from the sequence.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
struct mad_info {
|
|
|
|
|
aco_ptr<Instruction> add_instr;
|
|
|
|
|
uint32_t mul_temp_id;
|
aco: Use v_fmaak/v_fmamk if two operands are the same literal.
Foz-DB Navi21:
Totals from 5744 (4.26% of 134913) affected shaders:
VGPRs: 237128 -> 237056 (-0.03%); split: -0.04%, +0.01%
CodeSize: 16654484 -> 16620668 (-0.20%); split: -0.23%, +0.03%
MaxWaves: 152838 -> 152846 (+0.01%)
Instrs: 3063214 -> 3058572 (-0.15%); split: -0.17%, +0.02%
Latency: 23935195 -> 23934827 (-0.00%); split: -0.03%, +0.03%
InvThroughput: 5478562 -> 5478160 (-0.01%); split: -0.01%, +0.01%
VClause: 60432 -> 60435 (+0.00%); split: -0.02%, +0.03%
SClause: 121032 -> 120896 (-0.11%); split: -0.20%, +0.09%
Copies: 147865 -> 143144 (-3.19%); split: -3.59%, +0.40%
PreSGPRs: 195722 -> 195661 (-0.03%); split: -0.06%, +0.03%
PreVGPRs: 182849 -> 182787 (-0.03%)
Foz-DB Vega10:
Totals from 5290 (3.92% of 135041) affected shaders:
SGPRs: 357952 -> 359616 (+0.46%); split: -0.11%, +0.57%
VGPRs: 204048 -> 203928 (-0.06%); split: -0.08%, +0.02%
CodeSize: 14043176 -> 14003100 (-0.29%); split: -0.29%, +0.00%
MaxWaves: 39401 -> 39398 (-0.01%); split: +0.01%, -0.02%
Instrs: 2636739 -> 2631246 (-0.21%); split: -0.21%, +0.00%
Latency: 25264088 -> 25256482 (-0.03%); split: -0.05%, +0.02%
InvThroughput: 12039643 -> 12039346 (-0.00%); split: -0.00%, +0.00%
VClause: 55603 -> 55584 (-0.03%); split: -0.04%, +0.00%
SClause: 101577 -> 101342 (-0.23%); split: -0.30%, +0.07%
Copies: 213344 -> 207929 (-2.54%); split: -2.58%, +0.05%
Branches: 34053 -> 34054 (+0.00%)
PreSGPRs: 172405 -> 172260 (-0.08%)
Signed-off-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18645>
2022-09-17 20:52:24 +02:00
|
|
|
uint16_t literal_mask;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2019-11-22 15:18:38 +00:00
|
|
|
mad_info(aco_ptr<Instruction> instr, uint32_t id)
|
aco: Use v_fmaak/v_fmamk if two operands are the same literal.
Foz-DB Navi21:
Totals from 5744 (4.26% of 134913) affected shaders:
VGPRs: 237128 -> 237056 (-0.03%); split: -0.04%, +0.01%
CodeSize: 16654484 -> 16620668 (-0.20%); split: -0.23%, +0.03%
MaxWaves: 152838 -> 152846 (+0.01%)
Instrs: 3063214 -> 3058572 (-0.15%); split: -0.17%, +0.02%
Latency: 23935195 -> 23934827 (-0.00%); split: -0.03%, +0.03%
InvThroughput: 5478562 -> 5478160 (-0.01%); split: -0.01%, +0.01%
VClause: 60432 -> 60435 (+0.00%); split: -0.02%, +0.03%
SClause: 121032 -> 120896 (-0.11%); split: -0.20%, +0.09%
Copies: 147865 -> 143144 (-3.19%); split: -3.59%, +0.40%
PreSGPRs: 195722 -> 195661 (-0.03%); split: -0.06%, +0.03%
PreVGPRs: 182849 -> 182787 (-0.03%)
Foz-DB Vega10:
Totals from 5290 (3.92% of 135041) affected shaders:
SGPRs: 357952 -> 359616 (+0.46%); split: -0.11%, +0.57%
VGPRs: 204048 -> 203928 (-0.06%); split: -0.08%, +0.02%
CodeSize: 14043176 -> 14003100 (-0.29%); split: -0.29%, +0.00%
MaxWaves: 39401 -> 39398 (-0.01%); split: +0.01%, -0.02%
Instrs: 2636739 -> 2631246 (-0.21%); split: -0.21%, +0.00%
Latency: 25264088 -> 25256482 (-0.03%); split: -0.05%, +0.02%
InvThroughput: 12039643 -> 12039346 (-0.00%); split: -0.00%, +0.00%
VClause: 55603 -> 55584 (-0.03%); split: -0.04%, +0.00%
SClause: 101577 -> 101342 (-0.23%); split: -0.30%, +0.07%
Copies: 213344 -> 207929 (-2.54%); split: -2.58%, +0.05%
Branches: 34053 -> 34054 (+0.00%)
PreSGPRs: 172405 -> 172260 (-0.08%)
Signed-off-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18645>
2022-09-17 20:52:24 +02:00
|
|
|
: add_instr(std::move(instr)), mul_temp_id(id), literal_mask(0)
|
2021-06-09 10:14:54 +02:00
|
|
|
{}
|
2019-09-17 13:22:17 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
enum Label {
|
|
|
|
|
label_vec = 1 << 0,
|
2020-05-15 16:28:03 +01:00
|
|
|
label_constant_32bit = 1 << 1,
|
2020-05-15 15:12:33 +01:00
|
|
|
/* label_{abs,neg,mul,omod2,omod4,omod5,clamp} are used for both 16 and
|
|
|
|
|
* 32-bit operations but this shouldn't cause any issues because we don't
|
|
|
|
|
* look through any conversions */
|
2019-09-17 13:22:17 +02:00
|
|
|
label_abs = 1 << 2,
|
|
|
|
|
label_neg = 1 << 3,
|
|
|
|
|
label_mul = 1 << 4,
|
|
|
|
|
label_temp = 1 << 5,
|
|
|
|
|
label_literal = 1 << 6,
|
|
|
|
|
label_mad = 1 << 7,
|
|
|
|
|
label_omod2 = 1 << 8,
|
|
|
|
|
label_omod4 = 1 << 9,
|
|
|
|
|
label_omod5 = 1 << 10,
|
|
|
|
|
label_clamp = 1 << 12,
|
|
|
|
|
label_undefined = 1 << 14,
|
|
|
|
|
label_vcc = 1 << 15,
|
|
|
|
|
label_b2f = 1 << 16,
|
|
|
|
|
label_add_sub = 1 << 17,
|
|
|
|
|
label_bitwise = 1 << 18,
|
|
|
|
|
label_minmax = 1 << 19,
|
2020-06-19 16:09:48 +01:00
|
|
|
label_vopc = 1 << 20,
|
2019-11-05 11:41:00 +01:00
|
|
|
label_uniform_bool = 1 << 21,
|
2019-11-13 11:14:51 +01:00
|
|
|
label_constant_64bit = 1 << 22,
|
2020-01-03 10:30:04 +01:00
|
|
|
label_uniform_bitwise = 1 << 23,
|
aco: Flip s_cbranch / s_cselect to optimize out an s_not if possible.
When possible, get rid of an s_not when all it does is invert the SCC,
and its successor s_cbranch / s_cselect can be inverted instead.
Also modify some parts of instruction_selection to take advantage of
this feature.
Example:
s2: %3900, s1: %3899:scc = s_andn2_b64 %0:exec, %406
s2: %3902 = s_cselect_b64 -1, 0, %3900:scc
s2: %407, s1: %3903:scc = s_not_b64 %3902
s2: %3906, s1: %3905:scc = s_and_b64 %407, %0:exec
p_cbranch_z %3905:scc
Can now be optimized to:
s2: %3900, s1: %3899:scc = s_andn2_b64 %0:exec, %406
p_cbranch_nz %3900:scc
Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
2019-11-19 13:29:54 +01:00
|
|
|
label_scc_invert = 1 << 24,
|
2020-01-16 19:32:31 +01:00
|
|
|
label_scc_needed = 1 << 26,
|
2020-04-02 17:41:36 +02:00
|
|
|
label_b2i = 1 << 27,
|
2020-06-17 15:02:30 +01:00
|
|
|
label_fcanonicalize = 1 << 28,
|
2020-05-15 16:28:03 +01:00
|
|
|
label_constant_16bit = 1 << 29,
|
2021-06-09 10:14:54 +02:00
|
|
|
label_usedef = 1 << 30, /* generic label */
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
label_vop3p = 1ull << 31, /* 1ull to prevent sign extension */
|
|
|
|
|
label_canonicalized = 1ull << 32,
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
label_extract = 1ull << 33,
|
|
|
|
|
label_insert = 1ull << 34,
|
2021-11-29 00:12:04 +09:00
|
|
|
label_dpp16 = 1ull << 35,
|
|
|
|
|
label_dpp8 = 1ull << 36,
|
aco: use v_fma_mix to combine mul/add/fma input conversions
fossil-db (Sienna Cichlid):
Totals from 11558 (8.57% of 134913) affected shaders:
VGPRs: 829392 -> 825200 (-0.51%); split: -0.52%, +0.02%
SpillSGPRs: 7845 -> 8399 (+7.06%)
CodeSize: 101822704 -> 101677172 (-0.14%); split: -0.25%, +0.11%
MaxWaves: 172216 -> 173182 (+0.56%); split: +0.59%, -0.03%
Instrs: 19061343 -> 18883450 (-0.93%); split: -0.93%, +0.00%
Latency: 256011590 -> 255177378 (-0.33%); split: -0.39%, +0.06%
InvThroughput: 46104438 -> 45604059 (-1.09%); split: -1.12%, +0.04%
VClause: 352211 -> 351948 (-0.07%); split: -0.21%, +0.13%
SClause: 676506 -> 676961 (+0.07%); split: -0.04%, +0.11%
Copies: 1246571 -> 1237745 (-0.71%); split: -0.97%, +0.26%
Branches: 626229 -> 626241 (+0.00%); split: -0.02%, +0.03%
PreSGPRs: 882176 -> 888853 (+0.76%); split: -0.00%, +0.76%
PreVGPRs: 796705 -> 792304 (-0.55%); split: -0.56%, +0.00%
fossil-db (Navi):
Totals from 11558 (8.57% of 134913) affected shaders:
VGPRs: 803900 -> 798660 (-0.65%); split: -0.73%, +0.08%
SpillSGPRs: 7894 -> 8492 (+7.58%); split: -0.10%, +7.68%
CodeSize: 96892596 -> 97134716 (+0.25%); split: -0.05%, +0.29%
MaxWaves: 181454 -> 183014 (+0.86%); split: +0.94%, -0.08%
Instrs: 18186813 -> 18093994 (-0.51%); split: -0.56%, +0.05%
Latency: 253385909 -> 253325528 (-0.02%); split: -0.15%, +0.12%
InvThroughput: 43315355 -> 42805541 (-1.18%); split: -1.33%, +0.15%
VClause: 338755 -> 338535 (-0.06%); split: -0.16%, +0.10%
SClause: 656561 -> 656829 (+0.04%); split: -0.07%, +0.11%
Copies: 1162235 -> 1153558 (-0.75%); split: -1.07%, +0.32%
Branches: 588536 -> 588542 (+0.00%); split: -0.03%, +0.03%
PreSGPRs: 854849 -> 861640 (+0.79%); split: -0.00%, +0.80%
PreVGPRs: 783401 -> 779031 (-0.56%); split: -0.56%, +0.00%
fossil-db (Vega):
Totals from 11516 (8.53% of 135048) affected shaders:
SGPRs: 1072128 -> 1076288 (+0.39%); split: -0.01%, +0.40%
VGPRs: 821312 -> 818124 (-0.39%); split: -0.43%, +0.04%
SpillSGPRs: 11952 -> 12677 (+6.07%)
CodeSize: 96378496 -> 96707596 (+0.34%); split: -0.04%, +0.38%
MaxWaves: 42614 -> 42883 (+0.63%); split: +0.68%, -0.04%
Instrs: 18672844 -> 18600274 (-0.39%); split: -0.44%, +0.05%
Latency: 296658786 -> 296338296 (-0.11%); split: -0.21%, +0.10%
InvThroughput: 111665547 -> 111283559 (-0.34%); split: -0.40%, +0.06%
VClause: 343001 -> 342826 (-0.05%); split: -0.14%, +0.09%
SClause: 646684 -> 646657 (-0.00%); split: -0.05%, +0.04%
Copies: 1715316 -> 1712895 (-0.14%); split: -0.53%, +0.39%
PreSGPRs: 850737 -> 856543 (+0.68%); split: -0.04%, +0.72%
PreVGPRs: 775293 -> 772215 (-0.40%); split: -0.41%, +0.02%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 13:58:34 +00:00
|
|
|
label_f2f32 = 1ull << 37,
|
2022-01-17 16:52:10 +00:00
|
|
|
label_f2f16 = 1ull << 38,
|
2022-03-16 15:14:29 +01:00
|
|
|
label_split = 1ull << 39,
|
2019-09-17 13:22:17 +02:00
|
|
|
};
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
static constexpr uint64_t instr_usedef_labels =
|
|
|
|
|
label_vec | label_mul | label_mad | label_add_sub | label_vop3p | label_bitwise |
|
2021-11-29 00:12:04 +09:00
|
|
|
label_uniform_bitwise | label_minmax | label_vopc | label_usedef | label_extract | label_dpp16 |
|
aco: use v_fma_mix to combine mul/add/fma input conversions
fossil-db (Sienna Cichlid):
Totals from 11558 (8.57% of 134913) affected shaders:
VGPRs: 829392 -> 825200 (-0.51%); split: -0.52%, +0.02%
SpillSGPRs: 7845 -> 8399 (+7.06%)
CodeSize: 101822704 -> 101677172 (-0.14%); split: -0.25%, +0.11%
MaxWaves: 172216 -> 173182 (+0.56%); split: +0.59%, -0.03%
Instrs: 19061343 -> 18883450 (-0.93%); split: -0.93%, +0.00%
Latency: 256011590 -> 255177378 (-0.33%); split: -0.39%, +0.06%
InvThroughput: 46104438 -> 45604059 (-1.09%); split: -1.12%, +0.04%
VClause: 352211 -> 351948 (-0.07%); split: -0.21%, +0.13%
SClause: 676506 -> 676961 (+0.07%); split: -0.04%, +0.11%
Copies: 1246571 -> 1237745 (-0.71%); split: -0.97%, +0.26%
Branches: 626229 -> 626241 (+0.00%); split: -0.02%, +0.03%
PreSGPRs: 882176 -> 888853 (+0.76%); split: -0.00%, +0.76%
PreVGPRs: 796705 -> 792304 (-0.55%); split: -0.56%, +0.00%
fossil-db (Navi):
Totals from 11558 (8.57% of 134913) affected shaders:
VGPRs: 803900 -> 798660 (-0.65%); split: -0.73%, +0.08%
SpillSGPRs: 7894 -> 8492 (+7.58%); split: -0.10%, +7.68%
CodeSize: 96892596 -> 97134716 (+0.25%); split: -0.05%, +0.29%
MaxWaves: 181454 -> 183014 (+0.86%); split: +0.94%, -0.08%
Instrs: 18186813 -> 18093994 (-0.51%); split: -0.56%, +0.05%
Latency: 253385909 -> 253325528 (-0.02%); split: -0.15%, +0.12%
InvThroughput: 43315355 -> 42805541 (-1.18%); split: -1.33%, +0.15%
VClause: 338755 -> 338535 (-0.06%); split: -0.16%, +0.10%
SClause: 656561 -> 656829 (+0.04%); split: -0.07%, +0.11%
Copies: 1162235 -> 1153558 (-0.75%); split: -1.07%, +0.32%
Branches: 588536 -> 588542 (+0.00%); split: -0.03%, +0.03%
PreSGPRs: 854849 -> 861640 (+0.79%); split: -0.00%, +0.80%
PreVGPRs: 783401 -> 779031 (-0.56%); split: -0.56%, +0.00%
fossil-db (Vega):
Totals from 11516 (8.53% of 135048) affected shaders:
SGPRs: 1072128 -> 1076288 (+0.39%); split: -0.01%, +0.40%
VGPRs: 821312 -> 818124 (-0.39%); split: -0.43%, +0.04%
SpillSGPRs: 11952 -> 12677 (+6.07%)
CodeSize: 96378496 -> 96707596 (+0.34%); split: -0.04%, +0.38%
MaxWaves: 42614 -> 42883 (+0.63%); split: +0.68%, -0.04%
Instrs: 18672844 -> 18600274 (-0.39%); split: -0.44%, +0.05%
Latency: 296658786 -> 296338296 (-0.11%); split: -0.21%, +0.10%
InvThroughput: 111665547 -> 111283559 (-0.34%); split: -0.40%, +0.06%
VClause: 343001 -> 342826 (-0.05%); split: -0.14%, +0.09%
SClause: 646684 -> 646657 (-0.00%); split: -0.05%, +0.04%
Copies: 1715316 -> 1712895 (-0.14%); split: -0.53%, +0.39%
PreSGPRs: 850737 -> 856543 (+0.68%); split: -0.04%, +0.72%
PreVGPRs: 775293 -> 772215 (-0.40%); split: -0.41%, +0.02%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 13:58:34 +00:00
|
|
|
label_dpp8 | label_f2f32;
|
2021-06-09 10:14:54 +02:00
|
|
|
static constexpr uint64_t instr_mod_labels =
|
2022-01-17 16:52:10 +00:00
|
|
|
label_omod2 | label_omod4 | label_omod5 | label_clamp | label_insert | label_f2f16;
|
2020-08-12 15:58:32 +01:00
|
|
|
|
2022-03-16 15:14:29 +01:00
|
|
|
static constexpr uint64_t instr_labels = instr_usedef_labels | instr_mod_labels | label_split;
|
2021-06-09 10:14:54 +02:00
|
|
|
static constexpr uint64_t temp_labels = label_abs | label_neg | label_temp | label_vcc | label_b2f |
|
|
|
|
|
label_uniform_bool | label_scc_invert | label_b2i |
|
|
|
|
|
label_fcanonicalize;
|
|
|
|
|
static constexpr uint32_t val_labels =
|
|
|
|
|
label_constant_32bit | label_constant_64bit | label_constant_16bit | label_literal;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-08-12 13:52:55 +01:00
|
|
|
static_assert((instr_labels & temp_labels) == 0, "labels cannot intersect");
|
|
|
|
|
static_assert((instr_labels & val_labels) == 0, "labels cannot intersect");
|
|
|
|
|
static_assert((temp_labels & val_labels) == 0, "labels cannot intersect");
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
struct ssa_info {
|
2020-06-01 11:27:53 +01:00
|
|
|
uint64_t label;
|
2019-09-17 13:22:17 +02:00
|
|
|
union {
|
2020-06-01 11:27:53 +01:00
|
|
|
uint32_t val;
|
2019-09-17 13:22:17 +02:00
|
|
|
Temp temp;
|
|
|
|
|
Instruction* instr;
|
|
|
|
|
};
|
|
|
|
|
|
2020-04-06 14:08:39 +01:00
|
|
|
ssa_info() : label(0) {}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
void add_label(Label new_label)
|
|
|
|
|
{
|
2020-08-12 15:58:32 +01:00
|
|
|
/* Since all the instr_usedef_labels use instr for the same thing
|
|
|
|
|
* (indicating the defining instruction), there is usually no need to
|
|
|
|
|
* clear any other instr labels. */
|
|
|
|
|
if (new_label & instr_usedef_labels)
|
|
|
|
|
label &= ~(instr_mod_labels | temp_labels | val_labels); /* instr, temp and val alias */
|
|
|
|
|
|
|
|
|
|
if (new_label & instr_mod_labels) {
|
|
|
|
|
label &= ~instr_labels;
|
2020-06-01 11:27:53 +01:00
|
|
|
label &= ~(temp_labels | val_labels); /* instr, temp and val alias */
|
2020-08-12 15:58:32 +01:00
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
if (new_label & temp_labels) {
|
|
|
|
|
label &= ~temp_labels;
|
2020-06-01 11:27:53 +01:00
|
|
|
label &= ~(instr_labels | val_labels); /* instr, temp and val alias */
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
uint32_t const_labels =
|
|
|
|
|
label_literal | label_constant_32bit | label_constant_64bit | label_constant_16bit;
|
2020-06-01 11:27:53 +01:00
|
|
|
if (new_label & const_labels) {
|
2020-05-15 16:28:03 +01:00
|
|
|
label &= ~val_labels | const_labels;
|
2020-06-01 11:27:53 +01:00
|
|
|
label &= ~(instr_labels | temp_labels); /* instr, temp and val alias */
|
|
|
|
|
} else if (new_label & val_labels) {
|
2019-09-17 13:22:17 +02:00
|
|
|
label &= ~val_labels;
|
2020-06-01 11:27:53 +01:00
|
|
|
label &= ~(instr_labels | temp_labels); /* instr, temp and val alias */
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
label |= new_label;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void set_vec(Instruction* vec)
|
|
|
|
|
{
|
|
|
|
|
add_label(label_vec);
|
|
|
|
|
instr = vec;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool is_vec() { return label & label_vec; }
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2022-05-12 02:50:17 -04:00
|
|
|
void set_constant(amd_gfx_level gfx_level, uint64_t constant)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2021-07-13 11:22:46 +02:00
|
|
|
Operand op16 = Operand::c16(constant);
|
2022-05-12 02:50:17 -04:00
|
|
|
Operand op32 = Operand::get_const(gfx_level, constant, 4);
|
2020-05-15 16:28:03 +01:00
|
|
|
add_label(label_literal);
|
2019-09-17 13:22:17 +02:00
|
|
|
val = constant;
|
|
|
|
|
|
2021-07-19 15:01:09 +02:00
|
|
|
/* check that no upper bits are lost in case of packed 16bit constants */
|
2022-04-29 16:45:17 +01:00
|
|
|
if (gfx_level >= GFX8 && !op16.isLiteral() &&
|
|
|
|
|
op16.constantValue16(true) == ((constant >> 16) & 0xffff))
|
2020-05-15 16:28:03 +01:00
|
|
|
add_label(label_constant_16bit);
|
|
|
|
|
|
2020-12-03 15:18:30 +00:00
|
|
|
if (!op32.isLiteral())
|
2020-05-15 16:28:03 +01:00
|
|
|
add_label(label_constant_32bit);
|
|
|
|
|
|
2020-12-03 15:18:30 +00:00
|
|
|
if (Operand::is_constant_representable(constant, 8))
|
2020-05-15 16:28:03 +01:00
|
|
|
add_label(label_constant_64bit);
|
|
|
|
|
|
|
|
|
|
if (label & label_constant_64bit) {
|
2021-07-13 11:22:46 +02:00
|
|
|
val = Operand::c64(constant).constantValue();
|
2020-05-15 16:28:03 +01:00
|
|
|
if (val != constant)
|
|
|
|
|
label &= ~(label_literal | label_constant_16bit | label_constant_32bit);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool is_constant(unsigned bits)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2020-05-15 16:28:03 +01:00
|
|
|
switch (bits) {
|
2021-06-09 10:14:54 +02:00
|
|
|
case 8: return label & label_literal;
|
|
|
|
|
case 16: return label & label_constant_16bit;
|
|
|
|
|
case 32: return label & label_constant_32bit;
|
|
|
|
|
case 64: return label & label_constant_64bit;
|
2020-05-15 16:28:03 +01:00
|
|
|
}
|
|
|
|
|
return false;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2020-05-15 16:28:03 +01:00
|
|
|
bool is_literal(unsigned bits)
|
2019-11-13 11:14:51 +01:00
|
|
|
{
|
2020-05-15 16:28:03 +01:00
|
|
|
bool is_lit = label & label_literal;
|
|
|
|
|
switch (bits) {
|
2021-06-09 10:14:54 +02:00
|
|
|
case 8: return false;
|
|
|
|
|
case 16: return is_lit && ~(label & label_constant_16bit);
|
|
|
|
|
case 32: return is_lit && ~(label & label_constant_32bit);
|
|
|
|
|
case 64: return false;
|
2020-05-15 16:28:03 +01:00
|
|
|
}
|
|
|
|
|
return false;
|
2019-11-13 11:14:51 +01:00
|
|
|
}
|
|
|
|
|
|
2020-05-15 16:28:03 +01:00
|
|
|
bool is_constant_or_literal(unsigned bits)
|
2019-11-13 11:14:51 +01:00
|
|
|
{
|
2020-05-15 16:28:03 +01:00
|
|
|
if (bits == 64)
|
|
|
|
|
return label & label_constant_64bit;
|
|
|
|
|
else
|
|
|
|
|
return label & label_literal;
|
2019-11-13 11:14:51 +01:00
|
|
|
}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
void set_abs(Temp abs_temp)
|
|
|
|
|
{
|
|
|
|
|
add_label(label_abs);
|
|
|
|
|
temp = abs_temp;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool is_abs() { return label & label_abs; }
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
void set_neg(Temp neg_temp)
|
|
|
|
|
{
|
|
|
|
|
add_label(label_neg);
|
|
|
|
|
temp = neg_temp;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool is_neg() { return label & label_neg; }
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
void set_neg_abs(Temp neg_abs_temp)
|
|
|
|
|
{
|
|
|
|
|
add_label((Label)((uint32_t)label_abs | (uint32_t)label_neg));
|
|
|
|
|
temp = neg_abs_temp;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void set_mul(Instruction* mul)
|
|
|
|
|
{
|
|
|
|
|
add_label(label_mul);
|
|
|
|
|
instr = mul;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool is_mul() { return label & label_mul; }
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
void set_temp(Temp tmp)
|
|
|
|
|
{
|
|
|
|
|
add_label(label_temp);
|
|
|
|
|
temp = tmp;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool is_temp() { return label & label_temp; }
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
void set_mad(Instruction* mad, uint32_t mad_info_idx)
|
|
|
|
|
{
|
|
|
|
|
add_label(label_mad);
|
2020-06-01 11:27:53 +01:00
|
|
|
mad->pass_flags = mad_info_idx;
|
2019-09-17 13:22:17 +02:00
|
|
|
instr = mad;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool is_mad() { return label & label_mad; }
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-08-12 15:58:32 +01:00
|
|
|
void set_omod2(Instruction* mul)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
add_label(label_omod2);
|
2020-08-12 15:58:32 +01:00
|
|
|
instr = mul;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool is_omod2() { return label & label_omod2; }
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-08-12 15:58:32 +01:00
|
|
|
void set_omod4(Instruction* mul)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
add_label(label_omod4);
|
2020-08-12 15:58:32 +01:00
|
|
|
instr = mul;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool is_omod4() { return label & label_omod4; }
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-08-12 15:58:32 +01:00
|
|
|
void set_omod5(Instruction* mul)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
add_label(label_omod5);
|
2020-08-12 15:58:32 +01:00
|
|
|
instr = mul;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool is_omod5() { return label & label_omod5; }
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void set_clamp(Instruction* med3)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
add_label(label_clamp);
|
2020-08-12 15:58:32 +01:00
|
|
|
instr = med3;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool is_clamp() { return label & label_clamp; }
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2022-01-17 16:52:10 +00:00
|
|
|
void set_f2f16(Instruction* conv)
|
|
|
|
|
{
|
|
|
|
|
add_label(label_f2f16);
|
|
|
|
|
instr = conv;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool is_f2f16() { return label & label_f2f16; }
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void set_undefined() { add_label(label_undefined); }
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool is_undefined() { return label & label_undefined; }
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-11-03 14:40:05 +01:00
|
|
|
void set_vcc(Temp vcc_val)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
add_label(label_vcc);
|
2020-11-03 14:40:05 +01:00
|
|
|
temp = vcc_val;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool is_vcc() { return label & label_vcc; }
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-11-03 14:40:05 +01:00
|
|
|
void set_b2f(Temp b2f_val)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
add_label(label_b2f);
|
2020-11-03 14:40:05 +01:00
|
|
|
temp = b2f_val;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool is_b2f() { return label & label_b2f; }
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void set_add_sub(Instruction* add_sub_instr)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
add_label(label_add_sub);
|
|
|
|
|
instr = add_sub_instr;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool is_add_sub() { return label & label_add_sub; }
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void set_bitwise(Instruction* bitwise_instr)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
add_label(label_bitwise);
|
|
|
|
|
instr = bitwise_instr;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool is_bitwise() { return label & label_bitwise; }
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void set_uniform_bitwise() { add_label(label_uniform_bitwise); }
|
2020-01-03 10:30:04 +01:00
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool is_uniform_bitwise() { return label & label_uniform_bitwise; }
|
2020-01-03 10:30:04 +01:00
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void set_minmax(Instruction* minmax_instr)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
add_label(label_minmax);
|
|
|
|
|
instr = minmax_instr;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool is_minmax() { return label & label_minmax; }
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void set_vopc(Instruction* vopc_instr)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2020-06-19 16:09:48 +01:00
|
|
|
add_label(label_vopc);
|
|
|
|
|
instr = vopc_instr;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool is_vopc() { return label & label_vopc; }
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void set_scc_needed() { add_label(label_scc_needed); }
|
2020-01-16 19:32:31 +01:00
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool is_scc_needed() { return label & label_scc_needed; }
|
2020-01-16 19:32:31 +01:00
|
|
|
|
aco: Flip s_cbranch / s_cselect to optimize out an s_not if possible.
When possible, get rid of an s_not when all it does is invert the SCC,
and its successor s_cbranch / s_cselect can be inverted instead.
Also modify some parts of instruction_selection to take advantage of
this feature.
Example:
s2: %3900, s1: %3899:scc = s_andn2_b64 %0:exec, %406
s2: %3902 = s_cselect_b64 -1, 0, %3900:scc
s2: %407, s1: %3903:scc = s_not_b64 %3902
s2: %3906, s1: %3905:scc = s_and_b64 %407, %0:exec
p_cbranch_z %3905:scc
Can now be optimized to:
s2: %3900, s1: %3899:scc = s_andn2_b64 %0:exec, %406
p_cbranch_nz %3900:scc
Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
2019-11-19 13:29:54 +01:00
|
|
|
void set_scc_invert(Temp scc_inv)
|
|
|
|
|
{
|
|
|
|
|
add_label(label_scc_invert);
|
|
|
|
|
temp = scc_inv;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool is_scc_invert() { return label & label_scc_invert; }
|
aco: Flip s_cbranch / s_cselect to optimize out an s_not if possible.
When possible, get rid of an s_not when all it does is invert the SCC,
and its successor s_cbranch / s_cselect can be inverted instead.
Also modify some parts of instruction_selection to take advantage of
this feature.
Example:
s2: %3900, s1: %3899:scc = s_andn2_b64 %0:exec, %406
s2: %3902 = s_cselect_b64 -1, 0, %3900:scc
s2: %407, s1: %3903:scc = s_not_b64 %3902
s2: %3906, s1: %3905:scc = s_and_b64 %407, %0:exec
p_cbranch_z %3905:scc
Can now be optimized to:
s2: %3900, s1: %3899:scc = s_andn2_b64 %0:exec, %406
p_cbranch_nz %3900:scc
Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
2019-11-19 13:29:54 +01:00
|
|
|
|
2019-11-05 11:41:00 +01:00
|
|
|
void set_uniform_bool(Temp uniform_bool)
|
|
|
|
|
{
|
|
|
|
|
add_label(label_uniform_bool);
|
|
|
|
|
temp = uniform_bool;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool is_uniform_bool() { return label & label_uniform_bool; }
|
2019-11-05 11:41:00 +01:00
|
|
|
|
2020-11-03 14:40:05 +01:00
|
|
|
void set_b2i(Temp b2i_val)
|
2020-04-02 17:41:36 +02:00
|
|
|
{
|
|
|
|
|
add_label(label_b2i);
|
2020-11-03 14:40:05 +01:00
|
|
|
temp = b2i_val;
|
2020-04-02 17:41:36 +02:00
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool is_b2i() { return label & label_b2i; }
|
2020-04-02 17:41:36 +02:00
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void set_usedef(Instruction* label_instr)
|
2020-11-11 15:13:08 +01:00
|
|
|
{
|
|
|
|
|
add_label(label_usedef);
|
|
|
|
|
instr = label_instr;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool is_usedef() { return label & label_usedef; }
|
2020-09-03 12:02:55 +01:00
|
|
|
|
|
|
|
|
void set_vop3p(Instruction* vop3p_instr)
|
|
|
|
|
{
|
|
|
|
|
add_label(label_vop3p);
|
|
|
|
|
instr = vop3p_instr;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool is_vop3p() { return label & label_vop3p; }
|
2020-06-17 15:02:30 +01:00
|
|
|
|
|
|
|
|
void set_fcanonicalize(Temp tmp)
|
|
|
|
|
{
|
|
|
|
|
add_label(label_fcanonicalize);
|
|
|
|
|
temp = tmp;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool is_fcanonicalize() { return label & label_fcanonicalize; }
|
2020-06-17 15:02:30 +01:00
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void set_canonicalized() { add_label(label_canonicalized); }
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool is_canonicalized() { return label & label_canonicalized; }
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
|
aco: use v_fma_mix to combine mul/add/fma input conversions
fossil-db (Sienna Cichlid):
Totals from 11558 (8.57% of 134913) affected shaders:
VGPRs: 829392 -> 825200 (-0.51%); split: -0.52%, +0.02%
SpillSGPRs: 7845 -> 8399 (+7.06%)
CodeSize: 101822704 -> 101677172 (-0.14%); split: -0.25%, +0.11%
MaxWaves: 172216 -> 173182 (+0.56%); split: +0.59%, -0.03%
Instrs: 19061343 -> 18883450 (-0.93%); split: -0.93%, +0.00%
Latency: 256011590 -> 255177378 (-0.33%); split: -0.39%, +0.06%
InvThroughput: 46104438 -> 45604059 (-1.09%); split: -1.12%, +0.04%
VClause: 352211 -> 351948 (-0.07%); split: -0.21%, +0.13%
SClause: 676506 -> 676961 (+0.07%); split: -0.04%, +0.11%
Copies: 1246571 -> 1237745 (-0.71%); split: -0.97%, +0.26%
Branches: 626229 -> 626241 (+0.00%); split: -0.02%, +0.03%
PreSGPRs: 882176 -> 888853 (+0.76%); split: -0.00%, +0.76%
PreVGPRs: 796705 -> 792304 (-0.55%); split: -0.56%, +0.00%
fossil-db (Navi):
Totals from 11558 (8.57% of 134913) affected shaders:
VGPRs: 803900 -> 798660 (-0.65%); split: -0.73%, +0.08%
SpillSGPRs: 7894 -> 8492 (+7.58%); split: -0.10%, +7.68%
CodeSize: 96892596 -> 97134716 (+0.25%); split: -0.05%, +0.29%
MaxWaves: 181454 -> 183014 (+0.86%); split: +0.94%, -0.08%
Instrs: 18186813 -> 18093994 (-0.51%); split: -0.56%, +0.05%
Latency: 253385909 -> 253325528 (-0.02%); split: -0.15%, +0.12%
InvThroughput: 43315355 -> 42805541 (-1.18%); split: -1.33%, +0.15%
VClause: 338755 -> 338535 (-0.06%); split: -0.16%, +0.10%
SClause: 656561 -> 656829 (+0.04%); split: -0.07%, +0.11%
Copies: 1162235 -> 1153558 (-0.75%); split: -1.07%, +0.32%
Branches: 588536 -> 588542 (+0.00%); split: -0.03%, +0.03%
PreSGPRs: 854849 -> 861640 (+0.79%); split: -0.00%, +0.80%
PreVGPRs: 783401 -> 779031 (-0.56%); split: -0.56%, +0.00%
fossil-db (Vega):
Totals from 11516 (8.53% of 135048) affected shaders:
SGPRs: 1072128 -> 1076288 (+0.39%); split: -0.01%, +0.40%
VGPRs: 821312 -> 818124 (-0.39%); split: -0.43%, +0.04%
SpillSGPRs: 11952 -> 12677 (+6.07%)
CodeSize: 96378496 -> 96707596 (+0.34%); split: -0.04%, +0.38%
MaxWaves: 42614 -> 42883 (+0.63%); split: +0.68%, -0.04%
Instrs: 18672844 -> 18600274 (-0.39%); split: -0.44%, +0.05%
Latency: 296658786 -> 296338296 (-0.11%); split: -0.21%, +0.10%
InvThroughput: 111665547 -> 111283559 (-0.34%); split: -0.40%, +0.06%
VClause: 343001 -> 342826 (-0.05%); split: -0.14%, +0.09%
SClause: 646684 -> 646657 (-0.00%); split: -0.05%, +0.04%
Copies: 1715316 -> 1712895 (-0.14%); split: -0.53%, +0.39%
PreSGPRs: 850737 -> 856543 (+0.68%); split: -0.04%, +0.72%
PreVGPRs: 775293 -> 772215 (-0.40%); split: -0.41%, +0.02%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 13:58:34 +00:00
|
|
|
void set_f2f32(Instruction* cvt)
|
|
|
|
|
{
|
|
|
|
|
add_label(label_f2f32);
|
|
|
|
|
instr = cvt;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool is_f2f32() { return label & label_f2f32; }
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void set_extract(Instruction* extract)
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
{
|
|
|
|
|
add_label(label_extract);
|
|
|
|
|
instr = extract;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool is_extract() { return label & label_extract; }
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void set_insert(Instruction* insert)
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
{
|
|
|
|
|
add_label(label_insert);
|
|
|
|
|
instr = insert;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool is_insert() { return label & label_insert; }
|
2020-06-30 15:33:18 +01:00
|
|
|
|
2021-11-29 00:12:04 +09:00
|
|
|
void set_dpp16(Instruction* mov)
|
2020-06-30 15:33:18 +01:00
|
|
|
{
|
2021-11-29 00:12:04 +09:00
|
|
|
add_label(label_dpp16);
|
2020-06-30 15:33:18 +01:00
|
|
|
instr = mov;
|
|
|
|
|
}
|
|
|
|
|
|
2021-11-29 00:12:04 +09:00
|
|
|
void set_dpp8(Instruction* mov)
|
|
|
|
|
{
|
|
|
|
|
add_label(label_dpp8);
|
|
|
|
|
instr = mov;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool is_dpp() { return label & (label_dpp16 | label_dpp8); }
|
|
|
|
|
bool is_dpp16() { return label & label_dpp16; }
|
|
|
|
|
bool is_dpp8() { return label & label_dpp8; }
|
2022-03-16 15:14:29 +01:00
|
|
|
|
|
|
|
|
void set_split(Instruction* split)
|
|
|
|
|
{
|
|
|
|
|
add_label(label_split);
|
|
|
|
|
instr = split;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool is_split() { return label & label_split; }
|
2019-09-17 13:22:17 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct opt_ctx {
|
|
|
|
|
Program* program;
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
float_mode fp_mode;
|
2019-09-17 13:22:17 +02:00
|
|
|
std::vector<aco_ptr<Instruction>> instructions;
|
|
|
|
|
ssa_info* info;
|
2021-06-09 10:14:54 +02:00
|
|
|
std::pair<uint32_t, Temp> last_literal;
|
2019-09-17 13:22:17 +02:00
|
|
|
std::vector<mad_info> mad_infos;
|
|
|
|
|
std::vector<uint16_t> uses;
|
|
|
|
|
};
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
can_use_VOP3(opt_ctx& ctx, const aco_ptr<Instruction>& instr)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2019-11-22 15:00:04 +00:00
|
|
|
if (instr->isVOP3())
|
|
|
|
|
return true;
|
|
|
|
|
|
2021-01-20 15:27:16 +00:00
|
|
|
if (instr->isVOP3P())
|
2020-09-04 12:35:54 +01:00
|
|
|
return false;
|
|
|
|
|
|
2022-05-12 02:50:17 -04:00
|
|
|
if (instr->operands.size() && instr->operands[0].isLiteral() && ctx.program->gfx_level < GFX10)
|
2019-09-17 13:22:17 +02:00
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (instr->isDPP() || instr->isSDWA())
|
|
|
|
|
return false;
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_fmamk_f32 && instr->opcode != aco_opcode::v_fmaak_f32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_fmamk_f16 && instr->opcode != aco_opcode::v_fmaak_f16 &&
|
2019-11-07 18:02:33 +01:00
|
|
|
instr->opcode != aco_opcode::v_readlane_b32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_writelane_b32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_readfirstlane_b32;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
pseudo_propagate_temp(opt_ctx& ctx, aco_ptr<Instruction>& instr, Temp temp, unsigned index)
|
2020-12-31 11:01:08 +00:00
|
|
|
{
|
|
|
|
|
if (instr->definitions.empty())
|
|
|
|
|
return false;
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
const bool vgpr =
|
|
|
|
|
instr->opcode == aco_opcode::p_as_uniform ||
|
|
|
|
|
std::all_of(instr->definitions.begin(), instr->definitions.end(),
|
|
|
|
|
[](const Definition& def) { return def.regClass().type() == RegType::vgpr; });
|
2020-12-31 11:01:08 +00:00
|
|
|
|
|
|
|
|
/* don't propagate VGPRs into SGPR instructions */
|
|
|
|
|
if (temp.type() == RegType::vgpr && !vgpr)
|
|
|
|
|
return false;
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool can_accept_sgpr =
|
2022-05-12 02:50:17 -04:00
|
|
|
ctx.program->gfx_level >= GFX9 ||
|
2021-06-09 10:14:54 +02:00
|
|
|
std::none_of(instr->definitions.begin(), instr->definitions.end(),
|
|
|
|
|
[](const Definition& def) { return def.regClass().is_subdword(); });
|
2020-12-31 11:01:08 +00:00
|
|
|
|
|
|
|
|
switch (instr->opcode) {
|
|
|
|
|
case aco_opcode::p_phi:
|
|
|
|
|
case aco_opcode::p_linear_phi:
|
|
|
|
|
case aco_opcode::p_parallelcopy:
|
|
|
|
|
case aco_opcode::p_create_vector:
|
|
|
|
|
if (temp.bytes() != instr->operands[index].bytes())
|
|
|
|
|
return false;
|
|
|
|
|
break;
|
|
|
|
|
case aco_opcode::p_extract_vector:
|
2021-10-05 13:09:02 +01:00
|
|
|
case aco_opcode::p_extract:
|
2020-12-31 11:01:08 +00:00
|
|
|
if (temp.type() == RegType::sgpr && !can_accept_sgpr)
|
|
|
|
|
return false;
|
|
|
|
|
break;
|
|
|
|
|
case aco_opcode::p_split_vector: {
|
|
|
|
|
if (temp.type() == RegType::sgpr && !can_accept_sgpr)
|
|
|
|
|
return false;
|
|
|
|
|
/* don't increase the vector size */
|
|
|
|
|
if (temp.bytes() > instr->operands[index].bytes())
|
|
|
|
|
return false;
|
|
|
|
|
/* We can decrease the vector size as smaller temporaries are only
|
|
|
|
|
* propagated by p_as_uniform instructions.
|
|
|
|
|
* If this propagation leads to invalid IR or hits the assertion below,
|
|
|
|
|
* it means that some undefined bytes within a dword are begin accessed
|
|
|
|
|
* and a bug in instruction_selection is likely. */
|
|
|
|
|
int decrease = instr->operands[index].bytes() - temp.bytes();
|
|
|
|
|
while (decrease > 0) {
|
|
|
|
|
decrease -= instr->definitions.back().bytes();
|
|
|
|
|
instr->definitions.pop_back();
|
|
|
|
|
}
|
|
|
|
|
assert(decrease == 0);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case aco_opcode::p_as_uniform:
|
|
|
|
|
if (temp.regClass() == instr->definitions[0].regClass())
|
|
|
|
|
instr->opcode = aco_opcode::p_parallelcopy;
|
|
|
|
|
break;
|
2021-06-09 10:14:54 +02:00
|
|
|
default: return false;
|
2020-12-31 11:01:08 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
instr->operands[index].setTemp(temp);
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2021-08-30 10:30:45 +01:00
|
|
|
/* This expects the DPP modifier to be removed. */
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
can_apply_sgprs(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
2019-09-24 13:32:56 +01:00
|
|
|
{
|
2022-05-12 02:50:17 -04:00
|
|
|
if (instr->isSDWA() && ctx.program->gfx_level < GFX9)
|
2019-12-05 14:12:39 +00:00
|
|
|
return false;
|
2019-09-24 13:32:56 +01:00
|
|
|
return instr->opcode != aco_opcode::v_readfirstlane_b32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_readlane_b32 &&
|
2019-11-07 18:02:33 +01:00
|
|
|
instr->opcode != aco_opcode::v_readlane_b32_e64 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_writelane_b32 &&
|
2021-05-28 21:56:50 +02:00
|
|
|
instr->opcode != aco_opcode::v_writelane_b32_e64 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_permlane16_b32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_permlanex16_b32;
|
2019-09-24 13:32:56 +01:00
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
2022-09-08 11:24:27 +02:00
|
|
|
to_VOP3(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned add_operands = 0)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
if (instr->isVOP3())
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
aco_ptr<Instruction> tmp = std::move(instr);
|
|
|
|
|
Format format = asVOP3(tmp->format);
|
2022-09-08 11:24:27 +02:00
|
|
|
instr.reset(create_instruction<VOP3_instruction>(
|
|
|
|
|
tmp->opcode, format, tmp->operands.size() + add_operands, tmp->definitions.size()));
|
2019-09-17 13:22:17 +02:00
|
|
|
std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
|
|
|
|
|
for (unsigned i = 0; i < instr->definitions.size(); i++) {
|
|
|
|
|
instr->definitions[i] = tmp->definitions[i];
|
|
|
|
|
if (instr->definitions[i].isTemp()) {
|
|
|
|
|
ssa_info& info = ctx.info[instr->definitions[i].tempId()];
|
2020-08-12 15:58:32 +01:00
|
|
|
if (info.label & instr_usedef_labels && info.instr == tmp.get())
|
2019-09-17 13:22:17 +02:00
|
|
|
info.instr = instr.get();
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-08-12 15:58:32 +01:00
|
|
|
/* we don't need to update any instr_mod_labels because they either haven't
|
|
|
|
|
* been applied yet or this instruction isn't dead and so they've been ignored */
|
2022-01-28 14:49:50 +00:00
|
|
|
|
|
|
|
|
instr->pass_flags = tmp->pass_flags;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
is_operand_vgpr(Operand op)
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
{
|
|
|
|
|
return op.isTemp() && op.getTemp().type() == RegType::vgpr;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
to_SDWA(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
{
|
2022-05-12 02:50:17 -04:00
|
|
|
aco_ptr<Instruction> tmp = convert_to_SDWA(ctx.program->gfx_level, instr);
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
if (!tmp)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < instr->definitions.size(); i++) {
|
|
|
|
|
ssa_info& info = ctx.info[instr->definitions[i].tempId()];
|
|
|
|
|
if (info.label & instr_labels && info.instr == tmp.get())
|
|
|
|
|
info.instr = instr.get();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
/* only covers special cases */
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
alu_can_accept_constant(aco_opcode opcode, unsigned operand)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2020-01-16 16:54:35 +01:00
|
|
|
switch (opcode) {
|
2019-09-17 13:22:17 +02:00
|
|
|
case aco_opcode::v_interp_p2_f32:
|
|
|
|
|
case aco_opcode::v_mac_f32:
|
|
|
|
|
case aco_opcode::v_writelane_b32:
|
2019-11-07 18:02:33 +01:00
|
|
|
case aco_opcode::v_writelane_b32_e64:
|
2021-06-09 10:14:54 +02:00
|
|
|
case aco_opcode::v_cndmask_b32: return operand != 2;
|
2019-09-17 13:22:17 +02:00
|
|
|
case aco_opcode::s_addk_i32:
|
|
|
|
|
case aco_opcode::s_mulk_i32:
|
|
|
|
|
case aco_opcode::p_wqm:
|
|
|
|
|
case aco_opcode::p_extract_vector:
|
|
|
|
|
case aco_opcode::p_split_vector:
|
2019-09-24 13:32:56 +01:00
|
|
|
case aco_opcode::v_readlane_b32:
|
2019-11-07 18:02:33 +01:00
|
|
|
case aco_opcode::v_readlane_b32_e64:
|
2019-09-24 13:32:56 +01:00
|
|
|
case aco_opcode::v_readfirstlane_b32:
|
2020-08-12 14:35:15 +01:00
|
|
|
case aco_opcode::p_extract:
|
2021-06-09 10:14:54 +02:00
|
|
|
case aco_opcode::p_insert: return operand != 0;
|
2022-12-13 09:39:30 +01:00
|
|
|
case aco_opcode::p_bpermute_gfx6:
|
|
|
|
|
case aco_opcode::p_bpermute_gfx10w64:
|
2022-12-13 10:11:04 +01:00
|
|
|
case aco_opcode::p_bpermute_gfx11w64:
|
2022-11-16 15:18:54 +01:00
|
|
|
case aco_opcode::p_interp_gfx11:
|
|
|
|
|
case aco_opcode::p_dual_src_export_gfx11: return false;
|
2021-06-09 10:14:54 +02:00
|
|
|
default: return true;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
valu_can_accept_vgpr(aco_ptr<Instruction>& instr, unsigned operand)
|
2019-11-12 15:53:15 +00:00
|
|
|
{
|
2021-06-09 10:14:54 +02:00
|
|
|
if (instr->opcode == aco_opcode::v_readlane_b32 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_readlane_b32_e64 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_writelane_b32 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_writelane_b32_e64)
|
2019-11-12 15:53:15 +00:00
|
|
|
return operand != 1;
|
2021-06-09 10:14:54 +02:00
|
|
|
if (instr->opcode == aco_opcode::v_permlane16_b32 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_permlanex16_b32)
|
2021-05-28 21:56:50 +02:00
|
|
|
return operand == 0;
|
2019-11-12 15:53:15 +00:00
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2019-11-22 14:50:41 +00:00
|
|
|
/* check constant bus and literal limitations */
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
check_vop3_operands(opt_ctx& ctx, unsigned num_operands, Operand* operands)
|
2019-11-22 14:50:41 +00:00
|
|
|
{
|
2022-05-12 02:50:17 -04:00
|
|
|
int limit = ctx.program->gfx_level >= GFX10 ? 2 : 1;
|
2019-11-20 16:42:17 +00:00
|
|
|
Operand literal32(s1);
|
|
|
|
|
Operand literal64(s2);
|
2019-11-22 14:50:41 +00:00
|
|
|
unsigned num_sgprs = 0;
|
|
|
|
|
unsigned sgpr[] = {0, 0};
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < num_operands; i++) {
|
|
|
|
|
Operand op = operands[i];
|
|
|
|
|
|
|
|
|
|
if (op.hasRegClass() && op.regClass().type() == RegType::sgpr) {
|
|
|
|
|
/* two reads of the same SGPR count as 1 to the limit */
|
|
|
|
|
if (op.tempId() != sgpr[0] && op.tempId() != sgpr[1]) {
|
|
|
|
|
if (num_sgprs < 2)
|
|
|
|
|
sgpr[num_sgprs++] = op.tempId();
|
|
|
|
|
limit--;
|
|
|
|
|
if (limit < 0)
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
} else if (op.isLiteral()) {
|
2022-05-12 02:50:17 -04:00
|
|
|
if (ctx.program->gfx_level < GFX10)
|
2019-11-20 16:42:17 +00:00
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (!literal32.isUndefined() && literal32.constantValue() != op.constantValue())
|
|
|
|
|
return false;
|
|
|
|
|
if (!literal64.isUndefined() && literal64.constantValue() != op.constantValue())
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
/* Any number of 32-bit literals counts as only 1 to the limit. Same
|
|
|
|
|
* (but separately) for 64-bit literals. */
|
|
|
|
|
if (op.size() == 1 && literal32.isUndefined()) {
|
|
|
|
|
limit--;
|
|
|
|
|
literal32 = op;
|
|
|
|
|
} else if (op.size() == 2 && literal64.isUndefined()) {
|
|
|
|
|
limit--;
|
|
|
|
|
literal64 = op;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (limit < 0)
|
|
|
|
|
return false;
|
2019-11-22 14:50:41 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
parse_base_offset(opt_ctx& ctx, Instruction* instr, unsigned op_index, Temp* base, uint32_t* offset,
|
|
|
|
|
bool prevent_overflow)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
Operand op = instr->operands[op_index];
|
|
|
|
|
|
|
|
|
|
if (!op.isTemp())
|
|
|
|
|
return false;
|
|
|
|
|
Temp tmp = op.getTemp();
|
|
|
|
|
if (!ctx.info[tmp.id()].is_add_sub())
|
|
|
|
|
return false;
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
Instruction* add_instr = ctx.info[tmp.id()].instr;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2022-05-19 15:34:04 +01:00
|
|
|
unsigned mask = 0x3;
|
|
|
|
|
bool is_sub = false;
|
2019-09-17 13:22:17 +02:00
|
|
|
switch (add_instr->opcode) {
|
|
|
|
|
case aco_opcode::v_add_u32:
|
|
|
|
|
case aco_opcode::v_add_co_u32:
|
2020-02-21 12:02:06 +00:00
|
|
|
case aco_opcode::v_add_co_u32_e64:
|
2019-09-17 13:22:17 +02:00
|
|
|
case aco_opcode::s_add_i32:
|
2021-06-09 10:14:54 +02:00
|
|
|
case aco_opcode::s_add_u32: break;
|
2022-05-19 15:34:04 +01:00
|
|
|
case aco_opcode::v_sub_u32:
|
|
|
|
|
case aco_opcode::v_sub_i32:
|
|
|
|
|
case aco_opcode::v_sub_co_u32:
|
|
|
|
|
case aco_opcode::v_sub_co_u32_e64:
|
|
|
|
|
case aco_opcode::s_sub_u32:
|
|
|
|
|
case aco_opcode::s_sub_i32:
|
|
|
|
|
mask = 0x2;
|
|
|
|
|
is_sub = true;
|
|
|
|
|
break;
|
|
|
|
|
case aco_opcode::v_subrev_u32:
|
|
|
|
|
case aco_opcode::v_subrev_co_u32:
|
|
|
|
|
case aco_opcode::v_subrev_co_u32_e64:
|
|
|
|
|
mask = 0x1;
|
|
|
|
|
is_sub = true;
|
|
|
|
|
break;
|
2021-06-09 10:14:54 +02:00
|
|
|
default: return false;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
2019-10-15 17:25:57 +01:00
|
|
|
if (prevent_overflow && !add_instr->definitions[0].isNUW())
|
|
|
|
|
return false;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2019-10-29 13:59:59 +00:00
|
|
|
if (add_instr->usesModifiers())
|
|
|
|
|
return false;
|
|
|
|
|
|
2022-05-19 15:34:04 +01:00
|
|
|
u_foreach_bit (i, mask) {
|
2019-09-17 13:22:17 +02:00
|
|
|
if (add_instr->operands[i].isConstant()) {
|
2022-05-19 15:34:04 +01:00
|
|
|
*offset = add_instr->operands[i].constantValue() * (uint32_t)(is_sub ? -1 : 1);
|
2019-09-17 13:22:17 +02:00
|
|
|
} else if (add_instr->operands[i].isTemp() &&
|
2020-05-15 16:28:03 +01:00
|
|
|
ctx.info[add_instr->operands[i].tempId()].is_constant_or_literal(32)) {
|
2022-05-19 15:34:04 +01:00
|
|
|
*offset = ctx.info[add_instr->operands[i].tempId()].val * (uint32_t)(is_sub ? -1 : 1);
|
2019-09-17 13:22:17 +02:00
|
|
|
} else {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (!add_instr->operands[!i].isTemp())
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
uint32_t offset2 = 0;
|
2019-10-15 17:00:55 +01:00
|
|
|
if (parse_base_offset(ctx, add_instr, !i, base, &offset2, prevent_overflow)) {
|
2019-09-17 13:22:17 +02:00
|
|
|
*offset += offset2;
|
|
|
|
|
} else {
|
|
|
|
|
*base = add_instr->operands[!i].getTemp();
|
|
|
|
|
}
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
aco: skip &-4 before SMEM
The hardware ignores the low 2 bits. I'm not sure if they are ignored
before or after the address is calculated, but this optimization should be
cautious enough.
fossil-db (Sienna Cichlid):
Totals from 259 (0.19% of 134572) affected shaders:
SpillSGPRs: 1381 -> 1382 (+0.07%)
SpillVGPRs: 1783 -> 1782 (-0.06%); split: -0.67%, +0.62%
CodeSize: 1598612 -> 1596084 (-0.16%); split: -0.30%, +0.14%
Scratch: 180224 -> 179200 (-0.57%); split: -1.14%, +0.57%
Instrs: 284885 -> 284268 (-0.22%); split: -0.34%, +0.12%
Latency: 6585634 -> 6603388 (+0.27%); split: -0.48%, +0.75%
InvThroughput: 2638983 -> 2648474 (+0.36%); split: -0.58%, +0.94%
VClause: 6797 -> 6820 (+0.34%); split: -0.15%, +0.49%
SClause: 6569 -> 6574 (+0.08%); split: -1.11%, +1.19%
Copies: 50561 -> 50586 (+0.05%); split: -0.61%, +0.66%
Branches: 10058 -> 10062 (+0.04%); split: -0.01%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13755>
2021-11-11 10:54:56 +00:00
|
|
|
void
|
|
|
|
|
skip_smem_offset_align(opt_ctx& ctx, SMEM_instruction* smem)
|
|
|
|
|
{
|
|
|
|
|
bool soe = smem->operands.size() >= (!smem->definitions.empty() ? 3 : 4);
|
|
|
|
|
if (soe && !smem->operands[1].isConstant())
|
|
|
|
|
return;
|
|
|
|
|
/* We don't need to check the constant offset because the address seems to be calculated with
|
|
|
|
|
* (offset&-4 + const_offset&-4), not (offset+const_offset)&-4.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
Operand& op = smem->operands[soe ? smem->operands.size() - 1 : 1];
|
|
|
|
|
if (!op.isTemp() || !ctx.info[op.tempId()].is_bitwise())
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
Instruction* bitwise_instr = ctx.info[op.tempId()].instr;
|
|
|
|
|
if (bitwise_instr->opcode != aco_opcode::s_and_b32)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
if (bitwise_instr->operands[0].constantEquals(-4) &&
|
|
|
|
|
bitwise_instr->operands[1].isOfType(op.regClass().type()))
|
|
|
|
|
op.setTemp(bitwise_instr->operands[1].getTemp());
|
|
|
|
|
else if (bitwise_instr->operands[1].constantEquals(-4) &&
|
|
|
|
|
bitwise_instr->operands[0].isOfType(op.regClass().type()))
|
|
|
|
|
op.setTemp(bitwise_instr->operands[0].getTemp());
|
|
|
|
|
}
|
|
|
|
|
|
2021-12-14 19:51:50 +00:00
|
|
|
void
|
|
|
|
|
smem_combine(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
|
|
|
|
{
|
|
|
|
|
/* skip &-4 before offset additions: load((a + 16) & -4, 0) */
|
|
|
|
|
if (!instr->operands.empty())
|
|
|
|
|
skip_smem_offset_align(ctx, &instr->smem());
|
|
|
|
|
|
|
|
|
|
/* propagate constants and combine additions */
|
|
|
|
|
if (!instr->operands.empty() && instr->operands[1].isTemp()) {
|
|
|
|
|
SMEM_instruction& smem = instr->smem();
|
|
|
|
|
ssa_info info = ctx.info[instr->operands[1].tempId()];
|
|
|
|
|
|
|
|
|
|
Temp base;
|
|
|
|
|
uint32_t offset;
|
|
|
|
|
bool prevent_overflow = smem.operands[0].size() > 2 || smem.prevent_overflow;
|
|
|
|
|
if (info.is_constant_or_literal(32) &&
|
2022-05-12 02:50:17 -04:00
|
|
|
((ctx.program->gfx_level == GFX6 && info.val <= 0x3FF) ||
|
|
|
|
|
(ctx.program->gfx_level == GFX7 && info.val <= 0xFFFFFFFF) ||
|
|
|
|
|
(ctx.program->gfx_level >= GFX8 && info.val <= 0xFFFFF))) {
|
2021-12-14 19:51:50 +00:00
|
|
|
instr->operands[1] = Operand::c32(info.val);
|
|
|
|
|
} else if (parse_base_offset(ctx, instr.get(), 1, &base, &offset, prevent_overflow) &&
|
2022-05-12 02:50:17 -04:00
|
|
|
base.regClass() == s1 && offset <= 0xFFFFF && ctx.program->gfx_level >= GFX9 &&
|
2021-12-14 19:51:50 +00:00
|
|
|
offset % 4u == 0) {
|
|
|
|
|
bool soe = smem.operands.size() >= (!smem.definitions.empty() ? 3 : 4);
|
|
|
|
|
if (soe) {
|
|
|
|
|
if (ctx.info[smem.operands.back().tempId()].is_constant_or_literal(32) &&
|
|
|
|
|
ctx.info[smem.operands.back().tempId()].val == 0) {
|
|
|
|
|
smem.operands[1] = Operand::c32(offset);
|
|
|
|
|
smem.operands.back() = Operand(base);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
SMEM_instruction* new_instr = create_instruction<SMEM_instruction>(
|
|
|
|
|
smem.opcode, Format::SMEM, smem.operands.size() + 1, smem.definitions.size());
|
|
|
|
|
new_instr->operands[0] = smem.operands[0];
|
|
|
|
|
new_instr->operands[1] = Operand::c32(offset);
|
|
|
|
|
if (smem.definitions.empty())
|
|
|
|
|
new_instr->operands[2] = smem.operands[2];
|
|
|
|
|
new_instr->operands.back() = Operand(base);
|
|
|
|
|
if (!smem.definitions.empty())
|
|
|
|
|
new_instr->definitions[0] = smem.definitions[0];
|
|
|
|
|
new_instr->sync = smem.sync;
|
|
|
|
|
new_instr->glc = smem.glc;
|
|
|
|
|
new_instr->dlc = smem.dlc;
|
|
|
|
|
new_instr->nv = smem.nv;
|
|
|
|
|
new_instr->disable_wqm = smem.disable_wqm;
|
|
|
|
|
instr.reset(new_instr);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* skip &-4 after offset additions: load(a & -4, 16) */
|
|
|
|
|
if (!instr->operands.empty())
|
|
|
|
|
skip_smem_offset_align(ctx, &instr->smem());
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
unsigned
|
|
|
|
|
get_operand_size(aco_ptr<Instruction>& instr, unsigned index)
|
2020-05-15 15:12:33 +01:00
|
|
|
{
|
2021-01-20 15:27:16 +00:00
|
|
|
if (instr->isPseudo())
|
2020-05-15 15:12:33 +01:00
|
|
|
return instr->operands[index].bytes() * 8u;
|
2021-06-09 10:14:54 +02:00
|
|
|
else if (instr->opcode == aco_opcode::v_mad_u64_u32 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_mad_i64_i32)
|
2020-05-15 15:12:33 +01:00
|
|
|
return index == 2 ? 64 : 32;
|
aco: use v_fma_mix to combine mul/add/fma input conversions
fossil-db (Sienna Cichlid):
Totals from 11558 (8.57% of 134913) affected shaders:
VGPRs: 829392 -> 825200 (-0.51%); split: -0.52%, +0.02%
SpillSGPRs: 7845 -> 8399 (+7.06%)
CodeSize: 101822704 -> 101677172 (-0.14%); split: -0.25%, +0.11%
MaxWaves: 172216 -> 173182 (+0.56%); split: +0.59%, -0.03%
Instrs: 19061343 -> 18883450 (-0.93%); split: -0.93%, +0.00%
Latency: 256011590 -> 255177378 (-0.33%); split: -0.39%, +0.06%
InvThroughput: 46104438 -> 45604059 (-1.09%); split: -1.12%, +0.04%
VClause: 352211 -> 351948 (-0.07%); split: -0.21%, +0.13%
SClause: 676506 -> 676961 (+0.07%); split: -0.04%, +0.11%
Copies: 1246571 -> 1237745 (-0.71%); split: -0.97%, +0.26%
Branches: 626229 -> 626241 (+0.00%); split: -0.02%, +0.03%
PreSGPRs: 882176 -> 888853 (+0.76%); split: -0.00%, +0.76%
PreVGPRs: 796705 -> 792304 (-0.55%); split: -0.56%, +0.00%
fossil-db (Navi):
Totals from 11558 (8.57% of 134913) affected shaders:
VGPRs: 803900 -> 798660 (-0.65%); split: -0.73%, +0.08%
SpillSGPRs: 7894 -> 8492 (+7.58%); split: -0.10%, +7.68%
CodeSize: 96892596 -> 97134716 (+0.25%); split: -0.05%, +0.29%
MaxWaves: 181454 -> 183014 (+0.86%); split: +0.94%, -0.08%
Instrs: 18186813 -> 18093994 (-0.51%); split: -0.56%, +0.05%
Latency: 253385909 -> 253325528 (-0.02%); split: -0.15%, +0.12%
InvThroughput: 43315355 -> 42805541 (-1.18%); split: -1.33%, +0.15%
VClause: 338755 -> 338535 (-0.06%); split: -0.16%, +0.10%
SClause: 656561 -> 656829 (+0.04%); split: -0.07%, +0.11%
Copies: 1162235 -> 1153558 (-0.75%); split: -1.07%, +0.32%
Branches: 588536 -> 588542 (+0.00%); split: -0.03%, +0.03%
PreSGPRs: 854849 -> 861640 (+0.79%); split: -0.00%, +0.80%
PreVGPRs: 783401 -> 779031 (-0.56%); split: -0.56%, +0.00%
fossil-db (Vega):
Totals from 11516 (8.53% of 135048) affected shaders:
SGPRs: 1072128 -> 1076288 (+0.39%); split: -0.01%, +0.40%
VGPRs: 821312 -> 818124 (-0.39%); split: -0.43%, +0.04%
SpillSGPRs: 11952 -> 12677 (+6.07%)
CodeSize: 96378496 -> 96707596 (+0.34%); split: -0.04%, +0.38%
MaxWaves: 42614 -> 42883 (+0.63%); split: +0.68%, -0.04%
Instrs: 18672844 -> 18600274 (-0.39%); split: -0.44%, +0.05%
Latency: 296658786 -> 296338296 (-0.11%); split: -0.21%, +0.10%
InvThroughput: 111665547 -> 111283559 (-0.34%); split: -0.40%, +0.06%
VClause: 343001 -> 342826 (-0.05%); split: -0.14%, +0.09%
SClause: 646684 -> 646657 (-0.00%); split: -0.05%, +0.04%
Copies: 1715316 -> 1712895 (-0.14%); split: -0.53%, +0.39%
PreSGPRs: 850737 -> 856543 (+0.68%); split: -0.04%, +0.72%
PreVGPRs: 775293 -> 772215 (-0.40%); split: -0.41%, +0.02%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 13:58:34 +00:00
|
|
|
else if (instr->opcode == aco_opcode::v_fma_mix_f32 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_fma_mixlo_f16)
|
|
|
|
|
return instr->vop3p().opsel_hi & (1u << index) ? 16 : 32;
|
2020-05-15 15:12:33 +01:00
|
|
|
else if (instr->isVALU() || instr->isSALU())
|
|
|
|
|
return instr_info.operand_size[(int)instr->opcode];
|
|
|
|
|
else
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
Operand
|
|
|
|
|
get_constant_op(opt_ctx& ctx, ssa_info info, uint32_t bits)
|
2019-11-14 08:09:32 +01:00
|
|
|
{
|
2020-12-03 15:18:30 +00:00
|
|
|
if (bits == 64)
|
2021-07-13 11:22:46 +02:00
|
|
|
return Operand::c32_or_c64(info.val, true);
|
2022-05-12 02:50:17 -04:00
|
|
|
return Operand::get_const(ctx.program->gfx_level, info.val, bits / 8u);
|
2019-11-14 08:09:32 +01:00
|
|
|
}
|
|
|
|
|
|
2021-12-13 19:58:46 +01:00
|
|
|
void
|
|
|
|
|
propagate_constants_vop3p(opt_ctx& ctx, aco_ptr<Instruction>& instr, ssa_info& info, unsigned i)
|
|
|
|
|
{
|
|
|
|
|
if (!info.is_constant_or_literal(32))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
assert(instr->operands[i].isTemp());
|
|
|
|
|
unsigned bits = get_operand_size(instr, i);
|
|
|
|
|
if (info.is_constant(bits)) {
|
|
|
|
|
instr->operands[i] = get_constant_op(ctx, info, bits);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2022-05-02 13:19:45 +01:00
|
|
|
/* The accumulation operand of dot product instructions ignores opsel. */
|
|
|
|
|
bool cannot_use_opsel =
|
|
|
|
|
(instr->opcode == aco_opcode::v_dot4_i32_i8 || instr->opcode == aco_opcode::v_dot2_i32_i16 ||
|
2022-10-17 11:12:59 +02:00
|
|
|
instr->opcode == aco_opcode::v_dot4_i32_iu8 || instr->opcode == aco_opcode::v_dot4_u32_u8 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_dot2_u32_u16) &&
|
2022-05-02 13:19:45 +01:00
|
|
|
i == 2;
|
|
|
|
|
if (cannot_use_opsel)
|
|
|
|
|
return;
|
|
|
|
|
|
2021-12-13 19:58:46 +01:00
|
|
|
/* try to fold inline constants */
|
|
|
|
|
VOP3P_instruction* vop3p = &instr->vop3p();
|
|
|
|
|
bool opsel_lo = (vop3p->opsel_lo >> i) & 1;
|
|
|
|
|
bool opsel_hi = (vop3p->opsel_hi >> i) & 1;
|
|
|
|
|
|
2022-05-02 14:07:03 +01:00
|
|
|
Operand const_op[2];
|
|
|
|
|
bool const_opsel[2] = {false, false};
|
|
|
|
|
for (unsigned j = 0; j < 2; j++) {
|
|
|
|
|
if ((unsigned)opsel_lo != j && (unsigned)opsel_hi != j)
|
|
|
|
|
continue; /* this half is unused */
|
|
|
|
|
|
|
|
|
|
uint16_t val = info.val >> (j ? 16 : 0);
|
|
|
|
|
Operand op = Operand::get_const(ctx.program->gfx_level, val, bits / 8u);
|
|
|
|
|
if (bits == 32 && op.isLiteral()) /* try sign extension */
|
|
|
|
|
op = Operand::get_const(ctx.program->gfx_level, val | 0xffff0000, 4);
|
|
|
|
|
if (bits == 32 && op.isLiteral()) { /* try shifting left */
|
|
|
|
|
op = Operand::get_const(ctx.program->gfx_level, val << 16, 4);
|
|
|
|
|
const_opsel[j] = true;
|
|
|
|
|
}
|
|
|
|
|
if (op.isLiteral())
|
|
|
|
|
return;
|
|
|
|
|
const_op[j] = op;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Operand const_lo = const_op[0];
|
|
|
|
|
Operand const_hi = const_op[1];
|
|
|
|
|
bool const_lo_opsel = const_opsel[0];
|
|
|
|
|
bool const_hi_opsel = const_opsel[1];
|
2021-12-13 19:58:46 +01:00
|
|
|
|
|
|
|
|
if (opsel_lo == opsel_hi) {
|
|
|
|
|
/* use the single 16bit value */
|
|
|
|
|
instr->operands[i] = opsel_lo ? const_hi : const_lo;
|
|
|
|
|
|
2022-05-02 14:07:03 +01:00
|
|
|
/* opsel must point the same for both halves */
|
|
|
|
|
opsel_lo = opsel_lo ? const_hi_opsel : const_lo_opsel;
|
|
|
|
|
opsel_hi = opsel_lo;
|
2021-12-13 19:58:46 +01:00
|
|
|
} else if (const_lo == const_hi) {
|
|
|
|
|
/* both constants are the same */
|
|
|
|
|
instr->operands[i] = const_lo;
|
|
|
|
|
|
2022-05-02 14:07:03 +01:00
|
|
|
/* opsel must point the same for both halves */
|
|
|
|
|
opsel_lo = const_lo_opsel;
|
|
|
|
|
opsel_hi = const_lo_opsel;
|
|
|
|
|
} else if (const_lo.constantValue16(const_lo_opsel) ==
|
|
|
|
|
const_hi.constantValue16(!const_hi_opsel)) {
|
2021-12-13 19:58:46 +01:00
|
|
|
instr->operands[i] = const_hi;
|
|
|
|
|
|
|
|
|
|
/* redirect opsel selection */
|
2022-05-02 14:07:03 +01:00
|
|
|
opsel_lo = opsel_lo ? const_hi_opsel : !const_hi_opsel;
|
|
|
|
|
opsel_hi = opsel_hi ? const_hi_opsel : !const_hi_opsel;
|
|
|
|
|
} else if (const_hi.constantValue16(const_hi_opsel) ==
|
|
|
|
|
const_lo.constantValue16(!const_lo_opsel)) {
|
|
|
|
|
instr->operands[i] = const_lo;
|
|
|
|
|
|
|
|
|
|
/* redirect opsel selection */
|
|
|
|
|
opsel_lo = opsel_lo ? !const_lo_opsel : const_lo_opsel;
|
|
|
|
|
opsel_hi = opsel_hi ? !const_lo_opsel : const_lo_opsel;
|
2021-12-13 19:58:46 +01:00
|
|
|
} else if (bits == 16 && const_lo.constantValue() == (const_hi.constantValue() ^ (1 << 15))) {
|
2022-05-02 14:07:03 +01:00
|
|
|
assert(const_lo_opsel == false && const_hi_opsel == false);
|
|
|
|
|
|
2021-12-13 19:58:46 +01:00
|
|
|
/* const_lo == -const_hi */
|
|
|
|
|
if (!instr_info.can_use_input_modifiers[(int)instr->opcode])
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
instr->operands[i] = Operand::c16(const_lo.constantValue() & 0x7FFF);
|
|
|
|
|
bool neg_lo = const_lo.constantValue() & (1 << 15);
|
|
|
|
|
vop3p->neg_lo[i] ^= opsel_lo ^ neg_lo;
|
|
|
|
|
vop3p->neg_hi[i] ^= opsel_hi ^ neg_lo;
|
|
|
|
|
|
|
|
|
|
/* opsel must point to lo for both operands */
|
2022-05-02 14:07:03 +01:00
|
|
|
opsel_lo = false;
|
|
|
|
|
opsel_hi = false;
|
2021-12-13 19:58:46 +01:00
|
|
|
}
|
2022-05-02 14:07:03 +01:00
|
|
|
|
|
|
|
|
vop3p->opsel_lo = opsel_lo ? (vop3p->opsel_lo | (1 << i)) : (vop3p->opsel_lo & ~(1 << i));
|
|
|
|
|
vop3p->opsel_hi = opsel_hi ? (vop3p->opsel_hi | (1 << i)) : (vop3p->opsel_hi & ~(1 << i));
|
2021-12-13 19:58:46 +01:00
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
fixed_to_exec(Operand op)
|
2020-01-28 12:04:48 +00:00
|
|
|
{
|
|
|
|
|
return op.isFixed() && op.physReg() == exec;
|
|
|
|
|
}
|
|
|
|
|
|
2021-08-30 17:58:36 +02:00
|
|
|
SubdwordSel
|
2021-06-09 10:14:54 +02:00
|
|
|
parse_extract(Instruction* instr)
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
{
|
|
|
|
|
if (instr->opcode == aco_opcode::p_extract) {
|
2021-08-30 17:58:36 +02:00
|
|
|
unsigned size = instr->operands[2].constantValue() / 8;
|
|
|
|
|
unsigned offset = instr->operands[1].constantValue() * size;
|
|
|
|
|
bool sext = instr->operands[3].constantEquals(1);
|
|
|
|
|
return SubdwordSel(size, offset, sext);
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
} else if (instr->opcode == aco_opcode::p_insert && instr->operands[1].constantEquals(0)) {
|
2021-08-30 17:58:36 +02:00
|
|
|
return instr->operands[2].constantEquals(8) ? SubdwordSel::ubyte : SubdwordSel::uword;
|
2021-10-04 11:13:08 +01:00
|
|
|
} else if (instr->opcode == aco_opcode::p_extract_vector) {
|
|
|
|
|
unsigned size = instr->definitions[0].bytes();
|
|
|
|
|
unsigned offset = instr->operands[1].constantValue() * size;
|
|
|
|
|
if (size <= 2)
|
|
|
|
|
return SubdwordSel(size, offset, false);
|
aco/optimizer: apply extract from subdword p_split_vector
Totals from 1345 (1.00% of 134572) affected shaders: (GFX10.3)
VGPRs: 76752 -> 76744 (-0.01%); split: -0.02%, +0.01%
SpillSGPRs: 1459 -> 1460 (+0.07%)
SpillVGPRs: 1776 -> 1784 (+0.45%); split: -0.39%, +0.84%
CodeSize: 13310964 -> 13309420 (-0.01%); split: -0.06%, +0.05%
Scratch: 178176 -> 179200 (+0.57%)
Instrs: 2516874 -> 2516860 (-0.00%); split: -0.05%, +0.05%
Latency: 23228506 -> 23230338 (+0.01%); split: -0.14%, +0.15%
InvThroughput: 6002384 -> 6000158 (-0.04%); split: -0.24%, +0.21%
VClause: 41115 -> 41117 (+0.00%); split: -0.28%, +0.29%
SClause: 104639 -> 104664 (+0.02%); split: -0.07%, +0.09%
Copies: 185121 -> 184862 (-0.14%); split: -0.69%, +0.55%
Branches: 100740 -> 100735 (-0.00%); split: -0.01%, +0.00%
PreVGPRs: 70119 -> 69968 (-0.22%)
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13576>
2021-10-25 16:44:42 +02:00
|
|
|
} else if (instr->opcode == aco_opcode::p_split_vector) {
|
|
|
|
|
assert(instr->operands[0].bytes() == 4 && instr->definitions[1].bytes() == 2);
|
|
|
|
|
return SubdwordSel(2, 2, false);
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
}
|
2021-10-04 11:13:08 +01:00
|
|
|
|
|
|
|
|
return SubdwordSel();
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
}
|
|
|
|
|
|
2021-08-30 17:58:36 +02:00
|
|
|
SubdwordSel
|
2021-06-09 10:14:54 +02:00
|
|
|
parse_insert(Instruction* instr)
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
{
|
|
|
|
|
if (instr->opcode == aco_opcode::p_extract && instr->operands[3].constantEquals(0) &&
|
|
|
|
|
instr->operands[1].constantEquals(0)) {
|
2021-08-30 17:58:36 +02:00
|
|
|
return instr->operands[2].constantEquals(8) ? SubdwordSel::ubyte : SubdwordSel::uword;
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
} else if (instr->opcode == aco_opcode::p_insert) {
|
2021-08-30 17:58:36 +02:00
|
|
|
unsigned size = instr->operands[2].constantValue() / 8;
|
|
|
|
|
unsigned offset = instr->operands[1].constantValue() * size;
|
|
|
|
|
return SubdwordSel(size, offset, false);
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
} else {
|
2021-08-30 17:58:36 +02:00
|
|
|
return SubdwordSel();
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
can_apply_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_info& info)
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
{
|
|
|
|
|
Temp tmp = info.instr->operands[0].getTemp();
|
2021-08-30 17:58:36 +02:00
|
|
|
SubdwordSel sel = parse_extract(info.instr);
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
|
2021-08-30 17:58:36 +02:00
|
|
|
if (!sel) {
|
|
|
|
|
return false;
|
|
|
|
|
} else if (sel.size() == 4) {
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
return true;
|
2021-08-30 17:58:36 +02:00
|
|
|
} else if (instr->opcode == aco_opcode::v_cvt_f32_u32 && sel.size() == 1 && !sel.sign_extend()) {
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
return true;
|
2022-10-28 16:32:12 +02:00
|
|
|
} else if (idx < 2 && can_use_SDWA(ctx.program->gfx_level, instr, true) &&
|
2022-05-12 02:50:17 -04:00
|
|
|
(tmp.type() == RegType::vgpr || ctx.program->gfx_level >= GFX9)) {
|
2021-08-30 17:58:36 +02:00
|
|
|
if (instr->isSDWA() && instr->sdwa().sel[idx] != SubdwordSel::dword)
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
return false;
|
|
|
|
|
return true;
|
2021-08-30 17:58:36 +02:00
|
|
|
} else if (instr->isVOP3() && sel.size() == 2 &&
|
2022-05-12 02:50:17 -04:00
|
|
|
can_use_opsel(ctx.program->gfx_level, instr->opcode, idx) &&
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
!(instr->vop3().opsel & (1 << idx))) {
|
|
|
|
|
return true;
|
2021-10-05 13:09:02 +01:00
|
|
|
} else if (instr->opcode == aco_opcode::p_extract) {
|
|
|
|
|
SubdwordSel instrSel = parse_extract(instr.get());
|
|
|
|
|
|
|
|
|
|
/* the outer offset must be within extracted range */
|
|
|
|
|
if (instrSel.offset() >= sel.size())
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
/* don't remove the sign-extension when increasing the size further */
|
|
|
|
|
if (instrSel.size() > sel.size() && !instrSel.sign_extend() && sel.sign_extend())
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
return true;
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
}
|
2021-10-05 13:09:02 +01:00
|
|
|
|
|
|
|
|
return false;
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Combine an p_extract (or p_insert, in some cases) instruction with instr.
|
|
|
|
|
* instr(p_extract(...)) -> instr()
|
|
|
|
|
*/
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
apply_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_info& info)
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
{
|
|
|
|
|
Temp tmp = info.instr->operands[0].getTemp();
|
2021-08-30 17:58:36 +02:00
|
|
|
SubdwordSel sel = parse_extract(info.instr);
|
|
|
|
|
assert(sel);
|
|
|
|
|
|
2021-09-30 14:32:07 +02:00
|
|
|
instr->operands[idx].set16bit(false);
|
|
|
|
|
instr->operands[idx].set24bit(false);
|
|
|
|
|
|
|
|
|
|
ctx.info[tmp.id()].label &= ~label_insert;
|
|
|
|
|
|
2021-08-30 17:58:36 +02:00
|
|
|
if (sel.size() == 4) {
|
|
|
|
|
/* full dword selection */
|
|
|
|
|
} else if (instr->opcode == aco_opcode::v_cvt_f32_u32 && sel.size() == 1 && !sel.sign_extend()) {
|
|
|
|
|
switch (sel.offset()) {
|
|
|
|
|
case 0: instr->opcode = aco_opcode::v_cvt_f32_ubyte0; break;
|
|
|
|
|
case 1: instr->opcode = aco_opcode::v_cvt_f32_ubyte1; break;
|
|
|
|
|
case 2: instr->opcode = aco_opcode::v_cvt_f32_ubyte2; break;
|
|
|
|
|
case 3: instr->opcode = aco_opcode::v_cvt_f32_ubyte3; break;
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
}
|
2021-09-30 14:32:07 +02:00
|
|
|
} else if (instr->opcode == aco_opcode::v_lshlrev_b32 && instr->operands[0].isConstant() &&
|
|
|
|
|
sel.offset() == 0 &&
|
|
|
|
|
((sel.size() == 2 && instr->operands[0].constantValue() >= 16u) ||
|
|
|
|
|
(sel.size() == 1 && instr->operands[0].constantValue() >= 24u))) {
|
|
|
|
|
/* The undesireable upper bits are already shifted out. */
|
|
|
|
|
return;
|
2022-05-12 02:50:17 -04:00
|
|
|
} else if (can_use_SDWA(ctx.program->gfx_level, instr, true) &&
|
|
|
|
|
(tmp.type() == RegType::vgpr || ctx.program->gfx_level >= GFX9)) {
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
to_SDWA(ctx, instr);
|
|
|
|
|
static_cast<SDWA_instruction*>(instr.get())->sel[idx] = sel;
|
|
|
|
|
} else if (instr->isVOP3()) {
|
2021-08-30 17:58:36 +02:00
|
|
|
if (sel.offset())
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
instr->vop3().opsel |= 1 << idx;
|
2021-10-05 13:09:02 +01:00
|
|
|
} else if (instr->opcode == aco_opcode::p_extract) {
|
|
|
|
|
SubdwordSel instrSel = parse_extract(instr.get());
|
|
|
|
|
|
|
|
|
|
unsigned size = std::min(sel.size(), instrSel.size());
|
|
|
|
|
unsigned offset = sel.offset() + instrSel.offset();
|
|
|
|
|
unsigned sign_extend =
|
|
|
|
|
instrSel.sign_extend() && (sel.sign_extend() || instrSel.size() <= sel.size());
|
|
|
|
|
|
|
|
|
|
instr->operands[1] = Operand::c32(offset / size);
|
|
|
|
|
instr->operands[2] = Operand::c32(size * 8u);
|
|
|
|
|
instr->operands[3] = Operand::c32(sign_extend);
|
|
|
|
|
return;
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
}
|
|
|
|
|
|
aco: use v_fma_mix to combine mul/add/fma input conversions
fossil-db (Sienna Cichlid):
Totals from 11558 (8.57% of 134913) affected shaders:
VGPRs: 829392 -> 825200 (-0.51%); split: -0.52%, +0.02%
SpillSGPRs: 7845 -> 8399 (+7.06%)
CodeSize: 101822704 -> 101677172 (-0.14%); split: -0.25%, +0.11%
MaxWaves: 172216 -> 173182 (+0.56%); split: +0.59%, -0.03%
Instrs: 19061343 -> 18883450 (-0.93%); split: -0.93%, +0.00%
Latency: 256011590 -> 255177378 (-0.33%); split: -0.39%, +0.06%
InvThroughput: 46104438 -> 45604059 (-1.09%); split: -1.12%, +0.04%
VClause: 352211 -> 351948 (-0.07%); split: -0.21%, +0.13%
SClause: 676506 -> 676961 (+0.07%); split: -0.04%, +0.11%
Copies: 1246571 -> 1237745 (-0.71%); split: -0.97%, +0.26%
Branches: 626229 -> 626241 (+0.00%); split: -0.02%, +0.03%
PreSGPRs: 882176 -> 888853 (+0.76%); split: -0.00%, +0.76%
PreVGPRs: 796705 -> 792304 (-0.55%); split: -0.56%, +0.00%
fossil-db (Navi):
Totals from 11558 (8.57% of 134913) affected shaders:
VGPRs: 803900 -> 798660 (-0.65%); split: -0.73%, +0.08%
SpillSGPRs: 7894 -> 8492 (+7.58%); split: -0.10%, +7.68%
CodeSize: 96892596 -> 97134716 (+0.25%); split: -0.05%, +0.29%
MaxWaves: 181454 -> 183014 (+0.86%); split: +0.94%, -0.08%
Instrs: 18186813 -> 18093994 (-0.51%); split: -0.56%, +0.05%
Latency: 253385909 -> 253325528 (-0.02%); split: -0.15%, +0.12%
InvThroughput: 43315355 -> 42805541 (-1.18%); split: -1.33%, +0.15%
VClause: 338755 -> 338535 (-0.06%); split: -0.16%, +0.10%
SClause: 656561 -> 656829 (+0.04%); split: -0.07%, +0.11%
Copies: 1162235 -> 1153558 (-0.75%); split: -1.07%, +0.32%
Branches: 588536 -> 588542 (+0.00%); split: -0.03%, +0.03%
PreSGPRs: 854849 -> 861640 (+0.79%); split: -0.00%, +0.80%
PreVGPRs: 783401 -> 779031 (-0.56%); split: -0.56%, +0.00%
fossil-db (Vega):
Totals from 11516 (8.53% of 135048) affected shaders:
SGPRs: 1072128 -> 1076288 (+0.39%); split: -0.01%, +0.40%
VGPRs: 821312 -> 818124 (-0.39%); split: -0.43%, +0.04%
SpillSGPRs: 11952 -> 12677 (+6.07%)
CodeSize: 96378496 -> 96707596 (+0.34%); split: -0.04%, +0.38%
MaxWaves: 42614 -> 42883 (+0.63%); split: +0.68%, -0.04%
Instrs: 18672844 -> 18600274 (-0.39%); split: -0.44%, +0.05%
Latency: 296658786 -> 296338296 (-0.11%); split: -0.21%, +0.10%
InvThroughput: 111665547 -> 111283559 (-0.34%); split: -0.40%, +0.06%
VClause: 343001 -> 342826 (-0.05%); split: -0.14%, +0.09%
SClause: 646684 -> 646657 (-0.00%); split: -0.05%, +0.04%
Copies: 1715316 -> 1712895 (-0.14%); split: -0.53%, +0.39%
PreSGPRs: 850737 -> 856543 (+0.68%); split: -0.04%, +0.72%
PreVGPRs: 775293 -> 772215 (-0.40%); split: -0.41%, +0.02%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 13:58:34 +00:00
|
|
|
/* Output modifier, label_vopc and label_f2f32 seem to be the only one worth keeping at the
|
|
|
|
|
* moment
|
|
|
|
|
*/
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
for (Definition& def : instr->definitions)
|
aco: use v_fma_mix to combine mul/add/fma input conversions
fossil-db (Sienna Cichlid):
Totals from 11558 (8.57% of 134913) affected shaders:
VGPRs: 829392 -> 825200 (-0.51%); split: -0.52%, +0.02%
SpillSGPRs: 7845 -> 8399 (+7.06%)
CodeSize: 101822704 -> 101677172 (-0.14%); split: -0.25%, +0.11%
MaxWaves: 172216 -> 173182 (+0.56%); split: +0.59%, -0.03%
Instrs: 19061343 -> 18883450 (-0.93%); split: -0.93%, +0.00%
Latency: 256011590 -> 255177378 (-0.33%); split: -0.39%, +0.06%
InvThroughput: 46104438 -> 45604059 (-1.09%); split: -1.12%, +0.04%
VClause: 352211 -> 351948 (-0.07%); split: -0.21%, +0.13%
SClause: 676506 -> 676961 (+0.07%); split: -0.04%, +0.11%
Copies: 1246571 -> 1237745 (-0.71%); split: -0.97%, +0.26%
Branches: 626229 -> 626241 (+0.00%); split: -0.02%, +0.03%
PreSGPRs: 882176 -> 888853 (+0.76%); split: -0.00%, +0.76%
PreVGPRs: 796705 -> 792304 (-0.55%); split: -0.56%, +0.00%
fossil-db (Navi):
Totals from 11558 (8.57% of 134913) affected shaders:
VGPRs: 803900 -> 798660 (-0.65%); split: -0.73%, +0.08%
SpillSGPRs: 7894 -> 8492 (+7.58%); split: -0.10%, +7.68%
CodeSize: 96892596 -> 97134716 (+0.25%); split: -0.05%, +0.29%
MaxWaves: 181454 -> 183014 (+0.86%); split: +0.94%, -0.08%
Instrs: 18186813 -> 18093994 (-0.51%); split: -0.56%, +0.05%
Latency: 253385909 -> 253325528 (-0.02%); split: -0.15%, +0.12%
InvThroughput: 43315355 -> 42805541 (-1.18%); split: -1.33%, +0.15%
VClause: 338755 -> 338535 (-0.06%); split: -0.16%, +0.10%
SClause: 656561 -> 656829 (+0.04%); split: -0.07%, +0.11%
Copies: 1162235 -> 1153558 (-0.75%); split: -1.07%, +0.32%
Branches: 588536 -> 588542 (+0.00%); split: -0.03%, +0.03%
PreSGPRs: 854849 -> 861640 (+0.79%); split: -0.00%, +0.80%
PreVGPRs: 783401 -> 779031 (-0.56%); split: -0.56%, +0.00%
fossil-db (Vega):
Totals from 11516 (8.53% of 135048) affected shaders:
SGPRs: 1072128 -> 1076288 (+0.39%); split: -0.01%, +0.40%
VGPRs: 821312 -> 818124 (-0.39%); split: -0.43%, +0.04%
SpillSGPRs: 11952 -> 12677 (+6.07%)
CodeSize: 96378496 -> 96707596 (+0.34%); split: -0.04%, +0.38%
MaxWaves: 42614 -> 42883 (+0.63%); split: +0.68%, -0.04%
Instrs: 18672844 -> 18600274 (-0.39%); split: -0.44%, +0.05%
Latency: 296658786 -> 296338296 (-0.11%); split: -0.21%, +0.10%
InvThroughput: 111665547 -> 111283559 (-0.34%); split: -0.40%, +0.06%
VClause: 343001 -> 342826 (-0.05%); split: -0.14%, +0.09%
SClause: 646684 -> 646657 (-0.00%); split: -0.05%, +0.04%
Copies: 1715316 -> 1712895 (-0.14%); split: -0.53%, +0.39%
PreSGPRs: 850737 -> 856543 (+0.68%); split: -0.04%, +0.72%
PreVGPRs: 775293 -> 772215 (-0.40%); split: -0.41%, +0.02%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 13:58:34 +00:00
|
|
|
ctx.info[def.tempId()].label &= (label_vopc | label_f2f32 | instr_mod_labels);
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
check_sdwa_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
{
|
|
|
|
|
for (unsigned i = 0; i < instr->operands.size(); i++) {
|
|
|
|
|
Operand op = instr->operands[i];
|
|
|
|
|
if (!op.isTemp())
|
|
|
|
|
continue;
|
|
|
|
|
ssa_info& info = ctx.info[op.tempId()];
|
|
|
|
|
if (info.is_extract() && (info.instr->operands[0].getTemp().type() == RegType::vgpr ||
|
|
|
|
|
op.getTemp().type() == RegType::sgpr)) {
|
|
|
|
|
if (!can_apply_extract(ctx, instr, i, info))
|
|
|
|
|
info.label &= ~label_extract;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
does_fp_op_flush_denorms(opt_ctx& ctx, aco_opcode op)
|
2020-06-17 15:02:30 +01:00
|
|
|
{
|
2022-05-12 02:50:17 -04:00
|
|
|
if (ctx.program->gfx_level <= GFX8) {
|
2021-02-05 10:35:03 +00:00
|
|
|
switch (op) {
|
|
|
|
|
case aco_opcode::v_min_f32:
|
|
|
|
|
case aco_opcode::v_max_f32:
|
|
|
|
|
case aco_opcode::v_med3_f32:
|
|
|
|
|
case aco_opcode::v_min3_f32:
|
|
|
|
|
case aco_opcode::v_max3_f32:
|
|
|
|
|
case aco_opcode::v_min_f16:
|
2021-06-09 10:14:54 +02:00
|
|
|
case aco_opcode::v_max_f16: return false;
|
|
|
|
|
default: break;
|
2021-02-05 10:35:03 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return op != aco_opcode::v_cndmask_b32;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
can_eliminate_fcanonicalize(opt_ctx& ctx, aco_ptr<Instruction>& instr, Temp tmp)
|
2021-02-05 10:35:03 +00:00
|
|
|
{
|
2021-06-09 10:14:54 +02:00
|
|
|
float_mode* fp = &ctx.fp_mode;
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
if (ctx.info[tmp.id()].is_canonicalized() ||
|
|
|
|
|
(tmp.bytes() == 4 ? fp->denorm32 : fp->denorm16_64) == fp_denorm_keep)
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
aco_opcode op = instr->opcode;
|
2021-02-05 10:35:03 +00:00
|
|
|
return instr_info.can_use_input_modifiers[(int)op] && does_fp_op_flush_denorms(ctx, op);
|
2020-06-17 15:02:30 +01:00
|
|
|
}
|
|
|
|
|
|
2022-03-30 18:01:45 +02:00
|
|
|
bool
|
|
|
|
|
can_eliminate_and_exec(opt_ctx& ctx, Temp tmp, unsigned pass_flags)
|
|
|
|
|
{
|
|
|
|
|
if (ctx.info[tmp.id()].is_vopc()) {
|
|
|
|
|
Instruction* vopc_instr = ctx.info[tmp.id()].instr;
|
|
|
|
|
/* Remove superfluous s_and when the VOPC instruction uses the same exec and thus
|
|
|
|
|
* already produces the same result */
|
|
|
|
|
return vopc_instr->pass_flags == pass_flags;
|
|
|
|
|
}
|
|
|
|
|
if (ctx.info[tmp.id()].is_bitwise()) {
|
|
|
|
|
Instruction* instr = ctx.info[tmp.id()].instr;
|
2022-08-24 12:55:12 +02:00
|
|
|
if (instr->operands.size() != 2 || instr->pass_flags != pass_flags)
|
2022-03-30 18:01:45 +02:00
|
|
|
return false;
|
2022-08-24 12:55:12 +02:00
|
|
|
if (!(instr->operands[0].isTemp() && instr->operands[1].isTemp()))
|
|
|
|
|
return false;
|
2022-08-24 15:12:40 +02:00
|
|
|
if (instr->opcode == aco_opcode::s_and_b32 || instr->opcode == aco_opcode::s_and_b64) {
|
|
|
|
|
return can_eliminate_and_exec(ctx, instr->operands[0].getTemp(), pass_flags) ||
|
|
|
|
|
can_eliminate_and_exec(ctx, instr->operands[1].getTemp(), pass_flags);
|
|
|
|
|
} else {
|
|
|
|
|
return can_eliminate_and_exec(ctx, instr->operands[0].getTemp(), pass_flags) &&
|
|
|
|
|
can_eliminate_and_exec(ctx, instr->operands[1].getTemp(), pass_flags);
|
|
|
|
|
}
|
2022-03-30 18:01:45 +02:00
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
is_copy_label(opt_ctx& ctx, aco_ptr<Instruction>& instr, ssa_info& info)
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
{
|
2021-06-09 10:14:54 +02:00
|
|
|
return info.is_temp() ||
|
|
|
|
|
(info.is_fcanonicalize() && can_eliminate_fcanonicalize(ctx, instr, info.temp));
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
is_op_canonicalized(opt_ctx& ctx, Operand op)
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
{
|
2021-06-09 10:14:54 +02:00
|
|
|
float_mode* fp = &ctx.fp_mode;
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
if ((op.isTemp() && ctx.info[op.tempId()].is_canonicalized()) ||
|
|
|
|
|
(op.bytes() == 4 ? fp->denorm32 : fp->denorm16_64) == fp_denorm_keep)
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
if (op.isConstant() || (op.isTemp() && ctx.info[op.tempId()].is_constant_or_literal(32))) {
|
|
|
|
|
uint32_t val = op.isTemp() ? ctx.info[op.tempId()].val : op.constantValue();
|
|
|
|
|
if (op.bytes() == 2)
|
|
|
|
|
return (val & 0x7fff) == 0 || (val & 0x7fff) > 0x3ff;
|
|
|
|
|
else if (op.bytes() == 4)
|
|
|
|
|
return (val & 0x7fffffff) == 0 || (val & 0x7fffffff) > 0x7fffff;
|
|
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2022-05-19 15:19:12 +01:00
|
|
|
bool
|
2022-12-01 15:05:49 +00:00
|
|
|
is_scratch_offset_valid(opt_ctx& ctx, Instruction* instr, int64_t offset0, int64_t offset1)
|
2022-05-19 15:19:12 +01:00
|
|
|
{
|
|
|
|
|
bool negative_unaligned_scratch_offset_bug = ctx.program->gfx_level == GFX10;
|
|
|
|
|
int32_t min = ctx.program->dev.scratch_global_offset_min;
|
|
|
|
|
int32_t max = ctx.program->dev.scratch_global_offset_max;
|
|
|
|
|
|
2022-12-01 15:05:49 +00:00
|
|
|
int64_t offset = offset0 + offset1;
|
|
|
|
|
|
2022-05-19 15:19:12 +01:00
|
|
|
bool has_vgpr_offset = instr && !instr->operands[0].isUndefined();
|
|
|
|
|
if (negative_unaligned_scratch_offset_bug && has_vgpr_offset && offset < 0 && offset % 4)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
return offset >= min && offset <= max;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2021-01-20 15:27:16 +00:00
|
|
|
if (instr->isSALU() || instr->isVALU() || instr->isPseudo()) {
|
2019-09-17 13:22:17 +02:00
|
|
|
ASSERTED bool all_const = false;
|
|
|
|
|
for (Operand& op : instr->operands)
|
2021-06-09 10:14:54 +02:00
|
|
|
all_const =
|
|
|
|
|
all_const && (!op.isTemp() || ctx.info[op.tempId()].is_constant_or_literal(32));
|
2020-08-14 10:42:27 +02:00
|
|
|
perfwarn(ctx.program, all_const, "All instruction operands are constant", instr.get());
|
2020-10-15 15:18:40 +01:00
|
|
|
|
|
|
|
|
ASSERTED bool is_copy = instr->opcode == aco_opcode::s_mov_b32 ||
|
|
|
|
|
instr->opcode == aco_opcode::s_mov_b64 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_mov_b32;
|
2021-06-09 10:14:54 +02:00
|
|
|
perfwarn(ctx.program, is_copy && !instr->usesModifiers(), "Use p_parallelcopy instead",
|
|
|
|
|
instr.get());
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2021-12-14 19:51:50 +00:00
|
|
|
if (instr->isSMEM())
|
|
|
|
|
smem_combine(ctx, instr);
|
aco: skip &-4 before SMEM
The hardware ignores the low 2 bits. I'm not sure if they are ignored
before or after the address is calculated, but this optimization should be
cautious enough.
fossil-db (Sienna Cichlid):
Totals from 259 (0.19% of 134572) affected shaders:
SpillSGPRs: 1381 -> 1382 (+0.07%)
SpillVGPRs: 1783 -> 1782 (-0.06%); split: -0.67%, +0.62%
CodeSize: 1598612 -> 1596084 (-0.16%); split: -0.30%, +0.14%
Scratch: 180224 -> 179200 (-0.57%); split: -1.14%, +0.57%
Instrs: 284885 -> 284268 (-0.22%); split: -0.34%, +0.12%
Latency: 6585634 -> 6603388 (+0.27%); split: -0.48%, +0.75%
InvThroughput: 2638983 -> 2648474 (+0.36%); split: -0.58%, +0.94%
VClause: 6797 -> 6820 (+0.34%); split: -0.15%, +0.49%
SClause: 6569 -> 6574 (+0.08%); split: -1.11%, +1.19%
Copies: 50561 -> 50586 (+0.05%); split: -0.61%, +0.66%
Branches: 10058 -> 10062 (+0.04%); split: -0.01%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13755>
2021-11-11 10:54:56 +00:00
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
for (unsigned i = 0; i < instr->operands.size(); i++) {
|
2019-09-17 13:22:17 +02:00
|
|
|
if (!instr->operands[i].isTemp())
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
ssa_info info = ctx.info[instr->operands[i].tempId()];
|
|
|
|
|
/* propagate undef */
|
|
|
|
|
if (info.is_undefined() && is_phi(instr))
|
|
|
|
|
instr->operands[i] = Operand(instr->operands[i].regClass());
|
|
|
|
|
/* propagate reg->reg of same type */
|
2020-12-31 11:01:08 +00:00
|
|
|
while (info.is_temp() && info.temp.regClass() == instr->operands[i].getTemp().regClass()) {
|
2019-09-17 13:22:17 +02:00
|
|
|
instr->operands[i].setTemp(ctx.info[instr->operands[i].tempId()].temp);
|
|
|
|
|
info = ctx.info[info.temp.id()];
|
|
|
|
|
}
|
|
|
|
|
|
2020-12-31 11:01:08 +00:00
|
|
|
/* PSEUDO: propagate temporaries */
|
2021-01-20 15:27:16 +00:00
|
|
|
if (instr->isPseudo()) {
|
2020-12-31 11:01:08 +00:00
|
|
|
while (info.is_temp()) {
|
|
|
|
|
pseudo_propagate_temp(ctx, instr, info.temp, i);
|
2020-04-07 10:46:37 +01:00
|
|
|
info = ctx.info[info.temp.id()];
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
2020-12-31 11:01:08 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* SALU / PSEUDO: propagate inline constants */
|
2021-01-20 15:27:16 +00:00
|
|
|
if (instr->isSALU() || instr->isPseudo()) {
|
2020-05-15 16:28:03 +01:00
|
|
|
unsigned bits = get_operand_size(instr, i);
|
2021-01-20 15:27:16 +00:00
|
|
|
if ((info.is_constant(bits) || (info.is_literal(bits) && instr->isPseudo())) &&
|
2020-01-16 16:54:35 +01:00
|
|
|
!instr->operands[i].isFixed() && alu_can_accept_constant(instr->opcode, i)) {
|
2020-05-15 16:28:03 +01:00
|
|
|
instr->operands[i] = get_constant_op(ctx, info, bits);
|
2019-09-17 13:22:17 +02:00
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* VALU: propagate neg, abs & inline constants */
|
|
|
|
|
else if (instr->isVALU()) {
|
2021-06-09 10:14:54 +02:00
|
|
|
if (is_copy_label(ctx, instr, info) && info.temp.type() == RegType::vgpr &&
|
|
|
|
|
valu_can_accept_vgpr(instr, i)) {
|
2019-09-17 13:22:17 +02:00
|
|
|
instr->operands[i].setTemp(info.temp);
|
|
|
|
|
info = ctx.info[info.temp.id()];
|
|
|
|
|
}
|
2020-09-18 00:00:38 +01:00
|
|
|
/* applying SGPRs to VOP1 doesn't increase code size and DCE is helped by doing it earlier */
|
2021-06-09 10:14:54 +02:00
|
|
|
if (info.is_temp() && info.temp.type() == RegType::sgpr && can_apply_sgprs(ctx, instr) &&
|
|
|
|
|
instr->operands.size() == 1) {
|
2021-08-30 10:30:45 +01:00
|
|
|
instr->format = withoutDPP(instr->format);
|
2020-09-18 00:00:38 +01:00
|
|
|
instr->operands[i].setTemp(info.temp);
|
|
|
|
|
info = ctx.info[info.temp.id()];
|
|
|
|
|
}
|
2020-05-15 15:12:33 +01:00
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
/* for instructions other than v_cndmask_b32, the size of the instruction should match the
|
|
|
|
|
* operand size */
|
|
|
|
|
unsigned can_use_mod =
|
|
|
|
|
instr->opcode != aco_opcode::v_cndmask_b32 || instr->operands[i].getTemp().bytes() == 4;
|
2020-05-15 15:12:33 +01:00
|
|
|
can_use_mod = can_use_mod && instr_info.can_use_input_modifiers[(int)instr->opcode];
|
|
|
|
|
|
2019-12-05 14:12:39 +00:00
|
|
|
if (instr->isSDWA())
|
2021-08-30 17:58:36 +02:00
|
|
|
can_use_mod = can_use_mod && instr->sdwa().sel[i].size() == 4;
|
2019-12-05 14:12:39 +00:00
|
|
|
else
|
2021-11-29 00:12:04 +09:00
|
|
|
can_use_mod = can_use_mod && (instr->isDPP16() || can_use_VOP3(ctx, instr));
|
2019-12-05 14:12:39 +00:00
|
|
|
|
2022-01-31 18:01:45 +00:00
|
|
|
unsigned bits = get_operand_size(instr, i);
|
|
|
|
|
bool mod_bitsize_compat = instr->operands[i].bytes() * 8 == bits;
|
|
|
|
|
|
|
|
|
|
if (info.is_neg() && instr->opcode == aco_opcode::v_add_f32 && mod_bitsize_compat) {
|
2019-09-17 13:22:17 +02:00
|
|
|
instr->opcode = i ? aco_opcode::v_sub_f32 : aco_opcode::v_subrev_f32;
|
|
|
|
|
instr->operands[i].setTemp(info.temp);
|
2022-01-31 18:01:45 +00:00
|
|
|
} else if (info.is_neg() && instr->opcode == aco_opcode::v_add_f16 && mod_bitsize_compat) {
|
2020-05-15 15:12:33 +01:00
|
|
|
instr->opcode = i ? aco_opcode::v_sub_f16 : aco_opcode::v_subrev_f16;
|
|
|
|
|
instr->operands[i].setTemp(info.temp);
|
2022-01-31 18:01:45 +00:00
|
|
|
} else if (info.is_neg() && can_use_mod && mod_bitsize_compat &&
|
2021-06-09 10:14:54 +02:00
|
|
|
can_eliminate_fcanonicalize(ctx, instr, info.temp)) {
|
2019-12-05 14:12:39 +00:00
|
|
|
if (!instr->isDPP() && !instr->isSDWA())
|
2019-09-17 13:22:17 +02:00
|
|
|
to_VOP3(ctx, instr);
|
|
|
|
|
instr->operands[i].setTemp(info.temp);
|
2021-11-29 00:12:04 +09:00
|
|
|
if (instr->isDPP16() && !instr->dpp16().abs[i])
|
|
|
|
|
instr->dpp16().neg[i] = true;
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
else if (instr->isSDWA() && !instr->sdwa().abs[i])
|
2021-01-21 16:13:34 +00:00
|
|
|
instr->sdwa().neg[i] = true;
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
else if (instr->isVOP3() && !instr->vop3().abs[i])
|
2021-01-21 16:13:34 +00:00
|
|
|
instr->vop3().neg[i] = true;
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
}
|
2022-01-31 18:01:45 +00:00
|
|
|
if (info.is_abs() && can_use_mod && mod_bitsize_compat &&
|
|
|
|
|
can_eliminate_fcanonicalize(ctx, instr, info.temp)) {
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
if (!instr->isDPP() && !instr->isSDWA())
|
|
|
|
|
to_VOP3(ctx, instr);
|
|
|
|
|
instr->operands[i] = Operand(info.temp);
|
2021-11-29 00:12:04 +09:00
|
|
|
if (instr->isDPP16())
|
|
|
|
|
instr->dpp16().abs[i] = true;
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
else if (instr->isSDWA())
|
|
|
|
|
instr->sdwa().abs[i] = true;
|
|
|
|
|
else
|
|
|
|
|
instr->vop3().abs[i] = true;
|
2019-09-17 13:22:17 +02:00
|
|
|
continue;
|
|
|
|
|
}
|
2020-06-30 15:33:18 +01:00
|
|
|
|
2021-12-13 19:58:46 +01:00
|
|
|
if (instr->isVOP3P()) {
|
|
|
|
|
propagate_constants_vop3p(ctx, instr, info, i);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-07 20:21:37 +01:00
|
|
|
if (info.is_constant(bits) && alu_can_accept_constant(instr->opcode, i) &&
|
2022-05-12 02:50:17 -04:00
|
|
|
(!instr->isSDWA() || ctx.program->gfx_level >= GFX9)) {
|
2020-05-15 16:28:03 +01:00
|
|
|
Operand op = get_constant_op(ctx, info, bits);
|
2021-06-09 10:14:54 +02:00
|
|
|
perfwarn(ctx.program, instr->opcode == aco_opcode::v_cndmask_b32 && i == 2,
|
|
|
|
|
"v_cndmask_b32 with a constant selector", instr.get());
|
2021-12-13 19:58:46 +01:00
|
|
|
if (i == 0 || instr->isSDWA() || instr->opcode == aco_opcode::v_readlane_b32 ||
|
2019-12-05 14:12:39 +00:00
|
|
|
instr->opcode == aco_opcode::v_writelane_b32) {
|
2021-08-30 10:30:45 +01:00
|
|
|
instr->format = withoutDPP(instr->format);
|
2019-11-13 11:14:51 +01:00
|
|
|
instr->operands[i] = op;
|
2019-09-17 13:22:17 +02:00
|
|
|
continue;
|
2021-07-14 17:22:02 +01:00
|
|
|
} else if (!instr->isVOP3() && can_swap_operands(instr, &instr->opcode)) {
|
2019-09-17 13:22:17 +02:00
|
|
|
instr->operands[i] = instr->operands[0];
|
2019-11-13 11:14:51 +01:00
|
|
|
instr->operands[0] = op;
|
2019-09-17 13:22:17 +02:00
|
|
|
continue;
|
2019-11-20 16:42:17 +00:00
|
|
|
} else if (can_use_VOP3(ctx, instr)) {
|
2019-09-17 13:22:17 +02:00
|
|
|
to_VOP3(ctx, instr);
|
2019-11-13 11:14:51 +01:00
|
|
|
instr->operands[i] = op;
|
2019-09-17 13:22:17 +02:00
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* MUBUF: propagate constants and combine additions */
|
2021-01-20 15:27:16 +00:00
|
|
|
else if (instr->isMUBUF()) {
|
2021-01-21 16:13:34 +00:00
|
|
|
MUBUF_instruction& mubuf = instr->mubuf();
|
2019-09-17 13:22:17 +02:00
|
|
|
Temp base;
|
|
|
|
|
uint32_t offset;
|
|
|
|
|
while (info.is_temp())
|
|
|
|
|
info = ctx.info[info.temp.id()];
|
|
|
|
|
|
2019-10-15 17:00:55 +01:00
|
|
|
/* According to AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(), vaddr
|
|
|
|
|
* overflow for scratch accesses works only on GFX9+ and saddr overflow
|
|
|
|
|
* never works. Since swizzling is the only thing that separates
|
|
|
|
|
* scratch accesses and other accesses and swizzling changing how
|
|
|
|
|
* addressing works significantly, this probably applies to swizzled
|
|
|
|
|
* MUBUF accesses. */
|
2022-05-12 02:50:17 -04:00
|
|
|
bool vaddr_prevent_overflow = mubuf.swizzled && ctx.program->gfx_level < GFX9;
|
2019-10-15 17:00:55 +01:00
|
|
|
|
2022-04-20 17:21:11 +02:00
|
|
|
if (mubuf.offen && mubuf.idxen && i == 1 && info.is_vec() &&
|
|
|
|
|
info.instr->operands.size() == 2 && info.instr->operands[0].isTemp() &&
|
|
|
|
|
info.instr->operands[0].regClass() == v1 && info.instr->operands[1].isConstant() &&
|
|
|
|
|
mubuf.offset + info.instr->operands[1].constantValue() < 4096) {
|
|
|
|
|
instr->operands[1] = info.instr->operands[0];
|
|
|
|
|
mubuf.offset += info.instr->operands[1].constantValue();
|
|
|
|
|
mubuf.offen = false;
|
|
|
|
|
continue;
|
|
|
|
|
} else if (mubuf.offen && i == 1 && info.is_constant_or_literal(32) &&
|
|
|
|
|
mubuf.offset + info.val < 4096) {
|
2021-01-21 16:13:34 +00:00
|
|
|
assert(!mubuf.idxen);
|
2020-01-16 16:54:35 +01:00
|
|
|
instr->operands[1] = Operand(v1);
|
2021-01-21 16:13:34 +00:00
|
|
|
mubuf.offset += info.val;
|
|
|
|
|
mubuf.offen = false;
|
2019-09-17 13:22:17 +02:00
|
|
|
continue;
|
2021-01-21 16:13:34 +00:00
|
|
|
} else if (i == 2 && info.is_constant_or_literal(32) && mubuf.offset + info.val < 4096) {
|
2021-07-13 11:22:46 +02:00
|
|
|
instr->operands[2] = Operand::c32(0);
|
2021-01-21 16:13:34 +00:00
|
|
|
mubuf.offset += info.val;
|
2019-09-17 13:22:17 +02:00
|
|
|
continue;
|
2021-06-09 10:14:54 +02:00
|
|
|
} else if (mubuf.offen && i == 1 &&
|
|
|
|
|
parse_base_offset(ctx, instr.get(), i, &base, &offset,
|
|
|
|
|
vaddr_prevent_overflow) &&
|
2021-01-21 16:13:34 +00:00
|
|
|
base.regClass() == v1 && mubuf.offset + offset < 4096) {
|
|
|
|
|
assert(!mubuf.idxen);
|
2020-01-16 16:54:35 +01:00
|
|
|
instr->operands[1].setTemp(base);
|
2021-01-21 16:13:34 +00:00
|
|
|
mubuf.offset += offset;
|
2019-09-17 13:22:17 +02:00
|
|
|
continue;
|
2022-03-25 12:03:27 +01:00
|
|
|
} else if (i == 2 && parse_base_offset(ctx, instr.get(), i, &base, &offset, true) &&
|
2021-01-21 16:13:34 +00:00
|
|
|
base.regClass() == s1 && mubuf.offset + offset < 4096) {
|
2019-09-17 13:22:17 +02:00
|
|
|
instr->operands[i].setTemp(base);
|
2021-01-21 16:13:34 +00:00
|
|
|
mubuf.offset += offset;
|
2019-09-17 13:22:17 +02:00
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-05-19 15:19:12 +01:00
|
|
|
/* SCRATCH: propagate constants and combine additions */
|
|
|
|
|
else if (instr->isScratch()) {
|
|
|
|
|
FLAT_instruction& scratch = instr->scratch();
|
|
|
|
|
Temp base;
|
|
|
|
|
uint32_t offset;
|
|
|
|
|
while (info.is_temp())
|
|
|
|
|
info = ctx.info[info.temp.id()];
|
|
|
|
|
|
2022-12-01 15:05:49 +00:00
|
|
|
/* The hardware probably does: 'scratch_base + u2u64(saddr) + i2i64(offset)'. This means
|
|
|
|
|
* we can't combine the addition if the unsigned addition overflows and offset is
|
|
|
|
|
* positive. In theory, there is also issues if
|
|
|
|
|
* 'ilt(offset, 0) && ige(saddr, 0) && ilt(saddr + offset, 0)', but that just
|
|
|
|
|
* replaces an already out-of-bounds access with a larger one since 'saddr + offset'
|
|
|
|
|
* would be larger than INT32_MAX.
|
|
|
|
|
*/
|
|
|
|
|
if (i <= 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, true) &&
|
2022-05-19 15:19:12 +01:00
|
|
|
base.regClass() == instr->operands[i].regClass() &&
|
2022-12-01 15:05:49 +00:00
|
|
|
is_scratch_offset_valid(ctx, instr.get(), scratch.offset, (int32_t)offset)) {
|
|
|
|
|
instr->operands[i].setTemp(base);
|
|
|
|
|
scratch.offset += (int32_t)offset;
|
|
|
|
|
continue;
|
|
|
|
|
} else if (i <= 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, false) &&
|
|
|
|
|
base.regClass() == instr->operands[i].regClass() && (int32_t)offset < 0 &&
|
|
|
|
|
is_scratch_offset_valid(ctx, instr.get(), scratch.offset, (int32_t)offset)) {
|
2022-05-19 15:19:12 +01:00
|
|
|
instr->operands[i].setTemp(base);
|
|
|
|
|
scratch.offset += (int32_t)offset;
|
|
|
|
|
continue;
|
|
|
|
|
} else if (i <= 1 && info.is_constant_or_literal(32) &&
|
|
|
|
|
ctx.program->gfx_level >= GFX10_3 &&
|
2022-12-01 15:05:49 +00:00
|
|
|
is_scratch_offset_valid(ctx, NULL, scratch.offset, (int32_t)info.val)) {
|
2022-05-19 15:19:12 +01:00
|
|
|
/* GFX10.3+ can disable both SADDR and ADDR. */
|
|
|
|
|
instr->operands[i] = Operand(instr->operands[i].regClass());
|
|
|
|
|
scratch.offset += (int32_t)info.val;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
/* DS: combine additions */
|
2021-01-20 15:27:16 +00:00
|
|
|
else if (instr->isDS()) {
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2021-01-21 16:13:34 +00:00
|
|
|
DS_instruction& ds = instr->ds();
|
2019-09-17 13:22:17 +02:00
|
|
|
Temp base;
|
|
|
|
|
uint32_t offset;
|
2022-05-12 02:50:17 -04:00
|
|
|
bool has_usable_ds_offset = ctx.program->gfx_level >= GFX7;
|
2021-06-09 10:14:54 +02:00
|
|
|
if (has_usable_ds_offset && i == 0 &&
|
|
|
|
|
parse_base_offset(ctx, instr.get(), i, &base, &offset, false) &&
|
2020-01-15 10:47:17 +01:00
|
|
|
base.regClass() == instr->operands[i].regClass() &&
|
|
|
|
|
instr->opcode != aco_opcode::ds_swizzle_b32) {
|
2021-06-09 10:14:54 +02:00
|
|
|
if (instr->opcode == aco_opcode::ds_write2_b32 ||
|
|
|
|
|
instr->opcode == aco_opcode::ds_read2_b32 ||
|
|
|
|
|
instr->opcode == aco_opcode::ds_write2_b64 ||
|
2021-11-15 16:40:53 +00:00
|
|
|
instr->opcode == aco_opcode::ds_read2_b64 ||
|
|
|
|
|
instr->opcode == aco_opcode::ds_write2st64_b32 ||
|
|
|
|
|
instr->opcode == aco_opcode::ds_read2st64_b32 ||
|
|
|
|
|
instr->opcode == aco_opcode::ds_write2st64_b64 ||
|
|
|
|
|
instr->opcode == aco_opcode::ds_read2st64_b64) {
|
|
|
|
|
bool is64bit = instr->opcode == aco_opcode::ds_write2_b64 ||
|
|
|
|
|
instr->opcode == aco_opcode::ds_read2_b64 ||
|
|
|
|
|
instr->opcode == aco_opcode::ds_write2st64_b64 ||
|
|
|
|
|
instr->opcode == aco_opcode::ds_read2st64_b64;
|
|
|
|
|
bool st64 = instr->opcode == aco_opcode::ds_write2st64_b32 ||
|
|
|
|
|
instr->opcode == aco_opcode::ds_read2st64_b32 ||
|
|
|
|
|
instr->opcode == aco_opcode::ds_write2st64_b64 ||
|
|
|
|
|
instr->opcode == aco_opcode::ds_read2st64_b64;
|
|
|
|
|
unsigned shifts = (is64bit ? 3 : 2) + (st64 ? 6 : 0);
|
|
|
|
|
unsigned mask = BITFIELD_MASK(shifts);
|
2021-06-09 10:14:54 +02:00
|
|
|
|
|
|
|
|
if ((offset & mask) == 0 && ds.offset0 + (offset >> shifts) <= 255 &&
|
2021-01-21 16:13:34 +00:00
|
|
|
ds.offset1 + (offset >> shifts) <= 255) {
|
2019-09-17 13:22:17 +02:00
|
|
|
instr->operands[i].setTemp(base);
|
2021-01-21 16:13:34 +00:00
|
|
|
ds.offset0 += offset >> shifts;
|
|
|
|
|
ds.offset1 += offset >> shifts;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
} else {
|
2021-01-21 16:13:34 +00:00
|
|
|
if (ds.offset0 + offset <= 65535) {
|
2019-09-17 13:22:17 +02:00
|
|
|
instr->operands[i].setTemp(base);
|
2021-01-21 16:13:34 +00:00
|
|
|
ds.offset0 += offset;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-01-20 15:27:16 +00:00
|
|
|
else if (instr->isBranch()) {
|
aco: Flip s_cbranch / s_cselect to optimize out an s_not if possible.
When possible, get rid of an s_not when all it does is invert the SCC,
and its successor s_cbranch / s_cselect can be inverted instead.
Also modify some parts of instruction_selection to take advantage of
this feature.
Example:
s2: %3900, s1: %3899:scc = s_andn2_b64 %0:exec, %406
s2: %3902 = s_cselect_b64 -1, 0, %3900:scc
s2: %407, s1: %3903:scc = s_not_b64 %3902
s2: %3906, s1: %3905:scc = s_and_b64 %407, %0:exec
p_cbranch_z %3905:scc
Can now be optimized to:
s2: %3900, s1: %3899:scc = s_andn2_b64 %0:exec, %406
p_cbranch_nz %3900:scc
Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
2019-11-19 13:29:54 +01:00
|
|
|
if (ctx.info[instr->operands[0].tempId()].is_scc_invert()) {
|
|
|
|
|
/* Flip the branch instruction to get rid of the scc_invert instruction */
|
2021-06-09 10:14:54 +02:00
|
|
|
instr->opcode = instr->opcode == aco_opcode::p_cbranch_z ? aco_opcode::p_cbranch_nz
|
|
|
|
|
: aco_opcode::p_cbranch_z;
|
aco: Flip s_cbranch / s_cselect to optimize out an s_not if possible.
When possible, get rid of an s_not when all it does is invert the SCC,
and its successor s_cbranch / s_cselect can be inverted instead.
Also modify some parts of instruction_selection to take advantage of
this feature.
Example:
s2: %3900, s1: %3899:scc = s_andn2_b64 %0:exec, %406
s2: %3902 = s_cselect_b64 -1, 0, %3900:scc
s2: %407, s1: %3903:scc = s_not_b64 %3902
s2: %3906, s1: %3905:scc = s_and_b64 %407, %0:exec
p_cbranch_z %3905:scc
Can now be optimized to:
s2: %3900, s1: %3899:scc = s_andn2_b64 %0:exec, %406
p_cbranch_nz %3900:scc
Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
2019-11-19 13:29:54 +01:00
|
|
|
instr->operands[0].setTemp(ctx.info[instr->operands[0].tempId()].temp);
|
|
|
|
|
}
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* if this instruction doesn't define anything, return */
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
if (instr->definitions.empty()) {
|
|
|
|
|
check_sdwa_extract(ctx, instr);
|
2019-09-17 13:22:17 +02:00
|
|
|
return;
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
if (instr->isVALU() || instr->isVINTRP()) {
|
|
|
|
|
if (instr_info.can_use_output_modifiers[(int)instr->opcode] || instr->isVINTRP() ||
|
|
|
|
|
instr->opcode == aco_opcode::v_cndmask_b32) {
|
|
|
|
|
bool canonicalized = true;
|
|
|
|
|
if (!does_fp_op_flush_denorms(ctx, instr->opcode)) {
|
|
|
|
|
unsigned ops = instr->opcode == aco_opcode::v_cndmask_b32 ? 2 : instr->operands.size();
|
|
|
|
|
for (unsigned i = 0; canonicalized && (i < ops); i++)
|
|
|
|
|
canonicalized = is_op_canonicalized(ctx, instr->operands[i]);
|
|
|
|
|
}
|
|
|
|
|
if (canonicalized)
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_canonicalized();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (instr->isVOPC()) {
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_vopc(instr.get());
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
check_sdwa_extract(ctx, instr);
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
if (instr->isVOP3P()) {
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_vop3p(instr.get());
|
|
|
|
|
return;
|
|
|
|
|
}
|
2020-09-03 12:02:55 +01:00
|
|
|
}
|
2020-06-19 16:09:48 +01:00
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
switch (instr->opcode) {
|
|
|
|
|
case aco_opcode::p_create_vector: {
|
2020-05-06 17:24:38 +01:00
|
|
|
bool copy_prop = instr->operands.size() == 1 && instr->operands[0].isTemp() &&
|
|
|
|
|
instr->operands[0].regClass() == instr->definitions[0].regClass();
|
|
|
|
|
if (copy_prop) {
|
2020-04-21 17:37:44 +01:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
|
2020-05-06 17:24:38 +01:00
|
|
|
break;
|
|
|
|
|
}
|
2020-04-21 17:37:44 +01:00
|
|
|
|
2020-09-18 11:52:35 +01:00
|
|
|
/* expand vector operands */
|
|
|
|
|
std::vector<Operand> ops;
|
2020-12-31 11:04:11 +00:00
|
|
|
unsigned offset = 0;
|
2019-09-17 13:22:17 +02:00
|
|
|
for (const Operand& op : instr->operands) {
|
2020-12-31 11:04:11 +00:00
|
|
|
/* ensure that any expanded operands are properly aligned */
|
|
|
|
|
bool aligned = offset % 4 == 0 || op.bytes() < 4;
|
|
|
|
|
offset += op.bytes();
|
|
|
|
|
if (aligned && op.isTemp() && ctx.info[op.tempId()].is_vec()) {
|
|
|
|
|
Instruction* vec = ctx.info[op.tempId()].instr;
|
|
|
|
|
for (const Operand& vec_op : vec->operands)
|
2020-09-18 11:52:35 +01:00
|
|
|
ops.emplace_back(vec_op);
|
|
|
|
|
} else {
|
|
|
|
|
ops.emplace_back(op);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
2020-04-24 11:58:17 +01:00
|
|
|
|
2020-09-18 11:52:35 +01:00
|
|
|
/* combine expanded operands to new vector */
|
|
|
|
|
if (ops.size() != instr->operands.size()) {
|
|
|
|
|
assert(ops.size() > instr->operands.size());
|
|
|
|
|
Definition def = instr->definitions[0];
|
2021-06-09 10:14:54 +02:00
|
|
|
instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
|
|
|
|
|
Format::PSEUDO, ops.size(), 1));
|
2020-09-18 11:52:35 +01:00
|
|
|
for (unsigned i = 0; i < ops.size(); i++) {
|
|
|
|
|
if (ops[i].isTemp() && ctx.info[ops[i].tempId()].is_temp() &&
|
2020-12-31 11:04:11 +00:00
|
|
|
ops[i].regClass() == ctx.info[ops[i].tempId()].temp.regClass())
|
2020-09-18 11:52:35 +01:00
|
|
|
ops[i].setTemp(ctx.info[ops[i].tempId()].temp);
|
|
|
|
|
instr->operands[i] = ops[i];
|
|
|
|
|
}
|
|
|
|
|
instr->definitions[0] = def;
|
|
|
|
|
} else {
|
|
|
|
|
for (unsigned i = 0; i < ops.size(); i++) {
|
|
|
|
|
assert(instr->operands[i] == ops[i]);
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-05-06 17:24:38 +01:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_vec(instr.get());
|
2022-03-16 15:14:29 +01:00
|
|
|
|
|
|
|
|
if (instr->operands.size() == 2) {
|
|
|
|
|
/* check if this is created from split_vector */
|
|
|
|
|
if (instr->operands[1].isTemp() && ctx.info[instr->operands[1].tempId()].is_split()) {
|
|
|
|
|
Instruction* split = ctx.info[instr->operands[1].tempId()].instr;
|
|
|
|
|
if (instr->operands[0].isTemp() &&
|
|
|
|
|
instr->operands[0].getTemp() == split->definitions[0].getTemp())
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_temp(split->operands[0].getTemp());
|
|
|
|
|
}
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case aco_opcode::p_split_vector: {
|
2020-05-18 19:42:40 +01:00
|
|
|
ssa_info& info = ctx.info[instr->operands[0].tempId()];
|
|
|
|
|
|
|
|
|
|
if (info.is_constant_or_literal(32)) {
|
2021-11-25 07:36:10 +01:00
|
|
|
uint64_t val = info.val;
|
2020-05-18 19:42:40 +01:00
|
|
|
for (Definition def : instr->definitions) {
|
|
|
|
|
uint32_t mask = u_bit_consecutive(0, def.bytes() * 8u);
|
2022-05-12 02:50:17 -04:00
|
|
|
ctx.info[def.tempId()].set_constant(ctx.program->gfx_level, val & mask);
|
2020-05-18 19:42:40 +01:00
|
|
|
val >>= def.bytes() * 8u;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
} else if (!info.is_vec()) {
|
2022-03-16 15:14:29 +01:00
|
|
|
if (instr->definitions.size() == 2 && instr->operands[0].isTemp() &&
|
|
|
|
|
instr->definitions[0].bytes() == instr->definitions[1].bytes()) {
|
|
|
|
|
ctx.info[instr->definitions[1].tempId()].set_split(instr.get());
|
|
|
|
|
if (instr->operands[0].bytes() == 4) {
|
|
|
|
|
/* D16 subdword split */
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
|
aco/optimizer: apply extract from subdword p_split_vector
Totals from 1345 (1.00% of 134572) affected shaders: (GFX10.3)
VGPRs: 76752 -> 76744 (-0.01%); split: -0.02%, +0.01%
SpillSGPRs: 1459 -> 1460 (+0.07%)
SpillVGPRs: 1776 -> 1784 (+0.45%); split: -0.39%, +0.84%
CodeSize: 13310964 -> 13309420 (-0.01%); split: -0.06%, +0.05%
Scratch: 178176 -> 179200 (+0.57%)
Instrs: 2516874 -> 2516860 (-0.00%); split: -0.05%, +0.05%
Latency: 23228506 -> 23230338 (+0.01%); split: -0.14%, +0.15%
InvThroughput: 6002384 -> 6000158 (-0.04%); split: -0.24%, +0.21%
VClause: 41115 -> 41117 (+0.00%); split: -0.28%, +0.29%
SClause: 104639 -> 104664 (+0.02%); split: -0.07%, +0.09%
Copies: 185121 -> 184862 (-0.14%); split: -0.69%, +0.55%
Branches: 100740 -> 100735 (-0.00%); split: -0.01%, +0.00%
PreVGPRs: 70119 -> 69968 (-0.22%)
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13576>
2021-10-25 16:44:42 +02:00
|
|
|
ctx.info[instr->definitions[1].tempId()].set_extract(instr.get());
|
2022-03-16 15:14:29 +01:00
|
|
|
}
|
aco/optimizer: apply extract from subdword p_split_vector
Totals from 1345 (1.00% of 134572) affected shaders: (GFX10.3)
VGPRs: 76752 -> 76744 (-0.01%); split: -0.02%, +0.01%
SpillSGPRs: 1459 -> 1460 (+0.07%)
SpillVGPRs: 1776 -> 1784 (+0.45%); split: -0.39%, +0.84%
CodeSize: 13310964 -> 13309420 (-0.01%); split: -0.06%, +0.05%
Scratch: 178176 -> 179200 (+0.57%)
Instrs: 2516874 -> 2516860 (-0.00%); split: -0.05%, +0.05%
Latency: 23228506 -> 23230338 (+0.01%); split: -0.14%, +0.15%
InvThroughput: 6002384 -> 6000158 (-0.04%); split: -0.24%, +0.21%
VClause: 41115 -> 41117 (+0.00%); split: -0.28%, +0.29%
SClause: 104639 -> 104664 (+0.02%); split: -0.07%, +0.09%
Copies: 185121 -> 184862 (-0.14%); split: -0.69%, +0.55%
Branches: 100740 -> 100735 (-0.00%); split: -0.01%, +0.00%
PreVGPRs: 70119 -> 69968 (-0.22%)
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13576>
2021-10-25 16:44:42 +02:00
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
break;
|
2020-05-18 19:42:40 +01:00
|
|
|
}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
Instruction* vec = ctx.info[instr->operands[0].tempId()].instr;
|
2020-04-10 13:09:54 +01:00
|
|
|
unsigned split_offset = 0;
|
|
|
|
|
unsigned vec_offset = 0;
|
|
|
|
|
unsigned vec_index = 0;
|
2021-06-09 10:14:54 +02:00
|
|
|
for (unsigned i = 0; i < instr->definitions.size();
|
|
|
|
|
split_offset += instr->definitions[i++].bytes()) {
|
2020-04-10 13:09:54 +01:00
|
|
|
while (vec_offset < split_offset && vec_index < vec->operands.size())
|
|
|
|
|
vec_offset += vec->operands[vec_index++].bytes();
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
if (vec_offset != split_offset ||
|
|
|
|
|
vec->operands[vec_index].bytes() != instr->definitions[i].bytes())
|
2020-04-10 13:09:54 +01:00
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
Operand vec_op = vec->operands[vec_index];
|
2019-09-17 13:22:17 +02:00
|
|
|
if (vec_op.isConstant()) {
|
2022-05-12 02:50:17 -04:00
|
|
|
ctx.info[instr->definitions[i].tempId()].set_constant(ctx.program->gfx_level,
|
2021-06-09 10:14:54 +02:00
|
|
|
vec_op.constantValue64());
|
2020-04-16 20:18:23 +01:00
|
|
|
} else if (vec_op.isUndefined()) {
|
|
|
|
|
ctx.info[instr->definitions[i].tempId()].set_undefined();
|
2019-09-17 13:22:17 +02:00
|
|
|
} else {
|
|
|
|
|
assert(vec_op.isTemp());
|
|
|
|
|
ctx.info[instr->definitions[i].tempId()].set_temp(vec_op.getTemp());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case aco_opcode::p_extract_vector: { /* mov */
|
2020-05-18 19:42:40 +01:00
|
|
|
ssa_info& info = ctx.info[instr->operands[0].tempId()];
|
|
|
|
|
const unsigned index = instr->operands[1].constantValue();
|
|
|
|
|
const unsigned dst_offset = index * instr->definitions[0].bytes();
|
|
|
|
|
|
2021-01-15 09:23:04 +01:00
|
|
|
if (info.is_vec()) {
|
|
|
|
|
/* check if we index directly into a vector element */
|
|
|
|
|
Instruction* vec = info.instr;
|
|
|
|
|
unsigned offset = 0;
|
|
|
|
|
|
|
|
|
|
for (const Operand& op : vec->operands) {
|
|
|
|
|
if (offset < dst_offset) {
|
|
|
|
|
offset += op.bytes();
|
|
|
|
|
continue;
|
|
|
|
|
} else if (offset != dst_offset || op.bytes() != instr->definitions[0].bytes()) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
instr->operands[0] = op;
|
2020-04-10 11:52:13 +01:00
|
|
|
break;
|
|
|
|
|
}
|
2021-01-15 09:23:04 +01:00
|
|
|
} else if (info.is_constant_or_literal(32)) {
|
|
|
|
|
/* propagate constants */
|
|
|
|
|
uint32_t mask = u_bit_consecutive(0, instr->definitions[0].bytes() * 8u);
|
|
|
|
|
uint32_t val = (info.val >> (dst_offset * 8u)) & mask;
|
2021-06-09 10:14:54 +02:00
|
|
|
instr->operands[0] =
|
2022-05-12 02:50:17 -04:00
|
|
|
Operand::get_const(ctx.program->gfx_level, val, instr->definitions[0].bytes());
|
2021-06-09 10:14:54 +02:00
|
|
|
;
|
2021-01-15 09:23:04 +01:00
|
|
|
}
|
2019-11-13 11:14:51 +01:00
|
|
|
|
2021-10-04 11:13:08 +01:00
|
|
|
if (instr->operands[0].bytes() != instr->definitions[0].bytes()) {
|
|
|
|
|
if (instr->operands[0].size() != 1)
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
if (index == 0)
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
|
|
|
|
|
else
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_extract(instr.get());
|
2020-04-10 11:52:13 +01:00
|
|
|
break;
|
2021-10-04 11:13:08 +01:00
|
|
|
}
|
2021-01-15 09:23:04 +01:00
|
|
|
|
|
|
|
|
/* convert this extract into a copy instruction */
|
|
|
|
|
instr->opcode = aco_opcode::p_parallelcopy;
|
|
|
|
|
instr->operands.pop_back();
|
|
|
|
|
FALLTHROUGH;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
2020-10-15 15:18:40 +01:00
|
|
|
case aco_opcode::p_parallelcopy: /* propagate */
|
2020-10-15 14:49:34 +01:00
|
|
|
if (instr->operands[0].isTemp() && ctx.info[instr->operands[0].tempId()].is_vec() &&
|
|
|
|
|
instr->operands[0].regClass() != instr->definitions[0].regClass()) {
|
|
|
|
|
/* We might not be able to copy-propagate if it's a SGPR->VGPR copy, so
|
|
|
|
|
* duplicate the vector instead.
|
|
|
|
|
*/
|
2021-06-09 10:14:54 +02:00
|
|
|
Instruction* vec = ctx.info[instr->operands[0].tempId()].instr;
|
2020-10-15 14:49:34 +01:00
|
|
|
aco_ptr<Instruction> old_copy = std::move(instr);
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
instr.reset(create_instruction<Pseudo_instruction>(
|
|
|
|
|
aco_opcode::p_create_vector, Format::PSEUDO, vec->operands.size(), 1));
|
2020-10-15 14:49:34 +01:00
|
|
|
instr->definitions[0] = old_copy->definitions[0];
|
|
|
|
|
std::copy(vec->operands.begin(), vec->operands.end(), instr->operands.begin());
|
|
|
|
|
for (unsigned i = 0; i < vec->operands.size(); i++) {
|
|
|
|
|
Operand& op = instr->operands[i];
|
|
|
|
|
if (op.isTemp() && ctx.info[op.tempId()].is_temp() &&
|
|
|
|
|
ctx.info[op.tempId()].temp.type() == instr->definitions[0].regClass().type())
|
|
|
|
|
op.setTemp(ctx.info[op.tempId()].temp);
|
|
|
|
|
}
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_vec(instr.get());
|
|
|
|
|
break;
|
|
|
|
|
}
|
2020-12-01 09:54:31 +00:00
|
|
|
FALLTHROUGH;
|
2019-09-17 13:22:17 +02:00
|
|
|
case aco_opcode::p_as_uniform:
|
|
|
|
|
if (instr->definitions[0].isFixed()) {
|
|
|
|
|
/* don't copy-propagate copies into fixed registers */
|
2019-10-29 13:59:59 +00:00
|
|
|
} else if (instr->usesModifiers()) {
|
|
|
|
|
// TODO
|
2019-09-17 13:22:17 +02:00
|
|
|
} else if (instr->operands[0].isConstant()) {
|
2021-06-09 10:14:54 +02:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_constant(
|
2022-05-12 02:50:17 -04:00
|
|
|
ctx.program->gfx_level, instr->operands[0].constantValue64());
|
2019-09-17 13:22:17 +02:00
|
|
|
} else if (instr->operands[0].isTemp()) {
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
if (ctx.info[instr->operands[0].tempId()].is_canonicalized())
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_canonicalized();
|
2019-09-17 13:22:17 +02:00
|
|
|
} else {
|
|
|
|
|
assert(instr->operands[0].isFixed());
|
|
|
|
|
}
|
|
|
|
|
break;
|
2020-06-30 15:33:18 +01:00
|
|
|
case aco_opcode::v_mov_b32:
|
2021-11-29 00:12:04 +09:00
|
|
|
if (instr->isDPP16()) {
|
2020-06-30 15:33:18 +01:00
|
|
|
/* anything else doesn't make sense in SSA */
|
2021-11-29 00:12:04 +09:00
|
|
|
assert(instr->dpp16().row_mask == 0xf && instr->dpp16().bank_mask == 0xf);
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_dpp16(instr.get());
|
|
|
|
|
} else if (instr->isDPP8()) {
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_dpp8(instr.get());
|
2020-06-30 15:33:18 +01:00
|
|
|
}
|
|
|
|
|
break;
|
2019-09-17 13:22:17 +02:00
|
|
|
case aco_opcode::p_is_helper:
|
|
|
|
|
if (!ctx.program->needs_wqm)
|
2022-05-12 02:50:17 -04:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->gfx_level, 0u);
|
2019-09-17 13:22:17 +02:00
|
|
|
break;
|
2021-06-30 19:20:49 +02:00
|
|
|
case aco_opcode::v_mul_f64: ctx.info[instr->definitions[0].tempId()].set_mul(instr.get()); break;
|
2020-05-15 15:12:33 +01:00
|
|
|
case aco_opcode::v_mul_f16:
|
2021-09-21 17:03:05 +01:00
|
|
|
case aco_opcode::v_mul_f32:
|
|
|
|
|
case aco_opcode::v_mul_legacy_f32: { /* omod */
|
2020-08-12 15:58:32 +01:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_mul(instr.get());
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
/* TODO: try to move the negate/abs modifier to the consumer instead */
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
bool uses_mods = instr->usesModifiers();
|
2020-05-15 15:12:33 +01:00
|
|
|
bool fp16 = instr->opcode == aco_opcode::v_mul_f16;
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
|
|
|
|
if (instr->operands[!i].isConstant() && instr->operands[i].isTemp()) {
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
if (!instr->isDPP() && !instr->isSDWA() &&
|
2021-06-09 10:14:54 +02:00
|
|
|
(instr->operands[!i].constantEquals(fp16 ? 0x3c00 : 0x3f800000) || /* 1.0 */
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
instr->operands[!i].constantEquals(fp16 ? 0xbc00 : 0xbf800000u))) { /* -1.0 */
|
|
|
|
|
bool neg1 = instr->operands[!i].constantEquals(fp16 ? 0xbc00 : 0xbf800000u);
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
VOP3_instruction* vop3 = instr->isVOP3() ? &instr->vop3() : NULL;
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
if (vop3 && (vop3->abs[!i] || vop3->neg[!i] || vop3->clamp || vop3->omod))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
bool abs = vop3 && vop3->abs[i];
|
|
|
|
|
bool neg = neg1 ^ (vop3 && vop3->neg[i]);
|
|
|
|
|
|
|
|
|
|
Temp other = instr->operands[i].getTemp();
|
|
|
|
|
if (abs && neg && other.type() == RegType::vgpr)
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_neg_abs(other);
|
|
|
|
|
else if (abs && !neg && other.type() == RegType::vgpr)
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_abs(other);
|
|
|
|
|
else if (!abs && neg && other.type() == RegType::vgpr)
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_neg(other);
|
|
|
|
|
else if (!abs && !neg)
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_fcanonicalize(other);
|
|
|
|
|
} else if (uses_mods) {
|
|
|
|
|
continue;
|
2021-06-09 10:14:54 +02:00
|
|
|
} else if (instr->operands[!i].constantValue() ==
|
|
|
|
|
(fp16 ? 0x4000 : 0x40000000)) { /* 2.0 */
|
2020-08-12 15:58:32 +01:00
|
|
|
ctx.info[instr->operands[i].tempId()].set_omod2(instr.get());
|
2021-06-09 10:14:54 +02:00
|
|
|
} else if (instr->operands[!i].constantValue() ==
|
|
|
|
|
(fp16 ? 0x4400 : 0x40800000)) { /* 4.0 */
|
2020-08-12 15:58:32 +01:00
|
|
|
ctx.info[instr->operands[i].tempId()].set_omod4(instr.get());
|
2021-06-09 10:14:54 +02:00
|
|
|
} else if (instr->operands[!i].constantValue() ==
|
|
|
|
|
(fp16 ? 0x3800 : 0x3f000000)) { /* 0.5 */
|
2020-08-12 15:58:32 +01:00
|
|
|
ctx.info[instr->operands[i].tempId()].set_omod5(instr.get());
|
2020-06-17 14:57:57 +01:00
|
|
|
} else if (instr->operands[!i].constantValue() == 0u &&
|
2021-09-21 17:03:05 +01:00
|
|
|
(!(fp16 ? ctx.fp_mode.preserve_signed_zero_inf_nan16_64
|
|
|
|
|
: ctx.fp_mode.preserve_signed_zero_inf_nan32) ||
|
|
|
|
|
instr->opcode == aco_opcode::v_mul_legacy_f32)) { /* 0.0 */
|
2022-05-12 02:50:17 -04:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->gfx_level, 0u);
|
2019-09-17 13:22:17 +02:00
|
|
|
} else {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
2020-11-02 15:34:25 +01:00
|
|
|
case aco_opcode::v_mul_lo_u16:
|
2021-07-01 18:48:09 +02:00
|
|
|
case aco_opcode::v_mul_lo_u16_e64:
|
2020-06-05 17:36:29 +01:00
|
|
|
case aco_opcode::v_mul_u32_u24:
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get());
|
|
|
|
|
break;
|
2020-05-15 15:12:33 +01:00
|
|
|
case aco_opcode::v_med3_f16:
|
2019-09-17 13:22:17 +02:00
|
|
|
case aco_opcode::v_med3_f32: { /* clamp */
|
2021-01-21 16:13:34 +00:00
|
|
|
VOP3_instruction& vop3 = instr->vop3();
|
2021-06-09 10:14:54 +02:00
|
|
|
if (vop3.abs[0] || vop3.abs[1] || vop3.abs[2] || vop3.neg[0] || vop3.neg[1] || vop3.neg[2] ||
|
2021-01-21 16:13:34 +00:00
|
|
|
vop3.omod != 0 || vop3.opsel != 0)
|
2019-09-17 13:22:17 +02:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
unsigned idx = 0;
|
|
|
|
|
bool found_zero = false, found_one = false;
|
2020-05-15 15:12:33 +01:00
|
|
|
bool is_fp16 = instr->opcode == aco_opcode::v_med3_f16;
|
2021-06-09 10:14:54 +02:00
|
|
|
for (unsigned i = 0; i < 3; i++) {
|
2019-09-17 13:22:17 +02:00
|
|
|
if (instr->operands[i].constantEquals(0))
|
|
|
|
|
found_zero = true;
|
2020-05-15 15:12:33 +01:00
|
|
|
else if (instr->operands[i].constantEquals(is_fp16 ? 0x3c00 : 0x3f800000)) /* 1.0 */
|
2019-09-17 13:22:17 +02:00
|
|
|
found_one = true;
|
|
|
|
|
else
|
|
|
|
|
idx = i;
|
|
|
|
|
}
|
2020-08-12 15:58:32 +01:00
|
|
|
if (found_zero && found_one && instr->operands[idx].isTemp())
|
|
|
|
|
ctx.info[instr->operands[idx].tempId()].set_clamp(instr.get());
|
2019-09-17 13:22:17 +02:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case aco_opcode::v_cndmask_b32:
|
2021-06-09 10:14:54 +02:00
|
|
|
if (instr->operands[0].constantEquals(0) && instr->operands[1].constantEquals(0xFFFFFFFF))
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_vcc(instr->operands[2].getTemp());
|
|
|
|
|
else if (instr->operands[0].constantEquals(0) &&
|
2020-04-02 17:41:36 +02:00
|
|
|
instr->operands[1].constantEquals(0x3f800000u))
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_b2f(instr->operands[2].getTemp());
|
2021-06-09 10:14:54 +02:00
|
|
|
else if (instr->operands[0].constantEquals(0) && instr->operands[1].constantEquals(1))
|
2020-04-02 17:41:36 +02:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_b2i(instr->operands[2].getTemp());
|
2020-01-07 10:12:08 +01:00
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
break;
|
|
|
|
|
case aco_opcode::v_cmp_lg_u32:
|
|
|
|
|
if (instr->format == Format::VOPC && /* don't optimize VOP3 / SDWA / DPP */
|
2021-06-09 10:14:54 +02:00
|
|
|
instr->operands[0].constantEquals(0) && instr->operands[1].isTemp() &&
|
|
|
|
|
ctx.info[instr->operands[1].tempId()].is_vcc())
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_temp(
|
|
|
|
|
ctx.info[instr->operands[1].tempId()].temp);
|
2019-09-17 13:22:17 +02:00
|
|
|
break;
|
|
|
|
|
case aco_opcode::p_linear_phi: {
|
|
|
|
|
/* lower_bool_phis() can create phis like this */
|
|
|
|
|
bool all_same_temp = instr->operands[0].isTemp();
|
|
|
|
|
/* this check is needed when moving uniform loop counters out of a divergent loop */
|
|
|
|
|
if (all_same_temp)
|
|
|
|
|
all_same_temp = instr->definitions[0].regClass() == instr->operands[0].regClass();
|
|
|
|
|
for (unsigned i = 1; all_same_temp && (i < instr->operands.size()); i++) {
|
2021-06-09 10:14:54 +02:00
|
|
|
if (!instr->operands[i].isTemp() ||
|
|
|
|
|
instr->operands[i].tempId() != instr->operands[0].tempId())
|
2019-09-17 13:22:17 +02:00
|
|
|
all_same_temp = false;
|
|
|
|
|
}
|
|
|
|
|
if (all_same_temp) {
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
|
|
|
|
|
} else {
|
|
|
|
|
bool all_undef = instr->operands[0].isUndefined();
|
|
|
|
|
for (unsigned i = 1; all_undef && (i < instr->operands.size()); i++) {
|
|
|
|
|
if (!instr->operands[i].isUndefined())
|
|
|
|
|
all_undef = false;
|
|
|
|
|
}
|
|
|
|
|
if (all_undef)
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_undefined();
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case aco_opcode::v_add_u32:
|
|
|
|
|
case aco_opcode::v_add_co_u32:
|
2020-02-21 12:02:06 +00:00
|
|
|
case aco_opcode::v_add_co_u32_e64:
|
2019-09-17 13:22:17 +02:00
|
|
|
case aco_opcode::s_add_i32:
|
|
|
|
|
case aco_opcode::s_add_u32:
|
aco: optimize v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc)
fossils-db (Vega10):
Totals from 7786 (5.70% of 136546) affected shaders:
SGPRs: 517778 -> 518626 (+0.16%); split: -0.01%, +0.17%
VGPRs: 488252 -> 488084 (-0.03%); split: -0.04%, +0.01%
CodeSize: 42282068 -> 42250152 (-0.08%); split: -0.16%, +0.09%
MaxWaves: 35697 -> 35716 (+0.05%); split: +0.06%, -0.01%
Instrs: 8319309 -> 8304792 (-0.17%); split: -0.18%, +0.00%
Cycles: 88619440 -> 88489636 (-0.15%); split: -0.16%, +0.01%
VMEM: 2788278 -> 2780431 (-0.28%); split: +0.06%, -0.35%
SMEM: 570364 -> 569370 (-0.17%); split: +0.12%, -0.30%
VClause: 144906 -> 144908 (+0.00%); split: -0.05%, +0.05%
SClause: 302143 -> 302055 (-0.03%); split: -0.04%, +0.01%
Copies: 579124 -> 578779 (-0.06%); split: -0.14%, +0.08%
PreSGPRs: 327695 -> 328845 (+0.35%); split: -0.00%, +0.35%
PreVGPRs: 434280 -> 433954 (-0.08%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7438>
2020-11-03 18:50:32 +01:00
|
|
|
case aco_opcode::v_subbrev_co_u32:
|
2022-05-19 15:34:04 +01:00
|
|
|
case aco_opcode::v_sub_u32:
|
|
|
|
|
case aco_opcode::v_sub_i32:
|
|
|
|
|
case aco_opcode::v_sub_co_u32:
|
|
|
|
|
case aco_opcode::v_sub_co_u32_e64:
|
|
|
|
|
case aco_opcode::s_sub_u32:
|
|
|
|
|
case aco_opcode::s_sub_i32:
|
|
|
|
|
case aco_opcode::v_subrev_u32:
|
|
|
|
|
case aco_opcode::v_subrev_co_u32:
|
|
|
|
|
case aco_opcode::v_subrev_co_u32_e64:
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_add_sub(instr.get());
|
|
|
|
|
break;
|
aco: Flip s_cbranch / s_cselect to optimize out an s_not if possible.
When possible, get rid of an s_not when all it does is invert the SCC,
and its successor s_cbranch / s_cselect can be inverted instead.
Also modify some parts of instruction_selection to take advantage of
this feature.
Example:
s2: %3900, s1: %3899:scc = s_andn2_b64 %0:exec, %406
s2: %3902 = s_cselect_b64 -1, 0, %3900:scc
s2: %407, s1: %3903:scc = s_not_b64 %3902
s2: %3906, s1: %3905:scc = s_and_b64 %407, %0:exec
p_cbranch_z %3905:scc
Can now be optimized to:
s2: %3900, s1: %3899:scc = s_andn2_b64 %0:exec, %406
p_cbranch_nz %3900:scc
Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
2019-11-19 13:29:54 +01:00
|
|
|
case aco_opcode::s_not_b32:
|
|
|
|
|
case aco_opcode::s_not_b64:
|
|
|
|
|
if (ctx.info[instr->operands[0].tempId()].is_uniform_bool()) {
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();
|
2021-06-09 10:14:54 +02:00
|
|
|
ctx.info[instr->definitions[1].tempId()].set_scc_invert(
|
|
|
|
|
ctx.info[instr->operands[0].tempId()].temp);
|
aco: Flip s_cbranch / s_cselect to optimize out an s_not if possible.
When possible, get rid of an s_not when all it does is invert the SCC,
and its successor s_cbranch / s_cselect can be inverted instead.
Also modify some parts of instruction_selection to take advantage of
this feature.
Example:
s2: %3900, s1: %3899:scc = s_andn2_b64 %0:exec, %406
s2: %3902 = s_cselect_b64 -1, 0, %3900:scc
s2: %407, s1: %3903:scc = s_not_b64 %3902
s2: %3906, s1: %3905:scc = s_and_b64 %407, %0:exec
p_cbranch_z %3905:scc
Can now be optimized to:
s2: %3900, s1: %3899:scc = s_andn2_b64 %0:exec, %406
p_cbranch_nz %3900:scc
Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
2019-11-19 13:29:54 +01:00
|
|
|
} else if (ctx.info[instr->operands[0].tempId()].is_uniform_bitwise()) {
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();
|
2021-06-09 10:14:54 +02:00
|
|
|
ctx.info[instr->definitions[1].tempId()].set_scc_invert(
|
|
|
|
|
ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
|
aco: Flip s_cbranch / s_cselect to optimize out an s_not if possible.
When possible, get rid of an s_not when all it does is invert the SCC,
and its successor s_cbranch / s_cselect can be inverted instead.
Also modify some parts of instruction_selection to take advantage of
this feature.
Example:
s2: %3900, s1: %3899:scc = s_andn2_b64 %0:exec, %406
s2: %3902 = s_cselect_b64 -1, 0, %3900:scc
s2: %407, s1: %3903:scc = s_not_b64 %3902
s2: %3906, s1: %3905:scc = s_and_b64 %407, %0:exec
p_cbranch_z %3905:scc
Can now be optimized to:
s2: %3900, s1: %3899:scc = s_andn2_b64 %0:exec, %406
p_cbranch_nz %3900:scc
Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
2019-11-19 13:29:54 +01:00
|
|
|
}
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
|
|
|
|
|
break;
|
2019-11-22 11:57:45 +01:00
|
|
|
case aco_opcode::s_and_b32:
|
2019-11-05 11:41:00 +01:00
|
|
|
case aco_opcode::s_and_b64:
|
2020-01-28 12:04:48 +00:00
|
|
|
if (fixed_to_exec(instr->operands[1]) && instr->operands[0].isTemp()) {
|
2020-01-03 10:30:04 +01:00
|
|
|
if (ctx.info[instr->operands[0].tempId()].is_uniform_bool()) {
|
2021-06-09 10:14:54 +02:00
|
|
|
/* Try to get rid of the superfluous s_cselect + s_and_b64 that comes from turning a
|
|
|
|
|
* uniform bool into divergent */
|
|
|
|
|
ctx.info[instr->definitions[1].tempId()].set_temp(
|
|
|
|
|
ctx.info[instr->operands[0].tempId()].temp);
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_uniform_bool(
|
|
|
|
|
ctx.info[instr->operands[0].tempId()].temp);
|
2020-01-03 10:30:04 +01:00
|
|
|
break;
|
|
|
|
|
} else if (ctx.info[instr->operands[0].tempId()].is_uniform_bitwise()) {
|
2021-06-09 10:14:54 +02:00
|
|
|
/* Try to get rid of the superfluous s_and_b64, since the uniform bitwise instruction
|
|
|
|
|
* already produces the same SCC */
|
|
|
|
|
ctx.info[instr->definitions[1].tempId()].set_temp(
|
|
|
|
|
ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_uniform_bool(
|
|
|
|
|
ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
|
2020-01-03 10:30:04 +01:00
|
|
|
break;
|
2021-06-18 15:25:35 +02:00
|
|
|
} else if ((ctx.program->stage.num_sw_stages() > 1 ||
|
|
|
|
|
ctx.program->stage.hw == HWStage::NGG) &&
|
|
|
|
|
instr->pass_flags == 1) {
|
|
|
|
|
/* In case of merged shaders, pass_flags=1 means that all lanes are active (exec=-1), so
|
|
|
|
|
* s_and is unnecessary. */
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
|
|
|
|
|
break;
|
2020-01-03 10:30:04 +01:00
|
|
|
}
|
2019-11-05 11:41:00 +01:00
|
|
|
}
|
2020-12-01 09:54:31 +00:00
|
|
|
FALLTHROUGH;
|
2019-09-17 13:22:17 +02:00
|
|
|
case aco_opcode::s_or_b32:
|
|
|
|
|
case aco_opcode::s_or_b64:
|
|
|
|
|
case aco_opcode::s_xor_b32:
|
|
|
|
|
case aco_opcode::s_xor_b64:
|
2021-06-09 10:14:54 +02:00
|
|
|
if (std::all_of(instr->operands.begin(), instr->operands.end(),
|
|
|
|
|
[&ctx](const Operand& op)
|
|
|
|
|
{
|
|
|
|
|
return op.isTemp() && (ctx.info[op.tempId()].is_uniform_bool() ||
|
|
|
|
|
ctx.info[op.tempId()].is_uniform_bitwise());
|
2020-01-03 10:30:04 +01:00
|
|
|
})) {
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();
|
|
|
|
|
}
|
2022-03-30 18:01:45 +02:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
|
|
|
|
|
break;
|
2019-09-17 13:22:17 +02:00
|
|
|
case aco_opcode::s_lshl_b32:
|
|
|
|
|
case aco_opcode::v_or_b32:
|
|
|
|
|
case aco_opcode::v_lshlrev_b32:
|
2020-11-11 18:42:35 +01:00
|
|
|
case aco_opcode::v_bcnt_u32_b32:
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
case aco_opcode::v_and_b32:
|
|
|
|
|
case aco_opcode::v_xor_b32:
|
2022-03-30 18:01:45 +02:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get());
|
2019-09-17 13:22:17 +02:00
|
|
|
break;
|
|
|
|
|
case aco_opcode::v_min_f32:
|
|
|
|
|
case aco_opcode::v_min_f16:
|
|
|
|
|
case aco_opcode::v_min_u32:
|
|
|
|
|
case aco_opcode::v_min_i32:
|
|
|
|
|
case aco_opcode::v_min_u16:
|
|
|
|
|
case aco_opcode::v_min_i16:
|
2022-04-29 17:19:09 +01:00
|
|
|
case aco_opcode::v_min_u16_e64:
|
|
|
|
|
case aco_opcode::v_min_i16_e64:
|
2019-09-17 13:22:17 +02:00
|
|
|
case aco_opcode::v_max_f32:
|
|
|
|
|
case aco_opcode::v_max_f16:
|
|
|
|
|
case aco_opcode::v_max_u32:
|
|
|
|
|
case aco_opcode::v_max_i32:
|
|
|
|
|
case aco_opcode::v_max_u16:
|
|
|
|
|
case aco_opcode::v_max_i16:
|
2022-04-29 17:19:09 +01:00
|
|
|
case aco_opcode::v_max_u16_e64:
|
|
|
|
|
case aco_opcode::v_max_i16_e64:
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_minmax(instr.get());
|
|
|
|
|
break;
|
2019-11-05 11:41:00 +01:00
|
|
|
case aco_opcode::s_cselect_b64:
|
2019-11-22 11:57:45 +01:00
|
|
|
case aco_opcode::s_cselect_b32:
|
2021-06-09 10:14:54 +02:00
|
|
|
if (instr->operands[0].constantEquals((unsigned)-1) && instr->operands[1].constantEquals(0)) {
|
2019-11-05 11:41:00 +01:00
|
|
|
/* Found a cselect that operates on a uniform bool that comes from eg. s_cmp */
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_uniform_bool(instr->operands[2].getTemp());
|
|
|
|
|
}
|
aco: Flip s_cbranch / s_cselect to optimize out an s_not if possible.
When possible, get rid of an s_not when all it does is invert the SCC,
and its successor s_cbranch / s_cselect can be inverted instead.
Also modify some parts of instruction_selection to take advantage of
this feature.
Example:
s2: %3900, s1: %3899:scc = s_andn2_b64 %0:exec, %406
s2: %3902 = s_cselect_b64 -1, 0, %3900:scc
s2: %407, s1: %3903:scc = s_not_b64 %3902
s2: %3906, s1: %3905:scc = s_and_b64 %407, %0:exec
p_cbranch_z %3905:scc
Can now be optimized to:
s2: %3900, s1: %3899:scc = s_andn2_b64 %0:exec, %406
p_cbranch_nz %3900:scc
Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
2019-11-19 13:29:54 +01:00
|
|
|
if (instr->operands[2].isTemp() && ctx.info[instr->operands[2].tempId()].is_scc_invert()) {
|
|
|
|
|
/* Flip the operands to get rid of the scc_invert instruction */
|
|
|
|
|
std::swap(instr->operands[0], instr->operands[1]);
|
|
|
|
|
instr->operands[2].setTemp(ctx.info[instr->operands[2].tempId()].temp);
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
case aco_opcode::p_wqm:
|
2021-06-09 10:14:54 +02:00
|
|
|
if (instr->operands[0].isTemp() && ctx.info[instr->operands[0].tempId()].is_scc_invert()) {
|
aco: Flip s_cbranch / s_cselect to optimize out an s_not if possible.
When possible, get rid of an s_not when all it does is invert the SCC,
and its successor s_cbranch / s_cselect can be inverted instead.
Also modify some parts of instruction_selection to take advantage of
this feature.
Example:
s2: %3900, s1: %3899:scc = s_andn2_b64 %0:exec, %406
s2: %3902 = s_cselect_b64 -1, 0, %3900:scc
s2: %407, s1: %3903:scc = s_not_b64 %3902
s2: %3906, s1: %3905:scc = s_and_b64 %407, %0:exec
p_cbranch_z %3905:scc
Can now be optimized to:
s2: %3900, s1: %3899:scc = s_andn2_b64 %0:exec, %406
p_cbranch_nz %3900:scc
Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
2019-11-19 13:29:54 +01:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
|
|
|
|
|
}
|
2019-11-05 11:41:00 +01:00
|
|
|
break;
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
case aco_opcode::s_mul_i32:
|
|
|
|
|
/* Testing every uint32_t shows that 0x3f800000*n is never a denormal.
|
|
|
|
|
* This pattern is created from a uniform nir_op_b2f. */
|
|
|
|
|
if (instr->operands[0].constantEquals(0x3f800000u))
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_canonicalized();
|
|
|
|
|
break;
|
2020-08-12 14:35:15 +01:00
|
|
|
case aco_opcode::p_extract: {
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
if (instr->definitions[0].bytes() == 4) {
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_extract(instr.get());
|
2021-08-30 17:58:36 +02:00
|
|
|
if (instr->operands[0].regClass() == v1 && parse_insert(instr.get()))
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
ctx.info[instr->operands[0].tempId()].set_insert(instr.get());
|
|
|
|
|
}
|
2020-08-12 14:35:15 +01:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case aco_opcode::p_insert: {
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
if (instr->operands[0].bytes() == 4) {
|
|
|
|
|
if (instr->operands[0].regClass() == v1)
|
|
|
|
|
ctx.info[instr->operands[0].tempId()].set_insert(instr.get());
|
2021-08-30 17:58:36 +02:00
|
|
|
if (parse_extract(instr.get()))
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_extract(instr.get());
|
2020-08-12 14:35:15 +01:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
}
|
2020-08-12 14:35:15 +01:00
|
|
|
break;
|
|
|
|
|
}
|
2021-08-27 15:45:59 +02:00
|
|
|
case aco_opcode::ds_read_u8:
|
|
|
|
|
case aco_opcode::ds_read_u8_d16:
|
|
|
|
|
case aco_opcode::ds_read_u16:
|
|
|
|
|
case aco_opcode::ds_read_u16_d16: {
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get());
|
|
|
|
|
break;
|
|
|
|
|
}
|
2022-01-17 16:52:10 +00:00
|
|
|
case aco_opcode::v_cvt_f16_f32: {
|
|
|
|
|
if (instr->operands[0].isTemp())
|
|
|
|
|
ctx.info[instr->operands[0].tempId()].set_f2f16(instr.get());
|
|
|
|
|
break;
|
|
|
|
|
}
|
aco: use v_fma_mix to combine mul/add/fma input conversions
fossil-db (Sienna Cichlid):
Totals from 11558 (8.57% of 134913) affected shaders:
VGPRs: 829392 -> 825200 (-0.51%); split: -0.52%, +0.02%
SpillSGPRs: 7845 -> 8399 (+7.06%)
CodeSize: 101822704 -> 101677172 (-0.14%); split: -0.25%, +0.11%
MaxWaves: 172216 -> 173182 (+0.56%); split: +0.59%, -0.03%
Instrs: 19061343 -> 18883450 (-0.93%); split: -0.93%, +0.00%
Latency: 256011590 -> 255177378 (-0.33%); split: -0.39%, +0.06%
InvThroughput: 46104438 -> 45604059 (-1.09%); split: -1.12%, +0.04%
VClause: 352211 -> 351948 (-0.07%); split: -0.21%, +0.13%
SClause: 676506 -> 676961 (+0.07%); split: -0.04%, +0.11%
Copies: 1246571 -> 1237745 (-0.71%); split: -0.97%, +0.26%
Branches: 626229 -> 626241 (+0.00%); split: -0.02%, +0.03%
PreSGPRs: 882176 -> 888853 (+0.76%); split: -0.00%, +0.76%
PreVGPRs: 796705 -> 792304 (-0.55%); split: -0.56%, +0.00%
fossil-db (Navi):
Totals from 11558 (8.57% of 134913) affected shaders:
VGPRs: 803900 -> 798660 (-0.65%); split: -0.73%, +0.08%
SpillSGPRs: 7894 -> 8492 (+7.58%); split: -0.10%, +7.68%
CodeSize: 96892596 -> 97134716 (+0.25%); split: -0.05%, +0.29%
MaxWaves: 181454 -> 183014 (+0.86%); split: +0.94%, -0.08%
Instrs: 18186813 -> 18093994 (-0.51%); split: -0.56%, +0.05%
Latency: 253385909 -> 253325528 (-0.02%); split: -0.15%, +0.12%
InvThroughput: 43315355 -> 42805541 (-1.18%); split: -1.33%, +0.15%
VClause: 338755 -> 338535 (-0.06%); split: -0.16%, +0.10%
SClause: 656561 -> 656829 (+0.04%); split: -0.07%, +0.11%
Copies: 1162235 -> 1153558 (-0.75%); split: -1.07%, +0.32%
Branches: 588536 -> 588542 (+0.00%); split: -0.03%, +0.03%
PreSGPRs: 854849 -> 861640 (+0.79%); split: -0.00%, +0.80%
PreVGPRs: 783401 -> 779031 (-0.56%); split: -0.56%, +0.00%
fossil-db (Vega):
Totals from 11516 (8.53% of 135048) affected shaders:
SGPRs: 1072128 -> 1076288 (+0.39%); split: -0.01%, +0.40%
VGPRs: 821312 -> 818124 (-0.39%); split: -0.43%, +0.04%
SpillSGPRs: 11952 -> 12677 (+6.07%)
CodeSize: 96378496 -> 96707596 (+0.34%); split: -0.04%, +0.38%
MaxWaves: 42614 -> 42883 (+0.63%); split: +0.68%, -0.04%
Instrs: 18672844 -> 18600274 (-0.39%); split: -0.44%, +0.05%
Latency: 296658786 -> 296338296 (-0.11%); split: -0.21%, +0.10%
InvThroughput: 111665547 -> 111283559 (-0.34%); split: -0.40%, +0.06%
VClause: 343001 -> 342826 (-0.05%); split: -0.14%, +0.09%
SClause: 646684 -> 646657 (-0.00%); split: -0.05%, +0.04%
Copies: 1715316 -> 1712895 (-0.14%); split: -0.53%, +0.39%
PreSGPRs: 850737 -> 856543 (+0.68%); split: -0.04%, +0.72%
PreVGPRs: 775293 -> 772215 (-0.40%); split: -0.41%, +0.02%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 13:58:34 +00:00
|
|
|
case aco_opcode::v_cvt_f32_f16: {
|
|
|
|
|
if (instr->operands[0].isTemp())
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_f2f32(instr.get());
|
|
|
|
|
break;
|
|
|
|
|
}
|
2021-06-09 10:14:54 +02:00
|
|
|
default: break;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
|
|
|
|
|
/* Don't remove label_extract if we can't apply the extract to
|
|
|
|
|
* neg/abs instructions because we'll likely combine it into another valu. */
|
|
|
|
|
if (!(ctx.info[instr->definitions[0].tempId()].label & (label_neg | label_abs)))
|
|
|
|
|
check_sdwa_extract(ctx, instr);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
unsigned
|
|
|
|
|
original_temp_id(opt_ctx& ctx, Temp tmp)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
if (ctx.info[tmp.id()].is_temp())
|
|
|
|
|
return ctx.info[tmp.id()].temp.id();
|
|
|
|
|
else
|
|
|
|
|
return tmp.id();
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
2022-09-28 13:45:25 +02:00
|
|
|
decrease_op_uses_if_dead(opt_ctx& ctx, Instruction* instr)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2022-09-28 13:45:25 +02:00
|
|
|
if (is_dead(ctx.uses, instr)) {
|
2019-09-17 13:22:17 +02:00
|
|
|
for (const Operand& op : instr->operands) {
|
|
|
|
|
if (op.isTemp())
|
|
|
|
|
ctx.uses[op.tempId()]--;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-09-28 13:45:25 +02:00
|
|
|
void
|
|
|
|
|
decrease_uses(opt_ctx& ctx, Instruction* instr)
|
|
|
|
|
{
|
|
|
|
|
ctx.uses[instr->definitions[0].tempId()]--;
|
|
|
|
|
decrease_op_uses_if_dead(ctx, instr);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Operand
|
|
|
|
|
copy_operand(opt_ctx& ctx, Operand op)
|
|
|
|
|
{
|
|
|
|
|
if (op.isTemp())
|
|
|
|
|
ctx.uses[op.tempId()]++;
|
|
|
|
|
return op;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
Instruction*
|
|
|
|
|
follow_operand(opt_ctx& ctx, Operand op, bool ignore_uses = false)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2020-08-12 15:58:32 +01:00
|
|
|
if (!op.isTemp() || !(ctx.info[op.tempId()].label & instr_usedef_labels))
|
2019-09-17 13:22:17 +02:00
|
|
|
return nullptr;
|
|
|
|
|
if (!ignore_uses && ctx.uses[op.tempId()] > 1)
|
|
|
|
|
return nullptr;
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
Instruction* instr = ctx.info[op.tempId()].instr;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
if (instr->definitions.size() == 2) {
|
|
|
|
|
assert(instr->definitions[0].isTemp() && instr->definitions[0].tempId() == op.tempId());
|
|
|
|
|
if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
|
|
|
|
|
return nullptr;
|
|
|
|
|
}
|
|
|
|
|
|
2022-09-28 14:37:19 +02:00
|
|
|
for (Operand& operand : instr->operands) {
|
|
|
|
|
if (fixed_to_exec(operand))
|
|
|
|
|
return nullptr;
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
return instr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* s_or_b64(neq(a, a), neq(b, b)) -> v_cmp_u_f32(a, b)
|
|
|
|
|
* s_and_b64(eq(a, a), eq(b, b)) -> v_cmp_o_f32(a, b) */
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
combine_ordering_test(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2019-12-03 13:37:49 +00:00
|
|
|
if (instr->definitions[0].regClass() != ctx.program->lane_mask)
|
2019-09-17 13:22:17 +02:00
|
|
|
return false;
|
|
|
|
|
if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
|
|
|
|
|
return false;
|
|
|
|
|
|
2019-12-03 13:37:49 +00:00
|
|
|
bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32;
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
bool neg[2] = {false, false};
|
|
|
|
|
bool abs[2] = {false, false};
|
2020-01-08 11:49:11 +01:00
|
|
|
uint8_t opsel = 0;
|
2021-06-09 10:14:54 +02:00
|
|
|
Instruction* op_instr[2];
|
2019-09-17 13:22:17 +02:00
|
|
|
Temp op[2];
|
|
|
|
|
|
2020-05-15 20:26:39 +01:00
|
|
|
unsigned bitsize = 0;
|
2019-09-17 13:22:17 +02:00
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
|
|
|
|
op_instr[i] = follow_operand(ctx, instr->operands[i], true);
|
|
|
|
|
if (!op_instr[i])
|
|
|
|
|
return false;
|
|
|
|
|
|
2019-12-03 13:37:49 +00:00
|
|
|
aco_opcode expected_cmp = is_or ? aco_opcode::v_cmp_neq_f32 : aco_opcode::v_cmp_eq_f32;
|
2020-05-15 20:26:39 +01:00
|
|
|
unsigned op_bitsize = get_cmp_bitsize(op_instr[i]->opcode);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-05-15 20:26:39 +01:00
|
|
|
if (get_f32_cmp(op_instr[i]->opcode) != expected_cmp)
|
|
|
|
|
return false;
|
|
|
|
|
if (bitsize && op_bitsize != bitsize)
|
2019-09-17 13:22:17 +02:00
|
|
|
return false;
|
|
|
|
|
if (!op_instr[i]->operands[0].isTemp() || !op_instr[i]->operands[1].isTemp())
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (op_instr[i]->isVOP3()) {
|
2021-01-21 16:13:34 +00:00
|
|
|
VOP3_instruction& vop3 = op_instr[i]->vop3();
|
2021-06-09 10:14:54 +02:00
|
|
|
if (vop3.neg[0] != vop3.neg[1] || vop3.abs[0] != vop3.abs[1] || vop3.opsel == 1 ||
|
|
|
|
|
vop3.opsel == 2)
|
2019-09-17 13:22:17 +02:00
|
|
|
return false;
|
2021-01-21 16:13:34 +00:00
|
|
|
neg[i] = vop3.neg[0];
|
|
|
|
|
abs[i] = vop3.abs[0];
|
|
|
|
|
opsel |= (vop3.opsel & 1) << i;
|
2019-12-05 14:12:39 +00:00
|
|
|
} else if (op_instr[i]->isSDWA()) {
|
|
|
|
|
return false;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Temp op0 = op_instr[i]->operands[0].getTemp();
|
|
|
|
|
Temp op1 = op_instr[i]->operands[1].getTemp();
|
|
|
|
|
if (original_temp_id(ctx, op0) != original_temp_id(ctx, op1))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
op[i] = op1;
|
2020-05-15 20:26:39 +01:00
|
|
|
bitsize = op_bitsize;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2019-11-20 16:31:43 +00:00
|
|
|
if (op[1].type() == RegType::sgpr)
|
|
|
|
|
std::swap(op[0], op[1]);
|
2019-11-20 16:42:17 +00:00
|
|
|
unsigned num_sgprs = (op[0].type() == RegType::sgpr) + (op[1].type() == RegType::sgpr);
|
2022-05-12 02:50:17 -04:00
|
|
|
if (num_sgprs > (ctx.program->gfx_level >= GFX10 ? 2 : 1))
|
2019-11-20 16:31:43 +00:00
|
|
|
return false;
|
|
|
|
|
|
2020-05-15 20:26:39 +01:00
|
|
|
aco_opcode new_op = aco_opcode::num_opcodes;
|
|
|
|
|
switch (bitsize) {
|
2021-06-09 10:14:54 +02:00
|
|
|
case 16: new_op = is_or ? aco_opcode::v_cmp_u_f16 : aco_opcode::v_cmp_o_f16; break;
|
|
|
|
|
case 32: new_op = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32; break;
|
|
|
|
|
case 64: new_op = is_or ? aco_opcode::v_cmp_u_f64 : aco_opcode::v_cmp_o_f64; break;
|
2020-05-15 20:26:39 +01:00
|
|
|
}
|
2021-06-09 10:14:54 +02:00
|
|
|
Instruction* new_instr;
|
2019-11-20 16:42:17 +00:00
|
|
|
if (neg[0] || neg[1] || abs[0] || abs[1] || opsel || num_sgprs > 1) {
|
2021-06-09 10:14:54 +02:00
|
|
|
VOP3_instruction* vop3 =
|
|
|
|
|
create_instruction<VOP3_instruction>(new_op, asVOP3(Format::VOPC), 2, 1);
|
2019-09-17 13:22:17 +02:00
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
|
|
|
|
vop3->neg[i] = neg[i];
|
|
|
|
|
vop3->abs[i] = abs[i];
|
|
|
|
|
}
|
2020-01-08 11:49:11 +01:00
|
|
|
vop3->opsel = opsel;
|
2021-06-09 10:14:54 +02:00
|
|
|
new_instr = static_cast<Instruction*>(vop3);
|
2019-09-17 13:22:17 +02:00
|
|
|
} else {
|
|
|
|
|
new_instr = create_instruction<VOPC_instruction>(new_op, Format::VOPC, 2, 1);
|
|
|
|
|
}
|
2022-09-28 13:45:25 +02:00
|
|
|
new_instr->operands[0] = copy_operand(ctx, Operand(op[0]));
|
|
|
|
|
new_instr->operands[1] = copy_operand(ctx, Operand(op[1]));
|
2019-09-17 13:22:17 +02:00
|
|
|
new_instr->definitions[0] = instr->definitions[0];
|
|
|
|
|
|
2022-09-28 13:45:25 +02:00
|
|
|
decrease_uses(ctx, op_instr[0]);
|
|
|
|
|
decrease_uses(ctx, op_instr[1]);
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.info[instr->definitions[0].tempId()].label = 0;
|
2020-06-19 16:09:48 +01:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
instr.reset(new_instr);
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* s_or_b64(v_cmp_u_f32(a, b), cmp(a, b)) -> get_unordered(cmp)(a, b)
|
|
|
|
|
* s_and_b64(v_cmp_o_f32(a, b), cmp(a, b)) -> get_ordered(cmp)(a, b) */
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
combine_comparison_ordering(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2019-12-03 13:37:49 +00:00
|
|
|
if (instr->definitions[0].regClass() != ctx.program->lane_mask)
|
2019-09-17 13:22:17 +02:00
|
|
|
return false;
|
|
|
|
|
if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
|
|
|
|
|
return false;
|
|
|
|
|
|
2019-12-03 13:37:49 +00:00
|
|
|
bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32;
|
|
|
|
|
aco_opcode expected_nan_test = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
Instruction* nan_test = follow_operand(ctx, instr->operands[0], true);
|
|
|
|
|
Instruction* cmp = follow_operand(ctx, instr->operands[1], true);
|
2019-09-17 13:22:17 +02:00
|
|
|
if (!nan_test || !cmp)
|
|
|
|
|
return false;
|
2019-12-05 14:12:39 +00:00
|
|
|
if (nan_test->isSDWA() || cmp->isSDWA())
|
|
|
|
|
return false;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-05-15 20:26:39 +01:00
|
|
|
if (get_f32_cmp(cmp->opcode) == expected_nan_test)
|
2019-09-17 13:22:17 +02:00
|
|
|
std::swap(nan_test, cmp);
|
2020-05-15 20:26:39 +01:00
|
|
|
else if (get_f32_cmp(nan_test->opcode) != expected_nan_test)
|
2019-09-17 13:22:17 +02:00
|
|
|
return false;
|
|
|
|
|
|
2022-08-15 17:01:06 +01:00
|
|
|
if (!is_fp_cmp(cmp->opcode) || get_cmp_bitsize(cmp->opcode) != get_cmp_bitsize(nan_test->opcode))
|
2019-09-17 13:22:17 +02:00
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (!nan_test->operands[0].isTemp() || !nan_test->operands[1].isTemp())
|
|
|
|
|
return false;
|
|
|
|
|
if (!cmp->operands[0].isTemp() || !cmp->operands[1].isTemp())
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
unsigned prop_cmp0 = original_temp_id(ctx, cmp->operands[0].getTemp());
|
|
|
|
|
unsigned prop_cmp1 = original_temp_id(ctx, cmp->operands[1].getTemp());
|
|
|
|
|
unsigned prop_nan0 = original_temp_id(ctx, nan_test->operands[0].getTemp());
|
|
|
|
|
unsigned prop_nan1 = original_temp_id(ctx, nan_test->operands[1].getTemp());
|
|
|
|
|
if (prop_cmp0 != prop_nan0 && prop_cmp0 != prop_nan1)
|
|
|
|
|
return false;
|
|
|
|
|
if (prop_cmp1 != prop_nan0 && prop_cmp1 != prop_nan1)
|
|
|
|
|
return false;
|
|
|
|
|
|
2019-12-03 13:37:49 +00:00
|
|
|
aco_opcode new_op = is_or ? get_unordered(cmp->opcode) : get_ordered(cmp->opcode);
|
2021-06-09 10:14:54 +02:00
|
|
|
Instruction* new_instr;
|
2019-09-17 13:22:17 +02:00
|
|
|
if (cmp->isVOP3()) {
|
2021-06-09 10:14:54 +02:00
|
|
|
VOP3_instruction* new_vop3 =
|
|
|
|
|
create_instruction<VOP3_instruction>(new_op, asVOP3(Format::VOPC), 2, 1);
|
2021-01-21 16:13:34 +00:00
|
|
|
VOP3_instruction& cmp_vop3 = cmp->vop3();
|
|
|
|
|
memcpy(new_vop3->abs, cmp_vop3.abs, sizeof(new_vop3->abs));
|
|
|
|
|
memcpy(new_vop3->neg, cmp_vop3.neg, sizeof(new_vop3->neg));
|
|
|
|
|
new_vop3->clamp = cmp_vop3.clamp;
|
|
|
|
|
new_vop3->omod = cmp_vop3.omod;
|
|
|
|
|
new_vop3->opsel = cmp_vop3.opsel;
|
2019-09-17 13:22:17 +02:00
|
|
|
new_instr = new_vop3;
|
|
|
|
|
} else {
|
|
|
|
|
new_instr = create_instruction<VOPC_instruction>(new_op, Format::VOPC, 2, 1);
|
|
|
|
|
}
|
2022-09-28 13:45:25 +02:00
|
|
|
new_instr->operands[0] = copy_operand(ctx, cmp->operands[0]);
|
|
|
|
|
new_instr->operands[1] = copy_operand(ctx, cmp->operands[1]);
|
2019-09-17 13:22:17 +02:00
|
|
|
new_instr->definitions[0] = instr->definitions[0];
|
|
|
|
|
|
2022-09-28 13:45:25 +02:00
|
|
|
decrease_uses(ctx, nan_test);
|
|
|
|
|
decrease_uses(ctx, cmp);
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.info[instr->definitions[0].tempId()].label = 0;
|
2020-06-19 16:09:48 +01:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
instr.reset(new_instr);
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
is_operand_constant(opt_ctx& ctx, Operand op, unsigned bit_size, uint64_t* value)
|
2020-10-07 11:40:45 +01:00
|
|
|
{
|
|
|
|
|
if (op.isConstant()) {
|
|
|
|
|
*value = op.constantValue64();
|
|
|
|
|
return true;
|
|
|
|
|
} else if (op.isTemp()) {
|
|
|
|
|
unsigned id = original_temp_id(ctx, op.getTemp());
|
|
|
|
|
if (!ctx.info[id].is_constant_or_literal(bit_size))
|
|
|
|
|
return false;
|
|
|
|
|
*value = get_constant_op(ctx, ctx.info[id], bit_size).constantValue64();
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
is_constant_nan(uint64_t value, unsigned bit_size)
|
2020-10-07 11:40:45 +01:00
|
|
|
{
|
|
|
|
|
if (bit_size == 16)
|
|
|
|
|
return ((value >> 10) & 0x1f) == 0x1f && (value & 0x3ff);
|
|
|
|
|
else if (bit_size == 32)
|
|
|
|
|
return ((value >> 23) & 0xff) == 0xff && (value & 0x7fffff);
|
|
|
|
|
else
|
|
|
|
|
return ((value >> 52) & 0x7ff) == 0x7ff && (value & 0xfffffffffffff);
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
/* s_or_b64(v_cmp_neq_f32(a, a), cmp(a, #b)) and b is not NaN -> get_unordered(cmp)(a, b)
|
|
|
|
|
* s_and_b64(v_cmp_eq_f32(a, a), cmp(a, #b)) and b is not NaN -> get_ordered(cmp)(a, b) */
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
combine_constant_comparison_ordering(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2019-12-03 13:37:49 +00:00
|
|
|
if (instr->definitions[0].regClass() != ctx.program->lane_mask)
|
2019-09-17 13:22:17 +02:00
|
|
|
return false;
|
|
|
|
|
if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
|
|
|
|
|
return false;
|
|
|
|
|
|
2019-12-03 13:37:49 +00:00
|
|
|
bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32;
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
Instruction* nan_test = follow_operand(ctx, instr->operands[0], true);
|
|
|
|
|
Instruction* cmp = follow_operand(ctx, instr->operands[1], true);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
if (!nan_test || !cmp || nan_test->isSDWA() || cmp->isSDWA())
|
2019-09-17 13:22:17 +02:00
|
|
|
return false;
|
2019-12-05 14:12:39 +00:00
|
|
|
if (nan_test->isSDWA() || cmp->isSDWA())
|
|
|
|
|
return false;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2019-12-03 13:37:49 +00:00
|
|
|
aco_opcode expected_nan_test = is_or ? aco_opcode::v_cmp_neq_f32 : aco_opcode::v_cmp_eq_f32;
|
2020-05-15 20:26:39 +01:00
|
|
|
if (get_f32_cmp(cmp->opcode) == expected_nan_test)
|
2019-09-17 13:22:17 +02:00
|
|
|
std::swap(nan_test, cmp);
|
2020-05-15 20:26:39 +01:00
|
|
|
else if (get_f32_cmp(nan_test->opcode) != expected_nan_test)
|
2019-09-17 13:22:17 +02:00
|
|
|
return false;
|
|
|
|
|
|
2020-10-07 11:40:45 +01:00
|
|
|
unsigned bit_size = get_cmp_bitsize(cmp->opcode);
|
2022-08-15 17:01:06 +01:00
|
|
|
if (!is_fp_cmp(cmp->opcode) || get_cmp_bitsize(nan_test->opcode) != bit_size)
|
2019-09-17 13:22:17 +02:00
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (!nan_test->operands[0].isTemp() || !nan_test->operands[1].isTemp())
|
|
|
|
|
return false;
|
|
|
|
|
if (!cmp->operands[0].isTemp() && !cmp->operands[1].isTemp())
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
unsigned prop_nan0 = original_temp_id(ctx, nan_test->operands[0].getTemp());
|
|
|
|
|
unsigned prop_nan1 = original_temp_id(ctx, nan_test->operands[1].getTemp());
|
|
|
|
|
if (prop_nan0 != prop_nan1)
|
|
|
|
|
return false;
|
|
|
|
|
|
2019-12-11 16:57:11 +00:00
|
|
|
if (nan_test->isVOP3()) {
|
2021-01-21 16:13:34 +00:00
|
|
|
VOP3_instruction& vop3 = nan_test->vop3();
|
2021-06-09 10:14:54 +02:00
|
|
|
if (vop3.neg[0] != vop3.neg[1] || vop3.abs[0] != vop3.abs[1] || vop3.opsel == 1 ||
|
|
|
|
|
vop3.opsel == 2)
|
2019-12-11 16:57:11 +00:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
int constant_operand = -1;
|
|
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
2021-06-09 10:14:54 +02:00
|
|
|
if (cmp->operands[i].isTemp() &&
|
|
|
|
|
original_temp_id(ctx, cmp->operands[i].getTemp()) == prop_nan0) {
|
2019-09-17 13:22:17 +02:00
|
|
|
constant_operand = !i;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (constant_operand == -1)
|
|
|
|
|
return false;
|
|
|
|
|
|
2020-10-07 11:40:45 +01:00
|
|
|
uint64_t constant_value;
|
|
|
|
|
if (!is_operand_constant(ctx, cmp->operands[constant_operand], bit_size, &constant_value))
|
2019-09-17 13:22:17 +02:00
|
|
|
return false;
|
2020-10-07 11:40:45 +01:00
|
|
|
if (is_constant_nan(constant_value, bit_size))
|
2019-09-17 13:22:17 +02:00
|
|
|
return false;
|
|
|
|
|
|
2019-12-03 13:37:49 +00:00
|
|
|
aco_opcode new_op = is_or ? get_unordered(cmp->opcode) : get_ordered(cmp->opcode);
|
2021-06-09 10:14:54 +02:00
|
|
|
Instruction* new_instr;
|
2019-09-17 13:22:17 +02:00
|
|
|
if (cmp->isVOP3()) {
|
2021-06-09 10:14:54 +02:00
|
|
|
VOP3_instruction* new_vop3 =
|
|
|
|
|
create_instruction<VOP3_instruction>(new_op, asVOP3(Format::VOPC), 2, 1);
|
2021-01-21 16:13:34 +00:00
|
|
|
VOP3_instruction& cmp_vop3 = cmp->vop3();
|
|
|
|
|
memcpy(new_vop3->abs, cmp_vop3.abs, sizeof(new_vop3->abs));
|
|
|
|
|
memcpy(new_vop3->neg, cmp_vop3.neg, sizeof(new_vop3->neg));
|
|
|
|
|
new_vop3->clamp = cmp_vop3.clamp;
|
|
|
|
|
new_vop3->omod = cmp_vop3.omod;
|
|
|
|
|
new_vop3->opsel = cmp_vop3.opsel;
|
2019-09-17 13:22:17 +02:00
|
|
|
new_instr = new_vop3;
|
|
|
|
|
} else {
|
|
|
|
|
new_instr = create_instruction<VOPC_instruction>(new_op, Format::VOPC, 2, 1);
|
|
|
|
|
}
|
2022-09-28 13:45:25 +02:00
|
|
|
new_instr->operands[0] = copy_operand(ctx, cmp->operands[0]);
|
|
|
|
|
new_instr->operands[1] = copy_operand(ctx, cmp->operands[1]);
|
2019-09-17 13:22:17 +02:00
|
|
|
new_instr->definitions[0] = instr->definitions[0];
|
|
|
|
|
|
2022-09-28 13:45:25 +02:00
|
|
|
decrease_uses(ctx, nan_test);
|
|
|
|
|
decrease_uses(ctx, cmp);
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.info[instr->definitions[0].tempId()].label = 0;
|
2020-06-19 16:09:48 +01:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
instr.reset(new_instr);
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-20 22:55:45 +02:00
|
|
|
/* s_not(cmp(a, b)) -> get_inverse(cmp)(a, b) */
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
combine_inverse_comparison(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2019-12-16 15:35:14 +00:00
|
|
|
if (ctx.uses[instr->definitions[1].tempId()])
|
2019-09-17 13:22:17 +02:00
|
|
|
return false;
|
2022-08-24 19:13:52 +02:00
|
|
|
if (!instr->operands[0].isTemp() || ctx.uses[instr->operands[0].tempId()] != 1)
|
|
|
|
|
return false;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2022-08-20 22:55:45 +02:00
|
|
|
Instruction* cmp = follow_operand(ctx, instr->operands[0]);
|
2019-09-17 13:22:17 +02:00
|
|
|
if (!cmp)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
aco_opcode new_opcode = get_inverse(cmp->opcode);
|
2020-05-19 13:26:21 +01:00
|
|
|
if (new_opcode == aco_opcode::num_opcodes)
|
2019-09-17 13:22:17 +02:00
|
|
|
return false;
|
|
|
|
|
|
2022-08-24 19:13:52 +02:00
|
|
|
/* Invert compare instruction and assign this instruction's definition */
|
|
|
|
|
cmp->opcode = new_opcode;
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()] = ctx.info[cmp->definitions[0].tempId()];
|
|
|
|
|
std::swap(instr->definitions[0], cmp->definitions[0]);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2022-08-24 19:13:52 +02:00
|
|
|
ctx.uses[instr->operands[0].tempId()]--;
|
2019-09-17 13:22:17 +02:00
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* op1(op2(1, 2), 0) if swap = false
|
|
|
|
|
* op1(0, op2(1, 2)) if swap = true */
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
match_op3_for_vop3(opt_ctx& ctx, aco_opcode op1, aco_opcode op2, Instruction* op1_instr, bool swap,
|
|
|
|
|
const char* shuffle_str, Operand operands[3], bool neg[3], bool abs[3],
|
|
|
|
|
uint8_t* opsel, bool* op1_clamp, uint8_t* op1_omod, bool* inbetween_neg,
|
|
|
|
|
bool* inbetween_abs, bool* inbetween_opsel, bool* precise)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
/* checks */
|
|
|
|
|
if (op1_instr->opcode != op1)
|
|
|
|
|
return false;
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
Instruction* op2_instr = follow_operand(ctx, op1_instr->operands[swap]);
|
2019-09-17 13:22:17 +02:00
|
|
|
if (!op2_instr || op2_instr->opcode != op2)
|
|
|
|
|
return false;
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
VOP3_instruction* op1_vop3 = op1_instr->isVOP3() ? &op1_instr->vop3() : NULL;
|
|
|
|
|
VOP3_instruction* op2_vop3 = op2_instr->isVOP3() ? &op2_instr->vop3() : NULL;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2019-12-05 14:12:39 +00:00
|
|
|
if (op1_instr->isSDWA() || op2_instr->isSDWA())
|
|
|
|
|
return false;
|
2021-07-19 14:26:42 +01:00
|
|
|
if (op1_instr->isDPP() || op2_instr->isDPP())
|
|
|
|
|
return false;
|
2019-12-05 14:12:39 +00:00
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
/* don't support inbetween clamp/omod */
|
|
|
|
|
if (op2_vop3 && (op2_vop3->clamp || op2_vop3->omod))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
/* get operands and modifiers and check inbetween modifiers */
|
|
|
|
|
*op1_clamp = op1_vop3 ? op1_vop3->clamp : false;
|
|
|
|
|
*op1_omod = op1_vop3 ? op1_vop3->omod : 0u;
|
|
|
|
|
|
|
|
|
|
if (inbetween_neg)
|
|
|
|
|
*inbetween_neg = op1_vop3 ? op1_vop3->neg[swap] : false;
|
|
|
|
|
else if (op1_vop3 && op1_vop3->neg[swap])
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (inbetween_abs)
|
|
|
|
|
*inbetween_abs = op1_vop3 ? op1_vop3->abs[swap] : false;
|
|
|
|
|
else if (op1_vop3 && op1_vop3->abs[swap])
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (inbetween_opsel)
|
2020-11-26 22:08:42 -08:00
|
|
|
*inbetween_opsel = op1_vop3 ? op1_vop3->opsel & (1 << (unsigned)swap) : false;
|
|
|
|
|
else if (op1_vop3 && op1_vop3->opsel & (1 << (unsigned)swap))
|
2019-09-17 13:22:17 +02:00
|
|
|
return false;
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
*precise = op1_instr->definitions[0].isPrecise() || op2_instr->definitions[0].isPrecise();
|
2020-10-07 11:09:16 +01:00
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
int shuffle[3];
|
|
|
|
|
shuffle[shuffle_str[0] - '0'] = 0;
|
|
|
|
|
shuffle[shuffle_str[1] - '0'] = 1;
|
|
|
|
|
shuffle[shuffle_str[2] - '0'] = 2;
|
|
|
|
|
|
|
|
|
|
operands[shuffle[0]] = op1_instr->operands[!swap];
|
|
|
|
|
neg[shuffle[0]] = op1_vop3 ? op1_vop3->neg[!swap] : false;
|
|
|
|
|
abs[shuffle[0]] = op1_vop3 ? op1_vop3->abs[!swap] : false;
|
2020-11-26 22:08:42 -08:00
|
|
|
if (op1_vop3 && (op1_vop3->opsel & (1 << (unsigned)!swap)))
|
2020-01-08 11:49:11 +01:00
|
|
|
*opsel |= 1 << shuffle[0];
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
|
|
|
|
operands[shuffle[i + 1]] = op2_instr->operands[i];
|
|
|
|
|
neg[shuffle[i + 1]] = op2_vop3 ? op2_vop3->neg[i] : false;
|
|
|
|
|
abs[shuffle[i + 1]] = op2_vop3 ? op2_vop3->abs[i] : false;
|
2020-01-08 11:49:11 +01:00
|
|
|
if (op2_vop3 && op2_vop3->opsel & (1 << i))
|
|
|
|
|
*opsel |= 1 << shuffle[i + 1];
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* check operands */
|
2019-11-22 14:50:41 +00:00
|
|
|
if (!check_vop3_operands(ctx, 3, operands))
|
|
|
|
|
return false;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
create_vop3_for_op3(opt_ctx& ctx, aco_opcode opcode, aco_ptr<Instruction>& instr,
|
|
|
|
|
Operand operands[3], bool neg[3], bool abs[3], uint8_t opsel, bool clamp,
|
|
|
|
|
unsigned omod)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2021-06-09 10:14:54 +02:00
|
|
|
VOP3_instruction* new_instr = create_instruction<VOP3_instruction>(opcode, Format::VOP3, 3, 1);
|
2019-09-17 13:22:17 +02:00
|
|
|
memcpy(new_instr->abs, abs, sizeof(bool[3]));
|
|
|
|
|
memcpy(new_instr->neg, neg, sizeof(bool[3]));
|
|
|
|
|
new_instr->clamp = clamp;
|
|
|
|
|
new_instr->omod = omod;
|
2020-01-08 11:49:11 +01:00
|
|
|
new_instr->opsel = opsel;
|
2019-09-17 13:22:17 +02:00
|
|
|
new_instr->operands[0] = operands[0];
|
|
|
|
|
new_instr->operands[1] = operands[1];
|
|
|
|
|
new_instr->operands[2] = operands[2];
|
|
|
|
|
new_instr->definitions[0] = instr->definitions[0];
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].label = 0;
|
|
|
|
|
|
|
|
|
|
instr.reset(new_instr);
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
combine_three_valu_op(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode op2, aco_opcode new_op,
|
|
|
|
|
const char* shuffle, uint8_t ops)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
for (unsigned swap = 0; swap < 2; swap++) {
|
|
|
|
|
if (!((1 << swap) & ops))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
Operand operands[3];
|
2020-10-07 11:09:16 +01:00
|
|
|
bool neg[3], abs[3], clamp, precise;
|
2020-01-08 11:49:11 +01:00
|
|
|
uint8_t opsel = 0, omod = 0;
|
2021-06-09 10:14:54 +02:00
|
|
|
if (match_op3_for_vop3(ctx, instr->opcode, op2, instr.get(), swap, shuffle, operands, neg,
|
|
|
|
|
abs, &opsel, &clamp, &omod, NULL, NULL, NULL, &precise)) {
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.uses[instr->operands[swap].tempId()]--;
|
|
|
|
|
create_vop3_for_op3(ctx, new_op, instr, operands, neg, abs, opsel, clamp, omod);
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2020-08-12 14:35:15 +01:00
|
|
|
/* creates v_lshl_add_u32, v_lshl_or_b32 or v_and_or_b32 */
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
combine_add_or_then_and_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
2020-08-12 14:35:15 +01:00
|
|
|
{
|
|
|
|
|
bool is_or = instr->opcode == aco_opcode::v_or_b32;
|
|
|
|
|
aco_opcode new_op_lshl = is_or ? aco_opcode::v_lshl_or_b32 : aco_opcode::v_lshl_add_u32;
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
if (is_or && combine_three_valu_op(ctx, instr, aco_opcode::s_and_b32, aco_opcode::v_and_or_b32,
|
|
|
|
|
"120", 1 | 2))
|
2020-08-12 14:35:15 +01:00
|
|
|
return true;
|
2021-06-09 10:14:54 +02:00
|
|
|
if (is_or && combine_three_valu_op(ctx, instr, aco_opcode::v_and_b32, aco_opcode::v_and_or_b32,
|
|
|
|
|
"120", 1 | 2))
|
2020-08-12 14:35:15 +01:00
|
|
|
return true;
|
|
|
|
|
if (combine_three_valu_op(ctx, instr, aco_opcode::s_lshl_b32, new_op_lshl, "120", 1 | 2))
|
|
|
|
|
return true;
|
|
|
|
|
if (combine_three_valu_op(ctx, instr, aco_opcode::v_lshlrev_b32, new_op_lshl, "210", 1 | 2))
|
|
|
|
|
return true;
|
|
|
|
|
|
2021-07-19 14:26:42 +01:00
|
|
|
if (instr->isSDWA() || instr->isDPP())
|
2020-08-12 14:35:15 +01:00
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
/* v_or_b32(p_extract(a, 0, 8/16, 0), b) -> v_and_or_b32(a, 0xff/0xffff, b)
|
|
|
|
|
* v_or_b32(p_insert(a, 0, 8/16), b) -> v_and_or_b32(a, 0xff/0xffff, b)
|
|
|
|
|
* v_or_b32(p_insert(a, 24/16, 8/16), b) -> v_lshl_or_b32(a, 24/16, b)
|
|
|
|
|
* v_add_u32(p_insert(a, 24/16, 8/16), b) -> v_lshl_add_b32(a, 24/16, b)
|
|
|
|
|
*/
|
|
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
2021-06-09 10:14:54 +02:00
|
|
|
Instruction* extins = follow_operand(ctx, instr->operands[i]);
|
2020-08-12 14:35:15 +01:00
|
|
|
if (!extins)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
aco_opcode op;
|
|
|
|
|
Operand operands[3];
|
|
|
|
|
|
|
|
|
|
if (extins->opcode == aco_opcode::p_insert &&
|
|
|
|
|
(extins->operands[1].constantValue() + 1) * extins->operands[2].constantValue() == 32) {
|
|
|
|
|
op = new_op_lshl;
|
2021-06-09 10:14:54 +02:00
|
|
|
operands[1] =
|
2021-07-13 11:22:46 +02:00
|
|
|
Operand::c32(extins->operands[1].constantValue() * extins->operands[2].constantValue());
|
2021-06-09 10:14:54 +02:00
|
|
|
} else if (is_or &&
|
|
|
|
|
(extins->opcode == aco_opcode::p_insert ||
|
|
|
|
|
(extins->opcode == aco_opcode::p_extract &&
|
|
|
|
|
extins->operands[3].constantEquals(0))) &&
|
2020-08-12 14:35:15 +01:00
|
|
|
extins->operands[1].constantEquals(0)) {
|
|
|
|
|
op = aco_opcode::v_and_or_b32;
|
2021-07-13 11:22:46 +02:00
|
|
|
operands[1] = Operand::c32(extins->operands[2].constantEquals(8) ? 0xffu : 0xffffu);
|
2020-08-12 14:35:15 +01:00
|
|
|
} else {
|
2021-06-09 10:14:54 +02:00
|
|
|
continue;
|
2020-08-12 14:35:15 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
operands[0] = extins->operands[0];
|
|
|
|
|
operands[2] = instr->operands[!i];
|
|
|
|
|
|
|
|
|
|
if (!check_vop3_operands(ctx, 3, operands))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
bool neg[3] = {}, abs[3] = {};
|
|
|
|
|
uint8_t opsel = 0, omod = 0;
|
|
|
|
|
bool clamp = false;
|
|
|
|
|
if (instr->isVOP3())
|
|
|
|
|
clamp = instr->vop3().clamp;
|
|
|
|
|
|
|
|
|
|
ctx.uses[instr->operands[i].tempId()]--;
|
|
|
|
|
create_vop3_for_op3(ctx, op, instr, operands, neg, abs, opsel, clamp, omod);
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
aco: use v_minmax/v_maxmin opcodes
fossil-db (gfx1100):
Totals from 29868 (22.12% of 135032) affected shaders:
MaxWaves: 741336 -> 741344 (+0.00%)
Instrs: 34624902 -> 34539766 (-0.25%); split: -0.25%, +0.00%
CodeSize: 187196804 -> 187192100 (-0.00%); split: -0.01%, +0.01%
VGPRs: 1816860 -> 1816788 (-0.00%); split: -0.01%, +0.01%
Latency: 502597202 -> 502245627 (-0.07%); split: -0.08%, +0.01%
InvThroughput: 84813176 -> 84586122 (-0.27%); split: -0.28%, +0.01%
VClause: 633826 -> 633749 (-0.01%); split: -0.02%, +0.01%
SClause: 1317738 -> 1317047 (-0.05%); split: -0.06%, +0.01%
Copies: 2130610 -> 2130954 (+0.02%); split: -0.03%, +0.05%
Branches: 766093 -> 765969 (-0.02%); split: -0.02%, +0.00%
PreSGPRs: 1630250 -> 1630034 (-0.01%); split: -0.02%, +0.00%
PreVGPRs: 1590777 -> 1590664 (-0.01%); split: -0.01%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19933>
2022-11-16 17:42:20 +00:00
|
|
|
combine_minmax(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode opposite, aco_opcode op3src,
|
|
|
|
|
aco_opcode minmax)
|
2019-11-22 20:32:11 +00:00
|
|
|
{
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
/* TODO: this can handle SDWA min/max instructions by using opsel */
|
2019-11-22 20:32:11 +00:00
|
|
|
|
aco: use v_minmax/v_maxmin opcodes
fossil-db (gfx1100):
Totals from 29868 (22.12% of 135032) affected shaders:
MaxWaves: 741336 -> 741344 (+0.00%)
Instrs: 34624902 -> 34539766 (-0.25%); split: -0.25%, +0.00%
CodeSize: 187196804 -> 187192100 (-0.00%); split: -0.01%, +0.01%
VGPRs: 1816860 -> 1816788 (-0.00%); split: -0.01%, +0.01%
Latency: 502597202 -> 502245627 (-0.07%); split: -0.08%, +0.01%
InvThroughput: 84813176 -> 84586122 (-0.27%); split: -0.28%, +0.01%
VClause: 633826 -> 633749 (-0.01%); split: -0.02%, +0.01%
SClause: 1317738 -> 1317047 (-0.05%); split: -0.06%, +0.01%
Copies: 2130610 -> 2130954 (+0.02%); split: -0.03%, +0.05%
Branches: 766093 -> 765969 (-0.02%); split: -0.02%, +0.00%
PreSGPRs: 1630250 -> 1630034 (-0.01%); split: -0.02%, +0.00%
PreVGPRs: 1590777 -> 1590664 (-0.01%); split: -0.01%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19933>
2022-11-16 17:42:20 +00:00
|
|
|
/* min(min(a, b), c) -> min3(a, b, c)
|
|
|
|
|
* max(max(a, b), c) -> max3(a, b, c)
|
|
|
|
|
* gfx11: min(-min(a, b), c) -> maxmin(-a, -b, c)
|
|
|
|
|
* gfx11: max(-max(a, b), c) -> minmax(-a, -b, c)
|
|
|
|
|
*/
|
|
|
|
|
for (unsigned swap = 0; swap < 2; swap++) {
|
|
|
|
|
Operand operands[3];
|
|
|
|
|
bool neg[3], abs[3], clamp, precise;
|
|
|
|
|
uint8_t opsel = 0, omod = 0;
|
|
|
|
|
bool inbetween_neg;
|
|
|
|
|
if (match_op3_for_vop3(ctx, instr->opcode, instr->opcode, instr.get(), swap, "120", operands,
|
|
|
|
|
neg, abs, &opsel, &clamp, &omod, &inbetween_neg, NULL, NULL,
|
|
|
|
|
&precise) &&
|
|
|
|
|
(!inbetween_neg ||
|
|
|
|
|
(minmax != aco_opcode::num_opcodes && ctx.program->gfx_level >= GFX11))) {
|
|
|
|
|
ctx.uses[instr->operands[swap].tempId()]--;
|
|
|
|
|
if (inbetween_neg) {
|
|
|
|
|
neg[0] = !neg[0];
|
|
|
|
|
neg[1] = !neg[1];
|
|
|
|
|
create_vop3_for_op3(ctx, minmax, instr, operands, neg, abs, opsel, clamp, omod);
|
|
|
|
|
} else {
|
|
|
|
|
create_vop3_for_op3(ctx, op3src, instr, operands, neg, abs, opsel, clamp, omod);
|
|
|
|
|
}
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* min(-max(a, b), c) -> min3(-a, -b, c)
|
|
|
|
|
* max(-min(a, b), c) -> max3(-a, -b, c)
|
|
|
|
|
* gfx11: min(max(a, b), c) -> maxmin(a, b, c)
|
|
|
|
|
* gfx11: max(min(a, b), c) -> minmax(a, b, c)
|
|
|
|
|
*/
|
2019-11-22 20:32:11 +00:00
|
|
|
for (unsigned swap = 0; swap < 2; swap++) {
|
|
|
|
|
Operand operands[3];
|
2020-10-07 11:09:16 +01:00
|
|
|
bool neg[3], abs[3], clamp, precise;
|
2019-11-22 20:32:11 +00:00
|
|
|
uint8_t opsel = 0, omod = 0;
|
|
|
|
|
bool inbetween_neg;
|
2022-11-16 18:10:38 +00:00
|
|
|
if (match_op3_for_vop3(ctx, instr->opcode, opposite, instr.get(), swap, "120", operands, neg,
|
2021-06-09 10:14:54 +02:00
|
|
|
abs, &opsel, &clamp, &omod, &inbetween_neg, NULL, NULL, &precise) &&
|
aco: use v_minmax/v_maxmin opcodes
fossil-db (gfx1100):
Totals from 29868 (22.12% of 135032) affected shaders:
MaxWaves: 741336 -> 741344 (+0.00%)
Instrs: 34624902 -> 34539766 (-0.25%); split: -0.25%, +0.00%
CodeSize: 187196804 -> 187192100 (-0.00%); split: -0.01%, +0.01%
VGPRs: 1816860 -> 1816788 (-0.00%); split: -0.01%, +0.01%
Latency: 502597202 -> 502245627 (-0.07%); split: -0.08%, +0.01%
InvThroughput: 84813176 -> 84586122 (-0.27%); split: -0.28%, +0.01%
VClause: 633826 -> 633749 (-0.01%); split: -0.02%, +0.01%
SClause: 1317738 -> 1317047 (-0.05%); split: -0.06%, +0.01%
Copies: 2130610 -> 2130954 (+0.02%); split: -0.03%, +0.05%
Branches: 766093 -> 765969 (-0.02%); split: -0.02%, +0.00%
PreSGPRs: 1630250 -> 1630034 (-0.01%); split: -0.02%, +0.00%
PreVGPRs: 1590777 -> 1590664 (-0.01%); split: -0.01%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19933>
2022-11-16 17:42:20 +00:00
|
|
|
(inbetween_neg ||
|
|
|
|
|
(minmax != aco_opcode::num_opcodes && ctx.program->gfx_level >= GFX11))) {
|
2019-11-22 20:32:11 +00:00
|
|
|
ctx.uses[instr->operands[swap].tempId()]--;
|
aco: use v_minmax/v_maxmin opcodes
fossil-db (gfx1100):
Totals from 29868 (22.12% of 135032) affected shaders:
MaxWaves: 741336 -> 741344 (+0.00%)
Instrs: 34624902 -> 34539766 (-0.25%); split: -0.25%, +0.00%
CodeSize: 187196804 -> 187192100 (-0.00%); split: -0.01%, +0.01%
VGPRs: 1816860 -> 1816788 (-0.00%); split: -0.01%, +0.01%
Latency: 502597202 -> 502245627 (-0.07%); split: -0.08%, +0.01%
InvThroughput: 84813176 -> 84586122 (-0.27%); split: -0.28%, +0.01%
VClause: 633826 -> 633749 (-0.01%); split: -0.02%, +0.01%
SClause: 1317738 -> 1317047 (-0.05%); split: -0.06%, +0.01%
Copies: 2130610 -> 2130954 (+0.02%); split: -0.03%, +0.05%
Branches: 766093 -> 765969 (-0.02%); split: -0.02%, +0.00%
PreSGPRs: 1630250 -> 1630034 (-0.01%); split: -0.02%, +0.00%
PreVGPRs: 1590777 -> 1590664 (-0.01%); split: -0.01%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19933>
2022-11-16 17:42:20 +00:00
|
|
|
if (inbetween_neg) {
|
|
|
|
|
neg[0] = !neg[0];
|
|
|
|
|
neg[1] = !neg[1];
|
|
|
|
|
create_vop3_for_op3(ctx, op3src, instr, operands, neg, abs, opsel, clamp, omod);
|
|
|
|
|
} else {
|
|
|
|
|
create_vop3_for_op3(ctx, minmax, instr, operands, neg, abs, opsel, clamp, omod);
|
|
|
|
|
}
|
2019-11-22 20:32:11 +00:00
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
/* s_not_b32(s_and_b32(a, b)) -> s_nand_b32(a, b)
|
|
|
|
|
* s_not_b32(s_or_b32(a, b)) -> s_nor_b32(a, b)
|
|
|
|
|
* s_not_b32(s_xor_b32(a, b)) -> s_xnor_b32(a, b)
|
|
|
|
|
* s_not_b64(s_and_b64(a, b)) -> s_nand_b64(a, b)
|
|
|
|
|
* s_not_b64(s_or_b64(a, b)) -> s_nor_b64(a, b)
|
|
|
|
|
* s_not_b64(s_xor_b64(a, b)) -> s_xnor_b64(a, b) */
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
combine_salu_not_bitwise(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
/* checks */
|
|
|
|
|
if (!instr->operands[0].isTemp())
|
|
|
|
|
return false;
|
|
|
|
|
if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
|
|
|
|
|
return false;
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
Instruction* op2_instr = follow_operand(ctx, instr->operands[0]);
|
2019-09-17 13:22:17 +02:00
|
|
|
if (!op2_instr)
|
|
|
|
|
return false;
|
|
|
|
|
switch (op2_instr->opcode) {
|
|
|
|
|
case aco_opcode::s_and_b32:
|
|
|
|
|
case aco_opcode::s_or_b32:
|
|
|
|
|
case aco_opcode::s_xor_b32:
|
|
|
|
|
case aco_opcode::s_and_b64:
|
|
|
|
|
case aco_opcode::s_or_b64:
|
2021-06-09 10:14:54 +02:00
|
|
|
case aco_opcode::s_xor_b64: break;
|
|
|
|
|
default: return false;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* create instruction */
|
|
|
|
|
std::swap(instr->definitions[0], op2_instr->definitions[0]);
|
2020-01-28 12:32:09 +01:00
|
|
|
std::swap(instr->definitions[1], op2_instr->definitions[1]);
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.uses[instr->operands[0].tempId()]--;
|
|
|
|
|
ctx.info[op2_instr->definitions[0].tempId()].label = 0;
|
|
|
|
|
|
|
|
|
|
switch (op2_instr->opcode) {
|
2021-06-09 10:14:54 +02:00
|
|
|
case aco_opcode::s_and_b32: op2_instr->opcode = aco_opcode::s_nand_b32; break;
|
|
|
|
|
case aco_opcode::s_or_b32: op2_instr->opcode = aco_opcode::s_nor_b32; break;
|
|
|
|
|
case aco_opcode::s_xor_b32: op2_instr->opcode = aco_opcode::s_xnor_b32; break;
|
|
|
|
|
case aco_opcode::s_and_b64: op2_instr->opcode = aco_opcode::s_nand_b64; break;
|
|
|
|
|
case aco_opcode::s_or_b64: op2_instr->opcode = aco_opcode::s_nor_b64; break;
|
|
|
|
|
case aco_opcode::s_xor_b64: op2_instr->opcode = aco_opcode::s_xnor_b64; break;
|
|
|
|
|
default: break;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* s_and_b32(a, s_not_b32(b)) -> s_andn2_b32(a, b)
|
|
|
|
|
* s_or_b32(a, s_not_b32(b)) -> s_orn2_b32(a, b)
|
|
|
|
|
* s_and_b64(a, s_not_b64(b)) -> s_andn2_b64(a, b)
|
|
|
|
|
* s_or_b64(a, s_not_b64(b)) -> s_orn2_b64(a, b) */
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
combine_salu_n2(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2020-02-05 11:19:06 +01:00
|
|
|
if (instr->definitions[0].isTemp() && ctx.info[instr->definitions[0].tempId()].is_uniform_bool())
|
|
|
|
|
return false;
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
2021-06-09 10:14:54 +02:00
|
|
|
Instruction* op2_instr = follow_operand(ctx, instr->operands[i]);
|
|
|
|
|
if (!op2_instr || (op2_instr->opcode != aco_opcode::s_not_b32 &&
|
|
|
|
|
op2_instr->opcode != aco_opcode::s_not_b64))
|
2019-09-17 13:22:17 +02:00
|
|
|
continue;
|
2022-09-28 14:37:19 +02:00
|
|
|
if (ctx.uses[op2_instr->definitions[1].tempId()])
|
2020-01-28 12:04:48 +00:00
|
|
|
continue;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2019-11-22 14:34:24 +00:00
|
|
|
if (instr->operands[!i].isLiteral() && op2_instr->operands[0].isLiteral() &&
|
|
|
|
|
instr->operands[!i].constantValue() != op2_instr->operands[0].constantValue())
|
|
|
|
|
continue;
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.uses[instr->operands[i].tempId()]--;
|
|
|
|
|
instr->operands[0] = instr->operands[!i];
|
|
|
|
|
instr->operands[1] = op2_instr->operands[0];
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].label = 0;
|
|
|
|
|
|
|
|
|
|
switch (instr->opcode) {
|
2021-06-09 10:14:54 +02:00
|
|
|
case aco_opcode::s_and_b32: instr->opcode = aco_opcode::s_andn2_b32; break;
|
|
|
|
|
case aco_opcode::s_or_b32: instr->opcode = aco_opcode::s_orn2_b32; break;
|
|
|
|
|
case aco_opcode::s_and_b64: instr->opcode = aco_opcode::s_andn2_b64; break;
|
|
|
|
|
case aco_opcode::s_or_b64: instr->opcode = aco_opcode::s_orn2_b64; break;
|
|
|
|
|
default: break;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* s_add_{i32,u32}(a, s_lshl_b32(b, <n>)) -> s_lshl<n>_add_u32(a, b) */
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
combine_salu_lshl_add(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2020-01-28 12:05:26 +00:00
|
|
|
if (instr->opcode == aco_opcode::s_add_i32 && ctx.uses[instr->definitions[1].tempId()])
|
2019-09-17 13:22:17 +02:00
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
2021-06-09 10:14:54 +02:00
|
|
|
Instruction* op2_instr = follow_operand(ctx, instr->operands[i], true);
|
2020-01-28 12:05:26 +00:00
|
|
|
if (!op2_instr || op2_instr->opcode != aco_opcode::s_lshl_b32 ||
|
|
|
|
|
ctx.uses[op2_instr->definitions[1].tempId()])
|
2020-01-28 12:04:48 +00:00
|
|
|
continue;
|
2022-09-28 14:37:19 +02:00
|
|
|
if (!op2_instr->operands[1].isConstant())
|
2019-09-17 13:22:17 +02:00
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
uint32_t shift = op2_instr->operands[1].constantValue();
|
|
|
|
|
if (shift < 1 || shift > 4)
|
|
|
|
|
continue;
|
|
|
|
|
|
2019-11-22 14:34:24 +00:00
|
|
|
if (instr->operands[!i].isLiteral() && op2_instr->operands[0].isLiteral() &&
|
|
|
|
|
instr->operands[!i].constantValue() != op2_instr->operands[0].constantValue())
|
|
|
|
|
continue;
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
instr->operands[1] = instr->operands[!i];
|
2022-09-28 13:45:25 +02:00
|
|
|
instr->operands[0] = copy_operand(ctx, op2_instr->operands[0]);
|
|
|
|
|
decrease_uses(ctx, op2_instr);
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.info[instr->definitions[0].tempId()].label = 0;
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
instr->opcode = std::array<aco_opcode, 4>{
|
|
|
|
|
aco_opcode::s_lshl1_add_u32, aco_opcode::s_lshl2_add_u32, aco_opcode::s_lshl3_add_u32,
|
|
|
|
|
aco_opcode::s_lshl4_add_u32}[shift - 1];
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2022-09-23 19:00:36 +02:00
|
|
|
/* s_abs_i32(s_sub_[iu]32(a, b)) -> s_absdiff_i32(a, b)
|
|
|
|
|
* s_abs_i32(s_add_[iu]32(a, #b)) -> s_absdiff_i32(a, -b)
|
|
|
|
|
*/
|
|
|
|
|
bool
|
|
|
|
|
combine_sabsdiff(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
|
|
|
|
{
|
|
|
|
|
if (!instr->operands[0].isTemp() || !ctx.info[instr->operands[0].tempId()].is_add_sub())
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
Instruction* op_instr = follow_operand(ctx, instr->operands[0], false);
|
|
|
|
|
if (!op_instr)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (op_instr->opcode == aco_opcode::s_add_i32 || op_instr->opcode == aco_opcode::s_add_u32) {
|
|
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
|
|
|
|
uint64_t constant;
|
|
|
|
|
if (op_instr->operands[!i].isLiteral() ||
|
|
|
|
|
!is_operand_constant(ctx, op_instr->operands[i], 32, &constant))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
if (op_instr->operands[i].isTemp())
|
|
|
|
|
ctx.uses[op_instr->operands[i].tempId()]--;
|
|
|
|
|
op_instr->operands[0] = op_instr->operands[!i];
|
|
|
|
|
op_instr->operands[1] = Operand::c32(-int32_t(constant));
|
|
|
|
|
goto use_absdiff;
|
|
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
use_absdiff:
|
|
|
|
|
op_instr->opcode = aco_opcode::s_absdiff_i32;
|
|
|
|
|
std::swap(instr->definitions[0], op_instr->definitions[0]);
|
|
|
|
|
std::swap(instr->definitions[1], op_instr->definitions[1]);
|
|
|
|
|
ctx.uses[instr->operands[0].tempId()]--;
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
aco: Combine bit test to s_bitcmp.
Foz-DB Navi21:
Totals from 6396 (4.74% of 134913) affected shaders:
VGPRs: 483280 -> 483152 (-0.03%); split: -0.03%, +0.01%
SpillSGPRs: 8119 -> 7941 (-2.19%)
CodeSize: 63377880 -> 63268556 (-0.17%); split: -0.20%, +0.03%
MaxWaves: 86778 -> 86810 (+0.04%)
Instrs: 11745621 -> 11725857 (-0.17%); split: -0.20%, +0.03%
Latency: 162400148 -> 162282230 (-0.07%); split: -0.08%, +0.01%
InvThroughput: 29179429 -> 29133173 (-0.16%); split: -0.16%, +0.00%
VClause: 208032 -> 208100 (+0.03%); split: -0.01%, +0.05%
SClause: 431390 -> 430849 (-0.13%); split: -0.24%, +0.11%
Copies: 896222 -> 893285 (-0.33%); split: -0.62%, +0.30%
Branches: 349806 -> 348770 (-0.30%); split: -0.90%, +0.60%
PreSGPRs: 618908 -> 613773 (-0.83%); split: -0.83%, +0.00%
PreVGPRs: 482901 -> 482893 (-0.00%)
Signed-off-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18870>
2022-09-24 18:49:10 +02:00
|
|
|
/* s_cmp_{lg,eq}(s_and(a, s_lshl(1, b)), 0) -> s_bitcmp[10](a, b)*/
|
|
|
|
|
bool
|
|
|
|
|
combine_s_bitcmp(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
|
|
|
|
{
|
|
|
|
|
bool lg = false;
|
|
|
|
|
bool b64 = false;
|
|
|
|
|
switch (instr->opcode) {
|
|
|
|
|
case aco_opcode::s_cmp_lg_i32:
|
|
|
|
|
case aco_opcode::s_cmp_lg_u32: lg = true; break;
|
|
|
|
|
case aco_opcode::s_cmp_eq_i32:
|
|
|
|
|
case aco_opcode::s_cmp_eq_u32: break;
|
|
|
|
|
case aco_opcode::s_cmp_lg_u64: lg = true; FALLTHROUGH;
|
|
|
|
|
case aco_opcode::s_cmp_eq_u64: b64 = true; break;
|
|
|
|
|
default: return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
aco_opcode s_and = b64 ? aco_opcode::s_and_b64 : aco_opcode::s_and_b32;
|
|
|
|
|
aco_opcode s_lshl = b64 ? aco_opcode::s_lshl_b64 : aco_opcode::s_lshl_b32;
|
|
|
|
|
|
|
|
|
|
for (unsigned cmp_idx = 0; cmp_idx < 2; cmp_idx++) {
|
|
|
|
|
Instruction* and_instr = follow_operand(ctx, instr->operands[cmp_idx], false);
|
|
|
|
|
if (!and_instr || and_instr->opcode != s_and)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
for (unsigned and_idx = 0; and_idx < 2; and_idx++) {
|
|
|
|
|
Instruction* lshl_instr = follow_operand(ctx, and_instr->operands[and_idx], true);
|
|
|
|
|
if (!lshl_instr || lshl_instr->opcode != s_lshl ||
|
|
|
|
|
!lshl_instr->operands[0].constantEquals(1) ||
|
|
|
|
|
(lshl_instr->operands[1].isLiteral() && and_instr->operands[!and_idx].isLiteral()))
|
aco: Combine constant bit test to s_bitcmp.
Foz-DB Navi21:
Totals from 73988 (54.84% of 134913) affected shaders:
VGPRs: 2959768 -> 2959752 (-0.00%)
SpillSGPRs: 10250 -> 10697 (+4.36%); split: -0.64%, +5.00%
SpillVGPRs: 2326 -> 2291 (-1.50%); split: -2.24%, +0.73%
CodeSize: 261339476 -> 261045912 (-0.11%); split: -0.12%, +0.00%
Scratch: 239616 -> 238592 (-0.43%)
Instrs: 49214044 -> 49188242 (-0.05%); split: -0.06%, +0.00%
Latency: 413214139 -> 413296229 (+0.02%); split: -0.03%, +0.05%
InvThroughput: 71741622 -> 71786300 (+0.06%); split: -0.07%, +0.13%
VClause: 856838 -> 856973 (+0.02%); split: -0.01%, +0.02%
SClause: 1504502 -> 1504567 (+0.00%); split: -0.01%, +0.02%
Copies: 4058433 -> 4060424 (+0.05%); split: -0.03%, +0.08%
Branches: 1502953 -> 1502945 (-0.00%); split: -0.00%, +0.00%
PreSGPRs: 3081927 -> 3081531 (-0.01%); split: -0.02%, +0.01%
PreVGPRs: 2513990 -> 2513992 (+0.00%)
The vast majority of instruction count regressions are caused by parallel-rdp.
Signed-off-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18870>
2022-10-18 15:48:21 +02:00
|
|
|
lshl_instr = nullptr;
|
|
|
|
|
|
|
|
|
|
uint64_t constant;
|
|
|
|
|
if (!lshl_instr &&
|
|
|
|
|
(!is_operand_constant(ctx, and_instr->operands[and_idx], b64 ? 64 : 32, &constant) ||
|
|
|
|
|
!util_is_power_of_two_or_zero64(constant) || constant == 0))
|
aco: Combine bit test to s_bitcmp.
Foz-DB Navi21:
Totals from 6396 (4.74% of 134913) affected shaders:
VGPRs: 483280 -> 483152 (-0.03%); split: -0.03%, +0.01%
SpillSGPRs: 8119 -> 7941 (-2.19%)
CodeSize: 63377880 -> 63268556 (-0.17%); split: -0.20%, +0.03%
MaxWaves: 86778 -> 86810 (+0.04%)
Instrs: 11745621 -> 11725857 (-0.17%); split: -0.20%, +0.03%
Latency: 162400148 -> 162282230 (-0.07%); split: -0.08%, +0.01%
InvThroughput: 29179429 -> 29133173 (-0.16%); split: -0.16%, +0.00%
VClause: 208032 -> 208100 (+0.03%); split: -0.01%, +0.05%
SClause: 431390 -> 430849 (-0.13%); split: -0.24%, +0.11%
Copies: 896222 -> 893285 (-0.33%); split: -0.62%, +0.30%
Branches: 349806 -> 348770 (-0.30%); split: -0.90%, +0.60%
PreSGPRs: 618908 -> 613773 (-0.83%); split: -0.83%, +0.00%
PreVGPRs: 482901 -> 482893 (-0.00%)
Signed-off-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18870>
2022-09-24 18:49:10 +02:00
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
bool test1 = false;
|
|
|
|
|
if (instr->operands[!cmp_idx].constantEquals(0)) {
|
|
|
|
|
test1 = lg;
|
aco: Combine constant bit test to s_bitcmp.
Foz-DB Navi21:
Totals from 73988 (54.84% of 134913) affected shaders:
VGPRs: 2959768 -> 2959752 (-0.00%)
SpillSGPRs: 10250 -> 10697 (+4.36%); split: -0.64%, +5.00%
SpillVGPRs: 2326 -> 2291 (-1.50%); split: -2.24%, +0.73%
CodeSize: 261339476 -> 261045912 (-0.11%); split: -0.12%, +0.00%
Scratch: 239616 -> 238592 (-0.43%)
Instrs: 49214044 -> 49188242 (-0.05%); split: -0.06%, +0.00%
Latency: 413214139 -> 413296229 (+0.02%); split: -0.03%, +0.05%
InvThroughput: 71741622 -> 71786300 (+0.06%); split: -0.07%, +0.13%
VClause: 856838 -> 856973 (+0.02%); split: -0.01%, +0.02%
SClause: 1504502 -> 1504567 (+0.00%); split: -0.01%, +0.02%
Copies: 4058433 -> 4060424 (+0.05%); split: -0.03%, +0.08%
Branches: 1502953 -> 1502945 (-0.00%); split: -0.00%, +0.00%
PreSGPRs: 3081927 -> 3081531 (-0.01%); split: -0.02%, +0.01%
PreVGPRs: 2513990 -> 2513992 (+0.00%)
The vast majority of instruction count regressions are caused by parallel-rdp.
Signed-off-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18870>
2022-10-18 15:48:21 +02:00
|
|
|
} else if (lshl_instr && instr->operands[!cmp_idx].isTemp() &&
|
aco: Combine bit test to s_bitcmp.
Foz-DB Navi21:
Totals from 6396 (4.74% of 134913) affected shaders:
VGPRs: 483280 -> 483152 (-0.03%); split: -0.03%, +0.01%
SpillSGPRs: 8119 -> 7941 (-2.19%)
CodeSize: 63377880 -> 63268556 (-0.17%); split: -0.20%, +0.03%
MaxWaves: 86778 -> 86810 (+0.04%)
Instrs: 11745621 -> 11725857 (-0.17%); split: -0.20%, +0.03%
Latency: 162400148 -> 162282230 (-0.07%); split: -0.08%, +0.01%
InvThroughput: 29179429 -> 29133173 (-0.16%); split: -0.16%, +0.00%
VClause: 208032 -> 208100 (+0.03%); split: -0.01%, +0.05%
SClause: 431390 -> 430849 (-0.13%); split: -0.24%, +0.11%
Copies: 896222 -> 893285 (-0.33%); split: -0.62%, +0.30%
Branches: 349806 -> 348770 (-0.30%); split: -0.90%, +0.60%
PreSGPRs: 618908 -> 613773 (-0.83%); split: -0.83%, +0.00%
PreVGPRs: 482901 -> 482893 (-0.00%)
Signed-off-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18870>
2022-09-24 18:49:10 +02:00
|
|
|
instr->operands[!cmp_idx].tempId() == lshl_instr->definitions[0].tempId()) {
|
|
|
|
|
test1 = !lg;
|
|
|
|
|
ctx.uses[lshl_instr->definitions[0].tempId()]--;
|
|
|
|
|
} else {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (test1 && b64)
|
|
|
|
|
instr->opcode = aco_opcode::s_bitcmp1_b64;
|
|
|
|
|
else if (!test1 && b64)
|
|
|
|
|
instr->opcode = aco_opcode::s_bitcmp0_b64;
|
|
|
|
|
else if (test1 && !b64)
|
|
|
|
|
instr->opcode = aco_opcode::s_bitcmp1_b32;
|
|
|
|
|
else
|
|
|
|
|
instr->opcode = aco_opcode::s_bitcmp0_b32;
|
|
|
|
|
|
|
|
|
|
instr->operands[0] = copy_operand(ctx, and_instr->operands[!and_idx]);
|
|
|
|
|
decrease_uses(ctx, and_instr);
|
aco: Combine constant bit test to s_bitcmp.
Foz-DB Navi21:
Totals from 73988 (54.84% of 134913) affected shaders:
VGPRs: 2959768 -> 2959752 (-0.00%)
SpillSGPRs: 10250 -> 10697 (+4.36%); split: -0.64%, +5.00%
SpillVGPRs: 2326 -> 2291 (-1.50%); split: -2.24%, +0.73%
CodeSize: 261339476 -> 261045912 (-0.11%); split: -0.12%, +0.00%
Scratch: 239616 -> 238592 (-0.43%)
Instrs: 49214044 -> 49188242 (-0.05%); split: -0.06%, +0.00%
Latency: 413214139 -> 413296229 (+0.02%); split: -0.03%, +0.05%
InvThroughput: 71741622 -> 71786300 (+0.06%); split: -0.07%, +0.13%
VClause: 856838 -> 856973 (+0.02%); split: -0.01%, +0.02%
SClause: 1504502 -> 1504567 (+0.00%); split: -0.01%, +0.02%
Copies: 4058433 -> 4060424 (+0.05%); split: -0.03%, +0.08%
Branches: 1502953 -> 1502945 (-0.00%); split: -0.00%, +0.00%
PreSGPRs: 3081927 -> 3081531 (-0.01%); split: -0.02%, +0.01%
PreVGPRs: 2513990 -> 2513992 (+0.00%)
The vast majority of instruction count regressions are caused by parallel-rdp.
Signed-off-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18870>
2022-10-18 15:48:21 +02:00
|
|
|
if (lshl_instr) {
|
|
|
|
|
instr->operands[1] = copy_operand(ctx, lshl_instr->operands[1]);
|
|
|
|
|
decrease_op_uses_if_dead(ctx, lshl_instr);
|
|
|
|
|
} else {
|
|
|
|
|
instr->operands[1] = Operand::c32(ffsll(constant) - 1);
|
|
|
|
|
}
|
aco: Combine bit test to s_bitcmp.
Foz-DB Navi21:
Totals from 6396 (4.74% of 134913) affected shaders:
VGPRs: 483280 -> 483152 (-0.03%); split: -0.03%, +0.01%
SpillSGPRs: 8119 -> 7941 (-2.19%)
CodeSize: 63377880 -> 63268556 (-0.17%); split: -0.20%, +0.03%
MaxWaves: 86778 -> 86810 (+0.04%)
Instrs: 11745621 -> 11725857 (-0.17%); split: -0.20%, +0.03%
Latency: 162400148 -> 162282230 (-0.07%); split: -0.08%, +0.01%
InvThroughput: 29179429 -> 29133173 (-0.16%); split: -0.16%, +0.00%
VClause: 208032 -> 208100 (+0.03%); split: -0.01%, +0.05%
SClause: 431390 -> 430849 (-0.13%); split: -0.24%, +0.11%
Copies: 896222 -> 893285 (-0.33%); split: -0.62%, +0.30%
Branches: 349806 -> 348770 (-0.30%); split: -0.90%, +0.60%
PreSGPRs: 618908 -> 613773 (-0.83%); split: -0.83%, +0.00%
PreVGPRs: 482901 -> 482893 (-0.00%)
Signed-off-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18870>
2022-09-24 18:49:10 +02:00
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
combine_add_sub_b2i(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode new_op, uint8_t ops)
|
2020-04-02 17:41:36 +02:00
|
|
|
{
|
|
|
|
|
if (instr->usesModifiers())
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
|
|
|
|
if (!((1 << i) & ops))
|
|
|
|
|
continue;
|
2021-06-09 10:14:54 +02:00
|
|
|
if (instr->operands[i].isTemp() && ctx.info[instr->operands[i].tempId()].is_b2i() &&
|
2020-04-02 17:41:36 +02:00
|
|
|
ctx.uses[instr->operands[i].tempId()] == 1) {
|
|
|
|
|
|
|
|
|
|
aco_ptr<Instruction> new_instr;
|
2021-06-09 10:14:54 +02:00
|
|
|
if (instr->operands[!i].isTemp() &&
|
|
|
|
|
instr->operands[!i].getTemp().type() == RegType::vgpr) {
|
2020-04-02 17:41:36 +02:00
|
|
|
new_instr.reset(create_instruction<VOP2_instruction>(new_op, Format::VOP2, 3, 2));
|
2022-05-12 02:50:17 -04:00
|
|
|
} else if (ctx.program->gfx_level >= GFX10 ||
|
2020-04-02 17:41:36 +02:00
|
|
|
(instr->operands[!i].isConstant() && !instr->operands[!i].isLiteral())) {
|
2021-06-09 10:14:54 +02:00
|
|
|
new_instr.reset(
|
|
|
|
|
create_instruction<VOP3_instruction>(new_op, asVOP3(Format::VOP2), 3, 2));
|
2020-04-02 17:41:36 +02:00
|
|
|
} else {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
ctx.uses[instr->operands[i].tempId()]--;
|
|
|
|
|
new_instr->definitions[0] = instr->definitions[0];
|
aco: fix combining add/sub to b2i if a new dest needs to be allocated
The uses vector needs to be expanded to avoid out of bounds access
and to make sure the number of uses is initialized to 0.
This fixes combining more v_and(a, v_subbrev_co_u32).
fossilds-db (Vega10):
Totals from 4574 (3.28% of 139517) affected shaders:
SGPRs: 291625 -> 292217 (+0.20%); split: -0.01%, +0.21%
VGPRs: 276368 -> 276188 (-0.07%); split: -0.07%, +0.01%
SpillSGPRs: 455 -> 533 (+17.14%)
SpillVGPRs: 76 -> 78 (+2.63%)
CodeSize: 23327500 -> 23304152 (-0.10%); split: -0.17%, +0.07%
MaxWaves: 22044 -> 22066 (+0.10%)
Instrs: 4583064 -> 4576301 (-0.15%); split: -0.15%, +0.01%
Cycles: 47925276 -> 47871968 (-0.11%); split: -0.13%, +0.01%
VMEM: 1599363 -> 1597473 (-0.12%); split: +0.08%, -0.19%
SMEM: 331461 -> 331126 (-0.10%); split: +0.08%, -0.18%
VClause: 80639 -> 80696 (+0.07%); split: -0.02%, +0.09%
SClause: 155992 -> 155993 (+0.00%); split: -0.02%, +0.02%
Copies: 333482 -> 333318 (-0.05%); split: -0.12%, +0.07%
Branches: 70967 -> 70968 (+0.00%)
PreSGPRs: 187078 -> 187711 (+0.34%); split: -0.01%, +0.35%
PreVGPRs: 244918 -> 244785 (-0.05%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7513>
2020-11-09 19:42:22 +01:00
|
|
|
if (instr->definitions.size() == 2) {
|
|
|
|
|
new_instr->definitions[1] = instr->definitions[1];
|
|
|
|
|
} else {
|
|
|
|
|
new_instr->definitions[1] =
|
|
|
|
|
Definition(ctx.program->allocateTmp(ctx.program->lane_mask));
|
|
|
|
|
/* Make sure the uses vector is large enough and the number of
|
|
|
|
|
* uses properly initialized to 0.
|
|
|
|
|
*/
|
|
|
|
|
ctx.uses.push_back(0);
|
|
|
|
|
}
|
2021-07-13 11:22:46 +02:00
|
|
|
new_instr->operands[0] = Operand::zero();
|
2020-04-02 17:41:36 +02:00
|
|
|
new_instr->operands[1] = instr->operands[!i];
|
|
|
|
|
new_instr->operands[2] = Operand(ctx.info[instr->operands[i].tempId()].temp);
|
|
|
|
|
instr = std::move(new_instr);
|
aco: optimize v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc)
fossils-db (Vega10):
Totals from 7786 (5.70% of 136546) affected shaders:
SGPRs: 517778 -> 518626 (+0.16%); split: -0.01%, +0.17%
VGPRs: 488252 -> 488084 (-0.03%); split: -0.04%, +0.01%
CodeSize: 42282068 -> 42250152 (-0.08%); split: -0.16%, +0.09%
MaxWaves: 35697 -> 35716 (+0.05%); split: +0.06%, -0.01%
Instrs: 8319309 -> 8304792 (-0.17%); split: -0.18%, +0.00%
Cycles: 88619440 -> 88489636 (-0.15%); split: -0.16%, +0.01%
VMEM: 2788278 -> 2780431 (-0.28%); split: +0.06%, -0.35%
SMEM: 570364 -> 569370 (-0.17%); split: +0.12%, -0.30%
VClause: 144906 -> 144908 (+0.00%); split: -0.05%, +0.05%
SClause: 302143 -> 302055 (-0.03%); split: -0.04%, +0.01%
Copies: 579124 -> 578779 (-0.06%); split: -0.14%, +0.08%
PreSGPRs: 327695 -> 328845 (+0.35%); split: -0.00%, +0.35%
PreVGPRs: 434280 -> 433954 (-0.08%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7438>
2020-11-03 18:50:32 +01:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_add_sub(instr.get());
|
2020-04-02 17:41:36 +02:00
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
combine_add_bcnt(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
2020-11-11 18:42:35 +01:00
|
|
|
{
|
|
|
|
|
if (instr->usesModifiers())
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
2021-06-09 10:14:54 +02:00
|
|
|
Instruction* op_instr = follow_operand(ctx, instr->operands[i]);
|
|
|
|
|
if (op_instr && op_instr->opcode == aco_opcode::v_bcnt_u32_b32 &&
|
2021-09-28 17:11:28 +01:00
|
|
|
!op_instr->usesModifiers() && op_instr->operands[0].isTemp() &&
|
2020-11-11 18:42:35 +01:00
|
|
|
op_instr->operands[0].getTemp().type() == RegType::vgpr &&
|
|
|
|
|
op_instr->operands[1].constantEquals(0)) {
|
2021-06-09 10:14:54 +02:00
|
|
|
aco_ptr<Instruction> new_instr{
|
|
|
|
|
create_instruction<VOP3_instruction>(aco_opcode::v_bcnt_u32_b32, Format::VOP3, 2, 1)};
|
2020-11-11 18:42:35 +01:00
|
|
|
ctx.uses[instr->operands[i].tempId()]--;
|
|
|
|
|
new_instr->operands[0] = op_instr->operands[0];
|
|
|
|
|
new_instr->operands[1] = instr->operands[!i];
|
|
|
|
|
new_instr->definitions[0] = instr->definitions[0];
|
|
|
|
|
instr = std::move(new_instr);
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].label = 0;
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
get_minmax_info(aco_opcode op, aco_opcode* min, aco_opcode* max, aco_opcode* min3, aco_opcode* max3,
|
aco: use v_minmax/v_maxmin opcodes
fossil-db (gfx1100):
Totals from 29868 (22.12% of 135032) affected shaders:
MaxWaves: 741336 -> 741344 (+0.00%)
Instrs: 34624902 -> 34539766 (-0.25%); split: -0.25%, +0.00%
CodeSize: 187196804 -> 187192100 (-0.00%); split: -0.01%, +0.01%
VGPRs: 1816860 -> 1816788 (-0.00%); split: -0.01%, +0.01%
Latency: 502597202 -> 502245627 (-0.07%); split: -0.08%, +0.01%
InvThroughput: 84813176 -> 84586122 (-0.27%); split: -0.28%, +0.01%
VClause: 633826 -> 633749 (-0.01%); split: -0.02%, +0.01%
SClause: 1317738 -> 1317047 (-0.05%); split: -0.06%, +0.01%
Copies: 2130610 -> 2130954 (+0.02%); split: -0.03%, +0.05%
Branches: 766093 -> 765969 (-0.02%); split: -0.02%, +0.00%
PreSGPRs: 1630250 -> 1630034 (-0.01%); split: -0.02%, +0.00%
PreVGPRs: 1590777 -> 1590664 (-0.01%); split: -0.01%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19933>
2022-11-16 17:42:20 +00:00
|
|
|
aco_opcode* med3, aco_opcode* minmax, bool* some_gfx9_only)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
switch (op) {
|
2021-06-09 10:14:54 +02:00
|
|
|
#define MINMAX(type, gfx9) \
|
|
|
|
|
case aco_opcode::v_min_##type: \
|
|
|
|
|
case aco_opcode::v_max_##type: \
|
|
|
|
|
*min = aco_opcode::v_min_##type; \
|
|
|
|
|
*max = aco_opcode::v_max_##type; \
|
|
|
|
|
*med3 = aco_opcode::v_med3_##type; \
|
|
|
|
|
*min3 = aco_opcode::v_min3_##type; \
|
|
|
|
|
*max3 = aco_opcode::v_max3_##type; \
|
aco: use v_minmax/v_maxmin opcodes
fossil-db (gfx1100):
Totals from 29868 (22.12% of 135032) affected shaders:
MaxWaves: 741336 -> 741344 (+0.00%)
Instrs: 34624902 -> 34539766 (-0.25%); split: -0.25%, +0.00%
CodeSize: 187196804 -> 187192100 (-0.00%); split: -0.01%, +0.01%
VGPRs: 1816860 -> 1816788 (-0.00%); split: -0.01%, +0.01%
Latency: 502597202 -> 502245627 (-0.07%); split: -0.08%, +0.01%
InvThroughput: 84813176 -> 84586122 (-0.27%); split: -0.28%, +0.01%
VClause: 633826 -> 633749 (-0.01%); split: -0.02%, +0.01%
SClause: 1317738 -> 1317047 (-0.05%); split: -0.06%, +0.01%
Copies: 2130610 -> 2130954 (+0.02%); split: -0.03%, +0.05%
Branches: 766093 -> 765969 (-0.02%); split: -0.02%, +0.00%
PreSGPRs: 1630250 -> 1630034 (-0.01%); split: -0.02%, +0.00%
PreVGPRs: 1590777 -> 1590664 (-0.01%); split: -0.01%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19933>
2022-11-16 17:42:20 +00:00
|
|
|
*minmax = op == *min ? aco_opcode::v_maxmin_##type : aco_opcode::v_minmax_##type; \
|
|
|
|
|
*some_gfx9_only = gfx9; \
|
|
|
|
|
return true;
|
|
|
|
|
#define MINMAX_INT16(type, gfx9) \
|
|
|
|
|
case aco_opcode::v_min_##type: \
|
|
|
|
|
case aco_opcode::v_max_##type: \
|
|
|
|
|
*min = aco_opcode::v_min_##type; \
|
|
|
|
|
*max = aco_opcode::v_max_##type; \
|
|
|
|
|
*med3 = aco_opcode::v_med3_##type; \
|
|
|
|
|
*min3 = aco_opcode::v_min3_##type; \
|
|
|
|
|
*max3 = aco_opcode::v_max3_##type; \
|
|
|
|
|
*minmax = aco_opcode::num_opcodes; \
|
2021-06-09 10:14:54 +02:00
|
|
|
*some_gfx9_only = gfx9; \
|
2019-09-17 13:22:17 +02:00
|
|
|
return true;
|
aco: use v_minmax/v_maxmin opcodes
fossil-db (gfx1100):
Totals from 29868 (22.12% of 135032) affected shaders:
MaxWaves: 741336 -> 741344 (+0.00%)
Instrs: 34624902 -> 34539766 (-0.25%); split: -0.25%, +0.00%
CodeSize: 187196804 -> 187192100 (-0.00%); split: -0.01%, +0.01%
VGPRs: 1816860 -> 1816788 (-0.00%); split: -0.01%, +0.01%
Latency: 502597202 -> 502245627 (-0.07%); split: -0.08%, +0.01%
InvThroughput: 84813176 -> 84586122 (-0.27%); split: -0.28%, +0.01%
VClause: 633826 -> 633749 (-0.01%); split: -0.02%, +0.01%
SClause: 1317738 -> 1317047 (-0.05%); split: -0.06%, +0.01%
Copies: 2130610 -> 2130954 (+0.02%); split: -0.03%, +0.05%
Branches: 766093 -> 765969 (-0.02%); split: -0.02%, +0.00%
PreSGPRs: 1630250 -> 1630034 (-0.01%); split: -0.02%, +0.00%
PreVGPRs: 1590777 -> 1590664 (-0.01%); split: -0.01%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19933>
2022-11-16 17:42:20 +00:00
|
|
|
#define MINMAX_INT16_E64(type, gfx9) \
|
2022-04-29 17:19:09 +01:00
|
|
|
case aco_opcode::v_min_##type##_e64: \
|
|
|
|
|
case aco_opcode::v_max_##type##_e64: \
|
|
|
|
|
*min = aco_opcode::v_min_##type##_e64; \
|
|
|
|
|
*max = aco_opcode::v_max_##type##_e64; \
|
|
|
|
|
*med3 = aco_opcode::v_med3_##type; \
|
|
|
|
|
*min3 = aco_opcode::v_min3_##type; \
|
|
|
|
|
*max3 = aco_opcode::v_max3_##type; \
|
aco: use v_minmax/v_maxmin opcodes
fossil-db (gfx1100):
Totals from 29868 (22.12% of 135032) affected shaders:
MaxWaves: 741336 -> 741344 (+0.00%)
Instrs: 34624902 -> 34539766 (-0.25%); split: -0.25%, +0.00%
CodeSize: 187196804 -> 187192100 (-0.00%); split: -0.01%, +0.01%
VGPRs: 1816860 -> 1816788 (-0.00%); split: -0.01%, +0.01%
Latency: 502597202 -> 502245627 (-0.07%); split: -0.08%, +0.01%
InvThroughput: 84813176 -> 84586122 (-0.27%); split: -0.28%, +0.01%
VClause: 633826 -> 633749 (-0.01%); split: -0.02%, +0.01%
SClause: 1317738 -> 1317047 (-0.05%); split: -0.06%, +0.01%
Copies: 2130610 -> 2130954 (+0.02%); split: -0.03%, +0.05%
Branches: 766093 -> 765969 (-0.02%); split: -0.02%, +0.00%
PreSGPRs: 1630250 -> 1630034 (-0.01%); split: -0.02%, +0.00%
PreVGPRs: 1590777 -> 1590664 (-0.01%); split: -0.01%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19933>
2022-11-16 17:42:20 +00:00
|
|
|
*minmax = aco_opcode::num_opcodes; \
|
2022-04-29 17:19:09 +01:00
|
|
|
*some_gfx9_only = gfx9; \
|
|
|
|
|
return true;
|
2021-06-09 10:14:54 +02:00
|
|
|
MINMAX(f32, false)
|
|
|
|
|
MINMAX(u32, false)
|
|
|
|
|
MINMAX(i32, false)
|
|
|
|
|
MINMAX(f16, true)
|
aco: use v_minmax/v_maxmin opcodes
fossil-db (gfx1100):
Totals from 29868 (22.12% of 135032) affected shaders:
MaxWaves: 741336 -> 741344 (+0.00%)
Instrs: 34624902 -> 34539766 (-0.25%); split: -0.25%, +0.00%
CodeSize: 187196804 -> 187192100 (-0.00%); split: -0.01%, +0.01%
VGPRs: 1816860 -> 1816788 (-0.00%); split: -0.01%, +0.01%
Latency: 502597202 -> 502245627 (-0.07%); split: -0.08%, +0.01%
InvThroughput: 84813176 -> 84586122 (-0.27%); split: -0.28%, +0.01%
VClause: 633826 -> 633749 (-0.01%); split: -0.02%, +0.01%
SClause: 1317738 -> 1317047 (-0.05%); split: -0.06%, +0.01%
Copies: 2130610 -> 2130954 (+0.02%); split: -0.03%, +0.05%
Branches: 766093 -> 765969 (-0.02%); split: -0.02%, +0.00%
PreSGPRs: 1630250 -> 1630034 (-0.01%); split: -0.02%, +0.00%
PreVGPRs: 1590777 -> 1590664 (-0.01%); split: -0.01%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19933>
2022-11-16 17:42:20 +00:00
|
|
|
MINMAX_INT16(u16, true)
|
|
|
|
|
MINMAX_INT16(i16, true)
|
|
|
|
|
MINMAX_INT16_E64(u16, true)
|
|
|
|
|
MINMAX_INT16_E64(i16, true)
|
|
|
|
|
#undef MINMAX_INT16_E64
|
|
|
|
|
#undef MINMAX_INT16
|
2021-06-09 10:14:54 +02:00
|
|
|
#undef MINMAX
|
|
|
|
|
default: return false;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
/* when ub > lb:
|
|
|
|
|
* v_min_{f,u,i}{16,32}(v_max_{f,u,i}{16,32}(a, lb), ub) -> v_med3_{f,u,i}{16,32}(a, lb, ub)
|
|
|
|
|
* v_max_{f,u,i}{16,32}(v_min_{f,u,i}{16,32}(a, ub), lb) -> v_med3_{f,u,i}{16,32}(a, lb, ub)
|
|
|
|
|
*/
|
|
|
|
|
bool
|
|
|
|
|
combine_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode min, aco_opcode max,
|
|
|
|
|
aco_opcode med)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2019-11-22 17:50:29 +00:00
|
|
|
/* TODO: GLSL's clamp(x, minVal, maxVal) and SPIR-V's
|
|
|
|
|
* FClamp(x, minVal, maxVal)/NClamp(x, minVal, maxVal) are undefined if
|
|
|
|
|
* minVal > maxVal, which means we can always select it to a v_med3_f32 */
|
2019-09-17 13:22:17 +02:00
|
|
|
aco_opcode other_op;
|
|
|
|
|
if (instr->opcode == min)
|
|
|
|
|
other_op = max;
|
|
|
|
|
else if (instr->opcode == max)
|
|
|
|
|
other_op = min;
|
|
|
|
|
else
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
for (unsigned swap = 0; swap < 2; swap++) {
|
|
|
|
|
Operand operands[3];
|
2020-10-07 11:09:16 +01:00
|
|
|
bool neg[3], abs[3], clamp, precise;
|
2020-01-08 11:49:11 +01:00
|
|
|
uint8_t opsel = 0, omod = 0;
|
2021-06-09 10:14:54 +02:00
|
|
|
if (match_op3_for_vop3(ctx, instr->opcode, other_op, instr.get(), swap, "012", operands, neg,
|
|
|
|
|
abs, &opsel, &clamp, &omod, NULL, NULL, NULL, &precise)) {
|
2020-10-07 11:09:16 +01:00
|
|
|
/* max(min(src, upper), lower) returns upper if src is NaN, but
|
|
|
|
|
* med3(src, lower, upper) returns lower.
|
|
|
|
|
*/
|
2022-04-29 17:23:20 +01:00
|
|
|
if (precise && instr->opcode != min &&
|
|
|
|
|
(min == aco_opcode::v_min_f16 || min == aco_opcode::v_min_f32))
|
2020-10-07 11:09:16 +01:00
|
|
|
continue;
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
int const0_idx = -1, const1_idx = -1;
|
|
|
|
|
uint32_t const0 = 0, const1 = 0;
|
|
|
|
|
for (int i = 0; i < 3; i++) {
|
|
|
|
|
uint32_t val;
|
2022-04-29 16:45:17 +01:00
|
|
|
bool hi16 = opsel & (1 << i);
|
2019-09-17 13:22:17 +02:00
|
|
|
if (operands[i].isConstant()) {
|
2022-04-29 16:45:17 +01:00
|
|
|
val = hi16 ? operands[i].constantValue16(true) : operands[i].constantValue();
|
2021-06-09 10:14:54 +02:00
|
|
|
} else if (operands[i].isTemp() &&
|
|
|
|
|
ctx.info[operands[i].tempId()].is_constant_or_literal(32)) {
|
2022-04-29 16:45:17 +01:00
|
|
|
val = ctx.info[operands[i].tempId()].val >> (hi16 ? 16 : 0);
|
2019-09-17 13:22:17 +02:00
|
|
|
} else {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (const0_idx >= 0) {
|
|
|
|
|
const1_idx = i;
|
|
|
|
|
const1 = val;
|
|
|
|
|
} else {
|
|
|
|
|
const0_idx = i;
|
|
|
|
|
const0 = val;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (const0_idx < 0 || const1_idx < 0)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
int lower_idx = const0_idx;
|
|
|
|
|
switch (min) {
|
|
|
|
|
case aco_opcode::v_min_f32:
|
|
|
|
|
case aco_opcode::v_min_f16: {
|
|
|
|
|
float const0_f, const1_f;
|
|
|
|
|
if (min == aco_opcode::v_min_f32) {
|
|
|
|
|
memcpy(&const0_f, &const0, 4);
|
|
|
|
|
memcpy(&const1_f, &const1, 4);
|
|
|
|
|
} else {
|
|
|
|
|
const0_f = _mesa_half_to_float(const0);
|
|
|
|
|
const1_f = _mesa_half_to_float(const1);
|
|
|
|
|
}
|
2021-06-09 10:14:54 +02:00
|
|
|
if (abs[const0_idx])
|
|
|
|
|
const0_f = fabsf(const0_f);
|
|
|
|
|
if (abs[const1_idx])
|
|
|
|
|
const1_f = fabsf(const1_f);
|
|
|
|
|
if (neg[const0_idx])
|
|
|
|
|
const0_f = -const0_f;
|
|
|
|
|
if (neg[const1_idx])
|
|
|
|
|
const1_f = -const1_f;
|
2019-09-17 13:22:17 +02:00
|
|
|
lower_idx = const0_f < const1_f ? const0_idx : const1_idx;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case aco_opcode::v_min_u32: {
|
|
|
|
|
lower_idx = const0 < const1 ? const0_idx : const1_idx;
|
|
|
|
|
break;
|
|
|
|
|
}
|
2022-04-29 17:19:09 +01:00
|
|
|
case aco_opcode::v_min_u16:
|
|
|
|
|
case aco_opcode::v_min_u16_e64: {
|
2019-09-17 13:22:17 +02:00
|
|
|
lower_idx = (uint16_t)const0 < (uint16_t)const1 ? const0_idx : const1_idx;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case aco_opcode::v_min_i32: {
|
2021-06-09 10:14:54 +02:00
|
|
|
int32_t const0_i =
|
|
|
|
|
const0 & 0x80000000u ? -2147483648 + (int32_t)(const0 & 0x7fffffffu) : const0;
|
|
|
|
|
int32_t const1_i =
|
|
|
|
|
const1 & 0x80000000u ? -2147483648 + (int32_t)(const1 & 0x7fffffffu) : const1;
|
2019-09-17 13:22:17 +02:00
|
|
|
lower_idx = const0_i < const1_i ? const0_idx : const1_idx;
|
|
|
|
|
break;
|
|
|
|
|
}
|
2022-04-29 17:19:09 +01:00
|
|
|
case aco_opcode::v_min_i16:
|
|
|
|
|
case aco_opcode::v_min_i16_e64: {
|
2019-09-17 13:22:17 +02:00
|
|
|
int16_t const0_i = const0 & 0x8000u ? -32768 + (int16_t)(const0 & 0x7fffu) : const0;
|
|
|
|
|
int16_t const1_i = const1 & 0x8000u ? -32768 + (int16_t)(const1 & 0x7fffu) : const1;
|
|
|
|
|
lower_idx = const0_i < const1_i ? const0_idx : const1_idx;
|
|
|
|
|
break;
|
|
|
|
|
}
|
2021-06-09 10:14:54 +02:00
|
|
|
default: break;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
int upper_idx = lower_idx == const0_idx ? const1_idx : const0_idx;
|
|
|
|
|
|
|
|
|
|
if (instr->opcode == min) {
|
|
|
|
|
if (upper_idx != 0 || lower_idx == 0)
|
|
|
|
|
return false;
|
|
|
|
|
} else {
|
|
|
|
|
if (upper_idx == 0 || lower_idx != 0)
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ctx.uses[instr->operands[swap].tempId()]--;
|
|
|
|
|
create_vop3_for_op3(ctx, med, instr, operands, neg, abs, opsel, clamp, omod);
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
apply_sgprs(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2019-11-20 16:42:17 +00:00
|
|
|
bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_lshrrev_b64 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_ashrrev_i64;
|
|
|
|
|
|
2019-11-22 14:55:25 +00:00
|
|
|
/* find candidates and create the set of sgprs already read */
|
|
|
|
|
unsigned sgpr_ids[2] = {0, 0};
|
|
|
|
|
uint32_t operand_mask = 0;
|
|
|
|
|
bool has_literal = false;
|
|
|
|
|
for (unsigned i = 0; i < instr->operands.size(); i++) {
|
|
|
|
|
if (instr->operands[i].isLiteral())
|
|
|
|
|
has_literal = true;
|
2019-09-17 13:22:17 +02:00
|
|
|
if (!instr->operands[i].isTemp())
|
|
|
|
|
continue;
|
|
|
|
|
if (instr->operands[i].getTemp().type() == RegType::sgpr) {
|
2019-11-22 14:55:25 +00:00
|
|
|
if (instr->operands[i].tempId() != sgpr_ids[0])
|
|
|
|
|
sgpr_ids[!!sgpr_ids[0]] = instr->operands[i].tempId();
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
ssa_info& info = ctx.info[instr->operands[i].tempId()];
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
if (is_copy_label(ctx, instr, info) && info.temp.type() == RegType::sgpr)
|
2019-11-22 14:55:25 +00:00
|
|
|
operand_mask |= 1u << i;
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
if (info.is_extract() && info.instr->operands[0].getTemp().type() == RegType::sgpr)
|
|
|
|
|
operand_mask |= 1u << i;
|
2019-11-22 14:55:25 +00:00
|
|
|
}
|
|
|
|
|
unsigned max_sgprs = 1;
|
2022-05-12 02:50:17 -04:00
|
|
|
if (ctx.program->gfx_level >= GFX10 && !is_shift64)
|
2019-11-20 16:42:17 +00:00
|
|
|
max_sgprs = 2;
|
2019-11-22 14:55:25 +00:00
|
|
|
if (has_literal)
|
|
|
|
|
max_sgprs--;
|
|
|
|
|
|
|
|
|
|
unsigned num_sgprs = !!sgpr_ids[0] + !!sgpr_ids[1];
|
|
|
|
|
|
|
|
|
|
/* keep on applying sgprs until there is nothing left to be done */
|
|
|
|
|
while (operand_mask) {
|
|
|
|
|
uint32_t sgpr_idx = 0;
|
|
|
|
|
uint32_t sgpr_info_id = 0;
|
|
|
|
|
uint32_t mask = operand_mask;
|
|
|
|
|
/* choose a sgpr */
|
|
|
|
|
while (mask) {
|
|
|
|
|
unsigned i = u_bit_scan(&mask);
|
2019-09-17 13:22:17 +02:00
|
|
|
uint16_t uses = ctx.uses[instr->operands[i].tempId()];
|
|
|
|
|
if (sgpr_info_id == 0 || uses < ctx.uses[sgpr_info_id]) {
|
|
|
|
|
sgpr_idx = i;
|
|
|
|
|
sgpr_info_id = instr->operands[i].tempId();
|
|
|
|
|
}
|
|
|
|
|
}
|
2019-11-22 14:55:25 +00:00
|
|
|
operand_mask &= ~(1u << sgpr_idx);
|
|
|
|
|
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
ssa_info& info = ctx.info[sgpr_info_id];
|
|
|
|
|
|
2019-11-22 14:55:25 +00:00
|
|
|
/* Applying two sgprs require making it VOP3, so don't do it unless it's
|
|
|
|
|
* definitively beneficial.
|
|
|
|
|
* TODO: this is too conservative because later the use count could be reduced to 1 */
|
2021-06-09 10:14:54 +02:00
|
|
|
if (!info.is_extract() && num_sgprs && ctx.uses[sgpr_info_id] > 1 && !instr->isVOP3() &&
|
|
|
|
|
!instr->isSDWA() && instr->format != Format::VOP3P)
|
2019-11-22 14:55:25 +00:00
|
|
|
break;
|
|
|
|
|
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
Temp sgpr = info.is_extract() ? info.instr->operands[0].getTemp() : info.temp;
|
2019-11-22 14:55:25 +00:00
|
|
|
bool new_sgpr = sgpr.id() != sgpr_ids[0] && sgpr.id() != sgpr_ids[1];
|
|
|
|
|
if (new_sgpr && num_sgprs >= max_sgprs)
|
|
|
|
|
continue;
|
|
|
|
|
|
2021-08-30 10:30:45 +01:00
|
|
|
if (sgpr_idx == 0)
|
|
|
|
|
instr->format = withoutDPP(instr->format);
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
if (sgpr_idx == 0 || instr->isVOP3() || instr->isSDWA() || instr->isVOP3P() ||
|
|
|
|
|
info.is_extract()) {
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
/* can_apply_extract() checks SGPR encoding restrictions */
|
|
|
|
|
if (info.is_extract() && can_apply_extract(ctx, instr, sgpr_idx, info))
|
|
|
|
|
apply_extract(ctx, instr, sgpr_idx, info);
|
|
|
|
|
else if (info.is_extract())
|
|
|
|
|
continue;
|
2019-11-22 14:55:25 +00:00
|
|
|
instr->operands[sgpr_idx] = Operand(sgpr);
|
2021-07-14 17:22:02 +01:00
|
|
|
} else if (can_swap_operands(instr, &instr->opcode)) {
|
2019-09-17 13:22:17 +02:00
|
|
|
instr->operands[sgpr_idx] = instr->operands[0];
|
2019-11-22 14:55:25 +00:00
|
|
|
instr->operands[0] = Operand(sgpr);
|
|
|
|
|
/* swap bits using a 4-entry LUT */
|
|
|
|
|
uint32_t swapped = (0x3120 >> (operand_mask & 0x3)) & 0xf;
|
|
|
|
|
operand_mask = (operand_mask & ~0x3) | swapped;
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
} else if (can_use_VOP3(ctx, instr) && !info.is_extract()) {
|
2019-09-17 13:22:17 +02:00
|
|
|
to_VOP3(ctx, instr);
|
2019-11-22 14:55:25 +00:00
|
|
|
instr->operands[sgpr_idx] = Operand(sgpr);
|
|
|
|
|
} else {
|
|
|
|
|
continue;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2020-01-17 11:35:20 +00:00
|
|
|
if (new_sgpr)
|
|
|
|
|
sgpr_ids[num_sgprs++] = sgpr.id();
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.uses[sgpr_info_id]--;
|
2019-11-22 14:55:25 +00:00
|
|
|
ctx.uses[sgpr.id()]++;
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
|
|
|
|
|
/* TODO: handle when it's a VGPR */
|
|
|
|
|
if ((ctx.info[sgpr.id()].label & (label_extract | label_temp)) &&
|
|
|
|
|
ctx.info[sgpr.id()].temp.type() == RegType::sgpr)
|
|
|
|
|
operand_mask |= 1u << sgpr_idx;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-12-05 14:12:39 +00:00
|
|
|
template <typename T>
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
apply_omod_clamp_helper(opt_ctx& ctx, T* instr, ssa_info& def_info)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2019-12-05 14:12:39 +00:00
|
|
|
if (!def_info.is_clamp() && (instr->clamp || instr->omod))
|
2020-08-12 15:58:32 +01:00
|
|
|
return false;
|
2019-12-17 14:55:24 +00:00
|
|
|
|
2020-08-12 15:58:32 +01:00
|
|
|
if (def_info.is_omod2())
|
2019-12-05 14:12:39 +00:00
|
|
|
instr->omod = 1;
|
2020-08-12 15:58:32 +01:00
|
|
|
else if (def_info.is_omod4())
|
2019-12-05 14:12:39 +00:00
|
|
|
instr->omod = 2;
|
2020-08-12 15:58:32 +01:00
|
|
|
else if (def_info.is_omod5())
|
2019-12-05 14:12:39 +00:00
|
|
|
instr->omod = 3;
|
2020-08-12 15:58:32 +01:00
|
|
|
else if (def_info.is_clamp())
|
2019-12-05 14:12:39 +00:00
|
|
|
instr->clamp = true;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-08-12 15:58:32 +01:00
|
|
|
return true;
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-08-12 15:58:32 +01:00
|
|
|
/* apply omod / clamp modifiers if the def is used only once and the instruction can have modifiers */
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
apply_omod_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
2020-08-12 15:58:32 +01:00
|
|
|
{
|
|
|
|
|
if (instr->definitions.empty() || ctx.uses[instr->definitions[0].tempId()] != 1 ||
|
|
|
|
|
!instr_info.can_use_output_modifiers[(int)instr->opcode])
|
|
|
|
|
return false;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2019-12-05 14:12:39 +00:00
|
|
|
bool can_vop3 = can_use_VOP3(ctx, instr);
|
2022-01-17 17:54:47 +00:00
|
|
|
bool is_mad_mix =
|
|
|
|
|
instr->opcode == aco_opcode::v_fma_mix_f32 || instr->opcode == aco_opcode::v_fma_mixlo_f16;
|
|
|
|
|
if (!instr->isSDWA() && !is_mad_mix && !can_vop3)
|
2020-08-12 15:58:32 +01:00
|
|
|
return false;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2022-01-17 17:54:47 +00:00
|
|
|
/* omod flushes -0 to +0 and has no effect if denormals are enabled. SDWA omod is GFX9+. */
|
2022-05-12 02:50:17 -04:00
|
|
|
bool can_use_omod = (can_vop3 || ctx.program->gfx_level >= GFX9) && !instr->isVOP3P();
|
2020-11-13 15:10:58 +00:00
|
|
|
if (instr->definitions[0].bytes() == 4)
|
2021-06-09 10:14:54 +02:00
|
|
|
can_use_omod =
|
|
|
|
|
can_use_omod && ctx.fp_mode.denorm32 == 0 && !ctx.fp_mode.preserve_signed_zero_inf_nan32;
|
2020-11-13 15:10:58 +00:00
|
|
|
else
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
can_use_omod = can_use_omod && ctx.fp_mode.denorm16_64 == 0 &&
|
|
|
|
|
!ctx.fp_mode.preserve_signed_zero_inf_nan16_64;
|
2019-12-05 14:12:39 +00:00
|
|
|
|
2020-08-12 15:58:32 +01:00
|
|
|
ssa_info& def_info = ctx.info[instr->definitions[0].tempId()];
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-08-12 15:58:32 +01:00
|
|
|
uint64_t omod_labels = label_omod2 | label_omod4 | label_omod5;
|
|
|
|
|
if (!def_info.is_clamp() && !(can_use_omod && (def_info.label & omod_labels)))
|
|
|
|
|
return false;
|
|
|
|
|
/* if the omod/clamp instruction is dead, then the single user of this
|
|
|
|
|
* instruction is a different instruction */
|
|
|
|
|
if (!ctx.uses[def_info.instr->definitions[0].tempId()])
|
|
|
|
|
return false;
|
|
|
|
|
|
2022-01-31 18:12:59 +00:00
|
|
|
if (def_info.instr->definitions[0].bytes() != instr->definitions[0].bytes())
|
|
|
|
|
return false;
|
|
|
|
|
|
2020-08-12 15:58:32 +01:00
|
|
|
/* MADs/FMAs are created later, so we don't have to update the original add */
|
|
|
|
|
assert(!ctx.info[instr->definitions[0].tempId()].is_mad());
|
|
|
|
|
|
2019-12-05 14:12:39 +00:00
|
|
|
if (instr->isSDWA()) {
|
2021-01-21 16:13:34 +00:00
|
|
|
if (!apply_omod_clamp_helper(ctx, &instr->sdwa(), def_info))
|
2019-12-05 14:12:39 +00:00
|
|
|
return false;
|
2022-01-17 17:54:47 +00:00
|
|
|
} else if (instr->isVOP3P()) {
|
|
|
|
|
assert(def_info.is_clamp());
|
|
|
|
|
instr->vop3p().clamp = true;
|
2019-12-05 14:12:39 +00:00
|
|
|
} else {
|
|
|
|
|
to_VOP3(ctx, instr);
|
2021-01-21 16:13:34 +00:00
|
|
|
if (!apply_omod_clamp_helper(ctx, &instr->vop3(), def_info))
|
2019-12-05 14:12:39 +00:00
|
|
|
return false;
|
|
|
|
|
}
|
2020-08-12 15:58:32 +01:00
|
|
|
|
2021-01-26 14:24:48 +00:00
|
|
|
instr->definitions[0].swapTemp(def_info.instr->definitions[0]);
|
2022-01-17 16:52:10 +00:00
|
|
|
ctx.info[instr->definitions[0].tempId()].label &= label_clamp | label_insert | label_f2f16;
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
ctx.uses[def_info.instr->definitions[0].tempId()]--;
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Combine an p_insert (or p_extract, in some cases) instruction with instr.
|
|
|
|
|
* p_insert(instr(...)) -> instr_insert().
|
|
|
|
|
*/
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
apply_insert(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
{
|
|
|
|
|
if (instr->definitions.empty() || ctx.uses[instr->definitions[0].tempId()] != 1)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
ssa_info& def_info = ctx.info[instr->definitions[0].tempId()];
|
|
|
|
|
if (!def_info.is_insert())
|
|
|
|
|
return false;
|
|
|
|
|
/* if the insert instruction is dead, then the single user of this
|
|
|
|
|
* instruction is a different instruction */
|
|
|
|
|
if (!ctx.uses[def_info.instr->definitions[0].tempId()])
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
/* MADs/FMAs are created later, so we don't have to update the original add */
|
|
|
|
|
assert(!ctx.info[instr->definitions[0].tempId()].is_mad());
|
|
|
|
|
|
2021-08-30 17:58:36 +02:00
|
|
|
SubdwordSel sel = parse_insert(def_info.instr);
|
|
|
|
|
assert(sel);
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
|
2022-09-08 11:24:27 +02:00
|
|
|
if (instr->opcode == aco_opcode::v_cvt_u32_f32 && instr->format == Format::VOP1 &&
|
|
|
|
|
!sel.sign_extend() && sel.size() == 1) {
|
|
|
|
|
to_VOP3(ctx, instr, 2);
|
|
|
|
|
instr->format = Format::VOP3;
|
|
|
|
|
instr->opcode = aco_opcode::v_cvt_pk_u8_f32;
|
|
|
|
|
instr->operands[1] = Operand::c32(sel.offset());
|
|
|
|
|
instr->operands[2] = Operand::zero();
|
|
|
|
|
} else {
|
|
|
|
|
if (!can_use_SDWA(ctx.program->gfx_level, instr, true))
|
|
|
|
|
return false;
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
|
2022-09-08 11:24:27 +02:00
|
|
|
to_SDWA(ctx, instr);
|
|
|
|
|
if (instr->sdwa().dst_sel.size() != 4)
|
|
|
|
|
return false;
|
|
|
|
|
static_cast<SDWA_instruction*>(instr.get())->dst_sel = sel;
|
|
|
|
|
}
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
|
|
|
|
|
instr->definitions[0].swapTemp(def_info.instr->definitions[0]);
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].label = 0;
|
2020-08-12 15:58:32 +01:00
|
|
|
ctx.uses[def_info.instr->definitions[0].tempId()]--;
|
|
|
|
|
|
|
|
|
|
return true;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2021-08-27 15:45:59 +02:00
|
|
|
/* Remove superfluous extract after ds_read like so:
|
|
|
|
|
* p_extract(ds_read_uN(), 0, N, 0) -> ds_read_uN()
|
|
|
|
|
*/
|
|
|
|
|
bool
|
|
|
|
|
apply_ds_extract(opt_ctx& ctx, aco_ptr<Instruction>& extract)
|
|
|
|
|
{
|
|
|
|
|
/* Check if p_extract has a usedef operand and is the only user. */
|
|
|
|
|
if (!ctx.info[extract->operands[0].tempId()].is_usedef() ||
|
|
|
|
|
ctx.uses[extract->operands[0].tempId()] > 1)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
/* Check if the usedef is a DS instruction. */
|
|
|
|
|
Instruction* ds = ctx.info[extract->operands[0].tempId()].instr;
|
|
|
|
|
if (ds->format != Format::DS)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
unsigned extract_idx = extract->operands[1].constantValue();
|
|
|
|
|
unsigned bits_extracted = extract->operands[2].constantValue();
|
|
|
|
|
unsigned sign_ext = extract->operands[3].constantValue();
|
|
|
|
|
unsigned dst_bitsize = extract->definitions[0].bytes() * 8u;
|
|
|
|
|
|
|
|
|
|
/* TODO: These are doable, but probably don't occour too often. */
|
|
|
|
|
if (extract_idx || sign_ext || dst_bitsize != 32)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
unsigned bits_loaded = 0;
|
|
|
|
|
if (ds->opcode == aco_opcode::ds_read_u8 || ds->opcode == aco_opcode::ds_read_u8_d16)
|
|
|
|
|
bits_loaded = 8;
|
|
|
|
|
else if (ds->opcode == aco_opcode::ds_read_u16 || ds->opcode == aco_opcode::ds_read_u16_d16)
|
|
|
|
|
bits_loaded = 16;
|
|
|
|
|
else
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
/* Shrink the DS load if the extracted bit size is smaller. */
|
|
|
|
|
bits_loaded = MIN2(bits_loaded, bits_extracted);
|
|
|
|
|
|
|
|
|
|
/* Change the DS opcode so it writes the full register. */
|
|
|
|
|
if (bits_loaded == 8)
|
|
|
|
|
ds->opcode = aco_opcode::ds_read_u8;
|
|
|
|
|
else if (bits_loaded == 16)
|
|
|
|
|
ds->opcode = aco_opcode::ds_read_u16;
|
|
|
|
|
else
|
|
|
|
|
unreachable("Forgot to add DS opcode above.");
|
|
|
|
|
|
|
|
|
|
/* The DS now produces the exact same thing as the extract, remove the extract. */
|
|
|
|
|
std::swap(ds->definitions[0], extract->definitions[0]);
|
|
|
|
|
ctx.uses[extract->definitions[0].tempId()] = 0;
|
|
|
|
|
ctx.info[ds->definitions[0].tempId()].label = 0;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
aco: optimize v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc)
fossils-db (Vega10):
Totals from 7786 (5.70% of 136546) affected shaders:
SGPRs: 517778 -> 518626 (+0.16%); split: -0.01%, +0.17%
VGPRs: 488252 -> 488084 (-0.03%); split: -0.04%, +0.01%
CodeSize: 42282068 -> 42250152 (-0.08%); split: -0.16%, +0.09%
MaxWaves: 35697 -> 35716 (+0.05%); split: +0.06%, -0.01%
Instrs: 8319309 -> 8304792 (-0.17%); split: -0.18%, +0.00%
Cycles: 88619440 -> 88489636 (-0.15%); split: -0.16%, +0.01%
VMEM: 2788278 -> 2780431 (-0.28%); split: +0.06%, -0.35%
SMEM: 570364 -> 569370 (-0.17%); split: +0.12%, -0.30%
VClause: 144906 -> 144908 (+0.00%); split: -0.05%, +0.05%
SClause: 302143 -> 302055 (-0.03%); split: -0.04%, +0.01%
Copies: 579124 -> 578779 (-0.06%); split: -0.14%, +0.08%
PreSGPRs: 327695 -> 328845 (+0.35%); split: -0.00%, +0.35%
PreVGPRs: 434280 -> 433954 (-0.08%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7438>
2020-11-03 18:50:32 +01:00
|
|
|
/* v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc) */
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
combine_and_subbrev(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
aco: optimize v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc)
fossils-db (Vega10):
Totals from 7786 (5.70% of 136546) affected shaders:
SGPRs: 517778 -> 518626 (+0.16%); split: -0.01%, +0.17%
VGPRs: 488252 -> 488084 (-0.03%); split: -0.04%, +0.01%
CodeSize: 42282068 -> 42250152 (-0.08%); split: -0.16%, +0.09%
MaxWaves: 35697 -> 35716 (+0.05%); split: +0.06%, -0.01%
Instrs: 8319309 -> 8304792 (-0.17%); split: -0.18%, +0.00%
Cycles: 88619440 -> 88489636 (-0.15%); split: -0.16%, +0.01%
VMEM: 2788278 -> 2780431 (-0.28%); split: +0.06%, -0.35%
SMEM: 570364 -> 569370 (-0.17%); split: +0.12%, -0.30%
VClause: 144906 -> 144908 (+0.00%); split: -0.05%, +0.05%
SClause: 302143 -> 302055 (-0.03%); split: -0.04%, +0.01%
Copies: 579124 -> 578779 (-0.06%); split: -0.14%, +0.08%
PreSGPRs: 327695 -> 328845 (+0.35%); split: -0.00%, +0.35%
PreVGPRs: 434280 -> 433954 (-0.08%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7438>
2020-11-03 18:50:32 +01:00
|
|
|
{
|
|
|
|
|
if (instr->usesModifiers())
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
2021-06-09 10:14:54 +02:00
|
|
|
Instruction* op_instr = follow_operand(ctx, instr->operands[i], true);
|
|
|
|
|
if (op_instr && op_instr->opcode == aco_opcode::v_subbrev_co_u32 &&
|
|
|
|
|
op_instr->operands[0].constantEquals(0) && op_instr->operands[1].constantEquals(0) &&
|
aco: optimize v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc)
fossils-db (Vega10):
Totals from 7786 (5.70% of 136546) affected shaders:
SGPRs: 517778 -> 518626 (+0.16%); split: -0.01%, +0.17%
VGPRs: 488252 -> 488084 (-0.03%); split: -0.04%, +0.01%
CodeSize: 42282068 -> 42250152 (-0.08%); split: -0.16%, +0.09%
MaxWaves: 35697 -> 35716 (+0.05%); split: +0.06%, -0.01%
Instrs: 8319309 -> 8304792 (-0.17%); split: -0.18%, +0.00%
Cycles: 88619440 -> 88489636 (-0.15%); split: -0.16%, +0.01%
VMEM: 2788278 -> 2780431 (-0.28%); split: +0.06%, -0.35%
SMEM: 570364 -> 569370 (-0.17%); split: +0.12%, -0.30%
VClause: 144906 -> 144908 (+0.00%); split: -0.05%, +0.05%
SClause: 302143 -> 302055 (-0.03%); split: -0.04%, +0.01%
Copies: 579124 -> 578779 (-0.06%); split: -0.14%, +0.08%
PreSGPRs: 327695 -> 328845 (+0.35%); split: -0.00%, +0.35%
PreVGPRs: 434280 -> 433954 (-0.08%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7438>
2020-11-03 18:50:32 +01:00
|
|
|
!op_instr->usesModifiers()) {
|
|
|
|
|
|
|
|
|
|
aco_ptr<Instruction> new_instr;
|
2021-06-09 10:14:54 +02:00
|
|
|
if (instr->operands[!i].isTemp() &&
|
|
|
|
|
instr->operands[!i].getTemp().type() == RegType::vgpr) {
|
|
|
|
|
new_instr.reset(
|
|
|
|
|
create_instruction<VOP2_instruction>(aco_opcode::v_cndmask_b32, Format::VOP2, 3, 1));
|
2022-05-12 02:50:17 -04:00
|
|
|
} else if (ctx.program->gfx_level >= GFX10 ||
|
aco: optimize v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc)
fossils-db (Vega10):
Totals from 7786 (5.70% of 136546) affected shaders:
SGPRs: 517778 -> 518626 (+0.16%); split: -0.01%, +0.17%
VGPRs: 488252 -> 488084 (-0.03%); split: -0.04%, +0.01%
CodeSize: 42282068 -> 42250152 (-0.08%); split: -0.16%, +0.09%
MaxWaves: 35697 -> 35716 (+0.05%); split: +0.06%, -0.01%
Instrs: 8319309 -> 8304792 (-0.17%); split: -0.18%, +0.00%
Cycles: 88619440 -> 88489636 (-0.15%); split: -0.16%, +0.01%
VMEM: 2788278 -> 2780431 (-0.28%); split: +0.06%, -0.35%
SMEM: 570364 -> 569370 (-0.17%); split: +0.12%, -0.30%
VClause: 144906 -> 144908 (+0.00%); split: -0.05%, +0.05%
SClause: 302143 -> 302055 (-0.03%); split: -0.04%, +0.01%
Copies: 579124 -> 578779 (-0.06%); split: -0.14%, +0.08%
PreSGPRs: 327695 -> 328845 (+0.35%); split: -0.00%, +0.35%
PreVGPRs: 434280 -> 433954 (-0.08%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7438>
2020-11-03 18:50:32 +01:00
|
|
|
(instr->operands[!i].isConstant() && !instr->operands[!i].isLiteral())) {
|
2021-06-09 10:14:54 +02:00
|
|
|
new_instr.reset(create_instruction<VOP3_instruction>(aco_opcode::v_cndmask_b32,
|
|
|
|
|
asVOP3(Format::VOP2), 3, 1));
|
aco: optimize v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc)
fossils-db (Vega10):
Totals from 7786 (5.70% of 136546) affected shaders:
SGPRs: 517778 -> 518626 (+0.16%); split: -0.01%, +0.17%
VGPRs: 488252 -> 488084 (-0.03%); split: -0.04%, +0.01%
CodeSize: 42282068 -> 42250152 (-0.08%); split: -0.16%, +0.09%
MaxWaves: 35697 -> 35716 (+0.05%); split: +0.06%, -0.01%
Instrs: 8319309 -> 8304792 (-0.17%); split: -0.18%, +0.00%
Cycles: 88619440 -> 88489636 (-0.15%); split: -0.16%, +0.01%
VMEM: 2788278 -> 2780431 (-0.28%); split: +0.06%, -0.35%
SMEM: 570364 -> 569370 (-0.17%); split: +0.12%, -0.30%
VClause: 144906 -> 144908 (+0.00%); split: -0.05%, +0.05%
SClause: 302143 -> 302055 (-0.03%); split: -0.04%, +0.01%
Copies: 579124 -> 578779 (-0.06%); split: -0.14%, +0.08%
PreSGPRs: 327695 -> 328845 (+0.35%); split: -0.00%, +0.35%
PreVGPRs: 434280 -> 433954 (-0.08%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7438>
2020-11-03 18:50:32 +01:00
|
|
|
} else {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2021-07-13 11:22:46 +02:00
|
|
|
new_instr->operands[0] = Operand::zero();
|
aco: optimize v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc)
fossils-db (Vega10):
Totals from 7786 (5.70% of 136546) affected shaders:
SGPRs: 517778 -> 518626 (+0.16%); split: -0.01%, +0.17%
VGPRs: 488252 -> 488084 (-0.03%); split: -0.04%, +0.01%
CodeSize: 42282068 -> 42250152 (-0.08%); split: -0.16%, +0.09%
MaxWaves: 35697 -> 35716 (+0.05%); split: +0.06%, -0.01%
Instrs: 8319309 -> 8304792 (-0.17%); split: -0.18%, +0.00%
Cycles: 88619440 -> 88489636 (-0.15%); split: -0.16%, +0.01%
VMEM: 2788278 -> 2780431 (-0.28%); split: +0.06%, -0.35%
SMEM: 570364 -> 569370 (-0.17%); split: +0.12%, -0.30%
VClause: 144906 -> 144908 (+0.00%); split: -0.05%, +0.05%
SClause: 302143 -> 302055 (-0.03%); split: -0.04%, +0.01%
Copies: 579124 -> 578779 (-0.06%); split: -0.14%, +0.08%
PreSGPRs: 327695 -> 328845 (+0.35%); split: -0.00%, +0.35%
PreVGPRs: 434280 -> 433954 (-0.08%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7438>
2020-11-03 18:50:32 +01:00
|
|
|
new_instr->operands[1] = instr->operands[!i];
|
2022-09-28 13:45:25 +02:00
|
|
|
new_instr->operands[2] = copy_operand(ctx, op_instr->operands[2]);
|
aco: optimize v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc)
fossils-db (Vega10):
Totals from 7786 (5.70% of 136546) affected shaders:
SGPRs: 517778 -> 518626 (+0.16%); split: -0.01%, +0.17%
VGPRs: 488252 -> 488084 (-0.03%); split: -0.04%, +0.01%
CodeSize: 42282068 -> 42250152 (-0.08%); split: -0.16%, +0.09%
MaxWaves: 35697 -> 35716 (+0.05%); split: +0.06%, -0.01%
Instrs: 8319309 -> 8304792 (-0.17%); split: -0.18%, +0.00%
Cycles: 88619440 -> 88489636 (-0.15%); split: -0.16%, +0.01%
VMEM: 2788278 -> 2780431 (-0.28%); split: +0.06%, -0.35%
SMEM: 570364 -> 569370 (-0.17%); split: +0.12%, -0.30%
VClause: 144906 -> 144908 (+0.00%); split: -0.05%, +0.05%
SClause: 302143 -> 302055 (-0.03%); split: -0.04%, +0.01%
Copies: 579124 -> 578779 (-0.06%); split: -0.14%, +0.08%
PreSGPRs: 327695 -> 328845 (+0.35%); split: -0.00%, +0.35%
PreVGPRs: 434280 -> 433954 (-0.08%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7438>
2020-11-03 18:50:32 +01:00
|
|
|
new_instr->definitions[0] = instr->definitions[0];
|
|
|
|
|
instr = std::move(new_instr);
|
2022-09-28 13:45:25 +02:00
|
|
|
decrease_uses(ctx, op_instr);
|
aco: optimize v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc)
fossils-db (Vega10):
Totals from 7786 (5.70% of 136546) affected shaders:
SGPRs: 517778 -> 518626 (+0.16%); split: -0.01%, +0.17%
VGPRs: 488252 -> 488084 (-0.03%); split: -0.04%, +0.01%
CodeSize: 42282068 -> 42250152 (-0.08%); split: -0.16%, +0.09%
MaxWaves: 35697 -> 35716 (+0.05%); split: +0.06%, -0.01%
Instrs: 8319309 -> 8304792 (-0.17%); split: -0.18%, +0.00%
Cycles: 88619440 -> 88489636 (-0.15%); split: -0.16%, +0.01%
VMEM: 2788278 -> 2780431 (-0.28%); split: +0.06%, -0.35%
SMEM: 570364 -> 569370 (-0.17%); split: +0.12%, -0.30%
VClause: 144906 -> 144908 (+0.00%); split: -0.05%, +0.05%
SClause: 302143 -> 302055 (-0.03%); split: -0.04%, +0.01%
Copies: 579124 -> 578779 (-0.06%); split: -0.14%, +0.08%
PreSGPRs: 327695 -> 328845 (+0.35%); split: -0.00%, +0.35%
PreVGPRs: 434280 -> 433954 (-0.08%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7438>
2020-11-03 18:50:32 +01:00
|
|
|
ctx.info[instr->definitions[0].tempId()].label = 0;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2020-11-18 13:07:57 +01:00
|
|
|
/* v_add_co(c, s_lshl(a, b)) -> v_mad_u32_u24(a, 1<<b, c)
|
2021-09-09 08:38:41 +02:00
|
|
|
* v_add_co(c, v_lshlrev(a, b)) -> v_mad_u32_u24(b, 1<<a, c)
|
|
|
|
|
* v_sub(c, s_lshl(a, b)) -> v_mad_i32_i24(a, -(1<<b), c)
|
|
|
|
|
* v_sub(c, v_lshlrev(a, b)) -> v_mad_i32_i24(b, -(1<<a), c)
|
|
|
|
|
*/
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
2021-09-09 08:38:41 +02:00
|
|
|
combine_add_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr, bool is_sub)
|
aco: optimize v_add+s_lshl to v_mad_u32_u24 on GFX6-8
This optimizes v_add(c, s_lshl(a, b)) to v_mad_u32_u24(a, 1<<b, c)
if 'b' is a constant (less than or equal to 6 to avoid creating
literals) and 'a' known to be a 16-bit or a 24-bit value.
On GFX9+, this is already optimized to v_lshl_add_u32.
fossils-db (Polaris10):
Totals from 1916 (1.36% of 140385) affected shaders:
SGPRs: 88322 -> 87780 (-0.61%); split: -0.66%, +0.05%
CodeSize: 7852668 -> 7851800 (-0.01%); split: -0.01%, +0.00%
Instrs: 1533965 -> 1530459 (-0.23%); split: -0.23%, +0.00%
Cycles: 57001852 -> 56983244 (-0.03%); split: -0.03%, +0.00%
VMEM: 372561 -> 371733 (-0.22%); split: +0.03%, -0.25%
SMEM: 108859 -> 103711 (-4.73%); split: +0.23%, -4.96%
VClause: 37231 -> 37204 (-0.07%)
SClause: 58116 -> 58086 (-0.05%); split: -0.06%, +0.01%
Copies: 199953 -> 199931 (-0.01%); split: -0.03%, +0.02%
Branches: 63478 -> 63477 (-0.00%)
PreSGPRs: 61818 -> 61816 (-0.00%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7673>
2020-11-16 18:01:32 +01:00
|
|
|
{
|
|
|
|
|
if (instr->usesModifiers())
|
|
|
|
|
return false;
|
|
|
|
|
|
2021-09-09 08:38:41 +02:00
|
|
|
/* Substractions: start at operand 1 to avoid mixup such as
|
|
|
|
|
* turning v_sub(v_lshlrev(a, b), c) into v_mad_i32_i24(b, -(1<<a), c)
|
|
|
|
|
*/
|
|
|
|
|
unsigned start_op_idx = is_sub ? 1 : 0;
|
|
|
|
|
|
|
|
|
|
/* Don't allow 24-bit operands on subtraction because
|
|
|
|
|
* v_mad_i32_i24 applies a sign extension.
|
|
|
|
|
*/
|
|
|
|
|
bool allow_24bit = !is_sub;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = start_op_idx; i < 2; i++) {
|
2021-06-09 10:14:54 +02:00
|
|
|
Instruction* op_instr = follow_operand(ctx, instr->operands[i]);
|
aco: optimize v_add+s_lshl to v_mad_u32_u24 on GFX6-8
This optimizes v_add(c, s_lshl(a, b)) to v_mad_u32_u24(a, 1<<b, c)
if 'b' is a constant (less than or equal to 6 to avoid creating
literals) and 'a' known to be a 16-bit or a 24-bit value.
On GFX9+, this is already optimized to v_lshl_add_u32.
fossils-db (Polaris10):
Totals from 1916 (1.36% of 140385) affected shaders:
SGPRs: 88322 -> 87780 (-0.61%); split: -0.66%, +0.05%
CodeSize: 7852668 -> 7851800 (-0.01%); split: -0.01%, +0.00%
Instrs: 1533965 -> 1530459 (-0.23%); split: -0.23%, +0.00%
Cycles: 57001852 -> 56983244 (-0.03%); split: -0.03%, +0.00%
VMEM: 372561 -> 371733 (-0.22%); split: +0.03%, -0.25%
SMEM: 108859 -> 103711 (-4.73%); split: +0.23%, -4.96%
VClause: 37231 -> 37204 (-0.07%)
SClause: 58116 -> 58086 (-0.05%); split: -0.06%, +0.01%
Copies: 199953 -> 199931 (-0.01%); split: -0.03%, +0.02%
Branches: 63478 -> 63477 (-0.00%)
PreSGPRs: 61818 -> 61816 (-0.00%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7673>
2020-11-16 18:01:32 +01:00
|
|
|
if (!op_instr)
|
|
|
|
|
continue;
|
|
|
|
|
|
2020-11-18 13:07:57 +01:00
|
|
|
if (op_instr->opcode != aco_opcode::s_lshl_b32 &&
|
|
|
|
|
op_instr->opcode != aco_opcode::v_lshlrev_b32)
|
aco: optimize v_add+s_lshl to v_mad_u32_u24 on GFX6-8
This optimizes v_add(c, s_lshl(a, b)) to v_mad_u32_u24(a, 1<<b, c)
if 'b' is a constant (less than or equal to 6 to avoid creating
literals) and 'a' known to be a 16-bit or a 24-bit value.
On GFX9+, this is already optimized to v_lshl_add_u32.
fossils-db (Polaris10):
Totals from 1916 (1.36% of 140385) affected shaders:
SGPRs: 88322 -> 87780 (-0.61%); split: -0.66%, +0.05%
CodeSize: 7852668 -> 7851800 (-0.01%); split: -0.01%, +0.00%
Instrs: 1533965 -> 1530459 (-0.23%); split: -0.23%, +0.00%
Cycles: 57001852 -> 56983244 (-0.03%); split: -0.03%, +0.00%
VMEM: 372561 -> 371733 (-0.22%); split: +0.03%, -0.25%
SMEM: 108859 -> 103711 (-4.73%); split: +0.23%, -4.96%
VClause: 37231 -> 37204 (-0.07%)
SClause: 58116 -> 58086 (-0.05%); split: -0.06%, +0.01%
Copies: 199953 -> 199931 (-0.01%); split: -0.03%, +0.02%
Branches: 63478 -> 63477 (-0.00%)
PreSGPRs: 61818 -> 61816 (-0.00%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7673>
2020-11-16 18:01:32 +01:00
|
|
|
continue;
|
|
|
|
|
|
2020-11-18 13:07:57 +01:00
|
|
|
int shift_op_idx = op_instr->opcode == aco_opcode::s_lshl_b32 ? 1 : 0;
|
2021-09-09 08:38:41 +02:00
|
|
|
|
2020-11-18 13:07:57 +01:00
|
|
|
if (op_instr->operands[shift_op_idx].isConstant() &&
|
2021-09-09 08:38:41 +02:00
|
|
|
((allow_24bit && op_instr->operands[!shift_op_idx].is24bit()) ||
|
2020-11-18 13:07:57 +01:00
|
|
|
op_instr->operands[!shift_op_idx].is16bit())) {
|
2021-09-09 08:38:41 +02:00
|
|
|
uint32_t multiplier = 1 << (op_instr->operands[shift_op_idx].constantValue() % 32u);
|
|
|
|
|
if (is_sub)
|
|
|
|
|
multiplier = -multiplier;
|
|
|
|
|
if (is_sub ? (multiplier < 0xff800000) : (multiplier > 0xffffff))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
Operand ops[3] = {
|
|
|
|
|
op_instr->operands[!shift_op_idx],
|
|
|
|
|
Operand::c32(multiplier),
|
|
|
|
|
instr->operands[!i],
|
|
|
|
|
};
|
|
|
|
|
if (!check_vop3_operands(ctx, 3, ops))
|
|
|
|
|
return false;
|
aco: optimize v_add+s_lshl to v_mad_u32_u24 on GFX6-8
This optimizes v_add(c, s_lshl(a, b)) to v_mad_u32_u24(a, 1<<b, c)
if 'b' is a constant (less than or equal to 6 to avoid creating
literals) and 'a' known to be a 16-bit or a 24-bit value.
On GFX9+, this is already optimized to v_lshl_add_u32.
fossils-db (Polaris10):
Totals from 1916 (1.36% of 140385) affected shaders:
SGPRs: 88322 -> 87780 (-0.61%); split: -0.66%, +0.05%
CodeSize: 7852668 -> 7851800 (-0.01%); split: -0.01%, +0.00%
Instrs: 1533965 -> 1530459 (-0.23%); split: -0.23%, +0.00%
Cycles: 57001852 -> 56983244 (-0.03%); split: -0.03%, +0.00%
VMEM: 372561 -> 371733 (-0.22%); split: +0.03%, -0.25%
SMEM: 108859 -> 103711 (-4.73%); split: +0.23%, -4.96%
VClause: 37231 -> 37204 (-0.07%)
SClause: 58116 -> 58086 (-0.05%); split: -0.06%, +0.01%
Copies: 199953 -> 199931 (-0.01%); split: -0.03%, +0.02%
Branches: 63478 -> 63477 (-0.00%)
PreSGPRs: 61818 -> 61816 (-0.00%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7673>
2020-11-16 18:01:32 +01:00
|
|
|
|
|
|
|
|
ctx.uses[instr->operands[i].tempId()]--;
|
|
|
|
|
|
2021-09-09 08:38:41 +02:00
|
|
|
aco_opcode mad_op = is_sub ? aco_opcode::v_mad_i32_i24 : aco_opcode::v_mad_u32_u24;
|
2021-06-09 10:14:54 +02:00
|
|
|
aco_ptr<VOP3_instruction> new_instr{
|
2021-09-09 08:38:41 +02:00
|
|
|
create_instruction<VOP3_instruction>(mad_op, Format::VOP3, 3, 1)};
|
|
|
|
|
for (unsigned op_idx = 0; op_idx < 3; ++op_idx)
|
|
|
|
|
new_instr->operands[op_idx] = ops[op_idx];
|
aco: optimize v_add+s_lshl to v_mad_u32_u24 on GFX6-8
This optimizes v_add(c, s_lshl(a, b)) to v_mad_u32_u24(a, 1<<b, c)
if 'b' is a constant (less than or equal to 6 to avoid creating
literals) and 'a' known to be a 16-bit or a 24-bit value.
On GFX9+, this is already optimized to v_lshl_add_u32.
fossils-db (Polaris10):
Totals from 1916 (1.36% of 140385) affected shaders:
SGPRs: 88322 -> 87780 (-0.61%); split: -0.66%, +0.05%
CodeSize: 7852668 -> 7851800 (-0.01%); split: -0.01%, +0.00%
Instrs: 1533965 -> 1530459 (-0.23%); split: -0.23%, +0.00%
Cycles: 57001852 -> 56983244 (-0.03%); split: -0.03%, +0.00%
VMEM: 372561 -> 371733 (-0.22%); split: +0.03%, -0.25%
SMEM: 108859 -> 103711 (-4.73%); split: +0.23%, -4.96%
VClause: 37231 -> 37204 (-0.07%)
SClause: 58116 -> 58086 (-0.05%); split: -0.06%, +0.01%
Copies: 199953 -> 199931 (-0.01%); split: -0.03%, +0.02%
Branches: 63478 -> 63477 (-0.00%)
PreSGPRs: 61818 -> 61816 (-0.00%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7673>
2020-11-16 18:01:32 +01:00
|
|
|
new_instr->definitions[0] = instr->definitions[0];
|
|
|
|
|
instr = std::move(new_instr);
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].label = 0;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
propagate_swizzles(VOP3P_instruction* instr, uint8_t opsel_lo, uint8_t opsel_hi)
|
2021-01-07 15:07:09 +01:00
|
|
|
{
|
|
|
|
|
/* propagate swizzles which apply to a result down to the instruction's operands:
|
|
|
|
|
* result = a.xy + b.xx -> result.yx = a.yx + b.xx */
|
|
|
|
|
assert((opsel_lo & 1) == opsel_lo);
|
|
|
|
|
assert((opsel_hi & 1) == opsel_hi);
|
|
|
|
|
uint8_t tmp_lo = instr->opsel_lo;
|
|
|
|
|
uint8_t tmp_hi = instr->opsel_hi;
|
2021-06-09 10:14:54 +02:00
|
|
|
bool neg_lo[3] = {instr->neg_lo[0], instr->neg_lo[1], instr->neg_lo[2]};
|
|
|
|
|
bool neg_hi[3] = {instr->neg_hi[0], instr->neg_hi[1], instr->neg_hi[2]};
|
2021-01-07 15:07:09 +01:00
|
|
|
if (opsel_lo == 1) {
|
|
|
|
|
instr->opsel_lo = tmp_hi;
|
|
|
|
|
for (unsigned i = 0; i < 3; i++)
|
|
|
|
|
instr->neg_lo[i] = neg_hi[i];
|
|
|
|
|
}
|
|
|
|
|
if (opsel_hi == 0) {
|
|
|
|
|
instr->opsel_hi = tmp_lo;
|
|
|
|
|
for (unsigned i = 0; i < 3; i++)
|
|
|
|
|
instr->neg_hi[i] = neg_lo[i];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
combine_vop3p(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
2020-09-03 12:02:55 +01:00
|
|
|
{
|
2021-01-21 16:13:34 +00:00
|
|
|
VOP3P_instruction* vop3p = &instr->vop3p();
|
2020-09-11 15:54:39 +01:00
|
|
|
|
|
|
|
|
/* apply clamp */
|
2021-06-09 10:14:54 +02:00
|
|
|
if (instr->opcode == aco_opcode::v_pk_mul_f16 && instr->operands[1].constantEquals(0x3C00) &&
|
2022-07-07 12:27:08 +02:00
|
|
|
vop3p->clamp && instr->operands[0].isTemp() && ctx.uses[instr->operands[0].tempId()] == 1 &&
|
|
|
|
|
!((vop3p->opsel_lo | vop3p->opsel_hi) & 2)) {
|
2020-09-11 15:54:39 +01:00
|
|
|
|
|
|
|
|
ssa_info& info = ctx.info[instr->operands[0].tempId()];
|
|
|
|
|
if (info.is_vop3p() && instr_info.can_use_output_modifiers[(int)info.instr->opcode]) {
|
2021-01-21 16:13:34 +00:00
|
|
|
VOP3P_instruction* candidate = &ctx.info[instr->operands[0].tempId()].instr->vop3p();
|
2021-01-07 15:07:09 +01:00
|
|
|
candidate->clamp = true;
|
|
|
|
|
propagate_swizzles(candidate, vop3p->opsel_lo, vop3p->opsel_hi);
|
2021-01-26 14:24:48 +00:00
|
|
|
instr->definitions[0].swapTemp(candidate->definitions[0]);
|
2020-09-11 15:54:39 +01:00
|
|
|
ctx.info[candidate->definitions[0].tempId()].instr = candidate;
|
|
|
|
|
ctx.uses[instr->definitions[0].tempId()]--;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-09-03 12:02:55 +01:00
|
|
|
|
2020-09-11 16:20:21 +01:00
|
|
|
/* check for fneg modifiers */
|
|
|
|
|
if (instr_info.can_use_input_modifiers[(int)instr->opcode]) {
|
2021-03-24 17:17:38 +00:00
|
|
|
for (unsigned i = 0; i < instr->operands.size(); i++) {
|
2020-09-11 16:20:21 +01:00
|
|
|
Operand& op = instr->operands[i];
|
|
|
|
|
if (!op.isTemp())
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
ssa_info& info = ctx.info[op.tempId()];
|
|
|
|
|
if (info.is_vop3p() && info.instr->opcode == aco_opcode::v_pk_mul_f16 &&
|
2021-11-04 18:40:44 +01:00
|
|
|
info.instr->operands[1].constantEquals(0x3C00)) {
|
2022-07-07 12:27:08 +02:00
|
|
|
|
|
|
|
|
VOP3P_instruction* fneg = &info.instr->vop3p();
|
|
|
|
|
|
|
|
|
|
if ((fneg->opsel_lo | fneg->opsel_hi) & 2)
|
|
|
|
|
continue;
|
|
|
|
|
|
2021-12-16 15:47:53 +00:00
|
|
|
Operand ops[3];
|
|
|
|
|
for (unsigned j = 0; j < instr->operands.size(); j++)
|
|
|
|
|
ops[j] = instr->operands[j];
|
|
|
|
|
ops[i] = info.instr->operands[0];
|
|
|
|
|
if (!check_vop3_operands(ctx, instr->operands.size(), ops))
|
2020-09-11 16:20:21 +01:00
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
if (fneg->clamp)
|
|
|
|
|
continue;
|
|
|
|
|
instr->operands[i] = fneg->operands[0];
|
|
|
|
|
|
|
|
|
|
/* opsel_lo/hi is either 0 or 1:
|
|
|
|
|
* if 0 - pick selection from fneg->lo
|
|
|
|
|
* if 1 - pick selection from fneg->hi
|
|
|
|
|
*/
|
2021-11-04 18:37:03 +01:00
|
|
|
bool opsel_lo = (vop3p->opsel_lo >> i) & 1;
|
|
|
|
|
bool opsel_hi = (vop3p->opsel_hi >> i) & 1;
|
2021-11-04 18:40:44 +01:00
|
|
|
bool neg_lo = fneg->neg_lo[0] ^ fneg->neg_lo[1];
|
|
|
|
|
bool neg_hi = fneg->neg_hi[0] ^ fneg->neg_hi[1];
|
2021-11-04 18:37:03 +01:00
|
|
|
vop3p->neg_lo[i] ^= opsel_lo ? neg_hi : neg_lo;
|
|
|
|
|
vop3p->neg_hi[i] ^= opsel_hi ? neg_hi : neg_lo;
|
2020-09-11 16:20:21 +01:00
|
|
|
vop3p->opsel_lo ^= ((opsel_lo ? ~fneg->opsel_hi : fneg->opsel_lo) & 1) << i;
|
|
|
|
|
vop3p->opsel_hi ^= ((opsel_hi ? ~fneg->opsel_hi : fneg->opsel_lo) & 1) << i;
|
|
|
|
|
|
|
|
|
|
if (--ctx.uses[fneg->definitions[0].tempId()])
|
|
|
|
|
ctx.uses[fneg->operands[0].tempId()]++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-08-05 15:52:08 +02:00
|
|
|
if (instr->opcode == aco_opcode::v_pk_add_f16 || instr->opcode == aco_opcode::v_pk_add_u16) {
|
|
|
|
|
bool fadd = instr->opcode == aco_opcode::v_pk_add_f16;
|
|
|
|
|
if (fadd && instr->definitions[0].isPrecise())
|
2020-09-03 12:02:55 +01:00
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
Instruction* mul_instr = nullptr;
|
|
|
|
|
unsigned add_op_idx = 0;
|
2021-01-07 15:07:09 +01:00
|
|
|
uint8_t opsel_lo = 0, opsel_hi = 0;
|
2020-09-03 12:02:55 +01:00
|
|
|
uint32_t uses = UINT32_MAX;
|
|
|
|
|
|
|
|
|
|
/* find the 'best' mul instruction to combine with the add */
|
|
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
|
|
|
|
if (!instr->operands[i].isTemp() || !ctx.info[instr->operands[i].tempId()].is_vop3p())
|
|
|
|
|
continue;
|
|
|
|
|
ssa_info& info = ctx.info[instr->operands[i].tempId()];
|
2021-08-05 15:52:08 +02:00
|
|
|
if (fadd) {
|
|
|
|
|
if (info.instr->opcode != aco_opcode::v_pk_mul_f16 ||
|
|
|
|
|
info.instr->definitions[0].isPrecise())
|
|
|
|
|
continue;
|
|
|
|
|
} else {
|
|
|
|
|
if (info.instr->opcode != aco_opcode::v_pk_mul_lo_u16)
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2020-09-03 12:02:55 +01:00
|
|
|
|
|
|
|
|
Operand op[3] = {info.instr->operands[0], info.instr->operands[1], instr->operands[1 - i]};
|
2021-06-09 10:14:54 +02:00
|
|
|
if (ctx.uses[instr->operands[i].tempId()] >= uses || !check_vop3_operands(ctx, 3, op))
|
2020-09-03 12:02:55 +01:00
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
/* no clamp allowed between mul and add */
|
2021-01-21 16:13:34 +00:00
|
|
|
if (info.instr->vop3p().clamp)
|
2020-09-03 12:02:55 +01:00
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
mul_instr = info.instr;
|
|
|
|
|
add_op_idx = 1 - i;
|
2021-01-07 15:07:09 +01:00
|
|
|
opsel_lo = (vop3p->opsel_lo >> i) & 1;
|
|
|
|
|
opsel_hi = (vop3p->opsel_hi >> i) & 1;
|
2020-09-03 12:02:55 +01:00
|
|
|
uses = ctx.uses[instr->operands[i].tempId()];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!mul_instr)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
/* convert to mad */
|
|
|
|
|
Operand op[3] = {mul_instr->operands[0], mul_instr->operands[1], instr->operands[add_op_idx]};
|
|
|
|
|
ctx.uses[mul_instr->definitions[0].tempId()]--;
|
|
|
|
|
if (ctx.uses[mul_instr->definitions[0].tempId()]) {
|
|
|
|
|
if (op[0].isTemp())
|
|
|
|
|
ctx.uses[op[0].tempId()]++;
|
|
|
|
|
if (op[1].isTemp())
|
|
|
|
|
ctx.uses[op[1].tempId()]++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* turn packed mul+add into v_pk_fma_f16 */
|
2021-01-20 15:27:16 +00:00
|
|
|
assert(mul_instr->isVOP3P());
|
2021-08-05 15:52:08 +02:00
|
|
|
aco_opcode mad = fadd ? aco_opcode::v_pk_fma_f16 : aco_opcode::v_pk_mad_u16;
|
2021-06-09 10:14:54 +02:00
|
|
|
aco_ptr<VOP3P_instruction> fma{
|
2021-08-05 15:52:08 +02:00
|
|
|
create_instruction<VOP3P_instruction>(mad, Format::VOP3P, 3, 1)};
|
2021-01-21 16:13:34 +00:00
|
|
|
VOP3P_instruction* mul = &mul_instr->vop3p();
|
2020-09-03 12:02:55 +01:00
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
|
|
|
|
fma->operands[i] = op[i];
|
|
|
|
|
fma->neg_lo[i] = mul->neg_lo[i];
|
|
|
|
|
fma->neg_hi[i] = mul->neg_hi[i];
|
|
|
|
|
}
|
|
|
|
|
fma->operands[2] = op[2];
|
2021-01-07 15:07:09 +01:00
|
|
|
fma->clamp = vop3p->clamp;
|
|
|
|
|
fma->opsel_lo = mul->opsel_lo;
|
|
|
|
|
fma->opsel_hi = mul->opsel_hi;
|
|
|
|
|
propagate_swizzles(fma.get(), opsel_lo, opsel_hi);
|
|
|
|
|
fma->opsel_lo |= (vop3p->opsel_lo << (2 - add_op_idx)) & 0x4;
|
|
|
|
|
fma->opsel_hi |= (vop3p->opsel_hi << (2 - add_op_idx)) & 0x4;
|
2020-09-03 12:02:55 +01:00
|
|
|
fma->neg_lo[2] = vop3p->neg_lo[add_op_idx];
|
|
|
|
|
fma->neg_hi[2] = vop3p->neg_hi[add_op_idx];
|
|
|
|
|
fma->neg_lo[1] = fma->neg_lo[1] ^ vop3p->neg_lo[1 - add_op_idx];
|
|
|
|
|
fma->neg_hi[1] = fma->neg_hi[1] ^ vop3p->neg_hi[1 - add_op_idx];
|
|
|
|
|
fma->definitions[0] = instr->definitions[0];
|
2021-09-16 20:50:29 +02:00
|
|
|
instr = std::move(fma);
|
2020-09-03 12:02:55 +01:00
|
|
|
ctx.info[instr->definitions[0].tempId()].set_vop3p(instr.get());
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
aco: use v_fma_mix to combine mul/add/fma input conversions
fossil-db (Sienna Cichlid):
Totals from 11558 (8.57% of 134913) affected shaders:
VGPRs: 829392 -> 825200 (-0.51%); split: -0.52%, +0.02%
SpillSGPRs: 7845 -> 8399 (+7.06%)
CodeSize: 101822704 -> 101677172 (-0.14%); split: -0.25%, +0.11%
MaxWaves: 172216 -> 173182 (+0.56%); split: +0.59%, -0.03%
Instrs: 19061343 -> 18883450 (-0.93%); split: -0.93%, +0.00%
Latency: 256011590 -> 255177378 (-0.33%); split: -0.39%, +0.06%
InvThroughput: 46104438 -> 45604059 (-1.09%); split: -1.12%, +0.04%
VClause: 352211 -> 351948 (-0.07%); split: -0.21%, +0.13%
SClause: 676506 -> 676961 (+0.07%); split: -0.04%, +0.11%
Copies: 1246571 -> 1237745 (-0.71%); split: -0.97%, +0.26%
Branches: 626229 -> 626241 (+0.00%); split: -0.02%, +0.03%
PreSGPRs: 882176 -> 888853 (+0.76%); split: -0.00%, +0.76%
PreVGPRs: 796705 -> 792304 (-0.55%); split: -0.56%, +0.00%
fossil-db (Navi):
Totals from 11558 (8.57% of 134913) affected shaders:
VGPRs: 803900 -> 798660 (-0.65%); split: -0.73%, +0.08%
SpillSGPRs: 7894 -> 8492 (+7.58%); split: -0.10%, +7.68%
CodeSize: 96892596 -> 97134716 (+0.25%); split: -0.05%, +0.29%
MaxWaves: 181454 -> 183014 (+0.86%); split: +0.94%, -0.08%
Instrs: 18186813 -> 18093994 (-0.51%); split: -0.56%, +0.05%
Latency: 253385909 -> 253325528 (-0.02%); split: -0.15%, +0.12%
InvThroughput: 43315355 -> 42805541 (-1.18%); split: -1.33%, +0.15%
VClause: 338755 -> 338535 (-0.06%); split: -0.16%, +0.10%
SClause: 656561 -> 656829 (+0.04%); split: -0.07%, +0.11%
Copies: 1162235 -> 1153558 (-0.75%); split: -1.07%, +0.32%
Branches: 588536 -> 588542 (+0.00%); split: -0.03%, +0.03%
PreSGPRs: 854849 -> 861640 (+0.79%); split: -0.00%, +0.80%
PreVGPRs: 783401 -> 779031 (-0.56%); split: -0.56%, +0.00%
fossil-db (Vega):
Totals from 11516 (8.53% of 135048) affected shaders:
SGPRs: 1072128 -> 1076288 (+0.39%); split: -0.01%, +0.40%
VGPRs: 821312 -> 818124 (-0.39%); split: -0.43%, +0.04%
SpillSGPRs: 11952 -> 12677 (+6.07%)
CodeSize: 96378496 -> 96707596 (+0.34%); split: -0.04%, +0.38%
MaxWaves: 42614 -> 42883 (+0.63%); split: +0.68%, -0.04%
Instrs: 18672844 -> 18600274 (-0.39%); split: -0.44%, +0.05%
Latency: 296658786 -> 296338296 (-0.11%); split: -0.21%, +0.10%
InvThroughput: 111665547 -> 111283559 (-0.34%); split: -0.40%, +0.06%
VClause: 343001 -> 342826 (-0.05%); split: -0.14%, +0.09%
SClause: 646684 -> 646657 (-0.00%); split: -0.05%, +0.04%
Copies: 1715316 -> 1712895 (-0.14%); split: -0.53%, +0.39%
PreSGPRs: 850737 -> 856543 (+0.68%); split: -0.04%, +0.72%
PreVGPRs: 775293 -> 772215 (-0.40%); split: -0.41%, +0.02%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 13:58:34 +00:00
|
|
|
bool
|
|
|
|
|
can_use_mad_mix(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
|
|
|
|
{
|
2022-05-12 02:50:17 -04:00
|
|
|
if (ctx.program->gfx_level < GFX9)
|
aco: use v_fma_mix to combine mul/add/fma input conversions
fossil-db (Sienna Cichlid):
Totals from 11558 (8.57% of 134913) affected shaders:
VGPRs: 829392 -> 825200 (-0.51%); split: -0.52%, +0.02%
SpillSGPRs: 7845 -> 8399 (+7.06%)
CodeSize: 101822704 -> 101677172 (-0.14%); split: -0.25%, +0.11%
MaxWaves: 172216 -> 173182 (+0.56%); split: +0.59%, -0.03%
Instrs: 19061343 -> 18883450 (-0.93%); split: -0.93%, +0.00%
Latency: 256011590 -> 255177378 (-0.33%); split: -0.39%, +0.06%
InvThroughput: 46104438 -> 45604059 (-1.09%); split: -1.12%, +0.04%
VClause: 352211 -> 351948 (-0.07%); split: -0.21%, +0.13%
SClause: 676506 -> 676961 (+0.07%); split: -0.04%, +0.11%
Copies: 1246571 -> 1237745 (-0.71%); split: -0.97%, +0.26%
Branches: 626229 -> 626241 (+0.00%); split: -0.02%, +0.03%
PreSGPRs: 882176 -> 888853 (+0.76%); split: -0.00%, +0.76%
PreVGPRs: 796705 -> 792304 (-0.55%); split: -0.56%, +0.00%
fossil-db (Navi):
Totals from 11558 (8.57% of 134913) affected shaders:
VGPRs: 803900 -> 798660 (-0.65%); split: -0.73%, +0.08%
SpillSGPRs: 7894 -> 8492 (+7.58%); split: -0.10%, +7.68%
CodeSize: 96892596 -> 97134716 (+0.25%); split: -0.05%, +0.29%
MaxWaves: 181454 -> 183014 (+0.86%); split: +0.94%, -0.08%
Instrs: 18186813 -> 18093994 (-0.51%); split: -0.56%, +0.05%
Latency: 253385909 -> 253325528 (-0.02%); split: -0.15%, +0.12%
InvThroughput: 43315355 -> 42805541 (-1.18%); split: -1.33%, +0.15%
VClause: 338755 -> 338535 (-0.06%); split: -0.16%, +0.10%
SClause: 656561 -> 656829 (+0.04%); split: -0.07%, +0.11%
Copies: 1162235 -> 1153558 (-0.75%); split: -1.07%, +0.32%
Branches: 588536 -> 588542 (+0.00%); split: -0.03%, +0.03%
PreSGPRs: 854849 -> 861640 (+0.79%); split: -0.00%, +0.80%
PreVGPRs: 783401 -> 779031 (-0.56%); split: -0.56%, +0.00%
fossil-db (Vega):
Totals from 11516 (8.53% of 135048) affected shaders:
SGPRs: 1072128 -> 1076288 (+0.39%); split: -0.01%, +0.40%
VGPRs: 821312 -> 818124 (-0.39%); split: -0.43%, +0.04%
SpillSGPRs: 11952 -> 12677 (+6.07%)
CodeSize: 96378496 -> 96707596 (+0.34%); split: -0.04%, +0.38%
MaxWaves: 42614 -> 42883 (+0.63%); split: +0.68%, -0.04%
Instrs: 18672844 -> 18600274 (-0.39%); split: -0.44%, +0.05%
Latency: 296658786 -> 296338296 (-0.11%); split: -0.21%, +0.10%
InvThroughput: 111665547 -> 111283559 (-0.34%); split: -0.40%, +0.06%
VClause: 343001 -> 342826 (-0.05%); split: -0.14%, +0.09%
SClause: 646684 -> 646657 (-0.00%); split: -0.05%, +0.04%
Copies: 1715316 -> 1712895 (-0.14%); split: -0.53%, +0.39%
PreSGPRs: 850737 -> 856543 (+0.68%); split: -0.04%, +0.72%
PreVGPRs: 775293 -> 772215 (-0.40%); split: -0.41%, +0.02%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 13:58:34 +00:00
|
|
|
return false;
|
|
|
|
|
|
aco: don't use v_mad_mix on GFX9 if 16-bit denormals must be preserved
This probably effectively disables the v_mad_mix optimization on GFX9.
fossil-db (Vega):
Totals from 11545 (7.15% of 161366) affected shaders:
MaxWaves: 43025 -> 42780 (-0.57%); split: +0.06%, -0.63%
Instrs: 18571635 -> 18734201 (+0.88%); split: -0.00%, +0.88%
CodeSize: 96483568 -> 96611012 (+0.13%); split: -0.11%, +0.24%
SGPRs: 1079056 -> 1077616 (-0.13%); split: -0.14%, +0.01%
VGPRs: 819248 -> 821868 (+0.32%); split: -0.04%, +0.36%
SpillSGPRs: 13313 -> 12464 (-6.38%)
Latency: 293804093 -> 295046122 (+0.42%); split: -0.09%, +0.51%
InvThroughput: 110002239 -> 110994978 (+0.90%); split: -0.03%, +0.93%
VClause: 342458 -> 342596 (+0.04%); split: -0.12%, +0.16%
SClause: 648566 -> 648046 (-0.08%); split: -0.12%, +0.04%
Copies: 1728225 -> 1726679 (-0.09%); split: -0.66%, +0.57%
Branches: 552973 -> 552963 (-0.00%); split: -0.02%, +0.02%
PreSGPRs: 862360 -> 856820 (-0.64%); split: -0.69%, +0.05%
PreVGPRs: 773689 -> 776818 (+0.40%); split: -0.02%, +0.42%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/6178
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15718>
2022-04-01 19:51:55 +01:00
|
|
|
/* v_mad_mix* on GFX9 always flushes denormals for 16-bit inputs/outputs */
|
2022-05-12 02:50:17 -04:00
|
|
|
if (ctx.program->gfx_level == GFX9 && ctx.fp_mode.denorm16_64)
|
aco: don't use v_mad_mix on GFX9 if 16-bit denormals must be preserved
This probably effectively disables the v_mad_mix optimization on GFX9.
fossil-db (Vega):
Totals from 11545 (7.15% of 161366) affected shaders:
MaxWaves: 43025 -> 42780 (-0.57%); split: +0.06%, -0.63%
Instrs: 18571635 -> 18734201 (+0.88%); split: -0.00%, +0.88%
CodeSize: 96483568 -> 96611012 (+0.13%); split: -0.11%, +0.24%
SGPRs: 1079056 -> 1077616 (-0.13%); split: -0.14%, +0.01%
VGPRs: 819248 -> 821868 (+0.32%); split: -0.04%, +0.36%
SpillSGPRs: 13313 -> 12464 (-6.38%)
Latency: 293804093 -> 295046122 (+0.42%); split: -0.09%, +0.51%
InvThroughput: 110002239 -> 110994978 (+0.90%); split: -0.03%, +0.93%
VClause: 342458 -> 342596 (+0.04%); split: -0.12%, +0.16%
SClause: 648566 -> 648046 (-0.08%); split: -0.12%, +0.04%
Copies: 1728225 -> 1726679 (-0.09%); split: -0.66%, +0.57%
Branches: 552973 -> 552963 (-0.00%); split: -0.02%, +0.02%
PreSGPRs: 862360 -> 856820 (-0.64%); split: -0.69%, +0.05%
PreVGPRs: 773689 -> 776818 (+0.40%); split: -0.02%, +0.42%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/6178
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15718>
2022-04-01 19:51:55 +01:00
|
|
|
return false;
|
|
|
|
|
|
aco: use v_fma_mix to combine mul/add/fma input conversions
fossil-db (Sienna Cichlid):
Totals from 11558 (8.57% of 134913) affected shaders:
VGPRs: 829392 -> 825200 (-0.51%); split: -0.52%, +0.02%
SpillSGPRs: 7845 -> 8399 (+7.06%)
CodeSize: 101822704 -> 101677172 (-0.14%); split: -0.25%, +0.11%
MaxWaves: 172216 -> 173182 (+0.56%); split: +0.59%, -0.03%
Instrs: 19061343 -> 18883450 (-0.93%); split: -0.93%, +0.00%
Latency: 256011590 -> 255177378 (-0.33%); split: -0.39%, +0.06%
InvThroughput: 46104438 -> 45604059 (-1.09%); split: -1.12%, +0.04%
VClause: 352211 -> 351948 (-0.07%); split: -0.21%, +0.13%
SClause: 676506 -> 676961 (+0.07%); split: -0.04%, +0.11%
Copies: 1246571 -> 1237745 (-0.71%); split: -0.97%, +0.26%
Branches: 626229 -> 626241 (+0.00%); split: -0.02%, +0.03%
PreSGPRs: 882176 -> 888853 (+0.76%); split: -0.00%, +0.76%
PreVGPRs: 796705 -> 792304 (-0.55%); split: -0.56%, +0.00%
fossil-db (Navi):
Totals from 11558 (8.57% of 134913) affected shaders:
VGPRs: 803900 -> 798660 (-0.65%); split: -0.73%, +0.08%
SpillSGPRs: 7894 -> 8492 (+7.58%); split: -0.10%, +7.68%
CodeSize: 96892596 -> 97134716 (+0.25%); split: -0.05%, +0.29%
MaxWaves: 181454 -> 183014 (+0.86%); split: +0.94%, -0.08%
Instrs: 18186813 -> 18093994 (-0.51%); split: -0.56%, +0.05%
Latency: 253385909 -> 253325528 (-0.02%); split: -0.15%, +0.12%
InvThroughput: 43315355 -> 42805541 (-1.18%); split: -1.33%, +0.15%
VClause: 338755 -> 338535 (-0.06%); split: -0.16%, +0.10%
SClause: 656561 -> 656829 (+0.04%); split: -0.07%, +0.11%
Copies: 1162235 -> 1153558 (-0.75%); split: -1.07%, +0.32%
Branches: 588536 -> 588542 (+0.00%); split: -0.03%, +0.03%
PreSGPRs: 854849 -> 861640 (+0.79%); split: -0.00%, +0.80%
PreVGPRs: 783401 -> 779031 (-0.56%); split: -0.56%, +0.00%
fossil-db (Vega):
Totals from 11516 (8.53% of 135048) affected shaders:
SGPRs: 1072128 -> 1076288 (+0.39%); split: -0.01%, +0.40%
VGPRs: 821312 -> 818124 (-0.39%); split: -0.43%, +0.04%
SpillSGPRs: 11952 -> 12677 (+6.07%)
CodeSize: 96378496 -> 96707596 (+0.34%); split: -0.04%, +0.38%
MaxWaves: 42614 -> 42883 (+0.63%); split: +0.68%, -0.04%
Instrs: 18672844 -> 18600274 (-0.39%); split: -0.44%, +0.05%
Latency: 296658786 -> 296338296 (-0.11%); split: -0.21%, +0.10%
InvThroughput: 111665547 -> 111283559 (-0.34%); split: -0.40%, +0.06%
VClause: 343001 -> 342826 (-0.05%); split: -0.14%, +0.09%
SClause: 646684 -> 646657 (-0.00%); split: -0.05%, +0.04%
Copies: 1715316 -> 1712895 (-0.14%); split: -0.53%, +0.39%
PreSGPRs: 850737 -> 856543 (+0.68%); split: -0.04%, +0.72%
PreVGPRs: 775293 -> 772215 (-0.40%); split: -0.41%, +0.02%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 13:58:34 +00:00
|
|
|
switch (instr->opcode) {
|
|
|
|
|
case aco_opcode::v_add_f32:
|
|
|
|
|
case aco_opcode::v_sub_f32:
|
|
|
|
|
case aco_opcode::v_subrev_f32:
|
|
|
|
|
case aco_opcode::v_mul_f32:
|
|
|
|
|
case aco_opcode::v_fma_f32: break;
|
|
|
|
|
case aco_opcode::v_fma_mix_f32:
|
|
|
|
|
case aco_opcode::v_fma_mixlo_f16: return true;
|
|
|
|
|
default: return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (instr->opcode == aco_opcode::v_fma_f32 && !ctx.program->dev.fused_mad_mix &&
|
|
|
|
|
instr->definitions[0].isPrecise())
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (instr->isVOP3())
|
|
|
|
|
return !instr->vop3().omod && !(instr->vop3().opsel & 0x8);
|
|
|
|
|
|
|
|
|
|
return instr->format == Format::VOP2;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
to_mad_mix(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
|
|
|
|
{
|
|
|
|
|
bool is_add = instr->opcode != aco_opcode::v_mul_f32 && instr->opcode != aco_opcode::v_fma_f32;
|
|
|
|
|
|
|
|
|
|
aco_ptr<VOP3P_instruction> vop3p{
|
|
|
|
|
create_instruction<VOP3P_instruction>(aco_opcode::v_fma_mix_f32, Format::VOP3P, 3, 1)};
|
|
|
|
|
|
2022-03-22 12:05:03 +01:00
|
|
|
vop3p->opsel_lo = instr->isVOP3() ? ((instr->vop3().opsel & 0x7) << (is_add ? 1 : 0)) : 0x0;
|
aco: use v_fma_mix to combine mul/add/fma input conversions
fossil-db (Sienna Cichlid):
Totals from 11558 (8.57% of 134913) affected shaders:
VGPRs: 829392 -> 825200 (-0.51%); split: -0.52%, +0.02%
SpillSGPRs: 7845 -> 8399 (+7.06%)
CodeSize: 101822704 -> 101677172 (-0.14%); split: -0.25%, +0.11%
MaxWaves: 172216 -> 173182 (+0.56%); split: +0.59%, -0.03%
Instrs: 19061343 -> 18883450 (-0.93%); split: -0.93%, +0.00%
Latency: 256011590 -> 255177378 (-0.33%); split: -0.39%, +0.06%
InvThroughput: 46104438 -> 45604059 (-1.09%); split: -1.12%, +0.04%
VClause: 352211 -> 351948 (-0.07%); split: -0.21%, +0.13%
SClause: 676506 -> 676961 (+0.07%); split: -0.04%, +0.11%
Copies: 1246571 -> 1237745 (-0.71%); split: -0.97%, +0.26%
Branches: 626229 -> 626241 (+0.00%); split: -0.02%, +0.03%
PreSGPRs: 882176 -> 888853 (+0.76%); split: -0.00%, +0.76%
PreVGPRs: 796705 -> 792304 (-0.55%); split: -0.56%, +0.00%
fossil-db (Navi):
Totals from 11558 (8.57% of 134913) affected shaders:
VGPRs: 803900 -> 798660 (-0.65%); split: -0.73%, +0.08%
SpillSGPRs: 7894 -> 8492 (+7.58%); split: -0.10%, +7.68%
CodeSize: 96892596 -> 97134716 (+0.25%); split: -0.05%, +0.29%
MaxWaves: 181454 -> 183014 (+0.86%); split: +0.94%, -0.08%
Instrs: 18186813 -> 18093994 (-0.51%); split: -0.56%, +0.05%
Latency: 253385909 -> 253325528 (-0.02%); split: -0.15%, +0.12%
InvThroughput: 43315355 -> 42805541 (-1.18%); split: -1.33%, +0.15%
VClause: 338755 -> 338535 (-0.06%); split: -0.16%, +0.10%
SClause: 656561 -> 656829 (+0.04%); split: -0.07%, +0.11%
Copies: 1162235 -> 1153558 (-0.75%); split: -1.07%, +0.32%
Branches: 588536 -> 588542 (+0.00%); split: -0.03%, +0.03%
PreSGPRs: 854849 -> 861640 (+0.79%); split: -0.00%, +0.80%
PreVGPRs: 783401 -> 779031 (-0.56%); split: -0.56%, +0.00%
fossil-db (Vega):
Totals from 11516 (8.53% of 135048) affected shaders:
SGPRs: 1072128 -> 1076288 (+0.39%); split: -0.01%, +0.40%
VGPRs: 821312 -> 818124 (-0.39%); split: -0.43%, +0.04%
SpillSGPRs: 11952 -> 12677 (+6.07%)
CodeSize: 96378496 -> 96707596 (+0.34%); split: -0.04%, +0.38%
MaxWaves: 42614 -> 42883 (+0.63%); split: +0.68%, -0.04%
Instrs: 18672844 -> 18600274 (-0.39%); split: -0.44%, +0.05%
Latency: 296658786 -> 296338296 (-0.11%); split: -0.21%, +0.10%
InvThroughput: 111665547 -> 111283559 (-0.34%); split: -0.40%, +0.06%
VClause: 343001 -> 342826 (-0.05%); split: -0.14%, +0.09%
SClause: 646684 -> 646657 (-0.00%); split: -0.05%, +0.04%
Copies: 1715316 -> 1712895 (-0.14%); split: -0.53%, +0.39%
PreSGPRs: 850737 -> 856543 (+0.68%); split: -0.04%, +0.72%
PreVGPRs: 775293 -> 772215 (-0.40%); split: -0.41%, +0.02%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 13:58:34 +00:00
|
|
|
vop3p->opsel_hi = 0x0;
|
|
|
|
|
for (unsigned i = 0; i < instr->operands.size(); i++) {
|
|
|
|
|
vop3p->operands[is_add + i] = instr->operands[i];
|
|
|
|
|
vop3p->neg_lo[is_add + i] = instr->isVOP3() && instr->vop3().neg[i];
|
|
|
|
|
vop3p->neg_lo[is_add + i] |= instr->isSDWA() && instr->sdwa().neg[i];
|
|
|
|
|
vop3p->neg_hi[is_add + i] = instr->isVOP3() && instr->vop3().abs[i];
|
|
|
|
|
vop3p->neg_hi[is_add + i] |= instr->isSDWA() && instr->sdwa().abs[i];
|
|
|
|
|
vop3p->opsel_lo |= (instr->isSDWA() && instr->sdwa().sel[i].offset()) << (is_add + i);
|
|
|
|
|
}
|
|
|
|
|
if (instr->opcode == aco_opcode::v_mul_f32) {
|
|
|
|
|
vop3p->opsel_hi &= 0x3;
|
|
|
|
|
vop3p->operands[2] = Operand::zero();
|
|
|
|
|
vop3p->neg_lo[2] = true;
|
|
|
|
|
} else if (is_add) {
|
|
|
|
|
vop3p->opsel_hi &= 0x6;
|
|
|
|
|
vop3p->operands[0] = Operand::c32(0x3f800000);
|
|
|
|
|
if (instr->opcode == aco_opcode::v_sub_f32)
|
|
|
|
|
vop3p->neg_lo[2] ^= true;
|
|
|
|
|
else if (instr->opcode == aco_opcode::v_subrev_f32)
|
|
|
|
|
vop3p->neg_lo[1] ^= true;
|
|
|
|
|
}
|
|
|
|
|
vop3p->definitions[0] = instr->definitions[0];
|
|
|
|
|
vop3p->clamp = instr->isVOP3() && instr->vop3().clamp;
|
|
|
|
|
instr = std::move(vop3p);
|
|
|
|
|
|
2022-01-17 16:52:10 +00:00
|
|
|
ctx.info[instr->definitions[0].tempId()].label &= label_f2f16 | label_clamp | label_mul;
|
aco: use v_fma_mix to combine mul/add/fma input conversions
fossil-db (Sienna Cichlid):
Totals from 11558 (8.57% of 134913) affected shaders:
VGPRs: 829392 -> 825200 (-0.51%); split: -0.52%, +0.02%
SpillSGPRs: 7845 -> 8399 (+7.06%)
CodeSize: 101822704 -> 101677172 (-0.14%); split: -0.25%, +0.11%
MaxWaves: 172216 -> 173182 (+0.56%); split: +0.59%, -0.03%
Instrs: 19061343 -> 18883450 (-0.93%); split: -0.93%, +0.00%
Latency: 256011590 -> 255177378 (-0.33%); split: -0.39%, +0.06%
InvThroughput: 46104438 -> 45604059 (-1.09%); split: -1.12%, +0.04%
VClause: 352211 -> 351948 (-0.07%); split: -0.21%, +0.13%
SClause: 676506 -> 676961 (+0.07%); split: -0.04%, +0.11%
Copies: 1246571 -> 1237745 (-0.71%); split: -0.97%, +0.26%
Branches: 626229 -> 626241 (+0.00%); split: -0.02%, +0.03%
PreSGPRs: 882176 -> 888853 (+0.76%); split: -0.00%, +0.76%
PreVGPRs: 796705 -> 792304 (-0.55%); split: -0.56%, +0.00%
fossil-db (Navi):
Totals from 11558 (8.57% of 134913) affected shaders:
VGPRs: 803900 -> 798660 (-0.65%); split: -0.73%, +0.08%
SpillSGPRs: 7894 -> 8492 (+7.58%); split: -0.10%, +7.68%
CodeSize: 96892596 -> 97134716 (+0.25%); split: -0.05%, +0.29%
MaxWaves: 181454 -> 183014 (+0.86%); split: +0.94%, -0.08%
Instrs: 18186813 -> 18093994 (-0.51%); split: -0.56%, +0.05%
Latency: 253385909 -> 253325528 (-0.02%); split: -0.15%, +0.12%
InvThroughput: 43315355 -> 42805541 (-1.18%); split: -1.33%, +0.15%
VClause: 338755 -> 338535 (-0.06%); split: -0.16%, +0.10%
SClause: 656561 -> 656829 (+0.04%); split: -0.07%, +0.11%
Copies: 1162235 -> 1153558 (-0.75%); split: -1.07%, +0.32%
Branches: 588536 -> 588542 (+0.00%); split: -0.03%, +0.03%
PreSGPRs: 854849 -> 861640 (+0.79%); split: -0.00%, +0.80%
PreVGPRs: 783401 -> 779031 (-0.56%); split: -0.56%, +0.00%
fossil-db (Vega):
Totals from 11516 (8.53% of 135048) affected shaders:
SGPRs: 1072128 -> 1076288 (+0.39%); split: -0.01%, +0.40%
VGPRs: 821312 -> 818124 (-0.39%); split: -0.43%, +0.04%
SpillSGPRs: 11952 -> 12677 (+6.07%)
CodeSize: 96378496 -> 96707596 (+0.34%); split: -0.04%, +0.38%
MaxWaves: 42614 -> 42883 (+0.63%); split: +0.68%, -0.04%
Instrs: 18672844 -> 18600274 (-0.39%); split: -0.44%, +0.05%
Latency: 296658786 -> 296338296 (-0.11%); split: -0.21%, +0.10%
InvThroughput: 111665547 -> 111283559 (-0.34%); split: -0.40%, +0.06%
VClause: 343001 -> 342826 (-0.05%); split: -0.14%, +0.09%
SClause: 646684 -> 646657 (-0.00%); split: -0.05%, +0.04%
Copies: 1715316 -> 1712895 (-0.14%); split: -0.53%, +0.39%
PreSGPRs: 850737 -> 856543 (+0.68%); split: -0.04%, +0.72%
PreVGPRs: 775293 -> 772215 (-0.40%); split: -0.41%, +0.02%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 13:58:34 +00:00
|
|
|
if (ctx.info[instr->definitions[0].tempId()].label & label_mul)
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].instr = instr.get();
|
|
|
|
|
}
|
|
|
|
|
|
2022-01-17 16:52:10 +00:00
|
|
|
bool
|
|
|
|
|
combine_output_conversion(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
|
|
|
|
{
|
|
|
|
|
ssa_info& def_info = ctx.info[instr->definitions[0].tempId()];
|
|
|
|
|
if (!def_info.is_f2f16())
|
|
|
|
|
return false;
|
|
|
|
|
Instruction* conv = def_info.instr;
|
|
|
|
|
|
|
|
|
|
if (!can_use_mad_mix(ctx, instr) || ctx.uses[instr->definitions[0].tempId()] != 1)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (!ctx.uses[conv->definitions[0].tempId()])
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (conv->usesModifiers())
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (!instr->isVOP3P())
|
|
|
|
|
to_mad_mix(ctx, instr);
|
|
|
|
|
|
|
|
|
|
instr->opcode = aco_opcode::v_fma_mixlo_f16;
|
|
|
|
|
instr->definitions[0].swapTemp(conv->definitions[0]);
|
|
|
|
|
if (conv->definitions[0].isPrecise())
|
|
|
|
|
instr->definitions[0].setPrecise(true);
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].label &= label_clamp;
|
|
|
|
|
ctx.uses[conv->definitions[0].tempId()]--;
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
aco: use v_fma_mix to combine mul/add/fma input conversions
fossil-db (Sienna Cichlid):
Totals from 11558 (8.57% of 134913) affected shaders:
VGPRs: 829392 -> 825200 (-0.51%); split: -0.52%, +0.02%
SpillSGPRs: 7845 -> 8399 (+7.06%)
CodeSize: 101822704 -> 101677172 (-0.14%); split: -0.25%, +0.11%
MaxWaves: 172216 -> 173182 (+0.56%); split: +0.59%, -0.03%
Instrs: 19061343 -> 18883450 (-0.93%); split: -0.93%, +0.00%
Latency: 256011590 -> 255177378 (-0.33%); split: -0.39%, +0.06%
InvThroughput: 46104438 -> 45604059 (-1.09%); split: -1.12%, +0.04%
VClause: 352211 -> 351948 (-0.07%); split: -0.21%, +0.13%
SClause: 676506 -> 676961 (+0.07%); split: -0.04%, +0.11%
Copies: 1246571 -> 1237745 (-0.71%); split: -0.97%, +0.26%
Branches: 626229 -> 626241 (+0.00%); split: -0.02%, +0.03%
PreSGPRs: 882176 -> 888853 (+0.76%); split: -0.00%, +0.76%
PreVGPRs: 796705 -> 792304 (-0.55%); split: -0.56%, +0.00%
fossil-db (Navi):
Totals from 11558 (8.57% of 134913) affected shaders:
VGPRs: 803900 -> 798660 (-0.65%); split: -0.73%, +0.08%
SpillSGPRs: 7894 -> 8492 (+7.58%); split: -0.10%, +7.68%
CodeSize: 96892596 -> 97134716 (+0.25%); split: -0.05%, +0.29%
MaxWaves: 181454 -> 183014 (+0.86%); split: +0.94%, -0.08%
Instrs: 18186813 -> 18093994 (-0.51%); split: -0.56%, +0.05%
Latency: 253385909 -> 253325528 (-0.02%); split: -0.15%, +0.12%
InvThroughput: 43315355 -> 42805541 (-1.18%); split: -1.33%, +0.15%
VClause: 338755 -> 338535 (-0.06%); split: -0.16%, +0.10%
SClause: 656561 -> 656829 (+0.04%); split: -0.07%, +0.11%
Copies: 1162235 -> 1153558 (-0.75%); split: -1.07%, +0.32%
Branches: 588536 -> 588542 (+0.00%); split: -0.03%, +0.03%
PreSGPRs: 854849 -> 861640 (+0.79%); split: -0.00%, +0.80%
PreVGPRs: 783401 -> 779031 (-0.56%); split: -0.56%, +0.00%
fossil-db (Vega):
Totals from 11516 (8.53% of 135048) affected shaders:
SGPRs: 1072128 -> 1076288 (+0.39%); split: -0.01%, +0.40%
VGPRs: 821312 -> 818124 (-0.39%); split: -0.43%, +0.04%
SpillSGPRs: 11952 -> 12677 (+6.07%)
CodeSize: 96378496 -> 96707596 (+0.34%); split: -0.04%, +0.38%
MaxWaves: 42614 -> 42883 (+0.63%); split: +0.68%, -0.04%
Instrs: 18672844 -> 18600274 (-0.39%); split: -0.44%, +0.05%
Latency: 296658786 -> 296338296 (-0.11%); split: -0.21%, +0.10%
InvThroughput: 111665547 -> 111283559 (-0.34%); split: -0.40%, +0.06%
VClause: 343001 -> 342826 (-0.05%); split: -0.14%, +0.09%
SClause: 646684 -> 646657 (-0.00%); split: -0.05%, +0.04%
Copies: 1715316 -> 1712895 (-0.14%); split: -0.53%, +0.39%
PreSGPRs: 850737 -> 856543 (+0.68%); split: -0.04%, +0.72%
PreVGPRs: 775293 -> 772215 (-0.40%); split: -0.41%, +0.02%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 13:58:34 +00:00
|
|
|
void
|
|
|
|
|
combine_mad_mix(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
|
|
|
|
{
|
|
|
|
|
if (!can_use_mad_mix(ctx, instr))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < instr->operands.size(); i++) {
|
|
|
|
|
if (!instr->operands[i].isTemp())
|
|
|
|
|
continue;
|
|
|
|
|
Temp tmp = instr->operands[i].getTemp();
|
|
|
|
|
if (!ctx.info[tmp.id()].is_f2f32())
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
Instruction* conv = ctx.info[tmp.id()].instr;
|
|
|
|
|
if (conv->isSDWA() && (conv->sdwa().dst_sel.size() != 4 || conv->sdwa().sel[0].size() != 2 ||
|
|
|
|
|
conv->sdwa().clamp || conv->sdwa().omod)) {
|
|
|
|
|
continue;
|
|
|
|
|
} else if (conv->isVOP3() && (conv->vop3().clamp || conv->vop3().omod)) {
|
|
|
|
|
continue;
|
|
|
|
|
} else if (conv->isDPP()) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (get_operand_size(instr, i) != 32)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
/* Conversion to VOP3P will add inline constant operands, but that shouldn't affect
|
|
|
|
|
* check_vop3_operands(). */
|
|
|
|
|
Operand op[3];
|
|
|
|
|
for (unsigned j = 0; j < instr->operands.size(); j++)
|
|
|
|
|
op[j] = instr->operands[j];
|
|
|
|
|
op[i] = conv->operands[0];
|
|
|
|
|
if (!check_vop3_operands(ctx, instr->operands.size(), op))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
if (!instr->isVOP3P()) {
|
|
|
|
|
bool is_add =
|
|
|
|
|
instr->opcode != aco_opcode::v_mul_f32 && instr->opcode != aco_opcode::v_fma_f32;
|
|
|
|
|
to_mad_mix(ctx, instr);
|
|
|
|
|
i += is_add;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (--ctx.uses[tmp.id()])
|
|
|
|
|
ctx.uses[conv->operands[0].tempId()]++;
|
|
|
|
|
instr->operands[i].setTemp(conv->operands[0].getTemp());
|
|
|
|
|
if (conv->definitions[0].isPrecise())
|
|
|
|
|
instr->definitions[0].setPrecise(true);
|
|
|
|
|
instr->vop3p().opsel_hi ^= 1u << i;
|
|
|
|
|
if (conv->isSDWA() && conv->sdwa().sel[0].offset() == 2)
|
|
|
|
|
instr->vop3p().opsel_lo |= 1u << i;
|
|
|
|
|
bool neg = (conv->isVOP3() && conv->vop3().neg[0]) || (conv->isSDWA() && conv->sdwa().neg[0]);
|
|
|
|
|
bool abs = (conv->isVOP3() && conv->vop3().abs[0]) || (conv->isSDWA() && conv->sdwa().abs[0]);
|
|
|
|
|
if (!instr->vop3p().neg_hi[i]) {
|
|
|
|
|
instr->vop3p().neg_lo[i] ^= neg;
|
|
|
|
|
instr->vop3p().neg_hi[i] = abs;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
// TODO: we could possibly move the whole label_instruction pass to combine_instruction:
|
|
|
|
|
// this would mean that we'd have to fix the instruction uses while value propagation
|
|
|
|
|
|
2022-04-11 17:51:42 +01:00
|
|
|
/* also returns true for inf */
|
|
|
|
|
bool
|
|
|
|
|
is_pow_of_two(opt_ctx& ctx, Operand op)
|
|
|
|
|
{
|
|
|
|
|
if (op.isTemp() && ctx.info[op.tempId()].is_constant_or_literal(op.bytes() * 8))
|
|
|
|
|
return is_pow_of_two(ctx, get_constant_op(ctx, ctx.info[op.tempId()], op.bytes() * 8));
|
|
|
|
|
else if (!op.isConstant())
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
uint64_t val = op.constantValue64();
|
|
|
|
|
|
|
|
|
|
if (op.bytes() == 4) {
|
|
|
|
|
uint32_t exponent = (val & 0x7f800000) >> 23;
|
|
|
|
|
uint32_t fraction = val & 0x007fffff;
|
|
|
|
|
return (exponent >= 127) && (fraction == 0);
|
|
|
|
|
} else if (op.bytes() == 2) {
|
|
|
|
|
uint32_t exponent = (val & 0x7c00) >> 10;
|
|
|
|
|
uint32_t fraction = val & 0x03ff;
|
|
|
|
|
return (exponent >= 15) && (fraction == 0);
|
|
|
|
|
} else {
|
|
|
|
|
assert(op.bytes() == 8);
|
|
|
|
|
uint64_t exponent = (val & UINT64_C(0x7ff0000000000000)) >> 52;
|
|
|
|
|
uint64_t fraction = val & UINT64_C(0x000fffffffffffff);
|
|
|
|
|
return (exponent >= 1023) && (fraction == 0);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2019-11-19 13:38:34 +01:00
|
|
|
if (instr->definitions.empty() || is_dead(ctx.uses, instr.get()))
|
2019-09-17 13:22:17 +02:00
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
if (instr->isVALU()) {
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
/* Apply SDWA. Do this after label_instruction() so it can remove
|
|
|
|
|
* label_extract if not all instructions can take SDWA. */
|
|
|
|
|
for (unsigned i = 0; i < instr->operands.size(); i++) {
|
|
|
|
|
Operand& op = instr->operands[i];
|
|
|
|
|
if (!op.isTemp())
|
|
|
|
|
continue;
|
|
|
|
|
ssa_info& info = ctx.info[op.tempId()];
|
2021-07-01 15:03:36 +02:00
|
|
|
if (!info.is_extract())
|
|
|
|
|
continue;
|
|
|
|
|
/* if there are that many uses, there are likely better combinations */
|
|
|
|
|
// TODO: delay applying extract to a point where we know better
|
|
|
|
|
if (ctx.uses[op.tempId()] > 4) {
|
|
|
|
|
info.label &= ~label_extract;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2021-06-09 10:14:54 +02:00
|
|
|
if (info.is_extract() &&
|
|
|
|
|
(info.instr->operands[0].getTemp().type() == RegType::vgpr ||
|
|
|
|
|
instr->operands[i].getTemp().type() == RegType::sgpr) &&
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
can_apply_extract(ctx, instr, i, info)) {
|
2021-11-22 11:03:03 +00:00
|
|
|
/* Increase use count of the extract's operand if the extract still has uses. */
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
apply_extract(ctx, instr, i, info);
|
2021-11-22 11:03:03 +00:00
|
|
|
if (--ctx.uses[instr->operands[i].tempId()])
|
|
|
|
|
ctx.uses[info.instr->operands[0].tempId()]++;
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
instr->operands[i].setTemp(info.instr->operands[0].getTemp());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-12-05 14:12:39 +00:00
|
|
|
if (can_apply_sgprs(ctx, instr))
|
2019-09-24 13:32:56 +01:00
|
|
|
apply_sgprs(ctx, instr);
|
aco: use v_fma_mix to combine mul/add/fma input conversions
fossil-db (Sienna Cichlid):
Totals from 11558 (8.57% of 134913) affected shaders:
VGPRs: 829392 -> 825200 (-0.51%); split: -0.52%, +0.02%
SpillSGPRs: 7845 -> 8399 (+7.06%)
CodeSize: 101822704 -> 101677172 (-0.14%); split: -0.25%, +0.11%
MaxWaves: 172216 -> 173182 (+0.56%); split: +0.59%, -0.03%
Instrs: 19061343 -> 18883450 (-0.93%); split: -0.93%, +0.00%
Latency: 256011590 -> 255177378 (-0.33%); split: -0.39%, +0.06%
InvThroughput: 46104438 -> 45604059 (-1.09%); split: -1.12%, +0.04%
VClause: 352211 -> 351948 (-0.07%); split: -0.21%, +0.13%
SClause: 676506 -> 676961 (+0.07%); split: -0.04%, +0.11%
Copies: 1246571 -> 1237745 (-0.71%); split: -0.97%, +0.26%
Branches: 626229 -> 626241 (+0.00%); split: -0.02%, +0.03%
PreSGPRs: 882176 -> 888853 (+0.76%); split: -0.00%, +0.76%
PreVGPRs: 796705 -> 792304 (-0.55%); split: -0.56%, +0.00%
fossil-db (Navi):
Totals from 11558 (8.57% of 134913) affected shaders:
VGPRs: 803900 -> 798660 (-0.65%); split: -0.73%, +0.08%
SpillSGPRs: 7894 -> 8492 (+7.58%); split: -0.10%, +7.68%
CodeSize: 96892596 -> 97134716 (+0.25%); split: -0.05%, +0.29%
MaxWaves: 181454 -> 183014 (+0.86%); split: +0.94%, -0.08%
Instrs: 18186813 -> 18093994 (-0.51%); split: -0.56%, +0.05%
Latency: 253385909 -> 253325528 (-0.02%); split: -0.15%, +0.12%
InvThroughput: 43315355 -> 42805541 (-1.18%); split: -1.33%, +0.15%
VClause: 338755 -> 338535 (-0.06%); split: -0.16%, +0.10%
SClause: 656561 -> 656829 (+0.04%); split: -0.07%, +0.11%
Copies: 1162235 -> 1153558 (-0.75%); split: -1.07%, +0.32%
Branches: 588536 -> 588542 (+0.00%); split: -0.03%, +0.03%
PreSGPRs: 854849 -> 861640 (+0.79%); split: -0.00%, +0.80%
PreVGPRs: 783401 -> 779031 (-0.56%); split: -0.56%, +0.00%
fossil-db (Vega):
Totals from 11516 (8.53% of 135048) affected shaders:
SGPRs: 1072128 -> 1076288 (+0.39%); split: -0.01%, +0.40%
VGPRs: 821312 -> 818124 (-0.39%); split: -0.43%, +0.04%
SpillSGPRs: 11952 -> 12677 (+6.07%)
CodeSize: 96378496 -> 96707596 (+0.34%); split: -0.04%, +0.38%
MaxWaves: 42614 -> 42883 (+0.63%); split: +0.68%, -0.04%
Instrs: 18672844 -> 18600274 (-0.39%); split: -0.44%, +0.05%
Latency: 296658786 -> 296338296 (-0.11%); split: -0.21%, +0.10%
InvThroughput: 111665547 -> 111283559 (-0.34%); split: -0.40%, +0.06%
VClause: 343001 -> 342826 (-0.05%); split: -0.14%, +0.09%
SClause: 646684 -> 646657 (-0.00%); split: -0.05%, +0.04%
Copies: 1715316 -> 1712895 (-0.14%); split: -0.53%, +0.39%
PreSGPRs: 850737 -> 856543 (+0.68%); split: -0.04%, +0.72%
PreVGPRs: 775293 -> 772215 (-0.40%); split: -0.41%, +0.02%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 13:58:34 +00:00
|
|
|
combine_mad_mix(ctx, instr);
|
2022-11-04 19:30:12 +08:00
|
|
|
while (apply_omod_clamp(ctx, instr) || combine_output_conversion(ctx, instr))
|
2021-06-09 10:14:54 +02:00
|
|
|
;
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
apply_insert(ctx, instr);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
aco: use v_fma_mix to combine mul/add/fma input conversions
fossil-db (Sienna Cichlid):
Totals from 11558 (8.57% of 134913) affected shaders:
VGPRs: 829392 -> 825200 (-0.51%); split: -0.52%, +0.02%
SpillSGPRs: 7845 -> 8399 (+7.06%)
CodeSize: 101822704 -> 101677172 (-0.14%); split: -0.25%, +0.11%
MaxWaves: 172216 -> 173182 (+0.56%); split: +0.59%, -0.03%
Instrs: 19061343 -> 18883450 (-0.93%); split: -0.93%, +0.00%
Latency: 256011590 -> 255177378 (-0.33%); split: -0.39%, +0.06%
InvThroughput: 46104438 -> 45604059 (-1.09%); split: -1.12%, +0.04%
VClause: 352211 -> 351948 (-0.07%); split: -0.21%, +0.13%
SClause: 676506 -> 676961 (+0.07%); split: -0.04%, +0.11%
Copies: 1246571 -> 1237745 (-0.71%); split: -0.97%, +0.26%
Branches: 626229 -> 626241 (+0.00%); split: -0.02%, +0.03%
PreSGPRs: 882176 -> 888853 (+0.76%); split: -0.00%, +0.76%
PreVGPRs: 796705 -> 792304 (-0.55%); split: -0.56%, +0.00%
fossil-db (Navi):
Totals from 11558 (8.57% of 134913) affected shaders:
VGPRs: 803900 -> 798660 (-0.65%); split: -0.73%, +0.08%
SpillSGPRs: 7894 -> 8492 (+7.58%); split: -0.10%, +7.68%
CodeSize: 96892596 -> 97134716 (+0.25%); split: -0.05%, +0.29%
MaxWaves: 181454 -> 183014 (+0.86%); split: +0.94%, -0.08%
Instrs: 18186813 -> 18093994 (-0.51%); split: -0.56%, +0.05%
Latency: 253385909 -> 253325528 (-0.02%); split: -0.15%, +0.12%
InvThroughput: 43315355 -> 42805541 (-1.18%); split: -1.33%, +0.15%
VClause: 338755 -> 338535 (-0.06%); split: -0.16%, +0.10%
SClause: 656561 -> 656829 (+0.04%); split: -0.07%, +0.11%
Copies: 1162235 -> 1153558 (-0.75%); split: -1.07%, +0.32%
Branches: 588536 -> 588542 (+0.00%); split: -0.03%, +0.03%
PreSGPRs: 854849 -> 861640 (+0.79%); split: -0.00%, +0.80%
PreVGPRs: 783401 -> 779031 (-0.56%); split: -0.56%, +0.00%
fossil-db (Vega):
Totals from 11516 (8.53% of 135048) affected shaders:
SGPRs: 1072128 -> 1076288 (+0.39%); split: -0.01%, +0.40%
VGPRs: 821312 -> 818124 (-0.39%); split: -0.43%, +0.04%
SpillSGPRs: 11952 -> 12677 (+6.07%)
CodeSize: 96378496 -> 96707596 (+0.34%); split: -0.04%, +0.38%
MaxWaves: 42614 -> 42883 (+0.63%); split: +0.68%, -0.04%
Instrs: 18672844 -> 18600274 (-0.39%); split: -0.44%, +0.05%
Latency: 296658786 -> 296338296 (-0.11%); split: -0.21%, +0.10%
InvThroughput: 111665547 -> 111283559 (-0.34%); split: -0.40%, +0.06%
VClause: 343001 -> 342826 (-0.05%); split: -0.14%, +0.09%
SClause: 646684 -> 646657 (-0.00%); split: -0.05%, +0.04%
Copies: 1715316 -> 1712895 (-0.14%); split: -0.53%, +0.39%
PreSGPRs: 850737 -> 856543 (+0.68%); split: -0.04%, +0.72%
PreVGPRs: 775293 -> 772215 (-0.40%); split: -0.41%, +0.02%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 13:58:34 +00:00
|
|
|
if (instr->isVOP3P() && instr->opcode != aco_opcode::v_fma_mix_f32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_fma_mixlo_f16)
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
return combine_vop3p(ctx, instr);
|
2020-09-03 12:02:55 +01:00
|
|
|
|
2021-07-19 14:26:42 +01:00
|
|
|
if (instr->isSDWA() || instr->isDPP())
|
2019-12-05 14:12:39 +00:00
|
|
|
return;
|
|
|
|
|
|
2021-10-05 13:09:02 +01:00
|
|
|
if (instr->opcode == aco_opcode::p_extract) {
|
|
|
|
|
ssa_info& info = ctx.info[instr->operands[0].tempId()];
|
|
|
|
|
if (info.is_extract() && can_apply_extract(ctx, instr, 0, info)) {
|
|
|
|
|
apply_extract(ctx, instr, 0, info);
|
|
|
|
|
if (--ctx.uses[instr->operands[0].tempId()])
|
|
|
|
|
ctx.uses[info.instr->operands[0].tempId()]++;
|
|
|
|
|
instr->operands[0].setTemp(info.instr->operands[0].getTemp());
|
|
|
|
|
}
|
|
|
|
|
|
2021-08-27 15:45:59 +02:00
|
|
|
apply_ds_extract(ctx, instr);
|
2021-10-05 13:09:02 +01:00
|
|
|
}
|
2021-08-27 15:45:59 +02:00
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
/* TODO: There are still some peephole optimizations that could be done:
|
|
|
|
|
* - abs(a - b) -> s_absdiff_i32
|
|
|
|
|
* - various patterns for s_bitcmp{0,1}_b32 and s_bitset{0,1}_b32
|
|
|
|
|
* - patterns for v_alignbit_b32 and v_alignbyte_b32
|
|
|
|
|
* These aren't probably too interesting though.
|
|
|
|
|
* There are also patterns for v_cmp_class_f{16,32,64}. This is difficult but
|
|
|
|
|
* probably more useful than the previously mentioned optimizations.
|
|
|
|
|
* The various comparison optimizations also currently only work with 32-bit
|
|
|
|
|
* floats. */
|
|
|
|
|
|
2022-01-28 13:47:16 +00:00
|
|
|
/* neg(mul(a, b)) -> mul(neg(a), b), abs(mul(a, b)) -> mul(abs(a), abs(b)) */
|
|
|
|
|
if ((ctx.info[instr->definitions[0].tempId()].label & (label_neg | label_abs)) &&
|
2021-06-09 10:14:54 +02:00
|
|
|
ctx.uses[instr->operands[1].tempId()] == 1) {
|
2019-09-17 13:22:17 +02:00
|
|
|
Temp val = ctx.info[instr->definitions[0].tempId()].temp;
|
|
|
|
|
|
|
|
|
|
if (!ctx.info[val.id()].is_mul())
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
Instruction* mul_instr = ctx.info[val.id()].instr;
|
|
|
|
|
|
|
|
|
|
if (mul_instr->operands[0].isLiteral())
|
|
|
|
|
return;
|
2021-01-21 16:13:34 +00:00
|
|
|
if (mul_instr->isVOP3() && mul_instr->vop3().clamp)
|
2019-09-17 13:22:17 +02:00
|
|
|
return;
|
aco: use v_fma_mix to combine mul/add/fma input conversions
fossil-db (Sienna Cichlid):
Totals from 11558 (8.57% of 134913) affected shaders:
VGPRs: 829392 -> 825200 (-0.51%); split: -0.52%, +0.02%
SpillSGPRs: 7845 -> 8399 (+7.06%)
CodeSize: 101822704 -> 101677172 (-0.14%); split: -0.25%, +0.11%
MaxWaves: 172216 -> 173182 (+0.56%); split: +0.59%, -0.03%
Instrs: 19061343 -> 18883450 (-0.93%); split: -0.93%, +0.00%
Latency: 256011590 -> 255177378 (-0.33%); split: -0.39%, +0.06%
InvThroughput: 46104438 -> 45604059 (-1.09%); split: -1.12%, +0.04%
VClause: 352211 -> 351948 (-0.07%); split: -0.21%, +0.13%
SClause: 676506 -> 676961 (+0.07%); split: -0.04%, +0.11%
Copies: 1246571 -> 1237745 (-0.71%); split: -0.97%, +0.26%
Branches: 626229 -> 626241 (+0.00%); split: -0.02%, +0.03%
PreSGPRs: 882176 -> 888853 (+0.76%); split: -0.00%, +0.76%
PreVGPRs: 796705 -> 792304 (-0.55%); split: -0.56%, +0.00%
fossil-db (Navi):
Totals from 11558 (8.57% of 134913) affected shaders:
VGPRs: 803900 -> 798660 (-0.65%); split: -0.73%, +0.08%
SpillSGPRs: 7894 -> 8492 (+7.58%); split: -0.10%, +7.68%
CodeSize: 96892596 -> 97134716 (+0.25%); split: -0.05%, +0.29%
MaxWaves: 181454 -> 183014 (+0.86%); split: +0.94%, -0.08%
Instrs: 18186813 -> 18093994 (-0.51%); split: -0.56%, +0.05%
Latency: 253385909 -> 253325528 (-0.02%); split: -0.15%, +0.12%
InvThroughput: 43315355 -> 42805541 (-1.18%); split: -1.33%, +0.15%
VClause: 338755 -> 338535 (-0.06%); split: -0.16%, +0.10%
SClause: 656561 -> 656829 (+0.04%); split: -0.07%, +0.11%
Copies: 1162235 -> 1153558 (-0.75%); split: -1.07%, +0.32%
Branches: 588536 -> 588542 (+0.00%); split: -0.03%, +0.03%
PreSGPRs: 854849 -> 861640 (+0.79%); split: -0.00%, +0.80%
PreVGPRs: 783401 -> 779031 (-0.56%); split: -0.56%, +0.00%
fossil-db (Vega):
Totals from 11516 (8.53% of 135048) affected shaders:
SGPRs: 1072128 -> 1076288 (+0.39%); split: -0.01%, +0.40%
VGPRs: 821312 -> 818124 (-0.39%); split: -0.43%, +0.04%
SpillSGPRs: 11952 -> 12677 (+6.07%)
CodeSize: 96378496 -> 96707596 (+0.34%); split: -0.04%, +0.38%
MaxWaves: 42614 -> 42883 (+0.63%); split: +0.68%, -0.04%
Instrs: 18672844 -> 18600274 (-0.39%); split: -0.44%, +0.05%
Latency: 296658786 -> 296338296 (-0.11%); split: -0.21%, +0.10%
InvThroughput: 111665547 -> 111283559 (-0.34%); split: -0.40%, +0.06%
VClause: 343001 -> 342826 (-0.05%); split: -0.14%, +0.09%
SClause: 646684 -> 646657 (-0.00%); split: -0.05%, +0.04%
Copies: 1715316 -> 1712895 (-0.14%); split: -0.53%, +0.39%
PreSGPRs: 850737 -> 856543 (+0.68%); split: -0.04%, +0.72%
PreVGPRs: 775293 -> 772215 (-0.40%); split: -0.41%, +0.02%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 13:58:34 +00:00
|
|
|
if (mul_instr->isSDWA() || mul_instr->isDPP() || mul_instr->isVOP3P())
|
2019-12-05 14:12:39 +00:00
|
|
|
return;
|
2021-09-21 17:03:05 +01:00
|
|
|
if (mul_instr->opcode == aco_opcode::v_mul_legacy_f32 &&
|
|
|
|
|
ctx.fp_mode.preserve_signed_zero_inf_nan32)
|
|
|
|
|
return;
|
2022-01-31 18:28:59 +00:00
|
|
|
if (mul_instr->definitions[0].bytes() != instr->definitions[0].bytes())
|
|
|
|
|
return;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2022-01-28 13:47:16 +00:00
|
|
|
/* convert to mul(neg(a), b), mul(abs(a), abs(b)) or mul(neg(abs(a)), abs(b)) */
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.uses[mul_instr->definitions[0].tempId()]--;
|
|
|
|
|
Definition def = instr->definitions[0];
|
2022-01-28 13:47:16 +00:00
|
|
|
bool is_neg = ctx.info[instr->definitions[0].tempId()].is_neg();
|
2019-09-17 13:22:17 +02:00
|
|
|
bool is_abs = ctx.info[instr->definitions[0].tempId()].is_abs();
|
2021-06-09 10:14:54 +02:00
|
|
|
instr.reset(
|
|
|
|
|
create_instruction<VOP3_instruction>(mul_instr->opcode, asVOP3(Format::VOP2), 2, 1));
|
2019-09-17 13:22:17 +02:00
|
|
|
instr->operands[0] = mul_instr->operands[0];
|
|
|
|
|
instr->operands[1] = mul_instr->operands[1];
|
|
|
|
|
instr->definitions[0] = def;
|
2021-01-21 16:13:34 +00:00
|
|
|
VOP3_instruction& new_mul = instr->vop3();
|
2019-09-17 13:22:17 +02:00
|
|
|
if (mul_instr->isVOP3()) {
|
2021-01-21 16:13:34 +00:00
|
|
|
VOP3_instruction& mul = mul_instr->vop3();
|
2022-01-28 13:48:34 +00:00
|
|
|
new_mul.neg[0] = mul.neg[0];
|
|
|
|
|
new_mul.neg[1] = mul.neg[1];
|
|
|
|
|
new_mul.abs[0] = mul.abs[0];
|
|
|
|
|
new_mul.abs[1] = mul.abs[1];
|
2021-01-21 16:13:34 +00:00
|
|
|
new_mul.omod = mul.omod;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
2022-01-28 13:48:34 +00:00
|
|
|
if (is_abs) {
|
|
|
|
|
new_mul.neg[0] = new_mul.neg[1] = false;
|
|
|
|
|
new_mul.abs[0] = new_mul.abs[1] = true;
|
|
|
|
|
}
|
2022-01-28 13:47:16 +00:00
|
|
|
new_mul.neg[0] ^= is_neg;
|
2021-01-21 16:13:34 +00:00
|
|
|
new_mul.clamp = false;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_mul(instr.get());
|
|
|
|
|
return;
|
|
|
|
|
}
|
2020-05-15 14:03:15 +01:00
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
/* combine mul+add -> mad */
|
aco: combine add/mul as v_fma_mix into fma
fossil-db (Sienna Cichlid):
Totals from 7345 (5.44% of 134913) affected shaders:
CodeSize: 73840060 -> 73768936 (-0.10%); split: -0.10%, +0.00%
Instrs: 13701603 -> 13684183 (-0.13%); split: -0.13%, +0.00%
Latency: 185389373 -> 185306538 (-0.04%); split: -0.04%, +0.00%
InvThroughput: 33785020 -> 33757593 (-0.08%); split: -0.08%, +0.00%
VClause: 237337 -> 237338 (+0.00%)
SClause: 485728 -> 485720 (-0.00%)
Copies: 935900 -> 935279 (-0.07%); split: -0.07%, +0.00%
Branches: 480721 -> 480722 (+0.00%)
fossil-db (Navi):
Totals from 10649 (7.89% of 134913) affected shaders:
VGPRs: 756624 -> 756516 (-0.01%); split: -0.02%, +0.01%
CodeSize: 92156580 -> 91707900 (-0.49%); split: -0.49%, +0.00%
MaxWaves: 159402 -> 159476 (+0.05%); split: +0.07%, -0.02%
Instrs: 17155827 -> 17070449 (-0.50%); split: -0.50%, +0.00%
Latency: 246296456 -> 245487120 (-0.33%); split: -0.33%, +0.00%
InvThroughput: 41438159 -> 41117424 (-0.77%); split: -0.77%, +0.00%
VClause: 323790 -> 323867 (+0.02%); split: -0.00%, +0.03%
SClause: 612077 -> 612034 (-0.01%); split: -0.01%, +0.00%
Copies: 1103012 -> 1102775 (-0.02%); split: -0.03%, +0.01%
Branches: 555893 -> 555896 (+0.00%); split: -0.00%, +0.00%
PreSGPRs: 824372 -> 824378 (+0.00%)
PreVGPRs: 740390 -> 740363 (-0.00%); split: -0.01%, +0.01%
fossil-db (Vega):
Totals from 10950 (8.11% of 135048) affected shaders:
SGPRs: 1034528 -> 1034560 (+0.00%)
VGPRs: 794092 -> 794104 (+0.00%); split: -0.01%, +0.01%
CodeSize: 94409768 -> 93955568 (-0.48%); split: -0.48%, +0.00%
MaxWaves: 38950 -> 38939 (-0.03%); split: +0.00%, -0.03%
Instrs: 18162637 -> 18070934 (-0.50%); split: -0.51%, +0.00%
Latency: 291718455 -> 290772451 (-0.32%); split: -0.32%, +0.00%
InvThroughput: 109114674 -> 108489767 (-0.57%); split: -0.57%, +0.00%
VClause: 334498 -> 334579 (+0.02%); split: -0.01%, +0.03%
SClause: 628871 -> 628825 (-0.01%); split: -0.01%, +0.00%
Copies: 1674477 -> 1674850 (+0.02%); split: -0.02%, +0.04%
PreSGPRs: 834800 -> 834802 (+0.00%)
PreVGPRs: 750460 -> 750415 (-0.01%); split: -0.01%, +0.01%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 17:48:33 +00:00
|
|
|
bool is_add_mix =
|
|
|
|
|
(instr->opcode == aco_opcode::v_fma_mix_f32 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_fma_mixlo_f16) &&
|
|
|
|
|
!instr->vop3p().neg_lo[0] &&
|
|
|
|
|
((instr->operands[0].constantEquals(0x3f800000) && (instr->vop3p().opsel_hi & 0x1) == 0) ||
|
|
|
|
|
(instr->operands[0].constantEquals(0x3C00) && (instr->vop3p().opsel_hi & 0x1) &&
|
|
|
|
|
!(instr->vop3p().opsel_lo & 0x1)));
|
2021-06-09 10:14:54 +02:00
|
|
|
bool mad32 = instr->opcode == aco_opcode::v_add_f32 || instr->opcode == aco_opcode::v_sub_f32 ||
|
2020-05-15 14:03:15 +01:00
|
|
|
instr->opcode == aco_opcode::v_subrev_f32;
|
2021-06-09 10:14:54 +02:00
|
|
|
bool mad16 = instr->opcode == aco_opcode::v_add_f16 || instr->opcode == aco_opcode::v_sub_f16 ||
|
2020-05-14 21:09:36 +01:00
|
|
|
instr->opcode == aco_opcode::v_subrev_f16;
|
2021-06-30 19:20:49 +02:00
|
|
|
bool mad64 = instr->opcode == aco_opcode::v_add_f64;
|
aco: combine add/mul as v_fma_mix into fma
fossil-db (Sienna Cichlid):
Totals from 7345 (5.44% of 134913) affected shaders:
CodeSize: 73840060 -> 73768936 (-0.10%); split: -0.10%, +0.00%
Instrs: 13701603 -> 13684183 (-0.13%); split: -0.13%, +0.00%
Latency: 185389373 -> 185306538 (-0.04%); split: -0.04%, +0.00%
InvThroughput: 33785020 -> 33757593 (-0.08%); split: -0.08%, +0.00%
VClause: 237337 -> 237338 (+0.00%)
SClause: 485728 -> 485720 (-0.00%)
Copies: 935900 -> 935279 (-0.07%); split: -0.07%, +0.00%
Branches: 480721 -> 480722 (+0.00%)
fossil-db (Navi):
Totals from 10649 (7.89% of 134913) affected shaders:
VGPRs: 756624 -> 756516 (-0.01%); split: -0.02%, +0.01%
CodeSize: 92156580 -> 91707900 (-0.49%); split: -0.49%, +0.00%
MaxWaves: 159402 -> 159476 (+0.05%); split: +0.07%, -0.02%
Instrs: 17155827 -> 17070449 (-0.50%); split: -0.50%, +0.00%
Latency: 246296456 -> 245487120 (-0.33%); split: -0.33%, +0.00%
InvThroughput: 41438159 -> 41117424 (-0.77%); split: -0.77%, +0.00%
VClause: 323790 -> 323867 (+0.02%); split: -0.00%, +0.03%
SClause: 612077 -> 612034 (-0.01%); split: -0.01%, +0.00%
Copies: 1103012 -> 1102775 (-0.02%); split: -0.03%, +0.01%
Branches: 555893 -> 555896 (+0.00%); split: -0.00%, +0.00%
PreSGPRs: 824372 -> 824378 (+0.00%)
PreVGPRs: 740390 -> 740363 (-0.00%); split: -0.01%, +0.01%
fossil-db (Vega):
Totals from 10950 (8.11% of 135048) affected shaders:
SGPRs: 1034528 -> 1034560 (+0.00%)
VGPRs: 794092 -> 794104 (+0.00%); split: -0.01%, +0.01%
CodeSize: 94409768 -> 93955568 (-0.48%); split: -0.48%, +0.00%
MaxWaves: 38950 -> 38939 (-0.03%); split: +0.00%, -0.03%
Instrs: 18162637 -> 18070934 (-0.50%); split: -0.51%, +0.00%
Latency: 291718455 -> 290772451 (-0.32%); split: -0.32%, +0.00%
InvThroughput: 109114674 -> 108489767 (-0.57%); split: -0.57%, +0.00%
VClause: 334498 -> 334579 (+0.02%); split: -0.01%, +0.03%
SClause: 628871 -> 628825 (-0.01%); split: -0.01%, +0.00%
Copies: 1674477 -> 1674850 (+0.02%); split: -0.02%, +0.04%
PreSGPRs: 834800 -> 834802 (+0.00%)
PreVGPRs: 750460 -> 750415 (-0.01%); split: -0.01%, +0.01%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 17:48:33 +00:00
|
|
|
if (is_add_mix || mad16 || mad32 || mad64) {
|
2019-09-17 13:22:17 +02:00
|
|
|
Instruction* mul_instr = nullptr;
|
2020-09-03 12:02:55 +01:00
|
|
|
unsigned add_op_idx = 0;
|
2020-09-02 15:19:21 +01:00
|
|
|
uint32_t uses = UINT32_MAX;
|
2022-01-17 17:33:25 +00:00
|
|
|
bool emit_fma = false;
|
2019-09-17 13:22:17 +02:00
|
|
|
/* find the 'best' mul instruction to combine with the add */
|
aco: combine add/mul as v_fma_mix into fma
fossil-db (Sienna Cichlid):
Totals from 7345 (5.44% of 134913) affected shaders:
CodeSize: 73840060 -> 73768936 (-0.10%); split: -0.10%, +0.00%
Instrs: 13701603 -> 13684183 (-0.13%); split: -0.13%, +0.00%
Latency: 185389373 -> 185306538 (-0.04%); split: -0.04%, +0.00%
InvThroughput: 33785020 -> 33757593 (-0.08%); split: -0.08%, +0.00%
VClause: 237337 -> 237338 (+0.00%)
SClause: 485728 -> 485720 (-0.00%)
Copies: 935900 -> 935279 (-0.07%); split: -0.07%, +0.00%
Branches: 480721 -> 480722 (+0.00%)
fossil-db (Navi):
Totals from 10649 (7.89% of 134913) affected shaders:
VGPRs: 756624 -> 756516 (-0.01%); split: -0.02%, +0.01%
CodeSize: 92156580 -> 91707900 (-0.49%); split: -0.49%, +0.00%
MaxWaves: 159402 -> 159476 (+0.05%); split: +0.07%, -0.02%
Instrs: 17155827 -> 17070449 (-0.50%); split: -0.50%, +0.00%
Latency: 246296456 -> 245487120 (-0.33%); split: -0.33%, +0.00%
InvThroughput: 41438159 -> 41117424 (-0.77%); split: -0.77%, +0.00%
VClause: 323790 -> 323867 (+0.02%); split: -0.00%, +0.03%
SClause: 612077 -> 612034 (-0.01%); split: -0.01%, +0.00%
Copies: 1103012 -> 1102775 (-0.02%); split: -0.03%, +0.01%
Branches: 555893 -> 555896 (+0.00%); split: -0.00%, +0.00%
PreSGPRs: 824372 -> 824378 (+0.00%)
PreVGPRs: 740390 -> 740363 (-0.00%); split: -0.01%, +0.01%
fossil-db (Vega):
Totals from 10950 (8.11% of 135048) affected shaders:
SGPRs: 1034528 -> 1034560 (+0.00%)
VGPRs: 794092 -> 794104 (+0.00%); split: -0.01%, +0.01%
CodeSize: 94409768 -> 93955568 (-0.48%); split: -0.48%, +0.00%
MaxWaves: 38950 -> 38939 (-0.03%); split: +0.00%, -0.03%
Instrs: 18162637 -> 18070934 (-0.50%); split: -0.51%, +0.00%
Latency: 291718455 -> 290772451 (-0.32%); split: -0.32%, +0.00%
InvThroughput: 109114674 -> 108489767 (-0.57%); split: -0.57%, +0.00%
VClause: 334498 -> 334579 (+0.02%); split: -0.01%, +0.03%
SClause: 628871 -> 628825 (-0.01%); split: -0.01%, +0.00%
Copies: 1674477 -> 1674850 (+0.02%); split: -0.02%, +0.04%
PreSGPRs: 834800 -> 834802 (+0.00%)
PreVGPRs: 750460 -> 750415 (-0.01%); split: -0.01%, +0.01%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 17:48:33 +00:00
|
|
|
for (unsigned i = is_add_mix ? 1 : 0; i < instr->operands.size(); i++) {
|
2020-09-02 15:19:21 +01:00
|
|
|
if (!instr->operands[i].isTemp() || !ctx.info[instr->operands[i].tempId()].is_mul())
|
|
|
|
|
continue;
|
|
|
|
|
ssa_info& info = ctx.info[instr->operands[i].tempId()];
|
|
|
|
|
|
|
|
|
|
/* no clamp/omod allowed between mul and add */
|
2021-01-21 16:13:34 +00:00
|
|
|
if (info.instr->isVOP3() && (info.instr->vop3().clamp || info.instr->vop3().omod))
|
2020-09-02 15:19:21 +01:00
|
|
|
continue;
|
aco: combine add/mul as v_fma_mix into fma
fossil-db (Sienna Cichlid):
Totals from 7345 (5.44% of 134913) affected shaders:
CodeSize: 73840060 -> 73768936 (-0.10%); split: -0.10%, +0.00%
Instrs: 13701603 -> 13684183 (-0.13%); split: -0.13%, +0.00%
Latency: 185389373 -> 185306538 (-0.04%); split: -0.04%, +0.00%
InvThroughput: 33785020 -> 33757593 (-0.08%); split: -0.08%, +0.00%
VClause: 237337 -> 237338 (+0.00%)
SClause: 485728 -> 485720 (-0.00%)
Copies: 935900 -> 935279 (-0.07%); split: -0.07%, +0.00%
Branches: 480721 -> 480722 (+0.00%)
fossil-db (Navi):
Totals from 10649 (7.89% of 134913) affected shaders:
VGPRs: 756624 -> 756516 (-0.01%); split: -0.02%, +0.01%
CodeSize: 92156580 -> 91707900 (-0.49%); split: -0.49%, +0.00%
MaxWaves: 159402 -> 159476 (+0.05%); split: +0.07%, -0.02%
Instrs: 17155827 -> 17070449 (-0.50%); split: -0.50%, +0.00%
Latency: 246296456 -> 245487120 (-0.33%); split: -0.33%, +0.00%
InvThroughput: 41438159 -> 41117424 (-0.77%); split: -0.77%, +0.00%
VClause: 323790 -> 323867 (+0.02%); split: -0.00%, +0.03%
SClause: 612077 -> 612034 (-0.01%); split: -0.01%, +0.00%
Copies: 1103012 -> 1102775 (-0.02%); split: -0.03%, +0.01%
Branches: 555893 -> 555896 (+0.00%); split: -0.00%, +0.00%
PreSGPRs: 824372 -> 824378 (+0.00%)
PreVGPRs: 740390 -> 740363 (-0.00%); split: -0.01%, +0.01%
fossil-db (Vega):
Totals from 10950 (8.11% of 135048) affected shaders:
SGPRs: 1034528 -> 1034560 (+0.00%)
VGPRs: 794092 -> 794104 (+0.00%); split: -0.01%, +0.01%
CodeSize: 94409768 -> 93955568 (-0.48%); split: -0.48%, +0.00%
MaxWaves: 38950 -> 38939 (-0.03%); split: +0.00%, -0.03%
Instrs: 18162637 -> 18070934 (-0.50%); split: -0.51%, +0.00%
Latency: 291718455 -> 290772451 (-0.32%); split: -0.32%, +0.00%
InvThroughput: 109114674 -> 108489767 (-0.57%); split: -0.57%, +0.00%
VClause: 334498 -> 334579 (+0.02%); split: -0.01%, +0.03%
SClause: 628871 -> 628825 (-0.01%); split: -0.01%, +0.00%
Copies: 1674477 -> 1674850 (+0.02%); split: -0.02%, +0.04%
PreSGPRs: 834800 -> 834802 (+0.00%)
PreVGPRs: 750460 -> 750415 (-0.01%); split: -0.01%, +0.01%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 17:48:33 +00:00
|
|
|
if (info.instr->isVOP3P() && info.instr->vop3p().clamp)
|
|
|
|
|
continue;
|
|
|
|
|
/* v_fma_mix_f32/etc can't do omod */
|
|
|
|
|
if (info.instr->isVOP3P() && instr->isVOP3() && instr->vop3().omod)
|
|
|
|
|
continue;
|
|
|
|
|
/* don't promote fp16 to fp32 or remove fp32->fp16->fp32 conversions */
|
|
|
|
|
if (is_add_mix && info.instr->definitions[0].bytes() == 2)
|
aco: use v_fma_mix to combine mul/add/fma input conversions
fossil-db (Sienna Cichlid):
Totals from 11558 (8.57% of 134913) affected shaders:
VGPRs: 829392 -> 825200 (-0.51%); split: -0.52%, +0.02%
SpillSGPRs: 7845 -> 8399 (+7.06%)
CodeSize: 101822704 -> 101677172 (-0.14%); split: -0.25%, +0.11%
MaxWaves: 172216 -> 173182 (+0.56%); split: +0.59%, -0.03%
Instrs: 19061343 -> 18883450 (-0.93%); split: -0.93%, +0.00%
Latency: 256011590 -> 255177378 (-0.33%); split: -0.39%, +0.06%
InvThroughput: 46104438 -> 45604059 (-1.09%); split: -1.12%, +0.04%
VClause: 352211 -> 351948 (-0.07%); split: -0.21%, +0.13%
SClause: 676506 -> 676961 (+0.07%); split: -0.04%, +0.11%
Copies: 1246571 -> 1237745 (-0.71%); split: -0.97%, +0.26%
Branches: 626229 -> 626241 (+0.00%); split: -0.02%, +0.03%
PreSGPRs: 882176 -> 888853 (+0.76%); split: -0.00%, +0.76%
PreVGPRs: 796705 -> 792304 (-0.55%); split: -0.56%, +0.00%
fossil-db (Navi):
Totals from 11558 (8.57% of 134913) affected shaders:
VGPRs: 803900 -> 798660 (-0.65%); split: -0.73%, +0.08%
SpillSGPRs: 7894 -> 8492 (+7.58%); split: -0.10%, +7.68%
CodeSize: 96892596 -> 97134716 (+0.25%); split: -0.05%, +0.29%
MaxWaves: 181454 -> 183014 (+0.86%); split: +0.94%, -0.08%
Instrs: 18186813 -> 18093994 (-0.51%); split: -0.56%, +0.05%
Latency: 253385909 -> 253325528 (-0.02%); split: -0.15%, +0.12%
InvThroughput: 43315355 -> 42805541 (-1.18%); split: -1.33%, +0.15%
VClause: 338755 -> 338535 (-0.06%); split: -0.16%, +0.10%
SClause: 656561 -> 656829 (+0.04%); split: -0.07%, +0.11%
Copies: 1162235 -> 1153558 (-0.75%); split: -1.07%, +0.32%
Branches: 588536 -> 588542 (+0.00%); split: -0.03%, +0.03%
PreSGPRs: 854849 -> 861640 (+0.79%); split: -0.00%, +0.80%
PreVGPRs: 783401 -> 779031 (-0.56%); split: -0.56%, +0.00%
fossil-db (Vega):
Totals from 11516 (8.53% of 135048) affected shaders:
SGPRs: 1072128 -> 1076288 (+0.39%); split: -0.01%, +0.40%
VGPRs: 821312 -> 818124 (-0.39%); split: -0.43%, +0.04%
SpillSGPRs: 11952 -> 12677 (+6.07%)
CodeSize: 96378496 -> 96707596 (+0.34%); split: -0.04%, +0.38%
MaxWaves: 42614 -> 42883 (+0.63%); split: +0.68%, -0.04%
Instrs: 18672844 -> 18600274 (-0.39%); split: -0.44%, +0.05%
Latency: 296658786 -> 296338296 (-0.11%); split: -0.21%, +0.10%
InvThroughput: 111665547 -> 111283559 (-0.34%); split: -0.40%, +0.06%
VClause: 343001 -> 342826 (-0.05%); split: -0.14%, +0.09%
SClause: 646684 -> 646657 (-0.00%); split: -0.05%, +0.04%
Copies: 1715316 -> 1712895 (-0.14%); split: -0.53%, +0.39%
PreSGPRs: 850737 -> 856543 (+0.68%); split: -0.04%, +0.72%
PreVGPRs: 775293 -> 772215 (-0.40%); split: -0.41%, +0.02%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 13:58:34 +00:00
|
|
|
continue;
|
2020-09-02 15:19:21 +01:00
|
|
|
|
2022-01-31 18:22:58 +00:00
|
|
|
if (get_operand_size(instr, i) != info.instr->definitions[0].bytes() * 8)
|
|
|
|
|
continue;
|
|
|
|
|
|
2021-09-21 17:03:05 +01:00
|
|
|
bool legacy = info.instr->opcode == aco_opcode::v_mul_legacy_f32;
|
aco: combine add/mul as v_fma_mix into fma
fossil-db (Sienna Cichlid):
Totals from 7345 (5.44% of 134913) affected shaders:
CodeSize: 73840060 -> 73768936 (-0.10%); split: -0.10%, +0.00%
Instrs: 13701603 -> 13684183 (-0.13%); split: -0.13%, +0.00%
Latency: 185389373 -> 185306538 (-0.04%); split: -0.04%, +0.00%
InvThroughput: 33785020 -> 33757593 (-0.08%); split: -0.08%, +0.00%
VClause: 237337 -> 237338 (+0.00%)
SClause: 485728 -> 485720 (-0.00%)
Copies: 935900 -> 935279 (-0.07%); split: -0.07%, +0.00%
Branches: 480721 -> 480722 (+0.00%)
fossil-db (Navi):
Totals from 10649 (7.89% of 134913) affected shaders:
VGPRs: 756624 -> 756516 (-0.01%); split: -0.02%, +0.01%
CodeSize: 92156580 -> 91707900 (-0.49%); split: -0.49%, +0.00%
MaxWaves: 159402 -> 159476 (+0.05%); split: +0.07%, -0.02%
Instrs: 17155827 -> 17070449 (-0.50%); split: -0.50%, +0.00%
Latency: 246296456 -> 245487120 (-0.33%); split: -0.33%, +0.00%
InvThroughput: 41438159 -> 41117424 (-0.77%); split: -0.77%, +0.00%
VClause: 323790 -> 323867 (+0.02%); split: -0.00%, +0.03%
SClause: 612077 -> 612034 (-0.01%); split: -0.01%, +0.00%
Copies: 1103012 -> 1102775 (-0.02%); split: -0.03%, +0.01%
Branches: 555893 -> 555896 (+0.00%); split: -0.00%, +0.00%
PreSGPRs: 824372 -> 824378 (+0.00%)
PreVGPRs: 740390 -> 740363 (-0.00%); split: -0.01%, +0.01%
fossil-db (Vega):
Totals from 10950 (8.11% of 135048) affected shaders:
SGPRs: 1034528 -> 1034560 (+0.00%)
VGPRs: 794092 -> 794104 (+0.00%); split: -0.01%, +0.01%
CodeSize: 94409768 -> 93955568 (-0.48%); split: -0.48%, +0.00%
MaxWaves: 38950 -> 38939 (-0.03%); split: +0.00%, -0.03%
Instrs: 18162637 -> 18070934 (-0.50%); split: -0.51%, +0.00%
Latency: 291718455 -> 290772451 (-0.32%); split: -0.32%, +0.00%
InvThroughput: 109114674 -> 108489767 (-0.57%); split: -0.57%, +0.00%
VClause: 334498 -> 334579 (+0.02%); split: -0.01%, +0.03%
SClause: 628871 -> 628825 (-0.01%); split: -0.01%, +0.00%
Copies: 1674477 -> 1674850 (+0.02%); split: -0.02%, +0.04%
PreSGPRs: 834800 -> 834802 (+0.00%)
PreVGPRs: 750460 -> 750415 (-0.01%); split: -0.01%, +0.01%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 17:48:33 +00:00
|
|
|
bool mad_mix = is_add_mix || info.instr->isVOP3P();
|
2022-01-17 17:33:25 +00:00
|
|
|
|
2022-04-11 17:51:42 +01:00
|
|
|
/* Multiplication by power-of-two should never need rounding. 1/power-of-two also works,
|
|
|
|
|
* but using fma removes denormal flushing (0xfffffe * 0.5 + 0x810001a2).
|
|
|
|
|
*/
|
|
|
|
|
bool is_fma_precise = is_pow_of_two(ctx, info.instr->operands[0]) ||
|
|
|
|
|
is_pow_of_two(ctx, info.instr->operands[1]);
|
|
|
|
|
|
2022-05-12 02:50:17 -04:00
|
|
|
bool has_fma = mad16 || mad64 || (legacy && ctx.program->gfx_level >= GFX10_3) ||
|
aco: combine add/mul as v_fma_mix into fma
fossil-db (Sienna Cichlid):
Totals from 7345 (5.44% of 134913) affected shaders:
CodeSize: 73840060 -> 73768936 (-0.10%); split: -0.10%, +0.00%
Instrs: 13701603 -> 13684183 (-0.13%); split: -0.13%, +0.00%
Latency: 185389373 -> 185306538 (-0.04%); split: -0.04%, +0.00%
InvThroughput: 33785020 -> 33757593 (-0.08%); split: -0.08%, +0.00%
VClause: 237337 -> 237338 (+0.00%)
SClause: 485728 -> 485720 (-0.00%)
Copies: 935900 -> 935279 (-0.07%); split: -0.07%, +0.00%
Branches: 480721 -> 480722 (+0.00%)
fossil-db (Navi):
Totals from 10649 (7.89% of 134913) affected shaders:
VGPRs: 756624 -> 756516 (-0.01%); split: -0.02%, +0.01%
CodeSize: 92156580 -> 91707900 (-0.49%); split: -0.49%, +0.00%
MaxWaves: 159402 -> 159476 (+0.05%); split: +0.07%, -0.02%
Instrs: 17155827 -> 17070449 (-0.50%); split: -0.50%, +0.00%
Latency: 246296456 -> 245487120 (-0.33%); split: -0.33%, +0.00%
InvThroughput: 41438159 -> 41117424 (-0.77%); split: -0.77%, +0.00%
VClause: 323790 -> 323867 (+0.02%); split: -0.00%, +0.03%
SClause: 612077 -> 612034 (-0.01%); split: -0.01%, +0.00%
Copies: 1103012 -> 1102775 (-0.02%); split: -0.03%, +0.01%
Branches: 555893 -> 555896 (+0.00%); split: -0.00%, +0.00%
PreSGPRs: 824372 -> 824378 (+0.00%)
PreVGPRs: 740390 -> 740363 (-0.00%); split: -0.01%, +0.01%
fossil-db (Vega):
Totals from 10950 (8.11% of 135048) affected shaders:
SGPRs: 1034528 -> 1034560 (+0.00%)
VGPRs: 794092 -> 794104 (+0.00%); split: -0.01%, +0.01%
CodeSize: 94409768 -> 93955568 (-0.48%); split: -0.48%, +0.00%
MaxWaves: 38950 -> 38939 (-0.03%); split: +0.00%, -0.03%
Instrs: 18162637 -> 18070934 (-0.50%); split: -0.51%, +0.00%
Latency: 291718455 -> 290772451 (-0.32%); split: -0.32%, +0.00%
InvThroughput: 109114674 -> 108489767 (-0.57%); split: -0.57%, +0.00%
VClause: 334498 -> 334579 (+0.02%); split: -0.01%, +0.03%
SClause: 628871 -> 628825 (-0.01%); split: -0.01%, +0.00%
Copies: 1674477 -> 1674850 (+0.02%); split: -0.02%, +0.04%
PreSGPRs: 834800 -> 834802 (+0.00%)
PreVGPRs: 750460 -> 750415 (-0.01%); split: -0.01%, +0.01%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 17:48:33 +00:00
|
|
|
(mad32 && !legacy && !mad_mix && ctx.program->dev.has_fast_fma32) ||
|
|
|
|
|
(mad_mix && ctx.program->dev.fused_mad_mix);
|
|
|
|
|
bool has_mad = mad_mix ? !ctx.program->dev.fused_mad_mix
|
2022-05-12 02:50:17 -04:00
|
|
|
: ((mad32 && ctx.program->gfx_level < GFX10_3) ||
|
|
|
|
|
(mad16 && ctx.program->gfx_level <= GFX9));
|
2022-04-11 17:51:42 +01:00
|
|
|
bool can_use_fma =
|
|
|
|
|
has_fma &&
|
|
|
|
|
(!(info.instr->definitions[0].isPrecise() || instr->definitions[0].isPrecise()) ||
|
|
|
|
|
is_fma_precise);
|
2022-01-17 17:33:25 +00:00
|
|
|
bool can_use_mad =
|
aco: combine add/mul as v_fma_mix into fma
fossil-db (Sienna Cichlid):
Totals from 7345 (5.44% of 134913) affected shaders:
CodeSize: 73840060 -> 73768936 (-0.10%); split: -0.10%, +0.00%
Instrs: 13701603 -> 13684183 (-0.13%); split: -0.13%, +0.00%
Latency: 185389373 -> 185306538 (-0.04%); split: -0.04%, +0.00%
InvThroughput: 33785020 -> 33757593 (-0.08%); split: -0.08%, +0.00%
VClause: 237337 -> 237338 (+0.00%)
SClause: 485728 -> 485720 (-0.00%)
Copies: 935900 -> 935279 (-0.07%); split: -0.07%, +0.00%
Branches: 480721 -> 480722 (+0.00%)
fossil-db (Navi):
Totals from 10649 (7.89% of 134913) affected shaders:
VGPRs: 756624 -> 756516 (-0.01%); split: -0.02%, +0.01%
CodeSize: 92156580 -> 91707900 (-0.49%); split: -0.49%, +0.00%
MaxWaves: 159402 -> 159476 (+0.05%); split: +0.07%, -0.02%
Instrs: 17155827 -> 17070449 (-0.50%); split: -0.50%, +0.00%
Latency: 246296456 -> 245487120 (-0.33%); split: -0.33%, +0.00%
InvThroughput: 41438159 -> 41117424 (-0.77%); split: -0.77%, +0.00%
VClause: 323790 -> 323867 (+0.02%); split: -0.00%, +0.03%
SClause: 612077 -> 612034 (-0.01%); split: -0.01%, +0.00%
Copies: 1103012 -> 1102775 (-0.02%); split: -0.03%, +0.01%
Branches: 555893 -> 555896 (+0.00%); split: -0.00%, +0.00%
PreSGPRs: 824372 -> 824378 (+0.00%)
PreVGPRs: 740390 -> 740363 (-0.00%); split: -0.01%, +0.01%
fossil-db (Vega):
Totals from 10950 (8.11% of 135048) affected shaders:
SGPRs: 1034528 -> 1034560 (+0.00%)
VGPRs: 794092 -> 794104 (+0.00%); split: -0.01%, +0.01%
CodeSize: 94409768 -> 93955568 (-0.48%); split: -0.48%, +0.00%
MaxWaves: 38950 -> 38939 (-0.03%); split: +0.00%, -0.03%
Instrs: 18162637 -> 18070934 (-0.50%); split: -0.51%, +0.00%
Latency: 291718455 -> 290772451 (-0.32%); split: -0.32%, +0.00%
InvThroughput: 109114674 -> 108489767 (-0.57%); split: -0.57%, +0.00%
VClause: 334498 -> 334579 (+0.02%); split: -0.01%, +0.03%
SClause: 628871 -> 628825 (-0.01%); split: -0.01%, +0.00%
Copies: 1674477 -> 1674850 (+0.02%); split: -0.02%, +0.04%
PreSGPRs: 834800 -> 834802 (+0.00%)
PreVGPRs: 750460 -> 750415 (-0.01%); split: -0.01%, +0.01%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 17:48:33 +00:00
|
|
|
has_mad && (mad_mix || mad32 ? ctx.fp_mode.denorm32 : ctx.fp_mode.denorm16_64) == 0;
|
|
|
|
|
if (mad_mix && legacy)
|
|
|
|
|
continue;
|
2022-01-17 17:33:25 +00:00
|
|
|
if (!can_use_fma && !can_use_mad)
|
2021-09-21 17:03:05 +01:00
|
|
|
continue;
|
|
|
|
|
|
aco: combine add/mul as v_fma_mix into fma
fossil-db (Sienna Cichlid):
Totals from 7345 (5.44% of 134913) affected shaders:
CodeSize: 73840060 -> 73768936 (-0.10%); split: -0.10%, +0.00%
Instrs: 13701603 -> 13684183 (-0.13%); split: -0.13%, +0.00%
Latency: 185389373 -> 185306538 (-0.04%); split: -0.04%, +0.00%
InvThroughput: 33785020 -> 33757593 (-0.08%); split: -0.08%, +0.00%
VClause: 237337 -> 237338 (+0.00%)
SClause: 485728 -> 485720 (-0.00%)
Copies: 935900 -> 935279 (-0.07%); split: -0.07%, +0.00%
Branches: 480721 -> 480722 (+0.00%)
fossil-db (Navi):
Totals from 10649 (7.89% of 134913) affected shaders:
VGPRs: 756624 -> 756516 (-0.01%); split: -0.02%, +0.01%
CodeSize: 92156580 -> 91707900 (-0.49%); split: -0.49%, +0.00%
MaxWaves: 159402 -> 159476 (+0.05%); split: +0.07%, -0.02%
Instrs: 17155827 -> 17070449 (-0.50%); split: -0.50%, +0.00%
Latency: 246296456 -> 245487120 (-0.33%); split: -0.33%, +0.00%
InvThroughput: 41438159 -> 41117424 (-0.77%); split: -0.77%, +0.00%
VClause: 323790 -> 323867 (+0.02%); split: -0.00%, +0.03%
SClause: 612077 -> 612034 (-0.01%); split: -0.01%, +0.00%
Copies: 1103012 -> 1102775 (-0.02%); split: -0.03%, +0.01%
Branches: 555893 -> 555896 (+0.00%); split: -0.00%, +0.00%
PreSGPRs: 824372 -> 824378 (+0.00%)
PreVGPRs: 740390 -> 740363 (-0.00%); split: -0.01%, +0.01%
fossil-db (Vega):
Totals from 10950 (8.11% of 135048) affected shaders:
SGPRs: 1034528 -> 1034560 (+0.00%)
VGPRs: 794092 -> 794104 (+0.00%); split: -0.01%, +0.01%
CodeSize: 94409768 -> 93955568 (-0.48%); split: -0.48%, +0.00%
MaxWaves: 38950 -> 38939 (-0.03%); split: +0.00%, -0.03%
Instrs: 18162637 -> 18070934 (-0.50%); split: -0.51%, +0.00%
Latency: 291718455 -> 290772451 (-0.32%); split: -0.32%, +0.00%
InvThroughput: 109114674 -> 108489767 (-0.57%); split: -0.57%, +0.00%
VClause: 334498 -> 334579 (+0.02%); split: -0.01%, +0.03%
SClause: 628871 -> 628825 (-0.01%); split: -0.01%, +0.00%
Copies: 1674477 -> 1674850 (+0.02%); split: -0.02%, +0.04%
PreSGPRs: 834800 -> 834802 (+0.00%)
PreVGPRs: 750460 -> 750415 (-0.01%); split: -0.01%, +0.01%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 17:48:33 +00:00
|
|
|
unsigned candidate_add_op_idx = is_add_mix ? (3 - i) : (1 - i);
|
|
|
|
|
Operand op[3] = {info.instr->operands[0], info.instr->operands[1],
|
|
|
|
|
instr->operands[candidate_add_op_idx]};
|
2021-07-19 14:26:42 +01:00
|
|
|
if (info.instr->isSDWA() || info.instr->isDPP() || !check_vop3_operands(ctx, 3, op) ||
|
aco: use more predictable tiebreaker when forming MADs
fossil-db (GFX10.3):
Totals from 84981 (58.10% of 146267) affected shaders:
VGPRs: 3829896 -> 3820480 (-0.25%); split: -0.33%, +0.08%
CodeSize: 270860472 -> 270850132 (-0.00%); split: -0.08%, +0.08%
MaxWaves: 2035822 -> 2042516 (+0.33%); split: +0.39%, -0.06%
Instrs: 51285526 -> 51308869 (+0.05%); split: -0.03%, +0.08%
Latency: 931503706 -> 932556231 (+0.11%); split: -0.19%, +0.30%
InvThroughput: 217084232 -> 217070849 (-0.01%); split: -0.12%, +0.11%
fossil-db (GFX10):
Totals from 85520 (58.47% of 146267) affected shaders:
VGPRs: 3729132 -> 3725344 (-0.10%); split: -0.21%, +0.10%
CodeSize: 272796500 -> 272783084 (-0.00%); split: -0.09%, +0.08%
MaxWaves: 2246410 -> 2249012 (+0.12%); split: +0.17%, -0.05%
Instrs: 51643962 -> 51664865 (+0.04%); split: -0.04%, +0.08%
Latency: 932331949 -> 933274979 (+0.10%); split: -0.19%, +0.29%
InvThroughput: 214187040 -> 214130994 (-0.03%); split: -0.13%, +0.11%
fossil-db (GFX9):
Totals from 84619 (57.80% of 146401) affected shaders:
SGPRs: 5366240 -> 5366944 (+0.01%); split: -0.09%, +0.10%
VGPRs: 3765608 -> 3764972 (-0.02%); split: -0.23%, +0.22%
CodeSize: 263634732 -> 263616320 (-0.01%); split: -0.08%, +0.08%
MaxWaves: 546617 -> 547091 (+0.09%); split: +0.18%, -0.09%
Instrs: 51426195 -> 51458334 (+0.06%); split: -0.03%, +0.10%
Latency: 1164445660 -> 1161923480 (-0.22%); split: -0.46%, +0.24%
InvThroughput: 542964697 -> 542329595 (-0.12%); split: -0.26%, +0.14%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9805>
2021-03-18 11:33:41 +00:00
|
|
|
ctx.uses[instr->operands[i].tempId()] > uses)
|
2020-09-02 15:19:21 +01:00
|
|
|
continue;
|
|
|
|
|
|
aco: use more predictable tiebreaker when forming MADs
fossil-db (GFX10.3):
Totals from 84981 (58.10% of 146267) affected shaders:
VGPRs: 3829896 -> 3820480 (-0.25%); split: -0.33%, +0.08%
CodeSize: 270860472 -> 270850132 (-0.00%); split: -0.08%, +0.08%
MaxWaves: 2035822 -> 2042516 (+0.33%); split: +0.39%, -0.06%
Instrs: 51285526 -> 51308869 (+0.05%); split: -0.03%, +0.08%
Latency: 931503706 -> 932556231 (+0.11%); split: -0.19%, +0.30%
InvThroughput: 217084232 -> 217070849 (-0.01%); split: -0.12%, +0.11%
fossil-db (GFX10):
Totals from 85520 (58.47% of 146267) affected shaders:
VGPRs: 3729132 -> 3725344 (-0.10%); split: -0.21%, +0.10%
CodeSize: 272796500 -> 272783084 (-0.00%); split: -0.09%, +0.08%
MaxWaves: 2246410 -> 2249012 (+0.12%); split: +0.17%, -0.05%
Instrs: 51643962 -> 51664865 (+0.04%); split: -0.04%, +0.08%
Latency: 932331949 -> 933274979 (+0.10%); split: -0.19%, +0.29%
InvThroughput: 214187040 -> 214130994 (-0.03%); split: -0.13%, +0.11%
fossil-db (GFX9):
Totals from 84619 (57.80% of 146401) affected shaders:
SGPRs: 5366240 -> 5366944 (+0.01%); split: -0.09%, +0.10%
VGPRs: 3765608 -> 3764972 (-0.02%); split: -0.23%, +0.22%
CodeSize: 263634732 -> 263616320 (-0.01%); split: -0.08%, +0.08%
MaxWaves: 546617 -> 547091 (+0.09%); split: +0.18%, -0.09%
Instrs: 51426195 -> 51458334 (+0.06%); split: -0.03%, +0.10%
Latency: 1164445660 -> 1161923480 (-0.22%); split: -0.46%, +0.24%
InvThroughput: 542964697 -> 542329595 (-0.12%); split: -0.26%, +0.14%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9805>
2021-03-18 11:33:41 +00:00
|
|
|
if (ctx.uses[instr->operands[i].tempId()] == uses) {
|
|
|
|
|
unsigned cur_idx = mul_instr->definitions[0].tempId();
|
|
|
|
|
unsigned new_idx = info.instr->definitions[0].tempId();
|
|
|
|
|
if (cur_idx > new_idx)
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-02 15:19:21 +01:00
|
|
|
mul_instr = info.instr;
|
aco: combine add/mul as v_fma_mix into fma
fossil-db (Sienna Cichlid):
Totals from 7345 (5.44% of 134913) affected shaders:
CodeSize: 73840060 -> 73768936 (-0.10%); split: -0.10%, +0.00%
Instrs: 13701603 -> 13684183 (-0.13%); split: -0.13%, +0.00%
Latency: 185389373 -> 185306538 (-0.04%); split: -0.04%, +0.00%
InvThroughput: 33785020 -> 33757593 (-0.08%); split: -0.08%, +0.00%
VClause: 237337 -> 237338 (+0.00%)
SClause: 485728 -> 485720 (-0.00%)
Copies: 935900 -> 935279 (-0.07%); split: -0.07%, +0.00%
Branches: 480721 -> 480722 (+0.00%)
fossil-db (Navi):
Totals from 10649 (7.89% of 134913) affected shaders:
VGPRs: 756624 -> 756516 (-0.01%); split: -0.02%, +0.01%
CodeSize: 92156580 -> 91707900 (-0.49%); split: -0.49%, +0.00%
MaxWaves: 159402 -> 159476 (+0.05%); split: +0.07%, -0.02%
Instrs: 17155827 -> 17070449 (-0.50%); split: -0.50%, +0.00%
Latency: 246296456 -> 245487120 (-0.33%); split: -0.33%, +0.00%
InvThroughput: 41438159 -> 41117424 (-0.77%); split: -0.77%, +0.00%
VClause: 323790 -> 323867 (+0.02%); split: -0.00%, +0.03%
SClause: 612077 -> 612034 (-0.01%); split: -0.01%, +0.00%
Copies: 1103012 -> 1102775 (-0.02%); split: -0.03%, +0.01%
Branches: 555893 -> 555896 (+0.00%); split: -0.00%, +0.00%
PreSGPRs: 824372 -> 824378 (+0.00%)
PreVGPRs: 740390 -> 740363 (-0.00%); split: -0.01%, +0.01%
fossil-db (Vega):
Totals from 10950 (8.11% of 135048) affected shaders:
SGPRs: 1034528 -> 1034560 (+0.00%)
VGPRs: 794092 -> 794104 (+0.00%); split: -0.01%, +0.01%
CodeSize: 94409768 -> 93955568 (-0.48%); split: -0.48%, +0.00%
MaxWaves: 38950 -> 38939 (-0.03%); split: +0.00%, -0.03%
Instrs: 18162637 -> 18070934 (-0.50%); split: -0.51%, +0.00%
Latency: 291718455 -> 290772451 (-0.32%); split: -0.32%, +0.00%
InvThroughput: 109114674 -> 108489767 (-0.57%); split: -0.57%, +0.00%
VClause: 334498 -> 334579 (+0.02%); split: -0.01%, +0.03%
SClause: 628871 -> 628825 (-0.01%); split: -0.01%, +0.00%
Copies: 1674477 -> 1674850 (+0.02%); split: -0.02%, +0.04%
PreSGPRs: 834800 -> 834802 (+0.00%)
PreVGPRs: 750460 -> 750415 (-0.01%); split: -0.01%, +0.01%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 17:48:33 +00:00
|
|
|
add_op_idx = candidate_add_op_idx;
|
2020-09-02 15:19:21 +01:00
|
|
|
uses = ctx.uses[instr->operands[i].tempId()];
|
2022-01-17 17:33:25 +00:00
|
|
|
emit_fma = !can_use_mad;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
2020-09-02 15:19:21 +01:00
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
if (mul_instr) {
|
2020-09-02 15:19:21 +01:00
|
|
|
/* turn mul+add into v_mad/v_fma */
|
2021-06-09 10:14:54 +02:00
|
|
|
Operand op[3] = {mul_instr->operands[0], mul_instr->operands[1],
|
|
|
|
|
instr->operands[add_op_idx]};
|
2020-09-02 15:19:21 +01:00
|
|
|
ctx.uses[mul_instr->definitions[0].tempId()]--;
|
|
|
|
|
if (ctx.uses[mul_instr->definitions[0].tempId()]) {
|
|
|
|
|
if (op[0].isTemp())
|
|
|
|
|
ctx.uses[op[0].tempId()]++;
|
|
|
|
|
if (op[1].isTemp())
|
|
|
|
|
ctx.uses[op[1].tempId()]++;
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
bool neg[3] = {false, false, false};
|
|
|
|
|
bool abs[3] = {false, false, false};
|
|
|
|
|
unsigned omod = 0;
|
|
|
|
|
bool clamp = false;
|
aco: combine add/mul as v_fma_mix into fma
fossil-db (Sienna Cichlid):
Totals from 7345 (5.44% of 134913) affected shaders:
CodeSize: 73840060 -> 73768936 (-0.10%); split: -0.10%, +0.00%
Instrs: 13701603 -> 13684183 (-0.13%); split: -0.13%, +0.00%
Latency: 185389373 -> 185306538 (-0.04%); split: -0.04%, +0.00%
InvThroughput: 33785020 -> 33757593 (-0.08%); split: -0.08%, +0.00%
VClause: 237337 -> 237338 (+0.00%)
SClause: 485728 -> 485720 (-0.00%)
Copies: 935900 -> 935279 (-0.07%); split: -0.07%, +0.00%
Branches: 480721 -> 480722 (+0.00%)
fossil-db (Navi):
Totals from 10649 (7.89% of 134913) affected shaders:
VGPRs: 756624 -> 756516 (-0.01%); split: -0.02%, +0.01%
CodeSize: 92156580 -> 91707900 (-0.49%); split: -0.49%, +0.00%
MaxWaves: 159402 -> 159476 (+0.05%); split: +0.07%, -0.02%
Instrs: 17155827 -> 17070449 (-0.50%); split: -0.50%, +0.00%
Latency: 246296456 -> 245487120 (-0.33%); split: -0.33%, +0.00%
InvThroughput: 41438159 -> 41117424 (-0.77%); split: -0.77%, +0.00%
VClause: 323790 -> 323867 (+0.02%); split: -0.00%, +0.03%
SClause: 612077 -> 612034 (-0.01%); split: -0.01%, +0.00%
Copies: 1103012 -> 1102775 (-0.02%); split: -0.03%, +0.01%
Branches: 555893 -> 555896 (+0.00%); split: -0.00%, +0.00%
PreSGPRs: 824372 -> 824378 (+0.00%)
PreVGPRs: 740390 -> 740363 (-0.00%); split: -0.01%, +0.01%
fossil-db (Vega):
Totals from 10950 (8.11% of 135048) affected shaders:
SGPRs: 1034528 -> 1034560 (+0.00%)
VGPRs: 794092 -> 794104 (+0.00%); split: -0.01%, +0.01%
CodeSize: 94409768 -> 93955568 (-0.48%); split: -0.48%, +0.00%
MaxWaves: 38950 -> 38939 (-0.03%); split: +0.00%, -0.03%
Instrs: 18162637 -> 18070934 (-0.50%); split: -0.51%, +0.00%
Latency: 291718455 -> 290772451 (-0.32%); split: -0.32%, +0.00%
InvThroughput: 109114674 -> 108489767 (-0.57%); split: -0.57%, +0.00%
VClause: 334498 -> 334579 (+0.02%); split: -0.01%, +0.03%
SClause: 628871 -> 628825 (-0.01%); split: -0.01%, +0.00%
Copies: 1674477 -> 1674850 (+0.02%); split: -0.02%, +0.04%
PreSGPRs: 834800 -> 834802 (+0.00%)
PreVGPRs: 750460 -> 750415 (-0.01%); split: -0.01%, +0.01%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 17:48:33 +00:00
|
|
|
uint8_t opsel_lo = 0;
|
|
|
|
|
uint8_t opsel_hi = 0;
|
2019-11-22 14:50:41 +00:00
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
if (mul_instr->isVOP3()) {
|
2021-01-21 16:13:34 +00:00
|
|
|
VOP3_instruction& vop3 = mul_instr->vop3();
|
|
|
|
|
neg[0] = vop3.neg[0];
|
|
|
|
|
neg[1] = vop3.neg[1];
|
|
|
|
|
abs[0] = vop3.abs[0];
|
|
|
|
|
abs[1] = vop3.abs[1];
|
aco: combine add/mul as v_fma_mix into fma
fossil-db (Sienna Cichlid):
Totals from 7345 (5.44% of 134913) affected shaders:
CodeSize: 73840060 -> 73768936 (-0.10%); split: -0.10%, +0.00%
Instrs: 13701603 -> 13684183 (-0.13%); split: -0.13%, +0.00%
Latency: 185389373 -> 185306538 (-0.04%); split: -0.04%, +0.00%
InvThroughput: 33785020 -> 33757593 (-0.08%); split: -0.08%, +0.00%
VClause: 237337 -> 237338 (+0.00%)
SClause: 485728 -> 485720 (-0.00%)
Copies: 935900 -> 935279 (-0.07%); split: -0.07%, +0.00%
Branches: 480721 -> 480722 (+0.00%)
fossil-db (Navi):
Totals from 10649 (7.89% of 134913) affected shaders:
VGPRs: 756624 -> 756516 (-0.01%); split: -0.02%, +0.01%
CodeSize: 92156580 -> 91707900 (-0.49%); split: -0.49%, +0.00%
MaxWaves: 159402 -> 159476 (+0.05%); split: +0.07%, -0.02%
Instrs: 17155827 -> 17070449 (-0.50%); split: -0.50%, +0.00%
Latency: 246296456 -> 245487120 (-0.33%); split: -0.33%, +0.00%
InvThroughput: 41438159 -> 41117424 (-0.77%); split: -0.77%, +0.00%
VClause: 323790 -> 323867 (+0.02%); split: -0.00%, +0.03%
SClause: 612077 -> 612034 (-0.01%); split: -0.01%, +0.00%
Copies: 1103012 -> 1102775 (-0.02%); split: -0.03%, +0.01%
Branches: 555893 -> 555896 (+0.00%); split: -0.00%, +0.00%
PreSGPRs: 824372 -> 824378 (+0.00%)
PreVGPRs: 740390 -> 740363 (-0.00%); split: -0.01%, +0.01%
fossil-db (Vega):
Totals from 10950 (8.11% of 135048) affected shaders:
SGPRs: 1034528 -> 1034560 (+0.00%)
VGPRs: 794092 -> 794104 (+0.00%); split: -0.01%, +0.01%
CodeSize: 94409768 -> 93955568 (-0.48%); split: -0.48%, +0.00%
MaxWaves: 38950 -> 38939 (-0.03%); split: +0.00%, -0.03%
Instrs: 18162637 -> 18070934 (-0.50%); split: -0.51%, +0.00%
Latency: 291718455 -> 290772451 (-0.32%); split: -0.32%, +0.00%
InvThroughput: 109114674 -> 108489767 (-0.57%); split: -0.57%, +0.00%
VClause: 334498 -> 334579 (+0.02%); split: -0.01%, +0.03%
SClause: 628871 -> 628825 (-0.01%); split: -0.01%, +0.00%
Copies: 1674477 -> 1674850 (+0.02%); split: -0.02%, +0.04%
PreSGPRs: 834800 -> 834802 (+0.00%)
PreVGPRs: 750460 -> 750415 (-0.01%); split: -0.01%, +0.01%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 17:48:33 +00:00
|
|
|
} else if (mul_instr->isVOP3P()) {
|
|
|
|
|
VOP3P_instruction& vop3p = mul_instr->vop3p();
|
|
|
|
|
neg[0] = vop3p.neg_lo[0];
|
|
|
|
|
neg[1] = vop3p.neg_lo[1];
|
|
|
|
|
abs[0] = vop3p.neg_hi[0];
|
|
|
|
|
abs[1] = vop3p.neg_hi[1];
|
|
|
|
|
opsel_lo = vop3p.opsel_lo & 0x3;
|
|
|
|
|
opsel_hi = vop3p.opsel_hi & 0x3;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (instr->isVOP3()) {
|
2021-01-21 16:13:34 +00:00
|
|
|
VOP3_instruction& vop3 = instr->vop3();
|
|
|
|
|
neg[2] = vop3.neg[add_op_idx];
|
|
|
|
|
abs[2] = vop3.abs[add_op_idx];
|
|
|
|
|
omod = vop3.omod;
|
|
|
|
|
clamp = vop3.clamp;
|
2019-09-17 13:22:17 +02:00
|
|
|
/* abs of the multiplication result */
|
2021-01-21 16:13:34 +00:00
|
|
|
if (vop3.abs[1 - add_op_idx]) {
|
2019-09-17 13:22:17 +02:00
|
|
|
neg[0] = false;
|
|
|
|
|
neg[1] = false;
|
|
|
|
|
abs[0] = true;
|
|
|
|
|
abs[1] = true;
|
|
|
|
|
}
|
|
|
|
|
/* neg of the multiplication result */
|
2021-01-21 16:13:34 +00:00
|
|
|
neg[1] = neg[1] ^ vop3.neg[1 - add_op_idx];
|
aco: combine add/mul as v_fma_mix into fma
fossil-db (Sienna Cichlid):
Totals from 7345 (5.44% of 134913) affected shaders:
CodeSize: 73840060 -> 73768936 (-0.10%); split: -0.10%, +0.00%
Instrs: 13701603 -> 13684183 (-0.13%); split: -0.13%, +0.00%
Latency: 185389373 -> 185306538 (-0.04%); split: -0.04%, +0.00%
InvThroughput: 33785020 -> 33757593 (-0.08%); split: -0.08%, +0.00%
VClause: 237337 -> 237338 (+0.00%)
SClause: 485728 -> 485720 (-0.00%)
Copies: 935900 -> 935279 (-0.07%); split: -0.07%, +0.00%
Branches: 480721 -> 480722 (+0.00%)
fossil-db (Navi):
Totals from 10649 (7.89% of 134913) affected shaders:
VGPRs: 756624 -> 756516 (-0.01%); split: -0.02%, +0.01%
CodeSize: 92156580 -> 91707900 (-0.49%); split: -0.49%, +0.00%
MaxWaves: 159402 -> 159476 (+0.05%); split: +0.07%, -0.02%
Instrs: 17155827 -> 17070449 (-0.50%); split: -0.50%, +0.00%
Latency: 246296456 -> 245487120 (-0.33%); split: -0.33%, +0.00%
InvThroughput: 41438159 -> 41117424 (-0.77%); split: -0.77%, +0.00%
VClause: 323790 -> 323867 (+0.02%); split: -0.00%, +0.03%
SClause: 612077 -> 612034 (-0.01%); split: -0.01%, +0.00%
Copies: 1103012 -> 1102775 (-0.02%); split: -0.03%, +0.01%
Branches: 555893 -> 555896 (+0.00%); split: -0.00%, +0.00%
PreSGPRs: 824372 -> 824378 (+0.00%)
PreVGPRs: 740390 -> 740363 (-0.00%); split: -0.01%, +0.01%
fossil-db (Vega):
Totals from 10950 (8.11% of 135048) affected shaders:
SGPRs: 1034528 -> 1034560 (+0.00%)
VGPRs: 794092 -> 794104 (+0.00%); split: -0.01%, +0.01%
CodeSize: 94409768 -> 93955568 (-0.48%); split: -0.48%, +0.00%
MaxWaves: 38950 -> 38939 (-0.03%); split: +0.00%, -0.03%
Instrs: 18162637 -> 18070934 (-0.50%); split: -0.51%, +0.00%
Latency: 291718455 -> 290772451 (-0.32%); split: -0.32%, +0.00%
InvThroughput: 109114674 -> 108489767 (-0.57%); split: -0.57%, +0.00%
VClause: 334498 -> 334579 (+0.02%); split: -0.01%, +0.03%
SClause: 628871 -> 628825 (-0.01%); split: -0.01%, +0.00%
Copies: 1674477 -> 1674850 (+0.02%); split: -0.02%, +0.04%
PreSGPRs: 834800 -> 834802 (+0.00%)
PreVGPRs: 750460 -> 750415 (-0.01%); split: -0.01%, +0.01%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 17:48:33 +00:00
|
|
|
} else if (instr->isVOP3P()) {
|
|
|
|
|
VOP3P_instruction& vop3p = instr->vop3p();
|
|
|
|
|
neg[2] = vop3p.neg_lo[add_op_idx];
|
|
|
|
|
abs[2] = vop3p.neg_hi[add_op_idx];
|
|
|
|
|
opsel_lo |= vop3p.opsel_lo & (1 << add_op_idx) ? 0x4 : 0x0;
|
|
|
|
|
opsel_hi |= vop3p.opsel_hi & (1 << add_op_idx) ? 0x4 : 0x0;
|
|
|
|
|
clamp = vop3p.clamp;
|
|
|
|
|
/* abs of the multiplication result */
|
|
|
|
|
if (vop3p.neg_hi[3 - add_op_idx]) {
|
|
|
|
|
neg[0] = false;
|
|
|
|
|
neg[1] = false;
|
|
|
|
|
abs[0] = true;
|
|
|
|
|
abs[1] = true;
|
|
|
|
|
}
|
|
|
|
|
/* neg of the multiplication result */
|
|
|
|
|
neg[1] = neg[1] ^ vop3p.neg_lo[3 - add_op_idx];
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
aco: combine add/mul as v_fma_mix into fma
fossil-db (Sienna Cichlid):
Totals from 7345 (5.44% of 134913) affected shaders:
CodeSize: 73840060 -> 73768936 (-0.10%); split: -0.10%, +0.00%
Instrs: 13701603 -> 13684183 (-0.13%); split: -0.13%, +0.00%
Latency: 185389373 -> 185306538 (-0.04%); split: -0.04%, +0.00%
InvThroughput: 33785020 -> 33757593 (-0.08%); split: -0.08%, +0.00%
VClause: 237337 -> 237338 (+0.00%)
SClause: 485728 -> 485720 (-0.00%)
Copies: 935900 -> 935279 (-0.07%); split: -0.07%, +0.00%
Branches: 480721 -> 480722 (+0.00%)
fossil-db (Navi):
Totals from 10649 (7.89% of 134913) affected shaders:
VGPRs: 756624 -> 756516 (-0.01%); split: -0.02%, +0.01%
CodeSize: 92156580 -> 91707900 (-0.49%); split: -0.49%, +0.00%
MaxWaves: 159402 -> 159476 (+0.05%); split: +0.07%, -0.02%
Instrs: 17155827 -> 17070449 (-0.50%); split: -0.50%, +0.00%
Latency: 246296456 -> 245487120 (-0.33%); split: -0.33%, +0.00%
InvThroughput: 41438159 -> 41117424 (-0.77%); split: -0.77%, +0.00%
VClause: 323790 -> 323867 (+0.02%); split: -0.00%, +0.03%
SClause: 612077 -> 612034 (-0.01%); split: -0.01%, +0.00%
Copies: 1103012 -> 1102775 (-0.02%); split: -0.03%, +0.01%
Branches: 555893 -> 555896 (+0.00%); split: -0.00%, +0.00%
PreSGPRs: 824372 -> 824378 (+0.00%)
PreVGPRs: 740390 -> 740363 (-0.00%); split: -0.01%, +0.01%
fossil-db (Vega):
Totals from 10950 (8.11% of 135048) affected shaders:
SGPRs: 1034528 -> 1034560 (+0.00%)
VGPRs: 794092 -> 794104 (+0.00%); split: -0.01%, +0.01%
CodeSize: 94409768 -> 93955568 (-0.48%); split: -0.48%, +0.00%
MaxWaves: 38950 -> 38939 (-0.03%); split: +0.00%, -0.03%
Instrs: 18162637 -> 18070934 (-0.50%); split: -0.51%, +0.00%
Latency: 291718455 -> 290772451 (-0.32%); split: -0.32%, +0.00%
InvThroughput: 109114674 -> 108489767 (-0.57%); split: -0.57%, +0.00%
VClause: 334498 -> 334579 (+0.02%); split: -0.01%, +0.03%
SClause: 628871 -> 628825 (-0.01%); split: -0.01%, +0.00%
Copies: 1674477 -> 1674850 (+0.02%); split: -0.02%, +0.04%
PreSGPRs: 834800 -> 834802 (+0.00%)
PreVGPRs: 750460 -> 750415 (-0.01%); split: -0.01%, +0.01%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 17:48:33 +00:00
|
|
|
|
2020-05-14 21:09:36 +01:00
|
|
|
if (instr->opcode == aco_opcode::v_sub_f32 || instr->opcode == aco_opcode::v_sub_f16)
|
2019-09-17 13:22:17 +02:00
|
|
|
neg[1 + add_op_idx] = neg[1 + add_op_idx] ^ true;
|
2021-06-09 10:14:54 +02:00
|
|
|
else if (instr->opcode == aco_opcode::v_subrev_f32 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_subrev_f16)
|
2019-09-17 13:22:17 +02:00
|
|
|
neg[2 - add_op_idx] = neg[2 - add_op_idx] ^ true;
|
|
|
|
|
|
aco: combine add/mul as v_fma_mix into fma
fossil-db (Sienna Cichlid):
Totals from 7345 (5.44% of 134913) affected shaders:
CodeSize: 73840060 -> 73768936 (-0.10%); split: -0.10%, +0.00%
Instrs: 13701603 -> 13684183 (-0.13%); split: -0.13%, +0.00%
Latency: 185389373 -> 185306538 (-0.04%); split: -0.04%, +0.00%
InvThroughput: 33785020 -> 33757593 (-0.08%); split: -0.08%, +0.00%
VClause: 237337 -> 237338 (+0.00%)
SClause: 485728 -> 485720 (-0.00%)
Copies: 935900 -> 935279 (-0.07%); split: -0.07%, +0.00%
Branches: 480721 -> 480722 (+0.00%)
fossil-db (Navi):
Totals from 10649 (7.89% of 134913) affected shaders:
VGPRs: 756624 -> 756516 (-0.01%); split: -0.02%, +0.01%
CodeSize: 92156580 -> 91707900 (-0.49%); split: -0.49%, +0.00%
MaxWaves: 159402 -> 159476 (+0.05%); split: +0.07%, -0.02%
Instrs: 17155827 -> 17070449 (-0.50%); split: -0.50%, +0.00%
Latency: 246296456 -> 245487120 (-0.33%); split: -0.33%, +0.00%
InvThroughput: 41438159 -> 41117424 (-0.77%); split: -0.77%, +0.00%
VClause: 323790 -> 323867 (+0.02%); split: -0.00%, +0.03%
SClause: 612077 -> 612034 (-0.01%); split: -0.01%, +0.00%
Copies: 1103012 -> 1102775 (-0.02%); split: -0.03%, +0.01%
Branches: 555893 -> 555896 (+0.00%); split: -0.00%, +0.00%
PreSGPRs: 824372 -> 824378 (+0.00%)
PreVGPRs: 740390 -> 740363 (-0.00%); split: -0.01%, +0.01%
fossil-db (Vega):
Totals from 10950 (8.11% of 135048) affected shaders:
SGPRs: 1034528 -> 1034560 (+0.00%)
VGPRs: 794092 -> 794104 (+0.00%); split: -0.01%, +0.01%
CodeSize: 94409768 -> 93955568 (-0.48%); split: -0.48%, +0.00%
MaxWaves: 38950 -> 38939 (-0.03%); split: +0.00%, -0.03%
Instrs: 18162637 -> 18070934 (-0.50%); split: -0.51%, +0.00%
Latency: 291718455 -> 290772451 (-0.32%); split: -0.32%, +0.00%
InvThroughput: 109114674 -> 108489767 (-0.57%); split: -0.57%, +0.00%
VClause: 334498 -> 334579 (+0.02%); split: -0.01%, +0.03%
SClause: 628871 -> 628825 (-0.01%); split: -0.01%, +0.00%
Copies: 1674477 -> 1674850 (+0.02%); split: -0.02%, +0.04%
PreSGPRs: 834800 -> 834802 (+0.00%)
PreVGPRs: 750460 -> 750415 (-0.01%); split: -0.01%, +0.01%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 17:48:33 +00:00
|
|
|
aco_ptr<Instruction> add_instr = std::move(instr);
|
|
|
|
|
if (add_instr->isVOP3P() || mul_instr->isVOP3P()) {
|
|
|
|
|
assert(!omod);
|
|
|
|
|
|
|
|
|
|
aco_opcode mad_op = add_instr->definitions[0].bytes() == 2 ? aco_opcode::v_fma_mixlo_f16
|
|
|
|
|
: aco_opcode::v_fma_mix_f32;
|
|
|
|
|
aco_ptr<VOP3P_instruction> mad{
|
|
|
|
|
create_instruction<VOP3P_instruction>(mad_op, Format::VOP3P, 3, 1)};
|
|
|
|
|
for (unsigned i = 0; i < 3; i++) {
|
|
|
|
|
mad->operands[i] = op[i];
|
|
|
|
|
mad->neg_lo[i] = neg[i];
|
|
|
|
|
mad->neg_hi[i] = abs[i];
|
|
|
|
|
}
|
|
|
|
|
mad->clamp = clamp;
|
|
|
|
|
mad->opsel_lo = opsel_lo;
|
|
|
|
|
mad->opsel_hi = opsel_hi;
|
2020-05-15 14:03:15 +01:00
|
|
|
|
aco: combine add/mul as v_fma_mix into fma
fossil-db (Sienna Cichlid):
Totals from 7345 (5.44% of 134913) affected shaders:
CodeSize: 73840060 -> 73768936 (-0.10%); split: -0.10%, +0.00%
Instrs: 13701603 -> 13684183 (-0.13%); split: -0.13%, +0.00%
Latency: 185389373 -> 185306538 (-0.04%); split: -0.04%, +0.00%
InvThroughput: 33785020 -> 33757593 (-0.08%); split: -0.08%, +0.00%
VClause: 237337 -> 237338 (+0.00%)
SClause: 485728 -> 485720 (-0.00%)
Copies: 935900 -> 935279 (-0.07%); split: -0.07%, +0.00%
Branches: 480721 -> 480722 (+0.00%)
fossil-db (Navi):
Totals from 10649 (7.89% of 134913) affected shaders:
VGPRs: 756624 -> 756516 (-0.01%); split: -0.02%, +0.01%
CodeSize: 92156580 -> 91707900 (-0.49%); split: -0.49%, +0.00%
MaxWaves: 159402 -> 159476 (+0.05%); split: +0.07%, -0.02%
Instrs: 17155827 -> 17070449 (-0.50%); split: -0.50%, +0.00%
Latency: 246296456 -> 245487120 (-0.33%); split: -0.33%, +0.00%
InvThroughput: 41438159 -> 41117424 (-0.77%); split: -0.77%, +0.00%
VClause: 323790 -> 323867 (+0.02%); split: -0.00%, +0.03%
SClause: 612077 -> 612034 (-0.01%); split: -0.01%, +0.00%
Copies: 1103012 -> 1102775 (-0.02%); split: -0.03%, +0.01%
Branches: 555893 -> 555896 (+0.00%); split: -0.00%, +0.00%
PreSGPRs: 824372 -> 824378 (+0.00%)
PreVGPRs: 740390 -> 740363 (-0.00%); split: -0.01%, +0.01%
fossil-db (Vega):
Totals from 10950 (8.11% of 135048) affected shaders:
SGPRs: 1034528 -> 1034560 (+0.00%)
VGPRs: 794092 -> 794104 (+0.00%); split: -0.01%, +0.01%
CodeSize: 94409768 -> 93955568 (-0.48%); split: -0.48%, +0.00%
MaxWaves: 38950 -> 38939 (-0.03%); split: +0.00%, -0.03%
Instrs: 18162637 -> 18070934 (-0.50%); split: -0.51%, +0.00%
Latency: 291718455 -> 290772451 (-0.32%); split: -0.32%, +0.00%
InvThroughput: 109114674 -> 108489767 (-0.57%); split: -0.57%, +0.00%
VClause: 334498 -> 334579 (+0.02%); split: -0.01%, +0.03%
SClause: 628871 -> 628825 (-0.01%); split: -0.01%, +0.00%
Copies: 1674477 -> 1674850 (+0.02%); split: -0.02%, +0.04%
PreSGPRs: 834800 -> 834802 (+0.00%)
PreVGPRs: 750460 -> 750415 (-0.01%); split: -0.01%, +0.01%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 17:48:33 +00:00
|
|
|
instr = std::move(mad);
|
|
|
|
|
} else {
|
|
|
|
|
aco_opcode mad_op = emit_fma ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32;
|
|
|
|
|
if (mul_instr->opcode == aco_opcode::v_mul_legacy_f32) {
|
2022-05-12 02:50:17 -04:00
|
|
|
assert(emit_fma == (ctx.program->gfx_level >= GFX10_3));
|
aco: combine add/mul as v_fma_mix into fma
fossil-db (Sienna Cichlid):
Totals from 7345 (5.44% of 134913) affected shaders:
CodeSize: 73840060 -> 73768936 (-0.10%); split: -0.10%, +0.00%
Instrs: 13701603 -> 13684183 (-0.13%); split: -0.13%, +0.00%
Latency: 185389373 -> 185306538 (-0.04%); split: -0.04%, +0.00%
InvThroughput: 33785020 -> 33757593 (-0.08%); split: -0.08%, +0.00%
VClause: 237337 -> 237338 (+0.00%)
SClause: 485728 -> 485720 (-0.00%)
Copies: 935900 -> 935279 (-0.07%); split: -0.07%, +0.00%
Branches: 480721 -> 480722 (+0.00%)
fossil-db (Navi):
Totals from 10649 (7.89% of 134913) affected shaders:
VGPRs: 756624 -> 756516 (-0.01%); split: -0.02%, +0.01%
CodeSize: 92156580 -> 91707900 (-0.49%); split: -0.49%, +0.00%
MaxWaves: 159402 -> 159476 (+0.05%); split: +0.07%, -0.02%
Instrs: 17155827 -> 17070449 (-0.50%); split: -0.50%, +0.00%
Latency: 246296456 -> 245487120 (-0.33%); split: -0.33%, +0.00%
InvThroughput: 41438159 -> 41117424 (-0.77%); split: -0.77%, +0.00%
VClause: 323790 -> 323867 (+0.02%); split: -0.00%, +0.03%
SClause: 612077 -> 612034 (-0.01%); split: -0.01%, +0.00%
Copies: 1103012 -> 1102775 (-0.02%); split: -0.03%, +0.01%
Branches: 555893 -> 555896 (+0.00%); split: -0.00%, +0.00%
PreSGPRs: 824372 -> 824378 (+0.00%)
PreVGPRs: 740390 -> 740363 (-0.00%); split: -0.01%, +0.01%
fossil-db (Vega):
Totals from 10950 (8.11% of 135048) affected shaders:
SGPRs: 1034528 -> 1034560 (+0.00%)
VGPRs: 794092 -> 794104 (+0.00%); split: -0.01%, +0.01%
CodeSize: 94409768 -> 93955568 (-0.48%); split: -0.48%, +0.00%
MaxWaves: 38950 -> 38939 (-0.03%); split: +0.00%, -0.03%
Instrs: 18162637 -> 18070934 (-0.50%); split: -0.51%, +0.00%
Latency: 291718455 -> 290772451 (-0.32%); split: -0.32%, +0.00%
InvThroughput: 109114674 -> 108489767 (-0.57%); split: -0.57%, +0.00%
VClause: 334498 -> 334579 (+0.02%); split: -0.01%, +0.03%
SClause: 628871 -> 628825 (-0.01%); split: -0.01%, +0.00%
Copies: 1674477 -> 1674850 (+0.02%); split: -0.02%, +0.04%
PreSGPRs: 834800 -> 834802 (+0.00%)
PreVGPRs: 750460 -> 750415 (-0.01%); split: -0.01%, +0.01%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 17:48:33 +00:00
|
|
|
mad_op = emit_fma ? aco_opcode::v_fma_legacy_f32 : aco_opcode::v_mad_legacy_f32;
|
|
|
|
|
} else if (mad16) {
|
2022-05-12 02:50:17 -04:00
|
|
|
mad_op = emit_fma ? (ctx.program->gfx_level == GFX8 ? aco_opcode::v_fma_legacy_f16
|
|
|
|
|
: aco_opcode::v_fma_f16)
|
|
|
|
|
: (ctx.program->gfx_level == GFX8 ? aco_opcode::v_mad_legacy_f16
|
|
|
|
|
: aco_opcode::v_mad_f16);
|
aco: combine add/mul as v_fma_mix into fma
fossil-db (Sienna Cichlid):
Totals from 7345 (5.44% of 134913) affected shaders:
CodeSize: 73840060 -> 73768936 (-0.10%); split: -0.10%, +0.00%
Instrs: 13701603 -> 13684183 (-0.13%); split: -0.13%, +0.00%
Latency: 185389373 -> 185306538 (-0.04%); split: -0.04%, +0.00%
InvThroughput: 33785020 -> 33757593 (-0.08%); split: -0.08%, +0.00%
VClause: 237337 -> 237338 (+0.00%)
SClause: 485728 -> 485720 (-0.00%)
Copies: 935900 -> 935279 (-0.07%); split: -0.07%, +0.00%
Branches: 480721 -> 480722 (+0.00%)
fossil-db (Navi):
Totals from 10649 (7.89% of 134913) affected shaders:
VGPRs: 756624 -> 756516 (-0.01%); split: -0.02%, +0.01%
CodeSize: 92156580 -> 91707900 (-0.49%); split: -0.49%, +0.00%
MaxWaves: 159402 -> 159476 (+0.05%); split: +0.07%, -0.02%
Instrs: 17155827 -> 17070449 (-0.50%); split: -0.50%, +0.00%
Latency: 246296456 -> 245487120 (-0.33%); split: -0.33%, +0.00%
InvThroughput: 41438159 -> 41117424 (-0.77%); split: -0.77%, +0.00%
VClause: 323790 -> 323867 (+0.02%); split: -0.00%, +0.03%
SClause: 612077 -> 612034 (-0.01%); split: -0.01%, +0.00%
Copies: 1103012 -> 1102775 (-0.02%); split: -0.03%, +0.01%
Branches: 555893 -> 555896 (+0.00%); split: -0.00%, +0.00%
PreSGPRs: 824372 -> 824378 (+0.00%)
PreVGPRs: 740390 -> 740363 (-0.00%); split: -0.01%, +0.01%
fossil-db (Vega):
Totals from 10950 (8.11% of 135048) affected shaders:
SGPRs: 1034528 -> 1034560 (+0.00%)
VGPRs: 794092 -> 794104 (+0.00%); split: -0.01%, +0.01%
CodeSize: 94409768 -> 93955568 (-0.48%); split: -0.48%, +0.00%
MaxWaves: 38950 -> 38939 (-0.03%); split: +0.00%, -0.03%
Instrs: 18162637 -> 18070934 (-0.50%); split: -0.51%, +0.00%
Latency: 291718455 -> 290772451 (-0.32%); split: -0.32%, +0.00%
InvThroughput: 109114674 -> 108489767 (-0.57%); split: -0.57%, +0.00%
VClause: 334498 -> 334579 (+0.02%); split: -0.01%, +0.03%
SClause: 628871 -> 628825 (-0.01%); split: -0.01%, +0.00%
Copies: 1674477 -> 1674850 (+0.02%); split: -0.02%, +0.04%
PreSGPRs: 834800 -> 834802 (+0.00%)
PreVGPRs: 750460 -> 750415 (-0.01%); split: -0.01%, +0.01%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 17:48:33 +00:00
|
|
|
} else if (mad64) {
|
|
|
|
|
mad_op = aco_opcode::v_fma_f64;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
aco_ptr<VOP3_instruction> mad{
|
|
|
|
|
create_instruction<VOP3_instruction>(mad_op, Format::VOP3, 3, 1)};
|
|
|
|
|
for (unsigned i = 0; i < 3; i++) {
|
|
|
|
|
mad->operands[i] = op[i];
|
|
|
|
|
mad->neg[i] = neg[i];
|
|
|
|
|
mad->abs[i] = abs[i];
|
|
|
|
|
}
|
|
|
|
|
mad->omod = omod;
|
|
|
|
|
mad->clamp = clamp;
|
|
|
|
|
|
|
|
|
|
instr = std::move(mad);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
aco: combine add/mul as v_fma_mix into fma
fossil-db (Sienna Cichlid):
Totals from 7345 (5.44% of 134913) affected shaders:
CodeSize: 73840060 -> 73768936 (-0.10%); split: -0.10%, +0.00%
Instrs: 13701603 -> 13684183 (-0.13%); split: -0.13%, +0.00%
Latency: 185389373 -> 185306538 (-0.04%); split: -0.04%, +0.00%
InvThroughput: 33785020 -> 33757593 (-0.08%); split: -0.08%, +0.00%
VClause: 237337 -> 237338 (+0.00%)
SClause: 485728 -> 485720 (-0.00%)
Copies: 935900 -> 935279 (-0.07%); split: -0.07%, +0.00%
Branches: 480721 -> 480722 (+0.00%)
fossil-db (Navi):
Totals from 10649 (7.89% of 134913) affected shaders:
VGPRs: 756624 -> 756516 (-0.01%); split: -0.02%, +0.01%
CodeSize: 92156580 -> 91707900 (-0.49%); split: -0.49%, +0.00%
MaxWaves: 159402 -> 159476 (+0.05%); split: +0.07%, -0.02%
Instrs: 17155827 -> 17070449 (-0.50%); split: -0.50%, +0.00%
Latency: 246296456 -> 245487120 (-0.33%); split: -0.33%, +0.00%
InvThroughput: 41438159 -> 41117424 (-0.77%); split: -0.77%, +0.00%
VClause: 323790 -> 323867 (+0.02%); split: -0.00%, +0.03%
SClause: 612077 -> 612034 (-0.01%); split: -0.01%, +0.00%
Copies: 1103012 -> 1102775 (-0.02%); split: -0.03%, +0.01%
Branches: 555893 -> 555896 (+0.00%); split: -0.00%, +0.00%
PreSGPRs: 824372 -> 824378 (+0.00%)
PreVGPRs: 740390 -> 740363 (-0.00%); split: -0.01%, +0.01%
fossil-db (Vega):
Totals from 10950 (8.11% of 135048) affected shaders:
SGPRs: 1034528 -> 1034560 (+0.00%)
VGPRs: 794092 -> 794104 (+0.00%); split: -0.01%, +0.01%
CodeSize: 94409768 -> 93955568 (-0.48%); split: -0.48%, +0.00%
MaxWaves: 38950 -> 38939 (-0.03%); split: +0.00%, -0.03%
Instrs: 18162637 -> 18070934 (-0.50%); split: -0.51%, +0.00%
Latency: 291718455 -> 290772451 (-0.32%); split: -0.32%, +0.00%
InvThroughput: 109114674 -> 108489767 (-0.57%); split: -0.57%, +0.00%
VClause: 334498 -> 334579 (+0.02%); split: -0.01%, +0.03%
SClause: 628871 -> 628825 (-0.01%); split: -0.01%, +0.00%
Copies: 1674477 -> 1674850 (+0.02%); split: -0.02%, +0.04%
PreSGPRs: 834800 -> 834802 (+0.00%)
PreVGPRs: 750460 -> 750415 (-0.01%); split: -0.01%, +0.01%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 17:48:33 +00:00
|
|
|
instr->definitions[0] = add_instr->definitions[0];
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
/* mark this ssa_def to be re-checked for profitability and literals */
|
aco: combine add/mul as v_fma_mix into fma
fossil-db (Sienna Cichlid):
Totals from 7345 (5.44% of 134913) affected shaders:
CodeSize: 73840060 -> 73768936 (-0.10%); split: -0.10%, +0.00%
Instrs: 13701603 -> 13684183 (-0.13%); split: -0.13%, +0.00%
Latency: 185389373 -> 185306538 (-0.04%); split: -0.04%, +0.00%
InvThroughput: 33785020 -> 33757593 (-0.08%); split: -0.08%, +0.00%
VClause: 237337 -> 237338 (+0.00%)
SClause: 485728 -> 485720 (-0.00%)
Copies: 935900 -> 935279 (-0.07%); split: -0.07%, +0.00%
Branches: 480721 -> 480722 (+0.00%)
fossil-db (Navi):
Totals from 10649 (7.89% of 134913) affected shaders:
VGPRs: 756624 -> 756516 (-0.01%); split: -0.02%, +0.01%
CodeSize: 92156580 -> 91707900 (-0.49%); split: -0.49%, +0.00%
MaxWaves: 159402 -> 159476 (+0.05%); split: +0.07%, -0.02%
Instrs: 17155827 -> 17070449 (-0.50%); split: -0.50%, +0.00%
Latency: 246296456 -> 245487120 (-0.33%); split: -0.33%, +0.00%
InvThroughput: 41438159 -> 41117424 (-0.77%); split: -0.77%, +0.00%
VClause: 323790 -> 323867 (+0.02%); split: -0.00%, +0.03%
SClause: 612077 -> 612034 (-0.01%); split: -0.01%, +0.00%
Copies: 1103012 -> 1102775 (-0.02%); split: -0.03%, +0.01%
Branches: 555893 -> 555896 (+0.00%); split: -0.00%, +0.00%
PreSGPRs: 824372 -> 824378 (+0.00%)
PreVGPRs: 740390 -> 740363 (-0.00%); split: -0.01%, +0.01%
fossil-db (Vega):
Totals from 10950 (8.11% of 135048) affected shaders:
SGPRs: 1034528 -> 1034560 (+0.00%)
VGPRs: 794092 -> 794104 (+0.00%); split: -0.01%, +0.01%
CodeSize: 94409768 -> 93955568 (-0.48%); split: -0.48%, +0.00%
MaxWaves: 38950 -> 38939 (-0.03%); split: +0.00%, -0.03%
Instrs: 18162637 -> 18070934 (-0.50%); split: -0.51%, +0.00%
Latency: 291718455 -> 290772451 (-0.32%); split: -0.32%, +0.00%
InvThroughput: 109114674 -> 108489767 (-0.57%); split: -0.57%, +0.00%
VClause: 334498 -> 334579 (+0.02%); split: -0.01%, +0.03%
SClause: 628871 -> 628825 (-0.01%); split: -0.01%, +0.00%
Copies: 1674477 -> 1674850 (+0.02%); split: -0.02%, +0.04%
PreSGPRs: 834800 -> 834802 (+0.00%)
PreVGPRs: 750460 -> 750415 (-0.01%); split: -0.01%, +0.01%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 17:48:33 +00:00
|
|
|
ctx.mad_infos.emplace_back(std::move(add_instr), mul_instr->definitions[0].tempId());
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_mad(instr.get(), ctx.mad_infos.size() - 1);
|
2019-09-17 13:22:17 +02:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
/* v_mul_f32(v_cndmask_b32(0, 1.0, cond), a) -> v_cndmask_b32(0, a, cond) */
|
2021-09-21 17:03:05 +01:00
|
|
|
else if (((instr->opcode == aco_opcode::v_mul_f32 &&
|
|
|
|
|
!ctx.fp_mode.preserve_signed_zero_inf_nan32) ||
|
|
|
|
|
instr->opcode == aco_opcode::v_mul_legacy_f32) &&
|
2021-10-19 10:43:03 +01:00
|
|
|
!instr->usesModifiers() && !ctx.fp_mode.must_flush_denorms32) {
|
2019-09-17 13:22:17 +02:00
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
|
|
|
|
if (instr->operands[i].isTemp() && ctx.info[instr->operands[i].tempId()].is_b2f() &&
|
2021-06-09 10:14:54 +02:00
|
|
|
ctx.uses[instr->operands[i].tempId()] == 1 && instr->operands[!i].isTemp() &&
|
|
|
|
|
instr->operands[!i].getTemp().type() == RegType::vgpr) {
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.uses[instr->operands[i].tempId()]--;
|
|
|
|
|
ctx.uses[ctx.info[instr->operands[i].tempId()].temp.id()]++;
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
aco_ptr<VOP2_instruction> new_instr{
|
|
|
|
|
create_instruction<VOP2_instruction>(aco_opcode::v_cndmask_b32, Format::VOP2, 3, 1)};
|
2021-07-13 11:22:46 +02:00
|
|
|
new_instr->operands[0] = Operand::zero();
|
2019-09-17 13:22:17 +02:00
|
|
|
new_instr->operands[1] = instr->operands[!i];
|
|
|
|
|
new_instr->operands[2] = Operand(ctx.info[instr->operands[i].tempId()].temp);
|
|
|
|
|
new_instr->definitions[0] = instr->definitions[0];
|
2021-09-16 20:50:29 +02:00
|
|
|
instr = std::move(new_instr);
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.info[instr->definitions[0].tempId()].label = 0;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
2022-05-12 02:50:17 -04:00
|
|
|
} else if (instr->opcode == aco_opcode::v_or_b32 && ctx.program->gfx_level >= GFX9) {
|
2021-06-09 10:14:54 +02:00
|
|
|
if (combine_three_valu_op(ctx, instr, aco_opcode::s_or_b32, aco_opcode::v_or3_b32, "012",
|
|
|
|
|
1 | 2)) {
|
|
|
|
|
} else if (combine_three_valu_op(ctx, instr, aco_opcode::v_or_b32, aco_opcode::v_or3_b32,
|
|
|
|
|
"012", 1 | 2)) {
|
|
|
|
|
} else if (combine_add_or_then_and_lshl(ctx, instr)) {
|
|
|
|
|
}
|
2022-05-12 02:50:17 -04:00
|
|
|
} else if (instr->opcode == aco_opcode::v_xor_b32 && ctx.program->gfx_level >= GFX10) {
|
2021-06-09 10:14:54 +02:00
|
|
|
if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xor3_b32, "012",
|
|
|
|
|
1 | 2)) {
|
|
|
|
|
} else if (combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xor3_b32,
|
|
|
|
|
"012", 1 | 2)) {
|
|
|
|
|
}
|
2021-07-01 18:48:09 +02:00
|
|
|
} else if (instr->opcode == aco_opcode::v_add_u16) {
|
|
|
|
|
combine_three_valu_op(
|
|
|
|
|
ctx, instr, aco_opcode::v_mul_lo_u16,
|
2022-05-12 02:50:17 -04:00
|
|
|
ctx.program->gfx_level == GFX8 ? aco_opcode::v_mad_legacy_u16 : aco_opcode::v_mad_u16,
|
2021-07-01 18:48:09 +02:00
|
|
|
"120", 1 | 2);
|
|
|
|
|
} else if (instr->opcode == aco_opcode::v_add_u16_e64) {
|
|
|
|
|
combine_three_valu_op(ctx, instr, aco_opcode::v_mul_lo_u16_e64, aco_opcode::v_mad_u16, "120",
|
|
|
|
|
1 | 2);
|
2020-04-02 17:41:36 +02:00
|
|
|
} else if (instr->opcode == aco_opcode::v_add_u32) {
|
2021-06-09 10:14:54 +02:00
|
|
|
if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) {
|
|
|
|
|
} else if (combine_add_bcnt(ctx, instr)) {
|
|
|
|
|
} else if (combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24,
|
|
|
|
|
aco_opcode::v_mad_u32_u24, "120", 1 | 2)) {
|
2022-05-12 02:50:17 -04:00
|
|
|
} else if (ctx.program->gfx_level >= GFX9 && !instr->usesModifiers()) {
|
2021-06-09 10:14:54 +02:00
|
|
|
if (combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xad_u32, "120",
|
|
|
|
|
1 | 2)) {
|
|
|
|
|
} else if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xad_u32,
|
|
|
|
|
"120", 1 | 2)) {
|
|
|
|
|
} else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_i32, aco_opcode::v_add3_u32,
|
|
|
|
|
"012", 1 | 2)) {
|
|
|
|
|
} else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_u32, aco_opcode::v_add3_u32,
|
|
|
|
|
"012", 1 | 2)) {
|
|
|
|
|
} else if (combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add3_u32,
|
|
|
|
|
"012", 1 | 2)) {
|
|
|
|
|
} else if (combine_add_or_then_and_lshl(ctx, instr)) {
|
|
|
|
|
}
|
2020-04-02 17:41:36 +02:00
|
|
|
}
|
|
|
|
|
} else if (instr->opcode == aco_opcode::v_add_co_u32 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_add_co_u32_e64) {
|
2020-06-05 17:36:29 +01:00
|
|
|
bool carry_out = ctx.uses[instr->definitions[1].tempId()] > 0;
|
2021-06-09 10:14:54 +02:00
|
|
|
if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) {
|
|
|
|
|
} else if (!carry_out && combine_add_bcnt(ctx, instr)) {
|
|
|
|
|
} else if (!carry_out && combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24,
|
|
|
|
|
aco_opcode::v_mad_u32_u24, "120", 1 | 2)) {
|
2021-09-09 08:38:41 +02:00
|
|
|
} else if (!carry_out && combine_add_lshl(ctx, instr, false)) {
|
2021-06-09 10:14:54 +02:00
|
|
|
}
|
|
|
|
|
} else if (instr->opcode == aco_opcode::v_sub_u32 || instr->opcode == aco_opcode::v_sub_co_u32 ||
|
2020-04-02 17:41:36 +02:00
|
|
|
instr->opcode == aco_opcode::v_sub_co_u32_e64) {
|
2021-09-28 17:11:28 +01:00
|
|
|
bool carry_out =
|
|
|
|
|
instr->opcode != aco_opcode::v_sub_u32 && ctx.uses[instr->definitions[1].tempId()] > 0;
|
2021-09-09 08:38:41 +02:00
|
|
|
if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_subbrev_co_u32, 2)) {
|
|
|
|
|
} else if (!carry_out && combine_add_lshl(ctx, instr, true)) {
|
|
|
|
|
}
|
2020-04-02 17:41:36 +02:00
|
|
|
} else if (instr->opcode == aco_opcode::v_subrev_u32 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_subrev_co_u32 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_subrev_co_u32_e64) {
|
|
|
|
|
combine_add_sub_b2i(ctx, instr, aco_opcode::v_subbrev_co_u32, 1);
|
2022-05-12 02:50:17 -04:00
|
|
|
} else if (instr->opcode == aco_opcode::v_lshlrev_b32 && ctx.program->gfx_level >= GFX9) {
|
2021-06-09 10:14:54 +02:00
|
|
|
combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add_lshl_u32, "120",
|
|
|
|
|
2);
|
|
|
|
|
} else if ((instr->opcode == aco_opcode::s_add_u32 || instr->opcode == aco_opcode::s_add_i32) &&
|
2022-05-12 02:50:17 -04:00
|
|
|
ctx.program->gfx_level >= GFX9) {
|
2019-09-17 13:22:17 +02:00
|
|
|
combine_salu_lshl_add(ctx, instr);
|
2019-12-16 15:35:14 +00:00
|
|
|
} else if (instr->opcode == aco_opcode::s_not_b32 || instr->opcode == aco_opcode::s_not_b64) {
|
2022-08-20 22:55:45 +02:00
|
|
|
if (!combine_salu_not_bitwise(ctx, instr))
|
|
|
|
|
combine_inverse_comparison(ctx, instr);
|
2019-12-03 13:37:49 +00:00
|
|
|
} else if (instr->opcode == aco_opcode::s_and_b32 || instr->opcode == aco_opcode::s_or_b32 ||
|
|
|
|
|
instr->opcode == aco_opcode::s_and_b64 || instr->opcode == aco_opcode::s_or_b64) {
|
2021-06-09 10:14:54 +02:00
|
|
|
if (combine_ordering_test(ctx, instr)) {
|
|
|
|
|
} else if (combine_comparison_ordering(ctx, instr)) {
|
|
|
|
|
} else if (combine_constant_comparison_ordering(ctx, instr)) {
|
|
|
|
|
} else if (combine_salu_n2(ctx, instr)) {
|
|
|
|
|
}
|
2022-09-23 19:00:36 +02:00
|
|
|
} else if (instr->opcode == aco_opcode::s_abs_i32) {
|
|
|
|
|
combine_sabsdiff(ctx, instr);
|
aco: Combine bit test to s_bitcmp.
Foz-DB Navi21:
Totals from 6396 (4.74% of 134913) affected shaders:
VGPRs: 483280 -> 483152 (-0.03%); split: -0.03%, +0.01%
SpillSGPRs: 8119 -> 7941 (-2.19%)
CodeSize: 63377880 -> 63268556 (-0.17%); split: -0.20%, +0.03%
MaxWaves: 86778 -> 86810 (+0.04%)
Instrs: 11745621 -> 11725857 (-0.17%); split: -0.20%, +0.03%
Latency: 162400148 -> 162282230 (-0.07%); split: -0.08%, +0.01%
InvThroughput: 29179429 -> 29133173 (-0.16%); split: -0.16%, +0.00%
VClause: 208032 -> 208100 (+0.03%); split: -0.01%, +0.05%
SClause: 431390 -> 430849 (-0.13%); split: -0.24%, +0.11%
Copies: 896222 -> 893285 (-0.33%); split: -0.62%, +0.30%
Branches: 349806 -> 348770 (-0.30%); split: -0.90%, +0.60%
PreSGPRs: 618908 -> 613773 (-0.83%); split: -0.83%, +0.00%
PreVGPRs: 482901 -> 482893 (-0.00%)
Signed-off-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18870>
2022-09-24 18:49:10 +02:00
|
|
|
} else if (instr->opcode == aco_opcode::s_cmp_lg_i32 ||
|
|
|
|
|
instr->opcode == aco_opcode::s_cmp_lg_u32 ||
|
|
|
|
|
instr->opcode == aco_opcode::s_cmp_lg_u64 ||
|
|
|
|
|
instr->opcode == aco_opcode::s_cmp_eq_i32 ||
|
|
|
|
|
instr->opcode == aco_opcode::s_cmp_eq_u32 ||
|
|
|
|
|
instr->opcode == aco_opcode::s_cmp_eq_u64) {
|
|
|
|
|
combine_s_bitcmp(ctx, instr);
|
aco: optimize v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc)
fossils-db (Vega10):
Totals from 7786 (5.70% of 136546) affected shaders:
SGPRs: 517778 -> 518626 (+0.16%); split: -0.01%, +0.17%
VGPRs: 488252 -> 488084 (-0.03%); split: -0.04%, +0.01%
CodeSize: 42282068 -> 42250152 (-0.08%); split: -0.16%, +0.09%
MaxWaves: 35697 -> 35716 (+0.05%); split: +0.06%, -0.01%
Instrs: 8319309 -> 8304792 (-0.17%); split: -0.18%, +0.00%
Cycles: 88619440 -> 88489636 (-0.15%); split: -0.16%, +0.01%
VMEM: 2788278 -> 2780431 (-0.28%); split: +0.06%, -0.35%
SMEM: 570364 -> 569370 (-0.17%); split: +0.12%, -0.30%
VClause: 144906 -> 144908 (+0.00%); split: -0.05%, +0.05%
SClause: 302143 -> 302055 (-0.03%); split: -0.04%, +0.01%
Copies: 579124 -> 578779 (-0.06%); split: -0.14%, +0.08%
PreSGPRs: 327695 -> 328845 (+0.35%); split: -0.00%, +0.35%
PreVGPRs: 434280 -> 433954 (-0.08%)
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7438>
2020-11-03 18:50:32 +01:00
|
|
|
} else if (instr->opcode == aco_opcode::v_and_b32) {
|
|
|
|
|
combine_and_subbrev(ctx, instr);
|
2020-06-16 18:04:21 +01:00
|
|
|
} else if (instr->opcode == aco_opcode::v_fma_f32 || instr->opcode == aco_opcode::v_fma_f16) {
|
|
|
|
|
/* set existing v_fma_f32 with label_mad so we can create v_fmamk_f32/v_fmaak_f32.
|
|
|
|
|
* since ctx.uses[mad_info::mul_temp_id] is always 0, we don't have to worry about
|
|
|
|
|
* select_instruction() using mad_info::add_instr.
|
|
|
|
|
*/
|
|
|
|
|
ctx.mad_infos.emplace_back(nullptr, 0);
|
|
|
|
|
ctx.info[instr->definitions[0].tempId()].set_mad(instr.get(), ctx.mad_infos.size() - 1);
|
2019-09-17 13:22:17 +02:00
|
|
|
} else {
|
aco: use v_minmax/v_maxmin opcodes
fossil-db (gfx1100):
Totals from 29868 (22.12% of 135032) affected shaders:
MaxWaves: 741336 -> 741344 (+0.00%)
Instrs: 34624902 -> 34539766 (-0.25%); split: -0.25%, +0.00%
CodeSize: 187196804 -> 187192100 (-0.00%); split: -0.01%, +0.01%
VGPRs: 1816860 -> 1816788 (-0.00%); split: -0.01%, +0.01%
Latency: 502597202 -> 502245627 (-0.07%); split: -0.08%, +0.01%
InvThroughput: 84813176 -> 84586122 (-0.27%); split: -0.28%, +0.01%
VClause: 633826 -> 633749 (-0.01%); split: -0.02%, +0.01%
SClause: 1317738 -> 1317047 (-0.05%); split: -0.06%, +0.01%
Copies: 2130610 -> 2130954 (+0.02%); split: -0.03%, +0.05%
Branches: 766093 -> 765969 (-0.02%); split: -0.02%, +0.00%
PreSGPRs: 1630250 -> 1630034 (-0.01%); split: -0.02%, +0.00%
PreVGPRs: 1590777 -> 1590664 (-0.01%); split: -0.01%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19933>
2022-11-16 17:42:20 +00:00
|
|
|
aco_opcode min, max, min3, max3, med3, minmax;
|
2019-09-17 13:22:17 +02:00
|
|
|
bool some_gfx9_only;
|
aco: use v_minmax/v_maxmin opcodes
fossil-db (gfx1100):
Totals from 29868 (22.12% of 135032) affected shaders:
MaxWaves: 741336 -> 741344 (+0.00%)
Instrs: 34624902 -> 34539766 (-0.25%); split: -0.25%, +0.00%
CodeSize: 187196804 -> 187192100 (-0.00%); split: -0.01%, +0.01%
VGPRs: 1816860 -> 1816788 (-0.00%); split: -0.01%, +0.01%
Latency: 502597202 -> 502245627 (-0.07%); split: -0.08%, +0.01%
InvThroughput: 84813176 -> 84586122 (-0.27%); split: -0.28%, +0.01%
VClause: 633826 -> 633749 (-0.01%); split: -0.02%, +0.01%
SClause: 1317738 -> 1317047 (-0.05%); split: -0.06%, +0.01%
Copies: 2130610 -> 2130954 (+0.02%); split: -0.03%, +0.05%
Branches: 766093 -> 765969 (-0.02%); split: -0.02%, +0.00%
PreSGPRs: 1630250 -> 1630034 (-0.01%); split: -0.02%, +0.00%
PreVGPRs: 1590777 -> 1590664 (-0.01%); split: -0.01%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19933>
2022-11-16 17:42:20 +00:00
|
|
|
if (get_minmax_info(instr->opcode, &min, &max, &min3, &max3, &med3, &minmax,
|
|
|
|
|
&some_gfx9_only) &&
|
2022-05-12 02:50:17 -04:00
|
|
|
(!some_gfx9_only || ctx.program->gfx_level >= GFX9)) {
|
2021-06-09 10:14:54 +02:00
|
|
|
if (combine_minmax(ctx, instr, instr->opcode == min ? max : min,
|
aco: use v_minmax/v_maxmin opcodes
fossil-db (gfx1100):
Totals from 29868 (22.12% of 135032) affected shaders:
MaxWaves: 741336 -> 741344 (+0.00%)
Instrs: 34624902 -> 34539766 (-0.25%); split: -0.25%, +0.00%
CodeSize: 187196804 -> 187192100 (-0.00%); split: -0.01%, +0.01%
VGPRs: 1816860 -> 1816788 (-0.00%); split: -0.01%, +0.01%
Latency: 502597202 -> 502245627 (-0.07%); split: -0.08%, +0.01%
InvThroughput: 84813176 -> 84586122 (-0.27%); split: -0.28%, +0.01%
VClause: 633826 -> 633749 (-0.01%); split: -0.02%, +0.01%
SClause: 1317738 -> 1317047 (-0.05%); split: -0.06%, +0.01%
Copies: 2130610 -> 2130954 (+0.02%); split: -0.03%, +0.05%
Branches: 766093 -> 765969 (-0.02%); split: -0.02%, +0.00%
PreSGPRs: 1630250 -> 1630034 (-0.01%); split: -0.02%, +0.00%
PreVGPRs: 1590777 -> 1590664 (-0.01%); split: -0.01%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19933>
2022-11-16 17:42:20 +00:00
|
|
|
instr->opcode == min ? min3 : max3, minmax)) {
|
2021-06-09 10:14:54 +02:00
|
|
|
} else {
|
|
|
|
|
combine_clamp(ctx, instr, min, max, med3);
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
to_uniform_bool_instr(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
2020-01-16 19:32:31 +01:00
|
|
|
{
|
2021-08-25 12:13:39 +02:00
|
|
|
/* Check every operand to make sure they are suitable. */
|
|
|
|
|
for (Operand& op : instr->operands) {
|
|
|
|
|
if (!op.isTemp())
|
|
|
|
|
return false;
|
|
|
|
|
if (!ctx.info[op.tempId()].is_uniform_bool() && !ctx.info[op.tempId()].is_uniform_bitwise())
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2020-01-16 19:32:31 +01:00
|
|
|
switch (instr->opcode) {
|
2021-06-09 10:14:54 +02:00
|
|
|
case aco_opcode::s_and_b32:
|
|
|
|
|
case aco_opcode::s_and_b64: instr->opcode = aco_opcode::s_and_b32; break;
|
|
|
|
|
case aco_opcode::s_or_b32:
|
|
|
|
|
case aco_opcode::s_or_b64: instr->opcode = aco_opcode::s_or_b32; break;
|
|
|
|
|
case aco_opcode::s_xor_b32:
|
|
|
|
|
case aco_opcode::s_xor_b64: instr->opcode = aco_opcode::s_absdiff_i32; break;
|
|
|
|
|
default:
|
|
|
|
|
/* Don't transform other instructions. They are very unlikely to appear here. */
|
|
|
|
|
return false;
|
2020-01-16 19:32:31 +01:00
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
for (Operand& op : instr->operands) {
|
2020-01-16 19:32:31 +01:00
|
|
|
ctx.uses[op.tempId()]--;
|
|
|
|
|
|
|
|
|
|
if (ctx.info[op.tempId()].is_uniform_bool()) {
|
|
|
|
|
/* Just use the uniform boolean temp. */
|
|
|
|
|
op.setTemp(ctx.info[op.tempId()].temp);
|
|
|
|
|
} else if (ctx.info[op.tempId()].is_uniform_bitwise()) {
|
|
|
|
|
/* Use the SCC definition of the predecessor instruction.
|
2021-06-09 10:14:54 +02:00
|
|
|
* This allows the predecessor to get picked up by the same optimization (if it has no
|
|
|
|
|
* divergent users), and it also makes sure that the current instruction will keep working
|
|
|
|
|
* even if the predecessor won't be transformed.
|
2020-01-16 19:32:31 +01:00
|
|
|
*/
|
2021-06-09 10:14:54 +02:00
|
|
|
Instruction* pred_instr = ctx.info[op.tempId()].instr;
|
2020-01-16 19:32:31 +01:00
|
|
|
assert(pred_instr->definitions.size() >= 2);
|
2021-06-09 10:14:54 +02:00
|
|
|
assert(pred_instr->definitions[1].isFixed() &&
|
|
|
|
|
pred_instr->definitions[1].physReg() == scc);
|
2020-01-16 19:32:31 +01:00
|
|
|
op.setTemp(pred_instr->definitions[1].getTemp());
|
|
|
|
|
} else {
|
|
|
|
|
unreachable("Invalid operand on uniform bitwise instruction.");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ctx.uses[op.tempId()]++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
instr->definitions[0].setTemp(Temp(instr->definitions[0].tempId(), s1));
|
|
|
|
|
assert(instr->operands[0].regClass() == s1);
|
|
|
|
|
assert(instr->operands[1].regClass() == s1);
|
|
|
|
|
return true;
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
const uint32_t threshold = 4;
|
|
|
|
|
|
2019-12-16 13:30:10 +00:00
|
|
|
if (is_dead(ctx.uses, instr.get())) {
|
2019-09-17 13:22:17 +02:00
|
|
|
instr.reset();
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2019-12-09 21:20:10 +00:00
|
|
|
/* convert split_vector into a copy or extract_vector if only one definition is ever used */
|
2019-09-17 13:22:17 +02:00
|
|
|
if (instr->opcode == aco_opcode::p_split_vector) {
|
|
|
|
|
unsigned num_used = 0;
|
|
|
|
|
unsigned idx = 0;
|
2020-04-10 13:09:54 +01:00
|
|
|
unsigned split_offset = 0;
|
2021-06-09 10:14:54 +02:00
|
|
|
for (unsigned i = 0, offset = 0; i < instr->definitions.size();
|
|
|
|
|
offset += instr->definitions[i++].bytes()) {
|
2019-09-17 13:22:17 +02:00
|
|
|
if (ctx.uses[instr->definitions[i].tempId()]) {
|
|
|
|
|
num_used++;
|
|
|
|
|
idx = i;
|
2020-04-10 13:09:54 +01:00
|
|
|
split_offset = offset;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
2019-12-09 21:20:10 +00:00
|
|
|
bool done = false;
|
|
|
|
|
if (num_used == 1 && ctx.info[instr->operands[0].tempId()].is_vec() &&
|
|
|
|
|
ctx.uses[instr->operands[0].tempId()] == 1) {
|
2021-06-09 10:14:54 +02:00
|
|
|
Instruction* vec = ctx.info[instr->operands[0].tempId()].instr;
|
2019-12-09 21:20:10 +00:00
|
|
|
|
|
|
|
|
unsigned off = 0;
|
|
|
|
|
Operand op;
|
|
|
|
|
for (Operand& vec_op : vec->operands) {
|
2020-04-10 13:09:54 +01:00
|
|
|
if (off == split_offset) {
|
2019-12-09 21:20:10 +00:00
|
|
|
op = vec_op;
|
|
|
|
|
break;
|
|
|
|
|
}
|
2020-04-10 13:09:54 +01:00
|
|
|
off += vec_op.bytes();
|
2019-12-09 21:20:10 +00:00
|
|
|
}
|
2020-04-10 13:09:54 +01:00
|
|
|
if (off != instr->operands[0].bytes() && op.bytes() == instr->definitions[idx].bytes()) {
|
2019-12-09 21:20:10 +00:00
|
|
|
ctx.uses[instr->operands[0].tempId()]--;
|
|
|
|
|
for (Operand& vec_op : vec->operands) {
|
|
|
|
|
if (vec_op.isTemp())
|
|
|
|
|
ctx.uses[vec_op.tempId()]--;
|
|
|
|
|
}
|
|
|
|
|
if (op.isTemp())
|
|
|
|
|
ctx.uses[op.tempId()]++;
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
aco_ptr<Pseudo_instruction> extract{create_instruction<Pseudo_instruction>(
|
|
|
|
|
aco_opcode::p_create_vector, Format::PSEUDO, 1, 1)};
|
2019-12-09 21:20:10 +00:00
|
|
|
extract->operands[0] = op;
|
|
|
|
|
extract->definitions[0] = instr->definitions[idx];
|
2021-09-16 20:50:29 +02:00
|
|
|
instr = std::move(extract);
|
2019-12-09 21:20:10 +00:00
|
|
|
|
|
|
|
|
done = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-04-10 13:09:54 +01:00
|
|
|
if (!done && num_used == 1 &&
|
|
|
|
|
instr->operands[0].bytes() % instr->definitions[idx].bytes() == 0 &&
|
|
|
|
|
split_offset % instr->definitions[idx].bytes() == 0) {
|
2021-06-09 10:14:54 +02:00
|
|
|
aco_ptr<Pseudo_instruction> extract{create_instruction<Pseudo_instruction>(
|
|
|
|
|
aco_opcode::p_extract_vector, Format::PSEUDO, 2, 1)};
|
2019-09-17 13:22:17 +02:00
|
|
|
extract->operands[0] = instr->operands[0];
|
2021-07-13 11:22:46 +02:00
|
|
|
extract->operands[1] =
|
|
|
|
|
Operand::c32((uint32_t)split_offset / instr->definitions[idx].bytes());
|
2019-09-17 13:22:17 +02:00
|
|
|
extract->definitions[0] = instr->definitions[idx];
|
2021-09-16 20:50:29 +02:00
|
|
|
instr = std::move(extract);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-11-22 15:18:38 +00:00
|
|
|
mad_info* mad_info = NULL;
|
2020-05-15 14:03:15 +01:00
|
|
|
if (!instr->definitions.empty() && ctx.info[instr->definitions[0].tempId()].is_mad()) {
|
2020-06-01 11:27:53 +01:00
|
|
|
mad_info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].instr->pass_flags];
|
2019-11-22 15:18:38 +00:00
|
|
|
/* re-check mad instructions */
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
if (ctx.uses[mad_info->mul_temp_id] && mad_info->add_instr) {
|
2019-11-22 15:18:38 +00:00
|
|
|
ctx.uses[mad_info->mul_temp_id]++;
|
2019-11-20 19:09:25 +00:00
|
|
|
if (instr->operands[0].isTemp())
|
|
|
|
|
ctx.uses[instr->operands[0].tempId()]--;
|
|
|
|
|
if (instr->operands[1].isTemp())
|
|
|
|
|
ctx.uses[instr->operands[1].tempId()]--;
|
2019-11-22 15:18:38 +00:00
|
|
|
instr.swap(mad_info->add_instr);
|
|
|
|
|
mad_info = NULL;
|
|
|
|
|
}
|
|
|
|
|
/* check literals */
|
aco: combine add/mul as v_fma_mix into fma
fossil-db (Sienna Cichlid):
Totals from 7345 (5.44% of 134913) affected shaders:
CodeSize: 73840060 -> 73768936 (-0.10%); split: -0.10%, +0.00%
Instrs: 13701603 -> 13684183 (-0.13%); split: -0.13%, +0.00%
Latency: 185389373 -> 185306538 (-0.04%); split: -0.04%, +0.00%
InvThroughput: 33785020 -> 33757593 (-0.08%); split: -0.08%, +0.00%
VClause: 237337 -> 237338 (+0.00%)
SClause: 485728 -> 485720 (-0.00%)
Copies: 935900 -> 935279 (-0.07%); split: -0.07%, +0.00%
Branches: 480721 -> 480722 (+0.00%)
fossil-db (Navi):
Totals from 10649 (7.89% of 134913) affected shaders:
VGPRs: 756624 -> 756516 (-0.01%); split: -0.02%, +0.01%
CodeSize: 92156580 -> 91707900 (-0.49%); split: -0.49%, +0.00%
MaxWaves: 159402 -> 159476 (+0.05%); split: +0.07%, -0.02%
Instrs: 17155827 -> 17070449 (-0.50%); split: -0.50%, +0.00%
Latency: 246296456 -> 245487120 (-0.33%); split: -0.33%, +0.00%
InvThroughput: 41438159 -> 41117424 (-0.77%); split: -0.77%, +0.00%
VClause: 323790 -> 323867 (+0.02%); split: -0.00%, +0.03%
SClause: 612077 -> 612034 (-0.01%); split: -0.01%, +0.00%
Copies: 1103012 -> 1102775 (-0.02%); split: -0.03%, +0.01%
Branches: 555893 -> 555896 (+0.00%); split: -0.00%, +0.00%
PreSGPRs: 824372 -> 824378 (+0.00%)
PreVGPRs: 740390 -> 740363 (-0.00%); split: -0.01%, +0.01%
fossil-db (Vega):
Totals from 10950 (8.11% of 135048) affected shaders:
SGPRs: 1034528 -> 1034560 (+0.00%)
VGPRs: 794092 -> 794104 (+0.00%); split: -0.01%, +0.01%
CodeSize: 94409768 -> 93955568 (-0.48%); split: -0.48%, +0.00%
MaxWaves: 38950 -> 38939 (-0.03%); split: +0.00%, -0.03%
Instrs: 18162637 -> 18070934 (-0.50%); split: -0.51%, +0.00%
Latency: 291718455 -> 290772451 (-0.32%); split: -0.32%, +0.00%
InvThroughput: 109114674 -> 108489767 (-0.57%); split: -0.57%, +0.00%
VClause: 334498 -> 334579 (+0.02%); split: -0.01%, +0.03%
SClause: 628871 -> 628825 (-0.01%); split: -0.01%, +0.00%
Copies: 1674477 -> 1674850 (+0.02%); split: -0.02%, +0.04%
PreSGPRs: 834800 -> 834802 (+0.00%)
PreVGPRs: 750460 -> 750415 (-0.01%); split: -0.01%, +0.01%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
2022-01-17 17:48:33 +00:00
|
|
|
else if (!instr->usesModifiers() && !instr->isVOP3P() &&
|
|
|
|
|
instr->opcode != aco_opcode::v_fma_f64 &&
|
2021-09-21 17:03:05 +01:00
|
|
|
instr->opcode != aco_opcode::v_mad_legacy_f32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_fma_legacy_f32) {
|
2020-05-15 14:03:15 +01:00
|
|
|
/* FMA can only take literals on GFX10+ */
|
2020-05-14 21:09:36 +01:00
|
|
|
if ((instr->opcode == aco_opcode::v_fma_f32 || instr->opcode == aco_opcode::v_fma_f16) &&
|
2022-05-12 02:50:17 -04:00
|
|
|
ctx.program->gfx_level < GFX10)
|
2020-05-15 14:03:15 +01:00
|
|
|
return;
|
2021-07-21 20:18:12 +01:00
|
|
|
/* There are no v_fmaak_legacy_f16/v_fmamk_legacy_f16 and on chips where VOP3 can take
|
|
|
|
|
* literals (GFX10+), these instructions don't exist.
|
|
|
|
|
*/
|
|
|
|
|
if (instr->opcode == aco_opcode::v_fma_legacy_f16)
|
|
|
|
|
return;
|
2020-05-15 14:03:15 +01:00
|
|
|
|
aco: Use v_fmaak/v_fmamk if two operands are the same literal.
Foz-DB Navi21:
Totals from 5744 (4.26% of 134913) affected shaders:
VGPRs: 237128 -> 237056 (-0.03%); split: -0.04%, +0.01%
CodeSize: 16654484 -> 16620668 (-0.20%); split: -0.23%, +0.03%
MaxWaves: 152838 -> 152846 (+0.01%)
Instrs: 3063214 -> 3058572 (-0.15%); split: -0.17%, +0.02%
Latency: 23935195 -> 23934827 (-0.00%); split: -0.03%, +0.03%
InvThroughput: 5478562 -> 5478160 (-0.01%); split: -0.01%, +0.01%
VClause: 60432 -> 60435 (+0.00%); split: -0.02%, +0.03%
SClause: 121032 -> 120896 (-0.11%); split: -0.20%, +0.09%
Copies: 147865 -> 143144 (-3.19%); split: -3.59%, +0.40%
PreSGPRs: 195722 -> 195661 (-0.03%); split: -0.06%, +0.03%
PreVGPRs: 182849 -> 182787 (-0.03%)
Foz-DB Vega10:
Totals from 5290 (3.92% of 135041) affected shaders:
SGPRs: 357952 -> 359616 (+0.46%); split: -0.11%, +0.57%
VGPRs: 204048 -> 203928 (-0.06%); split: -0.08%, +0.02%
CodeSize: 14043176 -> 14003100 (-0.29%); split: -0.29%, +0.00%
MaxWaves: 39401 -> 39398 (-0.01%); split: +0.01%, -0.02%
Instrs: 2636739 -> 2631246 (-0.21%); split: -0.21%, +0.00%
Latency: 25264088 -> 25256482 (-0.03%); split: -0.05%, +0.02%
InvThroughput: 12039643 -> 12039346 (-0.00%); split: -0.00%, +0.00%
VClause: 55603 -> 55584 (-0.03%); split: -0.04%, +0.00%
SClause: 101577 -> 101342 (-0.23%); split: -0.30%, +0.07%
Copies: 213344 -> 207929 (-2.54%); split: -2.58%, +0.05%
Branches: 34053 -> 34054 (+0.00%)
PreSGPRs: 172405 -> 172260 (-0.08%)
Signed-off-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18645>
2022-09-17 20:52:24 +02:00
|
|
|
uint32_t literal_mask = 0;
|
|
|
|
|
uint32_t sgpr_mask = 0;
|
|
|
|
|
uint32_t vgpr_mask = 0;
|
2019-09-17 13:22:17 +02:00
|
|
|
uint32_t literal_uses = UINT32_MAX;
|
aco: Use v_fmaak/v_fmamk if two operands are the same literal.
Foz-DB Navi21:
Totals from 5744 (4.26% of 134913) affected shaders:
VGPRs: 237128 -> 237056 (-0.03%); split: -0.04%, +0.01%
CodeSize: 16654484 -> 16620668 (-0.20%); split: -0.23%, +0.03%
MaxWaves: 152838 -> 152846 (+0.01%)
Instrs: 3063214 -> 3058572 (-0.15%); split: -0.17%, +0.02%
Latency: 23935195 -> 23934827 (-0.00%); split: -0.03%, +0.03%
InvThroughput: 5478562 -> 5478160 (-0.01%); split: -0.01%, +0.01%
VClause: 60432 -> 60435 (+0.00%); split: -0.02%, +0.03%
SClause: 121032 -> 120896 (-0.11%); split: -0.20%, +0.09%
Copies: 147865 -> 143144 (-3.19%); split: -3.59%, +0.40%
PreSGPRs: 195722 -> 195661 (-0.03%); split: -0.06%, +0.03%
PreVGPRs: 182849 -> 182787 (-0.03%)
Foz-DB Vega10:
Totals from 5290 (3.92% of 135041) affected shaders:
SGPRs: 357952 -> 359616 (+0.46%); split: -0.11%, +0.57%
VGPRs: 204048 -> 203928 (-0.06%); split: -0.08%, +0.02%
CodeSize: 14043176 -> 14003100 (-0.29%); split: -0.29%, +0.00%
MaxWaves: 39401 -> 39398 (-0.01%); split: +0.01%, -0.02%
Instrs: 2636739 -> 2631246 (-0.21%); split: -0.21%, +0.00%
Latency: 25264088 -> 25256482 (-0.03%); split: -0.05%, +0.02%
InvThroughput: 12039643 -> 12039346 (-0.00%); split: -0.00%, +0.00%
VClause: 55603 -> 55584 (-0.03%); split: -0.04%, +0.00%
SClause: 101577 -> 101342 (-0.23%); split: -0.30%, +0.07%
Copies: 213344 -> 207929 (-2.54%); split: -2.58%, +0.05%
Branches: 34053 -> 34054 (+0.00%)
PreSGPRs: 172405 -> 172260 (-0.08%)
Signed-off-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18645>
2022-09-17 20:52:24 +02:00
|
|
|
uint32_t literal_value = 0;
|
2021-05-13 13:34:52 +01:00
|
|
|
|
aco: Use v_fmaak/v_fmamk if two operands are the same literal.
Foz-DB Navi21:
Totals from 5744 (4.26% of 134913) affected shaders:
VGPRs: 237128 -> 237056 (-0.03%); split: -0.04%, +0.01%
CodeSize: 16654484 -> 16620668 (-0.20%); split: -0.23%, +0.03%
MaxWaves: 152838 -> 152846 (+0.01%)
Instrs: 3063214 -> 3058572 (-0.15%); split: -0.17%, +0.02%
Latency: 23935195 -> 23934827 (-0.00%); split: -0.03%, +0.03%
InvThroughput: 5478562 -> 5478160 (-0.01%); split: -0.01%, +0.01%
VClause: 60432 -> 60435 (+0.00%); split: -0.02%, +0.03%
SClause: 121032 -> 120896 (-0.11%); split: -0.20%, +0.09%
Copies: 147865 -> 143144 (-3.19%); split: -3.59%, +0.40%
PreSGPRs: 195722 -> 195661 (-0.03%); split: -0.06%, +0.03%
PreVGPRs: 182849 -> 182787 (-0.03%)
Foz-DB Vega10:
Totals from 5290 (3.92% of 135041) affected shaders:
SGPRs: 357952 -> 359616 (+0.46%); split: -0.11%, +0.57%
VGPRs: 204048 -> 203928 (-0.06%); split: -0.08%, +0.02%
CodeSize: 14043176 -> 14003100 (-0.29%); split: -0.29%, +0.00%
MaxWaves: 39401 -> 39398 (-0.01%); split: +0.01%, -0.02%
Instrs: 2636739 -> 2631246 (-0.21%); split: -0.21%, +0.00%
Latency: 25264088 -> 25256482 (-0.03%); split: -0.05%, +0.02%
InvThroughput: 12039643 -> 12039346 (-0.00%); split: -0.00%, +0.00%
VClause: 55603 -> 55584 (-0.03%); split: -0.04%, +0.00%
SClause: 101577 -> 101342 (-0.23%); split: -0.30%, +0.07%
Copies: 213344 -> 207929 (-2.54%); split: -2.58%, +0.05%
Branches: 34053 -> 34054 (+0.00%)
PreSGPRs: 172405 -> 172260 (-0.08%)
Signed-off-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18645>
2022-09-17 20:52:24 +02:00
|
|
|
/* Iterate in reverse to prefer v_madak/v_fmaak. */
|
|
|
|
|
for (int i = 2; i >= 0; i--) {
|
|
|
|
|
Operand& op = instr->operands[i];
|
|
|
|
|
if (!op.isTemp())
|
|
|
|
|
continue;
|
|
|
|
|
if (ctx.info[op.tempId()].is_literal(get_operand_size(instr, i))) {
|
|
|
|
|
uint32_t new_literal = ctx.info[op.tempId()].val;
|
|
|
|
|
if (!literal_mask || literal_value == new_literal) {
|
|
|
|
|
literal_value = new_literal;
|
|
|
|
|
literal_uses = MIN2(literal_uses, ctx.uses[op.tempId()]);
|
|
|
|
|
literal_mask |= 1 << i;
|
2021-05-13 13:34:52 +01:00
|
|
|
continue;
|
aco: Use v_fmaak/v_fmamk if two operands are the same literal.
Foz-DB Navi21:
Totals from 5744 (4.26% of 134913) affected shaders:
VGPRs: 237128 -> 237056 (-0.03%); split: -0.04%, +0.01%
CodeSize: 16654484 -> 16620668 (-0.20%); split: -0.23%, +0.03%
MaxWaves: 152838 -> 152846 (+0.01%)
Instrs: 3063214 -> 3058572 (-0.15%); split: -0.17%, +0.02%
Latency: 23935195 -> 23934827 (-0.00%); split: -0.03%, +0.03%
InvThroughput: 5478562 -> 5478160 (-0.01%); split: -0.01%, +0.01%
VClause: 60432 -> 60435 (+0.00%); split: -0.02%, +0.03%
SClause: 121032 -> 120896 (-0.11%); split: -0.20%, +0.09%
Copies: 147865 -> 143144 (-3.19%); split: -3.59%, +0.40%
PreSGPRs: 195722 -> 195661 (-0.03%); split: -0.06%, +0.03%
PreVGPRs: 182849 -> 182787 (-0.03%)
Foz-DB Vega10:
Totals from 5290 (3.92% of 135041) affected shaders:
SGPRs: 357952 -> 359616 (+0.46%); split: -0.11%, +0.57%
VGPRs: 204048 -> 203928 (-0.06%); split: -0.08%, +0.02%
CodeSize: 14043176 -> 14003100 (-0.29%); split: -0.29%, +0.00%
MaxWaves: 39401 -> 39398 (-0.01%); split: +0.01%, -0.02%
Instrs: 2636739 -> 2631246 (-0.21%); split: -0.21%, +0.00%
Latency: 25264088 -> 25256482 (-0.03%); split: -0.05%, +0.02%
InvThroughput: 12039643 -> 12039346 (-0.00%); split: -0.00%, +0.00%
VClause: 55603 -> 55584 (-0.03%); split: -0.04%, +0.00%
SClause: 101577 -> 101342 (-0.23%); split: -0.30%, +0.07%
Copies: 213344 -> 207929 (-2.54%); split: -2.58%, +0.05%
Branches: 34053 -> 34054 (+0.00%)
PreSGPRs: 172405 -> 172260 (-0.08%)
Signed-off-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18645>
2022-09-17 20:52:24 +02:00
|
|
|
}
|
2021-05-13 13:34:52 +01:00
|
|
|
}
|
aco: Use v_fmaak/v_fmamk if two operands are the same literal.
Foz-DB Navi21:
Totals from 5744 (4.26% of 134913) affected shaders:
VGPRs: 237128 -> 237056 (-0.03%); split: -0.04%, +0.01%
CodeSize: 16654484 -> 16620668 (-0.20%); split: -0.23%, +0.03%
MaxWaves: 152838 -> 152846 (+0.01%)
Instrs: 3063214 -> 3058572 (-0.15%); split: -0.17%, +0.02%
Latency: 23935195 -> 23934827 (-0.00%); split: -0.03%, +0.03%
InvThroughput: 5478562 -> 5478160 (-0.01%); split: -0.01%, +0.01%
VClause: 60432 -> 60435 (+0.00%); split: -0.02%, +0.03%
SClause: 121032 -> 120896 (-0.11%); split: -0.20%, +0.09%
Copies: 147865 -> 143144 (-3.19%); split: -3.59%, +0.40%
PreSGPRs: 195722 -> 195661 (-0.03%); split: -0.06%, +0.03%
PreVGPRs: 182849 -> 182787 (-0.03%)
Foz-DB Vega10:
Totals from 5290 (3.92% of 135041) affected shaders:
SGPRs: 357952 -> 359616 (+0.46%); split: -0.11%, +0.57%
VGPRs: 204048 -> 203928 (-0.06%); split: -0.08%, +0.02%
CodeSize: 14043176 -> 14003100 (-0.29%); split: -0.29%, +0.00%
MaxWaves: 39401 -> 39398 (-0.01%); split: +0.01%, -0.02%
Instrs: 2636739 -> 2631246 (-0.21%); split: -0.21%, +0.00%
Latency: 25264088 -> 25256482 (-0.03%); split: -0.05%, +0.02%
InvThroughput: 12039643 -> 12039346 (-0.00%); split: -0.00%, +0.00%
VClause: 55603 -> 55584 (-0.03%); split: -0.04%, +0.00%
SClause: 101577 -> 101342 (-0.23%); split: -0.30%, +0.07%
Copies: 213344 -> 207929 (-2.54%); split: -2.58%, +0.05%
Branches: 34053 -> 34054 (+0.00%)
PreSGPRs: 172405 -> 172260 (-0.08%)
Signed-off-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18645>
2022-09-17 20:52:24 +02:00
|
|
|
sgpr_mask |= op.isOfType(RegType::sgpr) << i;
|
|
|
|
|
vgpr_mask |= op.isOfType(RegType::vgpr) << i;
|
2021-05-13 13:34:52 +01:00
|
|
|
}
|
|
|
|
|
|
aco: Use v_fmaak/v_fmamk if two operands are the same literal.
Foz-DB Navi21:
Totals from 5744 (4.26% of 134913) affected shaders:
VGPRs: 237128 -> 237056 (-0.03%); split: -0.04%, +0.01%
CodeSize: 16654484 -> 16620668 (-0.20%); split: -0.23%, +0.03%
MaxWaves: 152838 -> 152846 (+0.01%)
Instrs: 3063214 -> 3058572 (-0.15%); split: -0.17%, +0.02%
Latency: 23935195 -> 23934827 (-0.00%); split: -0.03%, +0.03%
InvThroughput: 5478562 -> 5478160 (-0.01%); split: -0.01%, +0.01%
VClause: 60432 -> 60435 (+0.00%); split: -0.02%, +0.03%
SClause: 121032 -> 120896 (-0.11%); split: -0.20%, +0.09%
Copies: 147865 -> 143144 (-3.19%); split: -3.59%, +0.40%
PreSGPRs: 195722 -> 195661 (-0.03%); split: -0.06%, +0.03%
PreVGPRs: 182849 -> 182787 (-0.03%)
Foz-DB Vega10:
Totals from 5290 (3.92% of 135041) affected shaders:
SGPRs: 357952 -> 359616 (+0.46%); split: -0.11%, +0.57%
VGPRs: 204048 -> 203928 (-0.06%); split: -0.08%, +0.02%
CodeSize: 14043176 -> 14003100 (-0.29%); split: -0.29%, +0.00%
MaxWaves: 39401 -> 39398 (-0.01%); split: +0.01%, -0.02%
Instrs: 2636739 -> 2631246 (-0.21%); split: -0.21%, +0.00%
Latency: 25264088 -> 25256482 (-0.03%); split: -0.05%, +0.02%
InvThroughput: 12039643 -> 12039346 (-0.00%); split: -0.00%, +0.00%
VClause: 55603 -> 55584 (-0.03%); split: -0.04%, +0.00%
SClause: 101577 -> 101342 (-0.23%); split: -0.30%, +0.07%
Copies: 213344 -> 207929 (-2.54%); split: -2.58%, +0.05%
Branches: 34053 -> 34054 (+0.00%)
PreSGPRs: 172405 -> 172260 (-0.08%)
Signed-off-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18645>
2022-09-17 20:52:24 +02:00
|
|
|
/* The constant bus limitations before GFX10 disallows SGPRs. */
|
|
|
|
|
if (sgpr_mask && ctx.program->gfx_level < GFX10)
|
|
|
|
|
literal_mask = 0;
|
2021-05-13 13:34:52 +01:00
|
|
|
|
aco: Use v_fmaak/v_fmamk if two operands are the same literal.
Foz-DB Navi21:
Totals from 5744 (4.26% of 134913) affected shaders:
VGPRs: 237128 -> 237056 (-0.03%); split: -0.04%, +0.01%
CodeSize: 16654484 -> 16620668 (-0.20%); split: -0.23%, +0.03%
MaxWaves: 152838 -> 152846 (+0.01%)
Instrs: 3063214 -> 3058572 (-0.15%); split: -0.17%, +0.02%
Latency: 23935195 -> 23934827 (-0.00%); split: -0.03%, +0.03%
InvThroughput: 5478562 -> 5478160 (-0.01%); split: -0.01%, +0.01%
VClause: 60432 -> 60435 (+0.00%); split: -0.02%, +0.03%
SClause: 121032 -> 120896 (-0.11%); split: -0.20%, +0.09%
Copies: 147865 -> 143144 (-3.19%); split: -3.59%, +0.40%
PreSGPRs: 195722 -> 195661 (-0.03%); split: -0.06%, +0.03%
PreVGPRs: 182849 -> 182787 (-0.03%)
Foz-DB Vega10:
Totals from 5290 (3.92% of 135041) affected shaders:
SGPRs: 357952 -> 359616 (+0.46%); split: -0.11%, +0.57%
VGPRs: 204048 -> 203928 (-0.06%); split: -0.08%, +0.02%
CodeSize: 14043176 -> 14003100 (-0.29%); split: -0.29%, +0.00%
MaxWaves: 39401 -> 39398 (-0.01%); split: +0.01%, -0.02%
Instrs: 2636739 -> 2631246 (-0.21%); split: -0.21%, +0.00%
Latency: 25264088 -> 25256482 (-0.03%); split: -0.05%, +0.02%
InvThroughput: 12039643 -> 12039346 (-0.00%); split: -0.00%, +0.00%
VClause: 55603 -> 55584 (-0.03%); split: -0.04%, +0.00%
SClause: 101577 -> 101342 (-0.23%); split: -0.30%, +0.07%
Copies: 213344 -> 207929 (-2.54%); split: -2.58%, +0.05%
Branches: 34053 -> 34054 (+0.00%)
PreSGPRs: 172405 -> 172260 (-0.08%)
Signed-off-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18645>
2022-09-17 20:52:24 +02:00
|
|
|
/* Encoding needs a vgpr. */
|
|
|
|
|
if (!vgpr_mask)
|
|
|
|
|
literal_mask = 0;
|
2021-05-13 13:34:52 +01:00
|
|
|
|
aco: Use v_fmaak/v_fmamk if two operands are the same literal.
Foz-DB Navi21:
Totals from 5744 (4.26% of 134913) affected shaders:
VGPRs: 237128 -> 237056 (-0.03%); split: -0.04%, +0.01%
CodeSize: 16654484 -> 16620668 (-0.20%); split: -0.23%, +0.03%
MaxWaves: 152838 -> 152846 (+0.01%)
Instrs: 3063214 -> 3058572 (-0.15%); split: -0.17%, +0.02%
Latency: 23935195 -> 23934827 (-0.00%); split: -0.03%, +0.03%
InvThroughput: 5478562 -> 5478160 (-0.01%); split: -0.01%, +0.01%
VClause: 60432 -> 60435 (+0.00%); split: -0.02%, +0.03%
SClause: 121032 -> 120896 (-0.11%); split: -0.20%, +0.09%
Copies: 147865 -> 143144 (-3.19%); split: -3.59%, +0.40%
PreSGPRs: 195722 -> 195661 (-0.03%); split: -0.06%, +0.03%
PreVGPRs: 182849 -> 182787 (-0.03%)
Foz-DB Vega10:
Totals from 5290 (3.92% of 135041) affected shaders:
SGPRs: 357952 -> 359616 (+0.46%); split: -0.11%, +0.57%
VGPRs: 204048 -> 203928 (-0.06%); split: -0.08%, +0.02%
CodeSize: 14043176 -> 14003100 (-0.29%); split: -0.29%, +0.00%
MaxWaves: 39401 -> 39398 (-0.01%); split: +0.01%, -0.02%
Instrs: 2636739 -> 2631246 (-0.21%); split: -0.21%, +0.00%
Latency: 25264088 -> 25256482 (-0.03%); split: -0.05%, +0.02%
InvThroughput: 12039643 -> 12039346 (-0.00%); split: -0.00%, +0.00%
VClause: 55603 -> 55584 (-0.03%); split: -0.04%, +0.00%
SClause: 101577 -> 101342 (-0.23%); split: -0.30%, +0.07%
Copies: 213344 -> 207929 (-2.54%); split: -2.58%, +0.05%
Branches: 34053 -> 34054 (+0.00%)
PreSGPRs: 172405 -> 172260 (-0.08%)
Signed-off-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18645>
2022-09-17 20:52:24 +02:00
|
|
|
/* v_madmk/v_fmamk needs a vgpr in the third source. */
|
|
|
|
|
if (!(literal_mask & 0b100) && !(vgpr_mask & 0b100))
|
|
|
|
|
literal_mask = 0;
|
2020-04-01 18:09:43 +02:00
|
|
|
|
|
|
|
|
/* Limit the number of literals to apply to not increase the code
|
|
|
|
|
* size too much, but always apply literals for v_mad->v_madak
|
|
|
|
|
* because both instructions are 64-bit and this doesn't increase
|
|
|
|
|
* code size.
|
|
|
|
|
* TODO: try to apply the literals earlier to lower the number of
|
|
|
|
|
* uses below threshold
|
|
|
|
|
*/
|
aco: Use v_fmaak/v_fmamk if two operands are the same literal.
Foz-DB Navi21:
Totals from 5744 (4.26% of 134913) affected shaders:
VGPRs: 237128 -> 237056 (-0.03%); split: -0.04%, +0.01%
CodeSize: 16654484 -> 16620668 (-0.20%); split: -0.23%, +0.03%
MaxWaves: 152838 -> 152846 (+0.01%)
Instrs: 3063214 -> 3058572 (-0.15%); split: -0.17%, +0.02%
Latency: 23935195 -> 23934827 (-0.00%); split: -0.03%, +0.03%
InvThroughput: 5478562 -> 5478160 (-0.01%); split: -0.01%, +0.01%
VClause: 60432 -> 60435 (+0.00%); split: -0.02%, +0.03%
SClause: 121032 -> 120896 (-0.11%); split: -0.20%, +0.09%
Copies: 147865 -> 143144 (-3.19%); split: -3.59%, +0.40%
PreSGPRs: 195722 -> 195661 (-0.03%); split: -0.06%, +0.03%
PreVGPRs: 182849 -> 182787 (-0.03%)
Foz-DB Vega10:
Totals from 5290 (3.92% of 135041) affected shaders:
SGPRs: 357952 -> 359616 (+0.46%); split: -0.11%, +0.57%
VGPRs: 204048 -> 203928 (-0.06%); split: -0.08%, +0.02%
CodeSize: 14043176 -> 14003100 (-0.29%); split: -0.29%, +0.00%
MaxWaves: 39401 -> 39398 (-0.01%); split: +0.01%, -0.02%
Instrs: 2636739 -> 2631246 (-0.21%); split: -0.21%, +0.00%
Latency: 25264088 -> 25256482 (-0.03%); split: -0.05%, +0.02%
InvThroughput: 12039643 -> 12039346 (-0.00%); split: -0.00%, +0.00%
VClause: 55603 -> 55584 (-0.03%); split: -0.04%, +0.00%
SClause: 101577 -> 101342 (-0.23%); split: -0.30%, +0.07%
Copies: 213344 -> 207929 (-2.54%); split: -2.58%, +0.05%
Branches: 34053 -> 34054 (+0.00%)
PreSGPRs: 172405 -> 172260 (-0.08%)
Signed-off-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18645>
2022-09-17 20:52:24 +02:00
|
|
|
if (literal_mask && (literal_uses < threshold || (literal_mask & 0b100))) {
|
|
|
|
|
u_foreach_bit (i, literal_mask)
|
|
|
|
|
ctx.uses[instr->operands[i].tempId()]--;
|
|
|
|
|
mad_info->literal_mask = literal_mask;
|
2019-11-22 15:18:38 +00:00
|
|
|
return;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
/* Mark SCC needed, so the uniform boolean transformation won't swap the definitions
|
|
|
|
|
* when it isn't beneficial */
|
|
|
|
|
if (instr->isBranch() && instr->operands.size() && instr->operands[0].isTemp() &&
|
|
|
|
|
instr->operands[0].isFixed() && instr->operands[0].physReg() == scc) {
|
2020-01-16 19:32:31 +01:00
|
|
|
ctx.info[instr->operands[0].tempId()].set_scc_needed();
|
|
|
|
|
return;
|
|
|
|
|
} else if ((instr->opcode == aco_opcode::s_cselect_b64 ||
|
|
|
|
|
instr->opcode == aco_opcode::s_cselect_b32) &&
|
|
|
|
|
instr->operands[2].isTemp()) {
|
|
|
|
|
ctx.info[instr->operands[2].tempId()].set_scc_needed();
|
2021-06-09 10:14:54 +02:00
|
|
|
} else if (instr->opcode == aco_opcode::p_wqm && instr->operands[0].isTemp() &&
|
2020-02-05 12:14:00 +01:00
|
|
|
ctx.info[instr->definitions[0].tempId()].is_scc_needed()) {
|
|
|
|
|
/* Propagate label so it is correctly detected by the uniform bool transform */
|
|
|
|
|
ctx.info[instr->operands[0].tempId()].set_scc_needed();
|
|
|
|
|
|
|
|
|
|
/* Fix definition to SCC, this will prevent RA from adding superfluous moves */
|
|
|
|
|
instr->definitions[0].setFixed(scc);
|
2020-01-16 19:32:31 +01:00
|
|
|
}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
/* check for literals */
|
2019-11-22 13:43:39 +00:00
|
|
|
if (!instr->isSALU() && !instr->isVALU())
|
|
|
|
|
return;
|
|
|
|
|
|
2020-01-16 19:32:31 +01:00
|
|
|
/* Transform uniform bitwise boolean operations to 32-bit when there are no divergent uses. */
|
2021-06-09 10:14:54 +02:00
|
|
|
if (instr->definitions.size() && ctx.uses[instr->definitions[0].tempId()] == 0 &&
|
2020-01-16 19:32:31 +01:00
|
|
|
ctx.info[instr->definitions[0].tempId()].is_uniform_bitwise()) {
|
|
|
|
|
bool transform_done = to_uniform_bool_instr(ctx, instr);
|
|
|
|
|
|
|
|
|
|
if (transform_done && !ctx.info[instr->definitions[1].tempId()].is_scc_needed()) {
|
2021-06-09 10:14:54 +02:00
|
|
|
/* Swap the two definition IDs in order to avoid overusing the SCC.
|
|
|
|
|
* This reduces extra moves generated by RA. */
|
2020-01-16 19:32:31 +01:00
|
|
|
uint32_t def0_id = instr->definitions[0].getTemp().id();
|
|
|
|
|
uint32_t def1_id = instr->definitions[1].getTemp().id();
|
|
|
|
|
instr->definitions[0].setTemp(Temp(def1_id, s1));
|
|
|
|
|
instr->definitions[1].setTemp(Temp(def0_id, s1));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-24 12:25:23 +02:00
|
|
|
/* This optimization is done late in order to be able to apply otherwise
|
|
|
|
|
* unsafe optimizations such as the inverse comparison optimization.
|
|
|
|
|
*/
|
|
|
|
|
if (instr->opcode == aco_opcode::s_and_b32 || instr->opcode == aco_opcode::s_and_b64) {
|
|
|
|
|
if (instr->operands[0].isTemp() && fixed_to_exec(instr->operands[1]) &&
|
|
|
|
|
ctx.uses[instr->operands[0].tempId()] == 1 && ctx.uses[instr->definitions[1].tempId()] == 0 &&
|
|
|
|
|
can_eliminate_and_exec(ctx, instr->operands[0].getTemp(), instr->pass_flags)) {
|
|
|
|
|
ctx.uses[instr->operands[0].tempId()]--;
|
|
|
|
|
ctx.info[instr->operands[0].tempId()].instr->definitions[0].setTemp(instr->definitions[0].getTemp());
|
|
|
|
|
instr.reset();
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-06-30 15:33:18 +01:00
|
|
|
/* Combine DPP copies into VALU. This should be done after creating MAD/FMA. */
|
|
|
|
|
if (instr->isVALU()) {
|
|
|
|
|
for (unsigned i = 0; i < instr->operands.size(); i++) {
|
|
|
|
|
if (!instr->operands[i].isTemp())
|
|
|
|
|
continue;
|
|
|
|
|
ssa_info info = ctx.info[instr->operands[i].tempId()];
|
|
|
|
|
|
|
|
|
|
aco_opcode swapped_op;
|
|
|
|
|
if (info.is_dpp() && info.instr->pass_flags == instr->pass_flags &&
|
2021-11-29 00:12:04 +09:00
|
|
|
(i == 0 || can_swap_operands(instr, &swapped_op)) &&
|
|
|
|
|
can_use_DPP(instr, true, info.is_dpp8()) && !instr->isDPP()) {
|
|
|
|
|
bool dpp8 = info.is_dpp8();
|
|
|
|
|
convert_to_DPP(instr, dpp8);
|
|
|
|
|
if (dpp8) {
|
|
|
|
|
DPP8_instruction* dpp = &instr->dpp8();
|
|
|
|
|
for (unsigned j = 0; j < 8; ++j)
|
|
|
|
|
dpp->lane_sel[j] = info.instr->dpp8().lane_sel[j];
|
|
|
|
|
if (i) {
|
|
|
|
|
instr->opcode = swapped_op;
|
|
|
|
|
std::swap(instr->operands[0], instr->operands[1]);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
DPP16_instruction* dpp = &instr->dpp16();
|
|
|
|
|
if (i) {
|
|
|
|
|
instr->opcode = swapped_op;
|
|
|
|
|
std::swap(instr->operands[0], instr->operands[1]);
|
|
|
|
|
std::swap(dpp->neg[0], dpp->neg[1]);
|
|
|
|
|
std::swap(dpp->abs[0], dpp->abs[1]);
|
|
|
|
|
}
|
|
|
|
|
dpp->dpp_ctrl = info.instr->dpp16().dpp_ctrl;
|
|
|
|
|
dpp->bound_ctrl = info.instr->dpp16().bound_ctrl;
|
|
|
|
|
dpp->neg[0] ^= info.instr->dpp16().neg[0] && !dpp->abs[0];
|
|
|
|
|
dpp->abs[0] |= info.instr->dpp16().abs[0];
|
2020-06-30 15:33:18 +01:00
|
|
|
}
|
|
|
|
|
if (--ctx.uses[info.instr->definitions[0].tempId()])
|
|
|
|
|
ctx.uses[info.instr->operands[0].tempId()]++;
|
|
|
|
|
instr->operands[0].setTemp(info.instr->operands[0].getTemp());
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-05-12 02:50:17 -04:00
|
|
|
if (instr->isSDWA() || (instr->isVOP3() && ctx.program->gfx_level < GFX10) ||
|
|
|
|
|
(instr->isVOP3P() && ctx.program->gfx_level < GFX10))
|
2019-11-22 13:43:39 +00:00
|
|
|
return; /* some encodings can't ever take literals */
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
/* we do not apply the literals yet as we don't know if it is profitable */
|
2019-11-22 13:43:39 +00:00
|
|
|
Operand current_literal(s1);
|
|
|
|
|
|
|
|
|
|
unsigned literal_id = 0;
|
|
|
|
|
unsigned literal_uses = UINT32_MAX;
|
|
|
|
|
Operand literal(s1);
|
2019-11-20 16:42:17 +00:00
|
|
|
unsigned num_operands = 1;
|
2020-09-04 12:35:54 +01:00
|
|
|
if (instr->isSALU() ||
|
2022-05-12 02:50:17 -04:00
|
|
|
(ctx.program->gfx_level >= GFX10 && (can_use_VOP3(ctx, instr) || instr->isVOP3P())))
|
2019-11-20 16:42:17 +00:00
|
|
|
num_operands = instr->operands.size();
|
2020-01-24 17:37:11 +00:00
|
|
|
/* catch VOP2 with a 3rd SGPR operand (e.g. v_cndmask_b32, v_addc_co_u32) */
|
|
|
|
|
else if (instr->isVALU() && instr->operands.size() >= 3)
|
|
|
|
|
return;
|
2019-11-22 13:43:39 +00:00
|
|
|
|
|
|
|
|
unsigned sgpr_ids[2] = {0, 0};
|
|
|
|
|
bool is_literal_sgpr = false;
|
|
|
|
|
uint32_t mask = 0;
|
|
|
|
|
|
|
|
|
|
/* choose a literal to apply */
|
|
|
|
|
for (unsigned i = 0; i < num_operands; i++) {
|
|
|
|
|
Operand op = instr->operands[i];
|
2020-05-15 16:28:03 +01:00
|
|
|
unsigned bits = get_operand_size(instr, i);
|
2020-01-23 20:03:40 +00:00
|
|
|
|
|
|
|
|
if (instr->isVALU() && op.isTemp() && op.getTemp().type() == RegType::sgpr &&
|
|
|
|
|
op.tempId() != sgpr_ids[0])
|
|
|
|
|
sgpr_ids[!!sgpr_ids[0]] = op.tempId();
|
|
|
|
|
|
2019-11-22 13:43:39 +00:00
|
|
|
if (op.isLiteral()) {
|
|
|
|
|
current_literal = op;
|
|
|
|
|
continue;
|
2020-05-15 16:28:03 +01:00
|
|
|
} else if (!op.isTemp() || !ctx.info[op.tempId()].is_literal(bits)) {
|
2019-11-22 13:43:39 +00:00
|
|
|
continue;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
2019-11-22 13:43:39 +00:00
|
|
|
|
2020-01-16 16:54:35 +01:00
|
|
|
if (!alu_can_accept_constant(instr->opcode, i))
|
2019-11-22 13:43:39 +00:00
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
if (ctx.uses[op.tempId()] < literal_uses) {
|
|
|
|
|
is_literal_sgpr = op.getTemp().type() == RegType::sgpr;
|
|
|
|
|
mask = 0;
|
2021-07-13 11:22:46 +02:00
|
|
|
literal = Operand::c32(ctx.info[op.tempId()].val);
|
2019-11-22 13:43:39 +00:00
|
|
|
literal_uses = ctx.uses[op.tempId()];
|
|
|
|
|
literal_id = op.tempId();
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
2019-11-22 13:43:39 +00:00
|
|
|
|
|
|
|
|
mask |= (op.tempId() == literal_id) << i;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2019-11-22 13:43:39 +00:00
|
|
|
/* don't go over the constant bus limit */
|
2019-11-20 16:42:17 +00:00
|
|
|
bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_lshrrev_b64 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_ashrrev_i64;
|
2019-11-22 13:43:39 +00:00
|
|
|
unsigned const_bus_limit = instr->isVALU() ? 1 : UINT32_MAX;
|
2022-05-12 02:50:17 -04:00
|
|
|
if (ctx.program->gfx_level >= GFX10 && !is_shift64)
|
2019-11-20 16:42:17 +00:00
|
|
|
const_bus_limit = 2;
|
|
|
|
|
|
2019-11-22 13:43:39 +00:00
|
|
|
unsigned num_sgprs = !!sgpr_ids[0] + !!sgpr_ids[1];
|
|
|
|
|
if (num_sgprs == const_bus_limit && !is_literal_sgpr)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
if (literal_id && literal_uses < threshold &&
|
|
|
|
|
(current_literal.isUndefined() ||
|
|
|
|
|
(current_literal.size() == literal.size() &&
|
|
|
|
|
current_literal.constantValue() == literal.constantValue()))) {
|
|
|
|
|
/* mark the literal to be applied */
|
|
|
|
|
while (mask) {
|
|
|
|
|
unsigned i = u_bit_scan(&mask);
|
|
|
|
|
if (instr->operands[i].isTemp() && instr->operands[i].tempId() == literal_id)
|
|
|
|
|
ctx.uses[instr->operands[i].tempId()]--;
|
|
|
|
|
}
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2022-04-17 22:14:30 +02:00
|
|
|
static aco_opcode
|
|
|
|
|
sopk_opcode_for_sopc(aco_opcode opcode)
|
|
|
|
|
{
|
|
|
|
|
#define CTOK(op) \
|
|
|
|
|
case aco_opcode::s_cmp_##op##_i32: return aco_opcode::s_cmpk_##op##_i32; \
|
|
|
|
|
case aco_opcode::s_cmp_##op##_u32: return aco_opcode::s_cmpk_##op##_u32;
|
|
|
|
|
switch (opcode) {
|
|
|
|
|
CTOK(eq)
|
|
|
|
|
CTOK(lg)
|
|
|
|
|
CTOK(gt)
|
|
|
|
|
CTOK(ge)
|
|
|
|
|
CTOK(lt)
|
|
|
|
|
CTOK(le)
|
|
|
|
|
default: return aco_opcode::num_opcodes;
|
|
|
|
|
}
|
|
|
|
|
#undef CTOK
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static bool
|
|
|
|
|
sopc_is_signed(aco_opcode opcode)
|
|
|
|
|
{
|
|
|
|
|
#define SOPC(op) \
|
|
|
|
|
case aco_opcode::s_cmp_##op##_i32: return true; \
|
|
|
|
|
case aco_opcode::s_cmp_##op##_u32: return false;
|
|
|
|
|
switch (opcode) {
|
|
|
|
|
SOPC(eq)
|
|
|
|
|
SOPC(lg)
|
|
|
|
|
SOPC(gt)
|
|
|
|
|
SOPC(ge)
|
|
|
|
|
SOPC(lt)
|
|
|
|
|
SOPC(le)
|
|
|
|
|
default: unreachable("Not a valid SOPC instruction.");
|
|
|
|
|
}
|
|
|
|
|
#undef SOPC
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static aco_opcode
|
2022-07-07 23:54:39 +02:00
|
|
|
sopc_32_swapped(aco_opcode opcode)
|
2022-04-17 22:14:30 +02:00
|
|
|
{
|
|
|
|
|
#define SOPC(op1, op2) \
|
|
|
|
|
case aco_opcode::s_cmp_##op1##_i32: return aco_opcode::s_cmp_##op2##_i32; \
|
|
|
|
|
case aco_opcode::s_cmp_##op1##_u32: return aco_opcode::s_cmp_##op2##_u32;
|
|
|
|
|
switch (opcode) {
|
|
|
|
|
SOPC(eq, eq)
|
|
|
|
|
SOPC(lg, lg)
|
2022-07-07 23:54:39 +02:00
|
|
|
SOPC(gt, lt)
|
|
|
|
|
SOPC(ge, le)
|
|
|
|
|
SOPC(lt, gt)
|
|
|
|
|
SOPC(le, ge)
|
2022-04-17 22:14:30 +02:00
|
|
|
default: return aco_opcode::num_opcodes;
|
|
|
|
|
}
|
|
|
|
|
#undef SOPC
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
try_convert_sopc_to_sopk(aco_ptr<Instruction>& instr)
|
|
|
|
|
{
|
|
|
|
|
if (sopk_opcode_for_sopc(instr->opcode) == aco_opcode::num_opcodes)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
if (instr->operands[0].isLiteral()) {
|
|
|
|
|
std::swap(instr->operands[0], instr->operands[1]);
|
2022-07-07 23:54:39 +02:00
|
|
|
instr->opcode = sopc_32_swapped(instr->opcode);
|
2022-04-17 22:14:30 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!instr->operands[1].isLiteral())
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
if (instr->operands[0].isFixed() && instr->operands[0].physReg() >= 128)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
uint32_t value = instr->operands[1].constantValue();
|
|
|
|
|
|
|
|
|
|
const uint32_t i16_mask = 0xffff8000u;
|
|
|
|
|
|
|
|
|
|
bool value_is_i16 = (value & i16_mask) == 0 || (value & i16_mask) == i16_mask;
|
|
|
|
|
bool value_is_u16 = !(value & 0xffff0000u);
|
|
|
|
|
|
|
|
|
|
if (!value_is_i16 && !value_is_u16)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
if (!value_is_i16 && sopc_is_signed(instr->opcode)) {
|
|
|
|
|
if (instr->opcode == aco_opcode::s_cmp_lg_i32)
|
|
|
|
|
instr->opcode = aco_opcode::s_cmp_lg_u32;
|
|
|
|
|
else if (instr->opcode == aco_opcode::s_cmp_eq_i32)
|
|
|
|
|
instr->opcode = aco_opcode::s_cmp_eq_u32;
|
|
|
|
|
else
|
|
|
|
|
return;
|
|
|
|
|
} else if (!value_is_u16 && !sopc_is_signed(instr->opcode)) {
|
|
|
|
|
if (instr->opcode == aco_opcode::s_cmp_lg_u32)
|
|
|
|
|
instr->opcode = aco_opcode::s_cmp_lg_i32;
|
|
|
|
|
else if (instr->opcode == aco_opcode::s_cmp_eq_u32)
|
|
|
|
|
instr->opcode = aco_opcode::s_cmp_eq_i32;
|
|
|
|
|
else
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static_assert(sizeof(SOPK_instruction) <= sizeof(SOPC_instruction),
|
|
|
|
|
"Invalid direct instruction cast.");
|
|
|
|
|
instr->format = Format::SOPK;
|
|
|
|
|
SOPK_instruction* instr_sopk = &instr->sopk();
|
|
|
|
|
|
|
|
|
|
instr_sopk->imm = instr_sopk->operands[1].constantValue() & 0xffff;
|
|
|
|
|
instr_sopk->opcode = sopk_opcode_for_sopc(instr_sopk->opcode);
|
|
|
|
|
instr_sopk->operands.pop_back();
|
|
|
|
|
}
|
|
|
|
|
|
2022-09-19 17:49:53 +02:00
|
|
|
static void
|
|
|
|
|
unswizzle_vop3p_literals(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
|
|
|
|
{
|
|
|
|
|
/* This opt is only beneficial for v_pk_fma_f16 because we can use v_pk_fmac_f16 if the
|
|
|
|
|
* instruction doesn't use swizzles. */
|
|
|
|
|
if (instr->opcode != aco_opcode::v_pk_fma_f16)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
VOP3P_instruction& vop3p = instr->vop3p();
|
|
|
|
|
|
|
|
|
|
unsigned literal_swizzle = ~0u;
|
|
|
|
|
for (unsigned i = 0; i < instr->operands.size(); i++) {
|
|
|
|
|
if (!instr->operands[i].isLiteral())
|
|
|
|
|
continue;
|
|
|
|
|
unsigned new_swizzle = ((vop3p.opsel_lo >> i) & 0x1) | (((vop3p.opsel_hi >> i) & 0x1) << 1);
|
|
|
|
|
if (literal_swizzle != ~0u && new_swizzle != literal_swizzle)
|
|
|
|
|
return; /* Literal swizzles conflict. */
|
|
|
|
|
literal_swizzle = new_swizzle;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (literal_swizzle == 0b10 || literal_swizzle == ~0u)
|
|
|
|
|
return; /* already unswizzled */
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < instr->operands.size(); i++) {
|
|
|
|
|
if (!instr->operands[i].isLiteral())
|
|
|
|
|
continue;
|
|
|
|
|
uint32_t literal = instr->operands[i].constantValue();
|
|
|
|
|
literal = (literal >> (16 * (literal_swizzle & 0x1)) & 0xffff) |
|
|
|
|
|
(literal >> (8 * (literal_swizzle & 0x2)) << 16);
|
|
|
|
|
instr->operands[i] = Operand::literal32(literal);
|
|
|
|
|
vop3p.opsel_lo &= ~(1 << i);
|
|
|
|
|
vop3p.opsel_hi |= (1 << i);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
apply_literals(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
/* Cleanup Dead Instructions */
|
|
|
|
|
if (!instr)
|
|
|
|
|
return;
|
|
|
|
|
|
2019-11-22 13:43:39 +00:00
|
|
|
/* apply literals on MAD */
|
2020-05-15 14:03:15 +01:00
|
|
|
if (!instr->definitions.empty() && ctx.info[instr->definitions[0].tempId()].is_mad()) {
|
2020-06-01 11:27:53 +01:00
|
|
|
mad_info* info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].instr->pass_flags];
|
aco: Use v_fmaak/v_fmamk if two operands are the same literal.
Foz-DB Navi21:
Totals from 5744 (4.26% of 134913) affected shaders:
VGPRs: 237128 -> 237056 (-0.03%); split: -0.04%, +0.01%
CodeSize: 16654484 -> 16620668 (-0.20%); split: -0.23%, +0.03%
MaxWaves: 152838 -> 152846 (+0.01%)
Instrs: 3063214 -> 3058572 (-0.15%); split: -0.17%, +0.02%
Latency: 23935195 -> 23934827 (-0.00%); split: -0.03%, +0.03%
InvThroughput: 5478562 -> 5478160 (-0.01%); split: -0.01%, +0.01%
VClause: 60432 -> 60435 (+0.00%); split: -0.02%, +0.03%
SClause: 121032 -> 120896 (-0.11%); split: -0.20%, +0.09%
Copies: 147865 -> 143144 (-3.19%); split: -3.59%, +0.40%
PreSGPRs: 195722 -> 195661 (-0.03%); split: -0.06%, +0.03%
PreVGPRs: 182849 -> 182787 (-0.03%)
Foz-DB Vega10:
Totals from 5290 (3.92% of 135041) affected shaders:
SGPRs: 357952 -> 359616 (+0.46%); split: -0.11%, +0.57%
VGPRs: 204048 -> 203928 (-0.06%); split: -0.08%, +0.02%
CodeSize: 14043176 -> 14003100 (-0.29%); split: -0.29%, +0.00%
MaxWaves: 39401 -> 39398 (-0.01%); split: +0.01%, -0.02%
Instrs: 2636739 -> 2631246 (-0.21%); split: -0.21%, +0.00%
Latency: 25264088 -> 25256482 (-0.03%); split: -0.05%, +0.02%
InvThroughput: 12039643 -> 12039346 (-0.00%); split: -0.00%, +0.00%
VClause: 55603 -> 55584 (-0.03%); split: -0.04%, +0.00%
SClause: 101577 -> 101342 (-0.23%); split: -0.30%, +0.07%
Copies: 213344 -> 207929 (-2.54%); split: -2.58%, +0.05%
Branches: 34053 -> 34054 (+0.00%)
PreSGPRs: 172405 -> 172260 (-0.08%)
Signed-off-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18645>
2022-09-17 20:52:24 +02:00
|
|
|
const bool madak = (info->literal_mask & 0b100);
|
|
|
|
|
bool has_dead_literal = false;
|
|
|
|
|
u_foreach_bit (i, info->literal_mask)
|
|
|
|
|
has_dead_literal |= ctx.uses[instr->operands[i].tempId()] == 0;
|
|
|
|
|
if (has_dead_literal || madak) {
|
2019-11-22 13:43:39 +00:00
|
|
|
aco_ptr<Instruction> new_mad;
|
2020-05-15 14:03:15 +01:00
|
|
|
|
aco: Use v_fmaak/v_fmamk if two operands are the same literal.
Foz-DB Navi21:
Totals from 5744 (4.26% of 134913) affected shaders:
VGPRs: 237128 -> 237056 (-0.03%); split: -0.04%, +0.01%
CodeSize: 16654484 -> 16620668 (-0.20%); split: -0.23%, +0.03%
MaxWaves: 152838 -> 152846 (+0.01%)
Instrs: 3063214 -> 3058572 (-0.15%); split: -0.17%, +0.02%
Latency: 23935195 -> 23934827 (-0.00%); split: -0.03%, +0.03%
InvThroughput: 5478562 -> 5478160 (-0.01%); split: -0.01%, +0.01%
VClause: 60432 -> 60435 (+0.00%); split: -0.02%, +0.03%
SClause: 121032 -> 120896 (-0.11%); split: -0.20%, +0.09%
Copies: 147865 -> 143144 (-3.19%); split: -3.59%, +0.40%
PreSGPRs: 195722 -> 195661 (-0.03%); split: -0.06%, +0.03%
PreVGPRs: 182849 -> 182787 (-0.03%)
Foz-DB Vega10:
Totals from 5290 (3.92% of 135041) affected shaders:
SGPRs: 357952 -> 359616 (+0.46%); split: -0.11%, +0.57%
VGPRs: 204048 -> 203928 (-0.06%); split: -0.08%, +0.02%
CodeSize: 14043176 -> 14003100 (-0.29%); split: -0.29%, +0.00%
MaxWaves: 39401 -> 39398 (-0.01%); split: +0.01%, -0.02%
Instrs: 2636739 -> 2631246 (-0.21%); split: -0.21%, +0.00%
Latency: 25264088 -> 25256482 (-0.03%); split: -0.05%, +0.02%
InvThroughput: 12039643 -> 12039346 (-0.00%); split: -0.00%, +0.00%
VClause: 55603 -> 55584 (-0.03%); split: -0.04%, +0.00%
SClause: 101577 -> 101342 (-0.23%); split: -0.30%, +0.07%
Copies: 213344 -> 207929 (-2.54%); split: -2.58%, +0.05%
Branches: 34053 -> 34054 (+0.00%)
PreSGPRs: 172405 -> 172260 (-0.08%)
Signed-off-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18645>
2022-09-17 20:52:24 +02:00
|
|
|
aco_opcode new_op = madak ? aco_opcode::v_madak_f32 : aco_opcode::v_madmk_f32;
|
2020-05-15 14:03:15 +01:00
|
|
|
if (instr->opcode == aco_opcode::v_fma_f32)
|
aco: Use v_fmaak/v_fmamk if two operands are the same literal.
Foz-DB Navi21:
Totals from 5744 (4.26% of 134913) affected shaders:
VGPRs: 237128 -> 237056 (-0.03%); split: -0.04%, +0.01%
CodeSize: 16654484 -> 16620668 (-0.20%); split: -0.23%, +0.03%
MaxWaves: 152838 -> 152846 (+0.01%)
Instrs: 3063214 -> 3058572 (-0.15%); split: -0.17%, +0.02%
Latency: 23935195 -> 23934827 (-0.00%); split: -0.03%, +0.03%
InvThroughput: 5478562 -> 5478160 (-0.01%); split: -0.01%, +0.01%
VClause: 60432 -> 60435 (+0.00%); split: -0.02%, +0.03%
SClause: 121032 -> 120896 (-0.11%); split: -0.20%, +0.09%
Copies: 147865 -> 143144 (-3.19%); split: -3.59%, +0.40%
PreSGPRs: 195722 -> 195661 (-0.03%); split: -0.06%, +0.03%
PreVGPRs: 182849 -> 182787 (-0.03%)
Foz-DB Vega10:
Totals from 5290 (3.92% of 135041) affected shaders:
SGPRs: 357952 -> 359616 (+0.46%); split: -0.11%, +0.57%
VGPRs: 204048 -> 203928 (-0.06%); split: -0.08%, +0.02%
CodeSize: 14043176 -> 14003100 (-0.29%); split: -0.29%, +0.00%
MaxWaves: 39401 -> 39398 (-0.01%); split: +0.01%, -0.02%
Instrs: 2636739 -> 2631246 (-0.21%); split: -0.21%, +0.00%
Latency: 25264088 -> 25256482 (-0.03%); split: -0.05%, +0.02%
InvThroughput: 12039643 -> 12039346 (-0.00%); split: -0.00%, +0.00%
VClause: 55603 -> 55584 (-0.03%); split: -0.04%, +0.00%
SClause: 101577 -> 101342 (-0.23%); split: -0.30%, +0.07%
Copies: 213344 -> 207929 (-2.54%); split: -2.58%, +0.05%
Branches: 34053 -> 34054 (+0.00%)
PreSGPRs: 172405 -> 172260 (-0.08%)
Signed-off-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18645>
2022-09-17 20:52:24 +02:00
|
|
|
new_op = madak ? aco_opcode::v_fmaak_f32 : aco_opcode::v_fmamk_f32;
|
2021-06-09 10:14:54 +02:00
|
|
|
else if (instr->opcode == aco_opcode::v_mad_f16 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_mad_legacy_f16)
|
aco: Use v_fmaak/v_fmamk if two operands are the same literal.
Foz-DB Navi21:
Totals from 5744 (4.26% of 134913) affected shaders:
VGPRs: 237128 -> 237056 (-0.03%); split: -0.04%, +0.01%
CodeSize: 16654484 -> 16620668 (-0.20%); split: -0.23%, +0.03%
MaxWaves: 152838 -> 152846 (+0.01%)
Instrs: 3063214 -> 3058572 (-0.15%); split: -0.17%, +0.02%
Latency: 23935195 -> 23934827 (-0.00%); split: -0.03%, +0.03%
InvThroughput: 5478562 -> 5478160 (-0.01%); split: -0.01%, +0.01%
VClause: 60432 -> 60435 (+0.00%); split: -0.02%, +0.03%
SClause: 121032 -> 120896 (-0.11%); split: -0.20%, +0.09%
Copies: 147865 -> 143144 (-3.19%); split: -3.59%, +0.40%
PreSGPRs: 195722 -> 195661 (-0.03%); split: -0.06%, +0.03%
PreVGPRs: 182849 -> 182787 (-0.03%)
Foz-DB Vega10:
Totals from 5290 (3.92% of 135041) affected shaders:
SGPRs: 357952 -> 359616 (+0.46%); split: -0.11%, +0.57%
VGPRs: 204048 -> 203928 (-0.06%); split: -0.08%, +0.02%
CodeSize: 14043176 -> 14003100 (-0.29%); split: -0.29%, +0.00%
MaxWaves: 39401 -> 39398 (-0.01%); split: +0.01%, -0.02%
Instrs: 2636739 -> 2631246 (-0.21%); split: -0.21%, +0.00%
Latency: 25264088 -> 25256482 (-0.03%); split: -0.05%, +0.02%
InvThroughput: 12039643 -> 12039346 (-0.00%); split: -0.00%, +0.00%
VClause: 55603 -> 55584 (-0.03%); split: -0.04%, +0.00%
SClause: 101577 -> 101342 (-0.23%); split: -0.30%, +0.07%
Copies: 213344 -> 207929 (-2.54%); split: -2.58%, +0.05%
Branches: 34053 -> 34054 (+0.00%)
PreSGPRs: 172405 -> 172260 (-0.08%)
Signed-off-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18645>
2022-09-17 20:52:24 +02:00
|
|
|
new_op = madak ? aco_opcode::v_madak_f16 : aco_opcode::v_madmk_f16;
|
2020-05-14 21:09:36 +01:00
|
|
|
else if (instr->opcode == aco_opcode::v_fma_f16)
|
aco: Use v_fmaak/v_fmamk if two operands are the same literal.
Foz-DB Navi21:
Totals from 5744 (4.26% of 134913) affected shaders:
VGPRs: 237128 -> 237056 (-0.03%); split: -0.04%, +0.01%
CodeSize: 16654484 -> 16620668 (-0.20%); split: -0.23%, +0.03%
MaxWaves: 152838 -> 152846 (+0.01%)
Instrs: 3063214 -> 3058572 (-0.15%); split: -0.17%, +0.02%
Latency: 23935195 -> 23934827 (-0.00%); split: -0.03%, +0.03%
InvThroughput: 5478562 -> 5478160 (-0.01%); split: -0.01%, +0.01%
VClause: 60432 -> 60435 (+0.00%); split: -0.02%, +0.03%
SClause: 121032 -> 120896 (-0.11%); split: -0.20%, +0.09%
Copies: 147865 -> 143144 (-3.19%); split: -3.59%, +0.40%
PreSGPRs: 195722 -> 195661 (-0.03%); split: -0.06%, +0.03%
PreVGPRs: 182849 -> 182787 (-0.03%)
Foz-DB Vega10:
Totals from 5290 (3.92% of 135041) affected shaders:
SGPRs: 357952 -> 359616 (+0.46%); split: -0.11%, +0.57%
VGPRs: 204048 -> 203928 (-0.06%); split: -0.08%, +0.02%
CodeSize: 14043176 -> 14003100 (-0.29%); split: -0.29%, +0.00%
MaxWaves: 39401 -> 39398 (-0.01%); split: +0.01%, -0.02%
Instrs: 2636739 -> 2631246 (-0.21%); split: -0.21%, +0.00%
Latency: 25264088 -> 25256482 (-0.03%); split: -0.05%, +0.02%
InvThroughput: 12039643 -> 12039346 (-0.00%); split: -0.00%, +0.00%
VClause: 55603 -> 55584 (-0.03%); split: -0.04%, +0.00%
SClause: 101577 -> 101342 (-0.23%); split: -0.30%, +0.07%
Copies: 213344 -> 207929 (-2.54%); split: -2.58%, +0.05%
Branches: 34053 -> 34054 (+0.00%)
PreSGPRs: 172405 -> 172260 (-0.08%)
Signed-off-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18645>
2022-09-17 20:52:24 +02:00
|
|
|
new_op = madak ? aco_opcode::v_fmaak_f16 : aco_opcode::v_fmamk_f16;
|
2020-05-15 14:03:15 +01:00
|
|
|
|
aco: Use v_fmaak/v_fmamk if two operands are the same literal.
Foz-DB Navi21:
Totals from 5744 (4.26% of 134913) affected shaders:
VGPRs: 237128 -> 237056 (-0.03%); split: -0.04%, +0.01%
CodeSize: 16654484 -> 16620668 (-0.20%); split: -0.23%, +0.03%
MaxWaves: 152838 -> 152846 (+0.01%)
Instrs: 3063214 -> 3058572 (-0.15%); split: -0.17%, +0.02%
Latency: 23935195 -> 23934827 (-0.00%); split: -0.03%, +0.03%
InvThroughput: 5478562 -> 5478160 (-0.01%); split: -0.01%, +0.01%
VClause: 60432 -> 60435 (+0.00%); split: -0.02%, +0.03%
SClause: 121032 -> 120896 (-0.11%); split: -0.20%, +0.09%
Copies: 147865 -> 143144 (-3.19%); split: -3.59%, +0.40%
PreSGPRs: 195722 -> 195661 (-0.03%); split: -0.06%, +0.03%
PreVGPRs: 182849 -> 182787 (-0.03%)
Foz-DB Vega10:
Totals from 5290 (3.92% of 135041) affected shaders:
SGPRs: 357952 -> 359616 (+0.46%); split: -0.11%, +0.57%
VGPRs: 204048 -> 203928 (-0.06%); split: -0.08%, +0.02%
CodeSize: 14043176 -> 14003100 (-0.29%); split: -0.29%, +0.00%
MaxWaves: 39401 -> 39398 (-0.01%); split: +0.01%, -0.02%
Instrs: 2636739 -> 2631246 (-0.21%); split: -0.21%, +0.00%
Latency: 25264088 -> 25256482 (-0.03%); split: -0.05%, +0.02%
InvThroughput: 12039643 -> 12039346 (-0.00%); split: -0.00%, +0.00%
VClause: 55603 -> 55584 (-0.03%); split: -0.04%, +0.00%
SClause: 101577 -> 101342 (-0.23%); split: -0.30%, +0.07%
Copies: 213344 -> 207929 (-2.54%); split: -2.58%, +0.05%
Branches: 34053 -> 34054 (+0.00%)
PreSGPRs: 172405 -> 172260 (-0.08%)
Signed-off-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18645>
2022-09-17 20:52:24 +02:00
|
|
|
uint32_t literal = ctx.info[instr->operands[ffs(info->literal_mask) - 1].tempId()].val;
|
2020-05-15 14:03:15 +01:00
|
|
|
new_mad.reset(create_instruction<VOP2_instruction>(new_op, Format::VOP2, 3, 1));
|
aco: Use v_fmaak/v_fmamk if two operands are the same literal.
Foz-DB Navi21:
Totals from 5744 (4.26% of 134913) affected shaders:
VGPRs: 237128 -> 237056 (-0.03%); split: -0.04%, +0.01%
CodeSize: 16654484 -> 16620668 (-0.20%); split: -0.23%, +0.03%
MaxWaves: 152838 -> 152846 (+0.01%)
Instrs: 3063214 -> 3058572 (-0.15%); split: -0.17%, +0.02%
Latency: 23935195 -> 23934827 (-0.00%); split: -0.03%, +0.03%
InvThroughput: 5478562 -> 5478160 (-0.01%); split: -0.01%, +0.01%
VClause: 60432 -> 60435 (+0.00%); split: -0.02%, +0.03%
SClause: 121032 -> 120896 (-0.11%); split: -0.20%, +0.09%
Copies: 147865 -> 143144 (-3.19%); split: -3.59%, +0.40%
PreSGPRs: 195722 -> 195661 (-0.03%); split: -0.06%, +0.03%
PreVGPRs: 182849 -> 182787 (-0.03%)
Foz-DB Vega10:
Totals from 5290 (3.92% of 135041) affected shaders:
SGPRs: 357952 -> 359616 (+0.46%); split: -0.11%, +0.57%
VGPRs: 204048 -> 203928 (-0.06%); split: -0.08%, +0.02%
CodeSize: 14043176 -> 14003100 (-0.29%); split: -0.29%, +0.00%
MaxWaves: 39401 -> 39398 (-0.01%); split: +0.01%, -0.02%
Instrs: 2636739 -> 2631246 (-0.21%); split: -0.21%, +0.00%
Latency: 25264088 -> 25256482 (-0.03%); split: -0.05%, +0.02%
InvThroughput: 12039643 -> 12039346 (-0.00%); split: -0.00%, +0.00%
VClause: 55603 -> 55584 (-0.03%); split: -0.04%, +0.00%
SClause: 101577 -> 101342 (-0.23%); split: -0.30%, +0.07%
Copies: 213344 -> 207929 (-2.54%); split: -2.58%, +0.05%
Branches: 34053 -> 34054 (+0.00%)
PreSGPRs: 172405 -> 172260 (-0.08%)
Signed-off-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18645>
2022-09-17 20:52:24 +02:00
|
|
|
for (unsigned i = 0; i < 3; i++) {
|
|
|
|
|
if (info->literal_mask & (1 << i))
|
|
|
|
|
new_mad->operands[i] = Operand::literal32(literal);
|
|
|
|
|
else
|
|
|
|
|
new_mad->operands[i] = instr->operands[i];
|
|
|
|
|
}
|
|
|
|
|
if (madak) { /* add literal -> madak */
|
2021-05-13 13:34:52 +01:00
|
|
|
if (!new_mad->operands[1].isTemp() ||
|
|
|
|
|
new_mad->operands[1].getTemp().type() == RegType::sgpr)
|
|
|
|
|
std::swap(new_mad->operands[0], new_mad->operands[1]);
|
2019-11-22 15:18:38 +00:00
|
|
|
} else { /* mul literal -> madmk */
|
aco: Use v_fmaak/v_fmamk if two operands are the same literal.
Foz-DB Navi21:
Totals from 5744 (4.26% of 134913) affected shaders:
VGPRs: 237128 -> 237056 (-0.03%); split: -0.04%, +0.01%
CodeSize: 16654484 -> 16620668 (-0.20%); split: -0.23%, +0.03%
MaxWaves: 152838 -> 152846 (+0.01%)
Instrs: 3063214 -> 3058572 (-0.15%); split: -0.17%, +0.02%
Latency: 23935195 -> 23934827 (-0.00%); split: -0.03%, +0.03%
InvThroughput: 5478562 -> 5478160 (-0.01%); split: -0.01%, +0.01%
VClause: 60432 -> 60435 (+0.00%); split: -0.02%, +0.03%
SClause: 121032 -> 120896 (-0.11%); split: -0.20%, +0.09%
Copies: 147865 -> 143144 (-3.19%); split: -3.59%, +0.40%
PreSGPRs: 195722 -> 195661 (-0.03%); split: -0.06%, +0.03%
PreVGPRs: 182849 -> 182787 (-0.03%)
Foz-DB Vega10:
Totals from 5290 (3.92% of 135041) affected shaders:
SGPRs: 357952 -> 359616 (+0.46%); split: -0.11%, +0.57%
VGPRs: 204048 -> 203928 (-0.06%); split: -0.08%, +0.02%
CodeSize: 14043176 -> 14003100 (-0.29%); split: -0.29%, +0.00%
MaxWaves: 39401 -> 39398 (-0.01%); split: +0.01%, -0.02%
Instrs: 2636739 -> 2631246 (-0.21%); split: -0.21%, +0.00%
Latency: 25264088 -> 25256482 (-0.03%); split: -0.05%, +0.02%
InvThroughput: 12039643 -> 12039346 (-0.00%); split: -0.00%, +0.00%
VClause: 55603 -> 55584 (-0.03%); split: -0.04%, +0.00%
SClause: 101577 -> 101342 (-0.23%); split: -0.30%, +0.07%
Copies: 213344 -> 207929 (-2.54%); split: -2.58%, +0.05%
Branches: 34053 -> 34054 (+0.00%)
PreSGPRs: 172405 -> 172260 (-0.08%)
Signed-off-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18645>
2022-09-17 20:52:24 +02:00
|
|
|
if (!(info->literal_mask & 0b10))
|
|
|
|
|
std::swap(new_mad->operands[0], new_mad->operands[1]);
|
|
|
|
|
std::swap(new_mad->operands[1], new_mad->operands[2]);
|
2019-11-22 13:43:39 +00:00
|
|
|
}
|
2019-11-22 15:18:38 +00:00
|
|
|
new_mad->definitions[0] = instr->definitions[0];
|
|
|
|
|
ctx.instructions.emplace_back(std::move(new_mad));
|
|
|
|
|
return;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-11-22 15:18:38 +00:00
|
|
|
/* apply literals on other SALU/VALU */
|
|
|
|
|
if (instr->isSALU() || instr->isVALU()) {
|
2019-11-22 13:43:39 +00:00
|
|
|
for (unsigned i = 0; i < instr->operands.size(); i++) {
|
|
|
|
|
Operand op = instr->operands[i];
|
2020-05-15 16:28:03 +01:00
|
|
|
unsigned bits = get_operand_size(instr, i);
|
|
|
|
|
if (op.isTemp() && ctx.info[op.tempId()].is_literal(bits) && ctx.uses[op.tempId()] == 0) {
|
2022-05-02 14:21:21 +01:00
|
|
|
Operand literal = Operand::literal32(ctx.info[op.tempId()].val);
|
2021-08-30 10:30:45 +01:00
|
|
|
instr->format = withoutDPP(instr->format);
|
2020-09-04 12:35:54 +01:00
|
|
|
if (instr->isVALU() && i > 0 && instr->format != Format::VOP3P)
|
2019-11-22 13:43:39 +00:00
|
|
|
to_VOP3(ctx, instr);
|
|
|
|
|
instr->operands[i] = literal;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-04-17 22:14:30 +02:00
|
|
|
if (instr->isSOPC())
|
|
|
|
|
try_convert_sopc_to_sopk(instr);
|
|
|
|
|
|
2022-04-17 14:32:34 +02:00
|
|
|
/* allow more s_addk_i32 optimizations if carry isn't used */
|
|
|
|
|
if (instr->opcode == aco_opcode::s_add_u32 && ctx.uses[instr->definitions[1].tempId()] == 0 &&
|
|
|
|
|
(instr->operands[0].isLiteral() || instr->operands[1].isLiteral()))
|
|
|
|
|
instr->opcode = aco_opcode::s_add_i32;
|
|
|
|
|
|
2022-09-19 17:49:53 +02:00
|
|
|
if (instr->isVOP3P())
|
|
|
|
|
unswizzle_vop3p_literals(ctx, instr);
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.instructions.emplace_back(std::move(instr));
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
optimize(Program* program)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
opt_ctx ctx;
|
|
|
|
|
ctx.program = program;
|
|
|
|
|
std::vector<ssa_info> info(program->peekAllocationId());
|
|
|
|
|
ctx.info = info.data();
|
|
|
|
|
|
|
|
|
|
/* 1. Bottom-Up DAG pass (forward) to label all ssa-defs */
|
|
|
|
|
for (Block& block : program->blocks) {
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
ctx.fp_mode = block.fp_mode;
|
2019-09-17 13:22:17 +02:00
|
|
|
for (aco_ptr<Instruction>& instr : block.instructions)
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
label_instruction(ctx, instr);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2020-05-22 12:52:05 +02:00
|
|
|
ctx.uses = dead_code_analysis(program);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
/* 2. Combine v_mad, omod, clamp and propagate sgpr on VALU instructions */
|
|
|
|
|
for (Block& block : program->blocks) {
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
ctx.fp_mode = block.fp_mode;
|
2019-09-17 13:22:17 +02:00
|
|
|
for (aco_ptr<Instruction>& instr : block.instructions)
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
combine_instruction(ctx, instr);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* 3. Top-Down DAG pass (backward) to select instructions (includes DCE) */
|
2021-06-09 10:14:54 +02:00
|
|
|
for (auto block_rit = program->blocks.rbegin(); block_rit != program->blocks.rend();
|
|
|
|
|
++block_rit) {
|
2020-11-03 14:40:05 +01:00
|
|
|
Block* block = &(*block_rit);
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
ctx.fp_mode = block->fp_mode;
|
2021-06-09 10:14:54 +02:00
|
|
|
for (auto instr_rit = block->instructions.rbegin(); instr_rit != block->instructions.rend();
|
|
|
|
|
++instr_rit)
|
2020-11-03 14:40:05 +01:00
|
|
|
select_instruction(ctx, *instr_rit);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* 4. Add literals to instructions */
|
|
|
|
|
for (Block& block : program->blocks) {
|
2022-08-17 00:18:54 +02:00
|
|
|
ctx.instructions.reserve(block.instructions.size());
|
aco: use -1.0*x and 1.0*|x| for fneg/fabs
Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.
Future versions of DXVK will require that 32-bit denormals are flushed.
fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%
fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%
fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%
fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
2020-06-25 11:44:26 +01:00
|
|
|
ctx.fp_mode = block.fp_mode;
|
2019-09-17 13:22:17 +02:00
|
|
|
for (aco_ptr<Instruction>& instr : block.instructions)
|
|
|
|
|
apply_literals(ctx, instr);
|
2022-08-17 00:18:54 +02:00
|
|
|
block.instructions = std::move(ctx.instructions);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
} // namespace aco
|