2020-06-03 11:27:55 +01:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2020 Valve Corporation
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
* Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
* IN THE SOFTWARE.
|
|
|
|
|
*
|
|
|
|
|
*/
|
2021-06-09 15:40:03 +02:00
|
|
|
|
2020-06-03 11:27:55 +01:00
|
|
|
#include "aco_ir.h"
|
2021-06-09 15:40:03 +02:00
|
|
|
|
2021-07-14 17:11:44 +01:00
|
|
|
#include "aco_builder.h"
|
|
|
|
|
|
2020-07-21 21:48:06 +01:00
|
|
|
#include "util/debug.h"
|
2020-06-03 11:27:55 +01:00
|
|
|
|
2021-06-09 15:40:03 +02:00
|
|
|
#include "c11/threads.h"
|
|
|
|
|
|
2020-06-03 11:27:55 +01:00
|
|
|
namespace aco {
|
|
|
|
|
|
2020-01-22 19:57:20 +00:00
|
|
|
uint64_t debug_flags = 0;
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
static const struct debug_control aco_debug_options[] = {{"validateir", DEBUG_VALIDATE_IR},
|
|
|
|
|
{"validatera", DEBUG_VALIDATE_RA},
|
|
|
|
|
{"perfwarn", DEBUG_PERFWARN},
|
|
|
|
|
{"force-waitcnt", DEBUG_FORCE_WAITCNT},
|
|
|
|
|
{"novn", DEBUG_NO_VN},
|
|
|
|
|
{"noopt", DEBUG_NO_OPT},
|
|
|
|
|
{"nosched", DEBUG_NO_SCHED},
|
|
|
|
|
{"perfinfo", DEBUG_PERF_INFO},
|
|
|
|
|
{"liveinfo", DEBUG_LIVE_INFO},
|
|
|
|
|
{NULL, 0}};
|
2020-01-22 19:57:20 +00:00
|
|
|
|
|
|
|
|
static once_flag init_once_flag = ONCE_FLAG_INIT;
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
static void
|
|
|
|
|
init_once()
|
2020-01-22 19:57:20 +00:00
|
|
|
{
|
|
|
|
|
debug_flags = parse_debug_string(getenv("ACO_DEBUG"), aco_debug_options);
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
#ifndef NDEBUG
|
2020-01-22 19:57:20 +00:00
|
|
|
/* enable some flags by default on debug builds */
|
2020-08-18 08:14:06 +02:00
|
|
|
debug_flags |= aco::DEBUG_VALIDATE_IR;
|
2021-06-09 10:14:54 +02:00
|
|
|
#endif
|
2020-01-22 19:57:20 +00:00
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
init()
|
2020-01-22 19:57:20 +00:00
|
|
|
{
|
|
|
|
|
call_once(&init_once_flag, init_once);
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
2021-10-05 10:32:55 +02:00
|
|
|
init_program(Program* program, Stage stage, const struct radv_shader_info* info,
|
2021-06-09 10:14:54 +02:00
|
|
|
enum chip_class chip_class, enum radeon_family family, bool wgp_mode,
|
|
|
|
|
ac_shader_config* config)
|
2020-01-22 19:57:20 +00:00
|
|
|
{
|
|
|
|
|
program->stage = stage;
|
|
|
|
|
program->config = config;
|
|
|
|
|
program->info = info;
|
|
|
|
|
program->chip_class = chip_class;
|
|
|
|
|
if (family == CHIP_UNKNOWN) {
|
|
|
|
|
switch (chip_class) {
|
2021-06-09 10:14:54 +02:00
|
|
|
case GFX6: program->family = CHIP_TAHITI; break;
|
|
|
|
|
case GFX7: program->family = CHIP_BONAIRE; break;
|
|
|
|
|
case GFX8: program->family = CHIP_POLARIS10; break;
|
|
|
|
|
case GFX9: program->family = CHIP_VEGA10; break;
|
|
|
|
|
case GFX10: program->family = CHIP_NAVI10; break;
|
|
|
|
|
default: program->family = CHIP_UNKNOWN; break;
|
2020-01-22 19:57:20 +00:00
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
program->family = family;
|
|
|
|
|
}
|
|
|
|
|
program->wave_size = info->wave_size;
|
|
|
|
|
program->lane_mask = program->wave_size == 32 ? s1 : s2;
|
|
|
|
|
|
2021-01-28 13:07:11 +00:00
|
|
|
program->dev.lds_encoding_granule = chip_class >= GFX7 ? 512 : 256;
|
2021-06-09 10:14:54 +02:00
|
|
|
program->dev.lds_alloc_granule =
|
|
|
|
|
chip_class >= GFX10_3 ? 1024 : program->dev.lds_encoding_granule;
|
2021-01-28 13:07:11 +00:00
|
|
|
program->dev.lds_limit = chip_class >= GFX7 ? 65536 : 32768;
|
2020-01-22 19:57:20 +00:00
|
|
|
/* apparently gfx702 also has 16-bank LDS but I can't find a family for that */
|
2021-01-28 13:07:11 +00:00
|
|
|
program->dev.has_16bank_lds = family == CHIP_KABINI || family == CHIP_STONEY;
|
2020-01-22 19:57:20 +00:00
|
|
|
|
2021-01-28 13:07:11 +00:00
|
|
|
program->dev.vgpr_limit = 256;
|
|
|
|
|
program->dev.physical_vgprs = 256;
|
|
|
|
|
program->dev.vgpr_alloc_granule = 4;
|
2020-01-22 19:57:20 +00:00
|
|
|
|
|
|
|
|
if (chip_class >= GFX10) {
|
2021-01-28 13:07:11 +00:00
|
|
|
program->dev.physical_sgprs = 5120; /* doesn't matter as long as it's at least 128 * 40 */
|
|
|
|
|
program->dev.physical_vgprs = program->wave_size == 32 ? 1024 : 512;
|
|
|
|
|
program->dev.sgpr_alloc_granule = 128;
|
2021-06-09 10:14:54 +02:00
|
|
|
program->dev.sgpr_limit =
|
|
|
|
|
108; /* includes VCC, which can be treated as s[106-107] on GFX10+ */
|
2020-06-18 14:31:13 +01:00
|
|
|
if (chip_class >= GFX10_3)
|
2021-01-28 13:07:11 +00:00
|
|
|
program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 16 : 8;
|
2020-06-18 14:31:13 +01:00
|
|
|
else
|
2021-01-28 13:07:11 +00:00
|
|
|
program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 8 : 4;
|
2020-01-22 19:57:20 +00:00
|
|
|
} else if (program->chip_class >= GFX8) {
|
2021-01-28 13:07:11 +00:00
|
|
|
program->dev.physical_sgprs = 800;
|
|
|
|
|
program->dev.sgpr_alloc_granule = 16;
|
|
|
|
|
program->dev.sgpr_limit = 102;
|
2021-02-05 14:36:39 +01:00
|
|
|
if (family == CHIP_TONGA || family == CHIP_ICELAND)
|
2021-01-28 13:07:11 +00:00
|
|
|
program->dev.sgpr_alloc_granule = 96; /* workaround hardware bug */
|
2020-01-22 19:57:20 +00:00
|
|
|
} else {
|
2021-01-28 13:07:11 +00:00
|
|
|
program->dev.physical_sgprs = 512;
|
|
|
|
|
program->dev.sgpr_alloc_granule = 8;
|
|
|
|
|
program->dev.sgpr_limit = 104;
|
2020-01-22 19:57:20 +00:00
|
|
|
}
|
|
|
|
|
|
2021-01-28 13:07:11 +00:00
|
|
|
program->dev.max_wave64_per_simd = 10;
|
|
|
|
|
if (program->chip_class >= GFX10_3)
|
|
|
|
|
program->dev.max_wave64_per_simd = 16;
|
|
|
|
|
else if (program->chip_class == GFX10)
|
|
|
|
|
program->dev.max_wave64_per_simd = 20;
|
|
|
|
|
else if (program->family >= CHIP_POLARIS10 && program->family <= CHIP_VEGAM)
|
|
|
|
|
program->dev.max_wave64_per_simd = 8;
|
|
|
|
|
|
|
|
|
|
program->dev.simd_per_cu = program->chip_class >= GFX10 ? 2 : 4;
|
|
|
|
|
|
|
|
|
|
switch (program->family) {
|
|
|
|
|
/* GFX8 APUs */
|
|
|
|
|
case CHIP_CARRIZO:
|
|
|
|
|
case CHIP_STONEY:
|
|
|
|
|
/* GFX9 APUS */
|
|
|
|
|
case CHIP_RAVEN:
|
|
|
|
|
case CHIP_RAVEN2:
|
2021-06-09 10:14:54 +02:00
|
|
|
case CHIP_RENOIR: program->dev.xnack_enabled = true; break;
|
|
|
|
|
default: break;
|
2021-01-28 13:07:11 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
program->dev.sram_ecc_enabled = program->family == CHIP_ARCTURUS;
|
|
|
|
|
/* apparently gfx702 also has fast v_fma_f32 but I can't find a family for that */
|
|
|
|
|
program->dev.has_fast_fma32 = program->chip_class >= GFX9;
|
2021-06-09 10:14:54 +02:00
|
|
|
if (program->family == CHIP_TAHITI || program->family == CHIP_CARRIZO ||
|
2021-01-28 13:07:11 +00:00
|
|
|
program->family == CHIP_HAWAII)
|
|
|
|
|
program->dev.has_fast_fma32 = true;
|
2021-04-27 12:11:37 +01:00
|
|
|
program->dev.has_mac_legacy32 = program->chip_class <= GFX7 || program->chip_class >= GFX10;
|
2021-01-28 13:07:11 +00:00
|
|
|
|
2022-01-27 14:00:38 +00:00
|
|
|
program->dev.fused_mad_mix = program->chip_class >= GFX10;
|
|
|
|
|
if (program->family == CHIP_VEGA12 || program->family == CHIP_VEGA20 ||
|
|
|
|
|
program->family == CHIP_ARCTURUS || program->family == CHIP_ALDEBARAN)
|
|
|
|
|
program->dev.fused_mad_mix = true;
|
|
|
|
|
|
2021-02-01 15:14:01 +00:00
|
|
|
program->wgp_mode = wgp_mode;
|
2021-01-28 11:07:26 +00:00
|
|
|
|
2021-04-20 17:35:41 +01:00
|
|
|
program->progress = CompilationProgress::after_isel;
|
|
|
|
|
|
2020-01-22 19:57:20 +00:00
|
|
|
program->next_fp_mode.preserve_signed_zero_inf_nan32 = false;
|
|
|
|
|
program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false;
|
|
|
|
|
program->next_fp_mode.must_flush_denorms32 = false;
|
|
|
|
|
program->next_fp_mode.must_flush_denorms16_64 = false;
|
|
|
|
|
program->next_fp_mode.care_about_round32 = false;
|
|
|
|
|
program->next_fp_mode.care_about_round16_64 = false;
|
|
|
|
|
program->next_fp_mode.denorm16_64 = fp_denorm_keep;
|
|
|
|
|
program->next_fp_mode.denorm32 = 0;
|
|
|
|
|
program->next_fp_mode.round16_64 = fp_round_ne;
|
|
|
|
|
program->next_fp_mode.round32 = fp_round_ne;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
memory_sync_info
|
|
|
|
|
get_sync_info(const Instruction* instr)
|
2020-06-26 15:54:22 +01:00
|
|
|
{
|
|
|
|
|
switch (instr->format) {
|
2021-06-09 10:14:54 +02:00
|
|
|
case Format::SMEM: return instr->smem().sync;
|
|
|
|
|
case Format::MUBUF: return instr->mubuf().sync;
|
|
|
|
|
case Format::MIMG: return instr->mimg().sync;
|
|
|
|
|
case Format::MTBUF: return instr->mtbuf().sync;
|
2020-06-26 15:54:22 +01:00
|
|
|
case Format::FLAT:
|
|
|
|
|
case Format::GLOBAL:
|
2021-06-09 10:14:54 +02:00
|
|
|
case Format::SCRATCH: return instr->flatlike().sync;
|
|
|
|
|
case Format::DS: return instr->ds().sync;
|
|
|
|
|
default: return memory_sync_info();
|
2020-06-26 15:54:22 +01:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr, bool pre_ra)
|
2020-05-11 17:49:40 +01:00
|
|
|
{
|
|
|
|
|
if (!instr->isVALU())
|
|
|
|
|
return false;
|
|
|
|
|
|
2021-12-20 10:46:43 +01:00
|
|
|
if (chip < GFX8 || instr->isDPP() || instr->isVOP3P())
|
2020-05-11 17:49:40 +01:00
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (instr->isSDWA())
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
if (instr->isVOP3()) {
|
2021-01-21 16:13:34 +00:00
|
|
|
VOP3_instruction& vop3 = instr->vop3();
|
2020-05-11 17:49:40 +01:00
|
|
|
if (instr->format == Format::VOP3)
|
|
|
|
|
return false;
|
aco: use VOPC_SDWA on GFX9+
Totals from 5138 (3.42% of 150170) affected shaders: (GFX10.3)
VGPRs: 409520 -> 409416 (-0.03%); split: -0.03%, +0.00%
CodeSize: 43056360 -> 43035696 (-0.05%); split: -0.06%, +0.02%
MaxWaves: 69296 -> 69310 (+0.02%)
Instrs: 8161016 -> 8153365 (-0.09%); split: -0.10%, +0.01%
Latency: 109397002 -> 109756208 (+0.33%); split: -0.05%, +0.38%
InvThroughput: 23238920 -> 23310761 (+0.31%); split: -0.11%, +0.42%
VClause: 135141 -> 135100 (-0.03%); split: -0.05%, +0.02%
SClause: 349511 -> 349489 (-0.01%); split: -0.01%, +0.00%
Copies: 388107 -> 387754 (-0.09%); split: -0.48%, +0.38%
Branches: 184629 -> 184503 (-0.07%); split: -0.08%, +0.01%
PreSGPRs: 258807 -> 258839 (+0.01%)
PreVGPRs: 372561 -> 372184 (-0.10%); split: -0.10%, +0.00%
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12364>
2021-07-07 11:37:49 +02:00
|
|
|
if (vop3.clamp && instr->isVOPC() && chip != GFX8)
|
2020-05-11 17:49:40 +01:00
|
|
|
return false;
|
2021-01-21 16:13:34 +00:00
|
|
|
if (vop3.omod && chip < GFX9)
|
2020-05-11 17:49:40 +01:00
|
|
|
return false;
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
// TODO: return true if we know we will use vcc
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
if (!pre_ra && instr->definitions.size() >= 2)
|
2020-05-11 17:49:40 +01:00
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 1; i < instr->operands.size(); i++) {
|
|
|
|
|
if (instr->operands[i].isLiteral())
|
|
|
|
|
return false;
|
|
|
|
|
if (chip < GFX9 && !instr->operands[i].isOfType(RegType::vgpr))
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
aco: use VOPC_SDWA on GFX9+
Totals from 5138 (3.42% of 150170) affected shaders: (GFX10.3)
VGPRs: 409520 -> 409416 (-0.03%); split: -0.03%, +0.00%
CodeSize: 43056360 -> 43035696 (-0.05%); split: -0.06%, +0.02%
MaxWaves: 69296 -> 69310 (+0.02%)
Instrs: 8161016 -> 8153365 (-0.09%); split: -0.10%, +0.01%
Latency: 109397002 -> 109756208 (+0.33%); split: -0.05%, +0.38%
InvThroughput: 23238920 -> 23310761 (+0.31%); split: -0.11%, +0.42%
VClause: 135141 -> 135100 (-0.03%); split: -0.05%, +0.02%
SClause: 349511 -> 349489 (-0.01%); split: -0.01%, +0.00%
Copies: 388107 -> 387754 (-0.09%); split: -0.48%, +0.38%
Branches: 184629 -> 184503 (-0.07%); split: -0.08%, +0.01%
PreSGPRs: 258807 -> 258839 (+0.01%)
PreVGPRs: 372561 -> 372184 (-0.10%); split: -0.10%, +0.00%
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12364>
2021-07-07 11:37:49 +02:00
|
|
|
if (!instr->definitions.empty() && instr->definitions[0].bytes() > 4 && !instr->isVOPC())
|
2021-06-07 16:56:45 +01:00
|
|
|
return false;
|
|
|
|
|
|
2020-05-11 17:49:40 +01:00
|
|
|
if (!instr->operands.empty()) {
|
|
|
|
|
if (instr->operands[0].isLiteral())
|
|
|
|
|
return false;
|
|
|
|
|
if (chip < GFX9 && !instr->operands[0].isOfType(RegType::vgpr))
|
|
|
|
|
return false;
|
2021-06-07 16:56:45 +01:00
|
|
|
if (instr->operands[0].bytes() > 4)
|
|
|
|
|
return false;
|
|
|
|
|
if (instr->operands.size() > 1 && instr->operands[1].bytes() > 4)
|
|
|
|
|
return false;
|
2020-05-11 17:49:40 +01:00
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool is_mac = instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_mac_f16 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_fmac_f32 || instr->opcode == aco_opcode::v_fmac_f16;
|
2020-05-11 17:49:40 +01:00
|
|
|
|
|
|
|
|
if (chip != GFX8 && is_mac)
|
|
|
|
|
return false;
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
// TODO: return true if we know we will use vcc
|
aco: use VOPC_SDWA on GFX9+
Totals from 5138 (3.42% of 150170) affected shaders: (GFX10.3)
VGPRs: 409520 -> 409416 (-0.03%); split: -0.03%, +0.00%
CodeSize: 43056360 -> 43035696 (-0.05%); split: -0.06%, +0.02%
MaxWaves: 69296 -> 69310 (+0.02%)
Instrs: 8161016 -> 8153365 (-0.09%); split: -0.10%, +0.01%
Latency: 109397002 -> 109756208 (+0.33%); split: -0.05%, +0.38%
InvThroughput: 23238920 -> 23310761 (+0.31%); split: -0.11%, +0.42%
VClause: 135141 -> 135100 (-0.03%); split: -0.05%, +0.02%
SClause: 349511 -> 349489 (-0.01%); split: -0.01%, +0.00%
Copies: 388107 -> 387754 (-0.09%); split: -0.48%, +0.38%
Branches: 184629 -> 184503 (-0.07%); split: -0.08%, +0.01%
PreSGPRs: 258807 -> 258839 (+0.01%)
PreVGPRs: 372561 -> 372184 (-0.10%); split: -0.10%, +0.00%
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12364>
2021-07-07 11:37:49 +02:00
|
|
|
if (!pre_ra && instr->isVOPC() && chip == GFX8)
|
2020-05-11 17:49:40 +01:00
|
|
|
return false;
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
if (!pre_ra && instr->operands.size() >= 3 && !is_mac)
|
2020-05-11 17:49:40 +01:00
|
|
|
return false;
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
|
2020-05-11 17:49:40 +01:00
|
|
|
instr->opcode != aco_opcode::v_readfirstlane_b32 &&
|
2021-06-09 10:14:54 +02:00
|
|
|
instr->opcode != aco_opcode::v_clrexcp && instr->opcode != aco_opcode::v_swap_b32;
|
2020-05-11 17:49:40 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* updates "instr" and returns the old instruction (or NULL if no update was needed) */
|
2021-06-09 10:14:54 +02:00
|
|
|
aco_ptr<Instruction>
|
|
|
|
|
convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr)
|
2020-05-11 17:49:40 +01:00
|
|
|
{
|
|
|
|
|
if (instr->isSDWA())
|
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
|
|
aco_ptr<Instruction> tmp = std::move(instr);
|
2021-06-09 10:14:54 +02:00
|
|
|
Format format =
|
|
|
|
|
(Format)(((uint16_t)tmp->format & ~(uint16_t)Format::VOP3) | (uint16_t)Format::SDWA);
|
|
|
|
|
instr.reset(create_instruction<SDWA_instruction>(tmp->opcode, format, tmp->operands.size(),
|
|
|
|
|
tmp->definitions.size()));
|
2020-05-11 17:49:40 +01:00
|
|
|
std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
|
|
|
|
|
std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(), instr->definitions.begin());
|
|
|
|
|
|
2021-01-21 16:13:34 +00:00
|
|
|
SDWA_instruction& sdwa = instr->sdwa();
|
2020-05-11 17:49:40 +01:00
|
|
|
|
|
|
|
|
if (tmp->isVOP3()) {
|
2021-01-21 16:13:34 +00:00
|
|
|
VOP3_instruction& vop3 = tmp->vop3();
|
|
|
|
|
memcpy(sdwa.neg, vop3.neg, sizeof(sdwa.neg));
|
|
|
|
|
memcpy(sdwa.abs, vop3.abs, sizeof(sdwa.abs));
|
|
|
|
|
sdwa.omod = vop3.omod;
|
|
|
|
|
sdwa.clamp = vop3.clamp;
|
2020-05-11 17:49:40 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < instr->operands.size(); i++) {
|
2020-08-22 20:45:54 +02:00
|
|
|
/* SDWA only uses operands 0 and 1. */
|
|
|
|
|
if (i >= 2)
|
|
|
|
|
break;
|
|
|
|
|
|
2021-08-30 17:58:36 +02:00
|
|
|
sdwa.sel[i] = SubdwordSel(instr->operands[i].bytes(), 0, false);
|
2020-05-11 17:49:40 +01:00
|
|
|
}
|
|
|
|
|
|
2021-08-30 17:58:36 +02:00
|
|
|
sdwa.dst_sel = SubdwordSel(instr->definitions[0].bytes(), 0, false);
|
|
|
|
|
|
2020-05-11 17:49:40 +01:00
|
|
|
if (instr->definitions[0].getTemp().type() == RegType::sgpr && chip == GFX8)
|
|
|
|
|
instr->definitions[0].setFixed(vcc);
|
|
|
|
|
if (instr->definitions.size() >= 2)
|
|
|
|
|
instr->definitions[1].setFixed(vcc);
|
|
|
|
|
if (instr->operands.size() >= 3)
|
|
|
|
|
instr->operands[2].setFixed(vcc);
|
|
|
|
|
|
2022-01-28 14:49:50 +00:00
|
|
|
instr->pass_flags = tmp->pass_flags;
|
|
|
|
|
|
2020-05-11 17:49:40 +01:00
|
|
|
return tmp;
|
|
|
|
|
}
|
|
|
|
|
|
2021-07-14 17:11:44 +01:00
|
|
|
bool
|
2021-11-29 00:12:04 +09:00
|
|
|
can_use_DPP(const aco_ptr<Instruction>& instr, bool pre_ra, bool dpp8)
|
2021-07-14 17:11:44 +01:00
|
|
|
{
|
|
|
|
|
assert(instr->isVALU() && !instr->operands.empty());
|
|
|
|
|
|
|
|
|
|
if (instr->isDPP())
|
2021-11-29 00:12:04 +09:00
|
|
|
return instr->isDPP8() == dpp8;
|
2021-07-14 17:11:44 +01:00
|
|
|
|
|
|
|
|
if (instr->operands.size() && instr->operands[0].isLiteral())
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (instr->isSDWA())
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (!pre_ra && (instr->isVOPC() || instr->definitions.size() > 1) &&
|
|
|
|
|
instr->definitions.back().physReg() != vcc)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (!pre_ra && instr->operands.size() >= 3 && instr->operands[2].physReg() != vcc)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (instr->isVOP3()) {
|
|
|
|
|
const VOP3_instruction* vop3 = &instr->vop3();
|
|
|
|
|
if (vop3->clamp || vop3->omod || vop3->opsel)
|
|
|
|
|
return false;
|
2021-11-29 00:12:04 +09:00
|
|
|
if (dpp8)
|
|
|
|
|
return false;
|
2021-07-14 17:11:44 +01:00
|
|
|
if (instr->format == Format::VOP3)
|
|
|
|
|
return false;
|
2021-11-29 16:34:15 +00:00
|
|
|
if (instr->operands.size() > 1 && !instr->operands[1].isOfType(RegType::vgpr))
|
|
|
|
|
return false;
|
2021-07-14 17:11:44 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* there are more cases but those all take 64-bit inputs */
|
|
|
|
|
return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_readfirstlane_b32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_cvt_f64_i32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_cvt_f64_f32 && instr->opcode != aco_opcode::v_cvt_f64_u32;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
aco_ptr<Instruction>
|
2021-11-29 00:12:04 +09:00
|
|
|
convert_to_DPP(aco_ptr<Instruction>& instr, bool dpp8)
|
2021-07-14 17:11:44 +01:00
|
|
|
{
|
|
|
|
|
if (instr->isDPP())
|
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
|
|
aco_ptr<Instruction> tmp = std::move(instr);
|
2021-11-29 00:12:04 +09:00
|
|
|
Format format = (Format)(((uint32_t)tmp->format & ~(uint32_t)Format::VOP3) |
|
|
|
|
|
(dpp8 ? (uint32_t)Format::DPP8 : (uint32_t)Format::DPP16));
|
|
|
|
|
if (dpp8)
|
|
|
|
|
instr.reset(create_instruction<DPP8_instruction>(tmp->opcode, format, tmp->operands.size(),
|
|
|
|
|
tmp->definitions.size()));
|
|
|
|
|
else
|
|
|
|
|
instr.reset(create_instruction<DPP16_instruction>(tmp->opcode, format, tmp->operands.size(),
|
|
|
|
|
tmp->definitions.size()));
|
2021-07-14 17:11:44 +01:00
|
|
|
std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
|
|
|
|
|
for (unsigned i = 0; i < instr->definitions.size(); i++)
|
|
|
|
|
instr->definitions[i] = tmp->definitions[i];
|
|
|
|
|
|
2021-11-29 00:12:04 +09:00
|
|
|
if (dpp8) {
|
|
|
|
|
DPP8_instruction* dpp = &instr->dpp8();
|
|
|
|
|
for (unsigned i = 0; i < 8; i++)
|
|
|
|
|
dpp->lane_sel[i] = i;
|
|
|
|
|
} else {
|
|
|
|
|
DPP16_instruction* dpp = &instr->dpp16();
|
|
|
|
|
dpp->dpp_ctrl = dpp_quad_perm(0, 1, 2, 3);
|
|
|
|
|
dpp->row_mask = 0xf;
|
|
|
|
|
dpp->bank_mask = 0xf;
|
|
|
|
|
|
|
|
|
|
if (tmp->isVOP3()) {
|
|
|
|
|
const VOP3_instruction* vop3 = &tmp->vop3();
|
|
|
|
|
memcpy(dpp->neg, vop3->neg, sizeof(dpp->neg));
|
|
|
|
|
memcpy(dpp->abs, vop3->abs, sizeof(dpp->abs));
|
|
|
|
|
}
|
2021-07-14 17:11:44 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (instr->isVOPC() || instr->definitions.size() > 1)
|
|
|
|
|
instr->definitions.back().setFixed(vcc);
|
|
|
|
|
|
|
|
|
|
if (instr->operands.size() >= 3)
|
|
|
|
|
instr->operands[2].setFixed(vcc);
|
|
|
|
|
|
2022-01-28 14:49:50 +00:00
|
|
|
instr->pass_flags = tmp->pass_flags;
|
|
|
|
|
|
2021-07-14 17:11:44 +01:00
|
|
|
return tmp;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high)
|
2020-06-03 11:27:55 +01:00
|
|
|
{
|
|
|
|
|
/* opsel is only GFX9+ */
|
|
|
|
|
if ((high || idx == -1) && chip < GFX9)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
switch (op) {
|
|
|
|
|
case aco_opcode::v_div_fixup_f16:
|
|
|
|
|
case aco_opcode::v_fma_f16:
|
|
|
|
|
case aco_opcode::v_mad_f16:
|
|
|
|
|
case aco_opcode::v_mad_u16:
|
|
|
|
|
case aco_opcode::v_mad_i16:
|
|
|
|
|
case aco_opcode::v_med3_f16:
|
|
|
|
|
case aco_opcode::v_med3_i16:
|
|
|
|
|
case aco_opcode::v_med3_u16:
|
|
|
|
|
case aco_opcode::v_min3_f16:
|
|
|
|
|
case aco_opcode::v_min3_i16:
|
|
|
|
|
case aco_opcode::v_min3_u16:
|
|
|
|
|
case aco_opcode::v_max3_f16:
|
|
|
|
|
case aco_opcode::v_max3_i16:
|
|
|
|
|
case aco_opcode::v_max3_u16:
|
|
|
|
|
case aco_opcode::v_max_u16_e64:
|
|
|
|
|
case aco_opcode::v_max_i16_e64:
|
|
|
|
|
case aco_opcode::v_min_u16_e64:
|
|
|
|
|
case aco_opcode::v_min_i16_e64:
|
|
|
|
|
case aco_opcode::v_add_i16:
|
|
|
|
|
case aco_opcode::v_sub_i16:
|
|
|
|
|
case aco_opcode::v_add_u16_e64:
|
|
|
|
|
case aco_opcode::v_sub_u16_e64:
|
|
|
|
|
case aco_opcode::v_lshlrev_b16_e64:
|
|
|
|
|
case aco_opcode::v_lshrrev_b16_e64:
|
|
|
|
|
case aco_opcode::v_ashrrev_i16_e64:
|
2021-06-09 10:14:54 +02:00
|
|
|
case aco_opcode::v_mul_lo_u16_e64: return true;
|
2020-06-03 11:27:55 +01:00
|
|
|
case aco_opcode::v_pack_b32_f16:
|
2020-08-17 11:36:24 +01:00
|
|
|
case aco_opcode::v_cvt_pknorm_i16_f16:
|
2021-06-09 10:14:54 +02:00
|
|
|
case aco_opcode::v_cvt_pknorm_u16_f16: return idx != -1;
|
2020-06-03 11:27:55 +01:00
|
|
|
case aco_opcode::v_mad_u32_u16:
|
2021-06-09 10:14:54 +02:00
|
|
|
case aco_opcode::v_mad_i32_i16: return idx >= 0 && idx < 2;
|
|
|
|
|
default: return false;
|
2020-06-03 11:27:55 +01:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-08-18 18:56:59 +02:00
|
|
|
bool
|
|
|
|
|
instr_is_16bit(chip_class chip, aco_opcode op)
|
|
|
|
|
{
|
|
|
|
|
/* partial register writes are GFX9+, only */
|
|
|
|
|
if (chip < GFX9)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
switch (op) {
|
|
|
|
|
/* VOP3 */
|
|
|
|
|
case aco_opcode::v_mad_f16:
|
|
|
|
|
case aco_opcode::v_mad_u16:
|
|
|
|
|
case aco_opcode::v_mad_i16:
|
|
|
|
|
case aco_opcode::v_fma_f16:
|
|
|
|
|
case aco_opcode::v_div_fixup_f16:
|
|
|
|
|
case aco_opcode::v_interp_p2_f16:
|
|
|
|
|
case aco_opcode::v_fma_mixlo_f16:
|
|
|
|
|
/* VOP2 */
|
|
|
|
|
case aco_opcode::v_mac_f16:
|
|
|
|
|
case aco_opcode::v_madak_f16:
|
|
|
|
|
case aco_opcode::v_madmk_f16: return chip >= GFX9;
|
|
|
|
|
case aco_opcode::v_add_f16:
|
|
|
|
|
case aco_opcode::v_sub_f16:
|
|
|
|
|
case aco_opcode::v_subrev_f16:
|
|
|
|
|
case aco_opcode::v_mul_f16:
|
|
|
|
|
case aco_opcode::v_max_f16:
|
|
|
|
|
case aco_opcode::v_min_f16:
|
|
|
|
|
case aco_opcode::v_ldexp_f16:
|
|
|
|
|
case aco_opcode::v_fmac_f16:
|
|
|
|
|
case aco_opcode::v_fmamk_f16:
|
|
|
|
|
case aco_opcode::v_fmaak_f16:
|
|
|
|
|
/* VOP1 */
|
|
|
|
|
case aco_opcode::v_cvt_f16_f32:
|
|
|
|
|
case aco_opcode::v_cvt_f16_u16:
|
|
|
|
|
case aco_opcode::v_cvt_f16_i16:
|
|
|
|
|
case aco_opcode::v_rcp_f16:
|
|
|
|
|
case aco_opcode::v_sqrt_f16:
|
|
|
|
|
case aco_opcode::v_rsq_f16:
|
|
|
|
|
case aco_opcode::v_log_f16:
|
|
|
|
|
case aco_opcode::v_exp_f16:
|
|
|
|
|
case aco_opcode::v_frexp_mant_f16:
|
|
|
|
|
case aco_opcode::v_frexp_exp_i16_f16:
|
|
|
|
|
case aco_opcode::v_floor_f16:
|
|
|
|
|
case aco_opcode::v_ceil_f16:
|
|
|
|
|
case aco_opcode::v_trunc_f16:
|
|
|
|
|
case aco_opcode::v_rndne_f16:
|
|
|
|
|
case aco_opcode::v_fract_f16:
|
|
|
|
|
case aco_opcode::v_sin_f16:
|
|
|
|
|
case aco_opcode::v_cos_f16: return chip >= GFX10;
|
|
|
|
|
// TODO: confirm whether these write 16 or 32 bit on GFX10+
|
|
|
|
|
// case aco_opcode::v_cvt_u16_f16:
|
|
|
|
|
// case aco_opcode::v_cvt_i16_f16:
|
|
|
|
|
// case aco_opcode::p_cvt_f16_f32_rtne:
|
|
|
|
|
// case aco_opcode::v_cvt_norm_i16_f16:
|
|
|
|
|
// case aco_opcode::v_cvt_norm_u16_f16:
|
|
|
|
|
/* on GFX10, all opsel instructions preserve the high bits */
|
|
|
|
|
default: return chip >= GFX10 && can_use_opsel(chip, op, -1, false);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
uint32_t
|
|
|
|
|
get_reduction_identity(ReduceOp op, unsigned idx)
|
2020-09-01 16:30:06 +01:00
|
|
|
{
|
|
|
|
|
switch (op) {
|
|
|
|
|
case iadd8:
|
|
|
|
|
case iadd16:
|
|
|
|
|
case iadd32:
|
|
|
|
|
case iadd64:
|
|
|
|
|
case fadd16:
|
|
|
|
|
case fadd32:
|
|
|
|
|
case fadd64:
|
|
|
|
|
case ior8:
|
|
|
|
|
case ior16:
|
|
|
|
|
case ior32:
|
|
|
|
|
case ior64:
|
|
|
|
|
case ixor8:
|
|
|
|
|
case ixor16:
|
|
|
|
|
case ixor32:
|
|
|
|
|
case ixor64:
|
|
|
|
|
case umax8:
|
|
|
|
|
case umax16:
|
|
|
|
|
case umax32:
|
2021-06-09 10:14:54 +02:00
|
|
|
case umax64: return 0;
|
2020-09-01 16:30:06 +01:00
|
|
|
case imul8:
|
|
|
|
|
case imul16:
|
|
|
|
|
case imul32:
|
2021-06-09 10:14:54 +02:00
|
|
|
case imul64: return idx ? 0 : 1;
|
|
|
|
|
case fmul16: return 0x3c00u; /* 1.0 */
|
|
|
|
|
case fmul32: return 0x3f800000u; /* 1.0 */
|
|
|
|
|
case fmul64: return idx ? 0x3ff00000u : 0u; /* 1.0 */
|
|
|
|
|
case imin8: return INT8_MAX;
|
|
|
|
|
case imin16: return INT16_MAX;
|
|
|
|
|
case imin32: return INT32_MAX;
|
|
|
|
|
case imin64: return idx ? 0x7fffffffu : 0xffffffffu;
|
|
|
|
|
case imax8: return INT8_MIN;
|
|
|
|
|
case imax16: return INT16_MIN;
|
|
|
|
|
case imax32: return INT32_MIN;
|
|
|
|
|
case imax64: return idx ? 0x80000000u : 0;
|
2020-09-01 16:30:06 +01:00
|
|
|
case umin8:
|
|
|
|
|
case umin16:
|
|
|
|
|
case iand8:
|
2021-06-09 10:14:54 +02:00
|
|
|
case iand16: return 0xffffffffu;
|
2020-09-01 16:30:06 +01:00
|
|
|
case umin32:
|
|
|
|
|
case umin64:
|
|
|
|
|
case iand32:
|
2021-06-09 10:14:54 +02:00
|
|
|
case iand64: return 0xffffffffu;
|
|
|
|
|
case fmin16: return 0x7c00u; /* infinity */
|
|
|
|
|
case fmin32: return 0x7f800000u; /* infinity */
|
|
|
|
|
case fmin64: return idx ? 0x7ff00000u : 0u; /* infinity */
|
|
|
|
|
case fmax16: return 0xfc00u; /* negative infinity */
|
|
|
|
|
case fmax32: return 0xff800000u; /* negative infinity */
|
|
|
|
|
case fmax64: return idx ? 0xfff00000u : 0u; /* negative infinity */
|
|
|
|
|
default: unreachable("Invalid reduction operation"); break;
|
2020-09-01 16:30:06 +01:00
|
|
|
}
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
needs_exec_mask(const Instruction* instr)
|
|
|
|
|
{
|
2021-11-12 13:46:17 +00:00
|
|
|
if (instr->isVALU()) {
|
|
|
|
|
return instr->opcode != aco_opcode::v_readlane_b32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_readlane_b32_e64 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_writelane_b32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_writelane_b32_e64;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (instr->isVMEM() || instr->isFlatLike())
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
if (instr->isSALU() || instr->isBranch() || instr->isSMEM() || instr->isBarrier())
|
2020-08-12 16:58:35 +02:00
|
|
|
return instr->reads_exec();
|
|
|
|
|
|
2021-01-20 15:27:16 +00:00
|
|
|
if (instr->isPseudo()) {
|
2020-08-12 16:58:35 +02:00
|
|
|
switch (instr->opcode) {
|
|
|
|
|
case aco_opcode::p_create_vector:
|
|
|
|
|
case aco_opcode::p_extract_vector:
|
|
|
|
|
case aco_opcode::p_split_vector:
|
2021-05-07 09:37:59 +02:00
|
|
|
case aco_opcode::p_phi:
|
|
|
|
|
case aco_opcode::p_parallelcopy:
|
2020-08-12 16:58:35 +02:00
|
|
|
for (Definition def : instr->definitions) {
|
|
|
|
|
if (def.getTemp().type() == RegType::vgpr)
|
|
|
|
|
return true;
|
|
|
|
|
}
|
2021-11-12 13:46:17 +00:00
|
|
|
return instr->reads_exec();
|
2020-08-12 16:58:35 +02:00
|
|
|
case aco_opcode::p_spill:
|
|
|
|
|
case aco_opcode::p_reload:
|
aco/insert_exec_mask: stay in WQM while helper lanes are still needed
This patch flags all instructions WQM which don't require
Exact mode, but depend on the exec mask as long as WQM
is needed on any control flow path afterwards.
This will mostly prevent accidental copies of WQM values
within Exact mode, and also makes a lot of other workarounds
unnecessary.
Totals from 17374 (12.88% of 134913) affected shaders: (GFX10.3)
VGPRs: 526952 -> 527384 (+0.08%); split: -0.01%, +0.09%
CodeSize: 33740512 -> 33766636 (+0.08%); split: -0.06%, +0.14%
MaxWaves: 488166 -> 488108 (-0.01%); split: +0.00%, -0.02%
Instrs: 6254240 -> 6260557 (+0.10%); split: -0.08%, +0.18%
Latency: 66497580 -> 66463472 (-0.05%); split: -0.15%, +0.10%
InvThroughput: 13265741 -> 13264036 (-0.01%); split: -0.03%, +0.01%
VClause: 122962 -> 122975 (+0.01%); split: -0.01%, +0.02%
SClause: 334805 -> 334405 (-0.12%); split: -0.51%, +0.39%
Copies: 275728 -> 282341 (+2.40%); split: -0.91%, +3.31%
Branches: 92546 -> 90990 (-1.68%); split: -1.68%, +0.00%
PreSGPRs: 504119 -> 504352 (+0.05%); split: -0.00%, +0.05%
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14951>
2022-01-24 19:43:49 +01:00
|
|
|
case aco_opcode::p_end_linear_vgpr:
|
2021-05-07 09:37:59 +02:00
|
|
|
case aco_opcode::p_logical_start:
|
|
|
|
|
case aco_opcode::p_logical_end:
|
2021-11-12 13:46:17 +00:00
|
|
|
case aco_opcode::p_startpgm: return instr->reads_exec();
|
2021-06-09 10:14:54 +02:00
|
|
|
default: break;
|
2020-08-12 16:58:35 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2021-07-14 17:22:02 +01:00
|
|
|
struct CmpInfo {
|
|
|
|
|
aco_opcode ordered;
|
|
|
|
|
aco_opcode unordered;
|
|
|
|
|
aco_opcode ordered_swapped;
|
|
|
|
|
aco_opcode unordered_swapped;
|
|
|
|
|
aco_opcode inverse;
|
|
|
|
|
aco_opcode f32;
|
|
|
|
|
unsigned size;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
ALWAYS_INLINE bool
|
|
|
|
|
get_cmp_info(aco_opcode op, CmpInfo* info)
|
|
|
|
|
{
|
|
|
|
|
info->ordered = aco_opcode::num_opcodes;
|
|
|
|
|
info->unordered = aco_opcode::num_opcodes;
|
|
|
|
|
info->ordered_swapped = aco_opcode::num_opcodes;
|
|
|
|
|
info->unordered_swapped = aco_opcode::num_opcodes;
|
|
|
|
|
switch (op) {
|
|
|
|
|
// clang-format off
|
|
|
|
|
#define CMP2(ord, unord, ord_swap, unord_swap, sz) \
|
|
|
|
|
case aco_opcode::v_cmp_##ord##_f##sz: \
|
|
|
|
|
case aco_opcode::v_cmp_n##unord##_f##sz: \
|
|
|
|
|
info->ordered = aco_opcode::v_cmp_##ord##_f##sz; \
|
|
|
|
|
info->unordered = aco_opcode::v_cmp_n##unord##_f##sz; \
|
|
|
|
|
info->ordered_swapped = aco_opcode::v_cmp_##ord_swap##_f##sz; \
|
|
|
|
|
info->unordered_swapped = aco_opcode::v_cmp_n##unord_swap##_f##sz; \
|
|
|
|
|
info->inverse = op == aco_opcode::v_cmp_n##unord##_f##sz ? aco_opcode::v_cmp_##unord##_f##sz \
|
|
|
|
|
: aco_opcode::v_cmp_n##ord##_f##sz; \
|
|
|
|
|
info->f32 = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord##_f32 \
|
|
|
|
|
: aco_opcode::v_cmp_n##unord##_f32; \
|
|
|
|
|
info->size = sz; \
|
|
|
|
|
return true;
|
|
|
|
|
#define CMP(ord, unord, ord_swap, unord_swap) \
|
|
|
|
|
CMP2(ord, unord, ord_swap, unord_swap, 16) \
|
|
|
|
|
CMP2(ord, unord, ord_swap, unord_swap, 32) \
|
|
|
|
|
CMP2(ord, unord, ord_swap, unord_swap, 64)
|
|
|
|
|
CMP(lt, /*n*/ge, gt, /*n*/le)
|
|
|
|
|
CMP(eq, /*n*/lg, eq, /*n*/lg)
|
|
|
|
|
CMP(le, /*n*/gt, ge, /*n*/lt)
|
|
|
|
|
CMP(gt, /*n*/le, lt, /*n*/le)
|
|
|
|
|
CMP(lg, /*n*/eq, lg, /*n*/eq)
|
|
|
|
|
CMP(ge, /*n*/lt, le, /*n*/gt)
|
|
|
|
|
#undef CMP
|
|
|
|
|
#undef CMP2
|
|
|
|
|
#define ORD_TEST(sz) \
|
|
|
|
|
case aco_opcode::v_cmp_u_f##sz: \
|
|
|
|
|
info->f32 = aco_opcode::v_cmp_u_f32; \
|
|
|
|
|
info->inverse = aco_opcode::v_cmp_o_f##sz; \
|
|
|
|
|
info->size = sz; \
|
|
|
|
|
return true; \
|
|
|
|
|
case aco_opcode::v_cmp_o_f##sz: \
|
|
|
|
|
info->f32 = aco_opcode::v_cmp_o_f32; \
|
|
|
|
|
info->inverse = aco_opcode::v_cmp_u_f##sz; \
|
|
|
|
|
info->size = sz; \
|
|
|
|
|
return true;
|
|
|
|
|
ORD_TEST(16)
|
|
|
|
|
ORD_TEST(32)
|
|
|
|
|
ORD_TEST(64)
|
|
|
|
|
#undef ORD_TEST
|
|
|
|
|
// clang-format on
|
|
|
|
|
default: return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
aco_opcode
|
|
|
|
|
get_ordered(aco_opcode op)
|
|
|
|
|
{
|
|
|
|
|
CmpInfo info;
|
|
|
|
|
return get_cmp_info(op, &info) ? info.ordered : aco_opcode::num_opcodes;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
aco_opcode
|
|
|
|
|
get_unordered(aco_opcode op)
|
|
|
|
|
{
|
|
|
|
|
CmpInfo info;
|
|
|
|
|
return get_cmp_info(op, &info) ? info.unordered : aco_opcode::num_opcodes;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
aco_opcode
|
|
|
|
|
get_inverse(aco_opcode op)
|
|
|
|
|
{
|
|
|
|
|
CmpInfo info;
|
|
|
|
|
return get_cmp_info(op, &info) ? info.inverse : aco_opcode::num_opcodes;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
aco_opcode
|
|
|
|
|
get_f32_cmp(aco_opcode op)
|
|
|
|
|
{
|
|
|
|
|
CmpInfo info;
|
|
|
|
|
return get_cmp_info(op, &info) ? info.f32 : aco_opcode::num_opcodes;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
unsigned
|
|
|
|
|
get_cmp_bitsize(aco_opcode op)
|
|
|
|
|
{
|
|
|
|
|
CmpInfo info;
|
|
|
|
|
return get_cmp_info(op, &info) ? info.size : 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool
|
|
|
|
|
is_cmp(aco_opcode op)
|
|
|
|
|
{
|
|
|
|
|
CmpInfo info;
|
|
|
|
|
return get_cmp_info(op, &info) && info.ordered != aco_opcode::num_opcodes;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool
|
|
|
|
|
can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op)
|
|
|
|
|
{
|
|
|
|
|
if (instr->isDPP())
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (instr->operands[0].isConstant() ||
|
|
|
|
|
(instr->operands[0].isTemp() && instr->operands[0].getTemp().type() == RegType::sgpr))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
switch (instr->opcode) {
|
|
|
|
|
case aco_opcode::v_add_u32:
|
|
|
|
|
case aco_opcode::v_add_co_u32:
|
|
|
|
|
case aco_opcode::v_add_co_u32_e64:
|
|
|
|
|
case aco_opcode::v_add_i32:
|
|
|
|
|
case aco_opcode::v_add_f16:
|
|
|
|
|
case aco_opcode::v_add_f32:
|
|
|
|
|
case aco_opcode::v_mul_f16:
|
|
|
|
|
case aco_opcode::v_mul_f32:
|
|
|
|
|
case aco_opcode::v_or_b32:
|
|
|
|
|
case aco_opcode::v_and_b32:
|
|
|
|
|
case aco_opcode::v_xor_b32:
|
|
|
|
|
case aco_opcode::v_max_f16:
|
|
|
|
|
case aco_opcode::v_max_f32:
|
|
|
|
|
case aco_opcode::v_min_f16:
|
|
|
|
|
case aco_opcode::v_min_f32:
|
|
|
|
|
case aco_opcode::v_max_i32:
|
|
|
|
|
case aco_opcode::v_min_i32:
|
|
|
|
|
case aco_opcode::v_max_u32:
|
|
|
|
|
case aco_opcode::v_min_u32:
|
|
|
|
|
case aco_opcode::v_max_i16:
|
|
|
|
|
case aco_opcode::v_min_i16:
|
|
|
|
|
case aco_opcode::v_max_u16:
|
|
|
|
|
case aco_opcode::v_min_u16:
|
|
|
|
|
case aco_opcode::v_max_i16_e64:
|
|
|
|
|
case aco_opcode::v_min_i16_e64:
|
|
|
|
|
case aco_opcode::v_max_u16_e64:
|
|
|
|
|
case aco_opcode::v_min_u16_e64: *new_op = instr->opcode; return true;
|
|
|
|
|
case aco_opcode::v_sub_f16: *new_op = aco_opcode::v_subrev_f16; return true;
|
|
|
|
|
case aco_opcode::v_sub_f32: *new_op = aco_opcode::v_subrev_f32; return true;
|
|
|
|
|
case aco_opcode::v_sub_co_u32: *new_op = aco_opcode::v_subrev_co_u32; return true;
|
|
|
|
|
case aco_opcode::v_sub_u16: *new_op = aco_opcode::v_subrev_u16; return true;
|
|
|
|
|
case aco_opcode::v_sub_u32: *new_op = aco_opcode::v_subrev_u32; return true;
|
|
|
|
|
default: {
|
|
|
|
|
CmpInfo info;
|
|
|
|
|
get_cmp_info(instr->opcode, &info);
|
|
|
|
|
if (info.ordered == instr->opcode) {
|
|
|
|
|
*new_op = info.ordered_swapped;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
if (info.unordered == instr->opcode) {
|
|
|
|
|
*new_op = info.unordered_swapped;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
wait_imm::wait_imm() : vm(unset_counter), exp(unset_counter), lgkm(unset_counter), vs(unset_counter)
|
|
|
|
|
{}
|
|
|
|
|
wait_imm::wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_)
|
|
|
|
|
: vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_)
|
|
|
|
|
{}
|
2021-01-27 16:27:38 +00:00
|
|
|
|
|
|
|
|
wait_imm::wait_imm(enum chip_class chip, uint16_t packed) : vs(unset_counter)
|
|
|
|
|
{
|
|
|
|
|
vm = packed & 0xf;
|
|
|
|
|
if (chip >= GFX9)
|
|
|
|
|
vm |= (packed >> 10) & 0x30;
|
|
|
|
|
|
|
|
|
|
exp = (packed >> 4) & 0x7;
|
|
|
|
|
|
|
|
|
|
lgkm = (packed >> 8) & 0xf;
|
|
|
|
|
if (chip >= GFX10)
|
|
|
|
|
lgkm |= (packed >> 8) & 0x30;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
uint16_t
|
|
|
|
|
wait_imm::pack(enum chip_class chip) const
|
2021-01-27 16:27:38 +00:00
|
|
|
{
|
|
|
|
|
uint16_t imm = 0;
|
|
|
|
|
assert(exp == unset_counter || exp <= 0x7);
|
|
|
|
|
switch (chip) {
|
|
|
|
|
case GFX10:
|
|
|
|
|
case GFX10_3:
|
|
|
|
|
assert(lgkm == unset_counter || lgkm <= 0x3f);
|
|
|
|
|
assert(vm == unset_counter || vm <= 0x3f);
|
|
|
|
|
imm = ((vm & 0x30) << 10) | ((lgkm & 0x3f) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
|
|
|
|
|
break;
|
|
|
|
|
case GFX9:
|
|
|
|
|
assert(lgkm == unset_counter || lgkm <= 0xf);
|
|
|
|
|
assert(vm == unset_counter || vm <= 0x3f);
|
|
|
|
|
imm = ((vm & 0x30) << 10) | ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
assert(lgkm == unset_counter || lgkm <= 0xf);
|
|
|
|
|
assert(vm == unset_counter || vm <= 0xf);
|
|
|
|
|
imm = ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
if (chip < GFX9 && vm == wait_imm::unset_counter)
|
2021-06-09 10:14:54 +02:00
|
|
|
imm |= 0xc000; /* should have no effect on pre-GFX9 and now we won't have to worry about the
|
|
|
|
|
architecture when interpreting the immediate */
|
2021-01-27 16:27:38 +00:00
|
|
|
if (chip < GFX10 && lgkm == wait_imm::unset_counter)
|
2021-06-09 10:14:54 +02:00
|
|
|
imm |= 0x3000; /* should have no effect on pre-GFX10 and now we won't have to worry about the
|
|
|
|
|
architecture when interpreting the immediate */
|
2021-01-27 16:27:38 +00:00
|
|
|
return imm;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
wait_imm::combine(const wait_imm& other)
|
2021-01-27 16:27:38 +00:00
|
|
|
{
|
|
|
|
|
bool changed = other.vm < vm || other.exp < exp || other.lgkm < lgkm || other.vs < vs;
|
|
|
|
|
vm = std::min(vm, other.vm);
|
|
|
|
|
exp = std::min(exp, other.exp);
|
|
|
|
|
lgkm = std::min(lgkm, other.lgkm);
|
|
|
|
|
vs = std::min(vs, other.vs);
|
|
|
|
|
return changed;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
wait_imm::empty() const
|
2021-01-27 16:27:38 +00:00
|
|
|
{
|
2021-06-09 10:14:54 +02:00
|
|
|
return vm == unset_counter && exp == unset_counter && lgkm == unset_counter &&
|
|
|
|
|
vs == unset_counter;
|
2021-01-27 16:27:38 +00:00
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
should_form_clause(const Instruction* a, const Instruction* b)
|
2020-11-30 17:53:23 +00:00
|
|
|
{
|
|
|
|
|
/* Vertex attribute loads from the same binding likely load from similar addresses */
|
2021-06-09 10:14:54 +02:00
|
|
|
unsigned a_vtx_binding =
|
|
|
|
|
a->isMUBUF() ? a->mubuf().vtx_binding : (a->isMTBUF() ? a->mtbuf().vtx_binding : 0);
|
|
|
|
|
unsigned b_vtx_binding =
|
|
|
|
|
b->isMUBUF() ? b->mubuf().vtx_binding : (b->isMTBUF() ? b->mtbuf().vtx_binding : 0);
|
2020-11-30 17:53:23 +00:00
|
|
|
if (a_vtx_binding && a_vtx_binding == b_vtx_binding)
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
if (a->format != b->format)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
/* Assume loads which don't use descriptors might load from similar addresses. */
|
|
|
|
|
if (a->isFlatLike())
|
|
|
|
|
return true;
|
|
|
|
|
if (a->isSMEM() && a->operands[0].bytes() == 8 && b->operands[0].bytes() == 8)
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
/* If they load from the same descriptor, assume they might load from similar
|
|
|
|
|
* addresses.
|
|
|
|
|
*/
|
|
|
|
|
if (a->isVMEM() || a->isSMEM())
|
|
|
|
|
return a->operands[0].tempId() == b->operands[0].tempId();
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
} // namespace aco
|