2020-06-03 11:27:55 +01:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2020 Valve Corporation
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
* Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
* IN THE SOFTWARE.
|
|
|
|
|
*
|
|
|
|
|
*/
|
2021-06-09 15:40:03 +02:00
|
|
|
|
2020-06-03 11:27:55 +01:00
|
|
|
#include "aco_ir.h"
|
2021-06-09 15:40:03 +02:00
|
|
|
|
2021-07-14 17:11:44 +01:00
|
|
|
#include "aco_builder.h"
|
|
|
|
|
|
2022-09-13 12:49:56 +03:00
|
|
|
#include "util/u_debug.h"
|
2020-06-03 11:27:55 +01:00
|
|
|
|
2021-06-09 15:40:03 +02:00
|
|
|
#include "c11/threads.h"
|
|
|
|
|
|
2020-06-03 11:27:55 +01:00
|
|
|
namespace aco {
|
|
|
|
|
|
2022-10-21 11:09:46 +02:00
|
|
|
thread_local aco::monotonic_buffer_resource* instruction_buffer = nullptr;
|
2022-08-15 21:42:19 +02:00
|
|
|
|
2020-01-22 19:57:20 +00:00
|
|
|
uint64_t debug_flags = 0;
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
static const struct debug_control aco_debug_options[] = {{"validateir", DEBUG_VALIDATE_IR},
|
|
|
|
|
{"validatera", DEBUG_VALIDATE_RA},
|
2022-10-06 10:24:14 -05:00
|
|
|
{"novalidateir", DEBUG_NO_VALIDATE_IR},
|
2021-06-09 10:14:54 +02:00
|
|
|
{"perfwarn", DEBUG_PERFWARN},
|
|
|
|
|
{"force-waitcnt", DEBUG_FORCE_WAITCNT},
|
2022-08-25 19:41:51 +01:00
|
|
|
{"force-waitdeps", DEBUG_FORCE_WAITDEPS},
|
2021-06-09 10:14:54 +02:00
|
|
|
{"novn", DEBUG_NO_VN},
|
|
|
|
|
{"noopt", DEBUG_NO_OPT},
|
|
|
|
|
{"nosched", DEBUG_NO_SCHED},
|
|
|
|
|
{"perfinfo", DEBUG_PERF_INFO},
|
|
|
|
|
{"liveinfo", DEBUG_LIVE_INFO},
|
|
|
|
|
{NULL, 0}};
|
2020-01-22 19:57:20 +00:00
|
|
|
|
|
|
|
|
static once_flag init_once_flag = ONCE_FLAG_INIT;
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
static void
|
|
|
|
|
init_once()
|
2020-01-22 19:57:20 +00:00
|
|
|
{
|
|
|
|
|
debug_flags = parse_debug_string(getenv("ACO_DEBUG"), aco_debug_options);
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
#ifndef NDEBUG
|
2020-01-22 19:57:20 +00:00
|
|
|
/* enable some flags by default on debug builds */
|
2020-08-18 08:14:06 +02:00
|
|
|
debug_flags |= aco::DEBUG_VALIDATE_IR;
|
2021-06-09 10:14:54 +02:00
|
|
|
#endif
|
2022-10-06 10:24:14 -05:00
|
|
|
|
|
|
|
|
if (debug_flags & aco::DEBUG_NO_VALIDATE_IR)
|
|
|
|
|
debug_flags &= ~aco::DEBUG_VALIDATE_IR;
|
2020-01-22 19:57:20 +00:00
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
init()
|
2020-01-22 19:57:20 +00:00
|
|
|
{
|
|
|
|
|
call_once(&init_once_flag, init_once);
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
2022-05-05 13:34:41 +10:00
|
|
|
init_program(Program* program, Stage stage, const struct aco_shader_info* info,
|
2022-05-12 02:50:17 -04:00
|
|
|
enum amd_gfx_level gfx_level, enum radeon_family family, bool wgp_mode,
|
2021-06-09 10:14:54 +02:00
|
|
|
ac_shader_config* config)
|
2020-01-22 19:57:20 +00:00
|
|
|
{
|
2022-10-21 11:09:46 +02:00
|
|
|
instruction_buffer = &program->m;
|
2020-01-22 19:57:20 +00:00
|
|
|
program->stage = stage;
|
|
|
|
|
program->config = config;
|
2022-05-05 13:34:41 +10:00
|
|
|
program->info = *info;
|
2022-05-12 02:50:17 -04:00
|
|
|
program->gfx_level = gfx_level;
|
2020-01-22 19:57:20 +00:00
|
|
|
if (family == CHIP_UNKNOWN) {
|
2022-05-12 02:50:17 -04:00
|
|
|
switch (gfx_level) {
|
2021-06-09 10:14:54 +02:00
|
|
|
case GFX6: program->family = CHIP_TAHITI; break;
|
|
|
|
|
case GFX7: program->family = CHIP_BONAIRE; break;
|
|
|
|
|
case GFX8: program->family = CHIP_POLARIS10; break;
|
|
|
|
|
case GFX9: program->family = CHIP_VEGA10; break;
|
|
|
|
|
case GFX10: program->family = CHIP_NAVI10; break;
|
2022-06-17 17:42:35 +01:00
|
|
|
case GFX10_3: program->family = CHIP_NAVI21; break;
|
|
|
|
|
case GFX11: program->family = CHIP_GFX1100; break;
|
2021-06-09 10:14:54 +02:00
|
|
|
default: program->family = CHIP_UNKNOWN; break;
|
2020-01-22 19:57:20 +00:00
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
program->family = family;
|
|
|
|
|
}
|
|
|
|
|
program->wave_size = info->wave_size;
|
|
|
|
|
program->lane_mask = program->wave_size == 32 ? s1 : s2;
|
|
|
|
|
|
2023-05-26 12:55:35 +01:00
|
|
|
program->dev.lds_encoding_granule = gfx_level >= GFX11 && stage == fragment_fs ? 1024
|
|
|
|
|
: gfx_level >= GFX7 ? 512
|
|
|
|
|
: 256;
|
2022-05-12 02:50:17 -04:00
|
|
|
program->dev.lds_alloc_granule = gfx_level >= GFX10_3 ? 1024 : program->dev.lds_encoding_granule;
|
2023-03-15 11:59:41 -07:00
|
|
|
|
|
|
|
|
/* GFX6: There is 64KB LDS per CU, but a single workgroup can only use 32KB. */
|
2022-05-12 02:50:17 -04:00
|
|
|
program->dev.lds_limit = gfx_level >= GFX7 ? 65536 : 32768;
|
2023-03-15 11:59:41 -07:00
|
|
|
|
2020-01-22 19:57:20 +00:00
|
|
|
/* apparently gfx702 also has 16-bank LDS but I can't find a family for that */
|
2021-01-28 13:07:11 +00:00
|
|
|
program->dev.has_16bank_lds = family == CHIP_KABINI || family == CHIP_STONEY;
|
2020-01-22 19:57:20 +00:00
|
|
|
|
aco/gfx11: increase vgpr_limit to 256
fossil-db (gfx1100):
Totals from 280 (0.21% of 134574) affected shaders:
MaxWaves: 3124 -> 2846 (-8.90%); split: +3.46%, -12.36%
Instrs: 1139038 -> 1091407 (-4.18%); split: -4.18%, +0.00%
CodeSize: 5809332 -> 5486812 (-5.55%); split: -5.55%, +0.00%
VGPRs: 35004 -> 42864 (+22.45%); split: -1.85%, +24.31%
SpillSGPRs: 1896 -> 1865 (-1.64%); split: -2.37%, +0.74%
SpillVGPRs: 17807 -> 2382 (-86.62%)
Scratch: 2573312 -> 736256 (-71.39%)
Latency: 27470485 -> 17981296 (-34.54%); split: -34.54%, +0.00%
InvThroughput: 5606102 -> 6527051 (+16.43%); split: -4.19%, +20.61%
VClause: 32319 -> 19927 (-38.34%); split: -39.13%, +0.78%
SClause: 15014 -> 14897 (-0.78%); split: -0.95%, +0.17%
Copies: 102977 -> 93511 (-9.19%); split: -9.93%, +0.74%
Branches: 15164 -> 14969 (-1.29%)
PreSGPRs: 19132 -> 19014 (-0.62%)
PreVGPRs: 30494 -> 37460 (+22.84%)
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20251>
2022-12-09 15:42:44 +00:00
|
|
|
program->dev.vgpr_limit = 256;
|
2021-01-28 13:07:11 +00:00
|
|
|
program->dev.physical_vgprs = 256;
|
|
|
|
|
program->dev.vgpr_alloc_granule = 4;
|
2020-01-22 19:57:20 +00:00
|
|
|
|
2022-05-12 02:50:17 -04:00
|
|
|
if (gfx_level >= GFX10) {
|
2021-01-28 13:07:11 +00:00
|
|
|
program->dev.physical_sgprs = 5120; /* doesn't matter as long as it's at least 128 * 40 */
|
|
|
|
|
program->dev.sgpr_alloc_granule = 128;
|
2021-06-09 10:14:54 +02:00
|
|
|
program->dev.sgpr_limit =
|
|
|
|
|
108; /* includes VCC, which can be treated as s[106-107] on GFX10+ */
|
2022-09-26 17:18:48 +01:00
|
|
|
|
|
|
|
|
if (family == CHIP_GFX1100 || family == CHIP_GFX1101) {
|
|
|
|
|
program->dev.physical_vgprs = program->wave_size == 32 ? 1536 : 768;
|
|
|
|
|
program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 24 : 12;
|
|
|
|
|
} else {
|
|
|
|
|
program->dev.physical_vgprs = program->wave_size == 32 ? 1024 : 512;
|
|
|
|
|
if (gfx_level >= GFX10_3)
|
|
|
|
|
program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 16 : 8;
|
|
|
|
|
else
|
|
|
|
|
program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 8 : 4;
|
|
|
|
|
}
|
2022-05-12 02:50:17 -04:00
|
|
|
} else if (program->gfx_level >= GFX8) {
|
2021-01-28 13:07:11 +00:00
|
|
|
program->dev.physical_sgprs = 800;
|
|
|
|
|
program->dev.sgpr_alloc_granule = 16;
|
|
|
|
|
program->dev.sgpr_limit = 102;
|
2021-02-05 14:36:39 +01:00
|
|
|
if (family == CHIP_TONGA || family == CHIP_ICELAND)
|
2021-01-28 13:07:11 +00:00
|
|
|
program->dev.sgpr_alloc_granule = 96; /* workaround hardware bug */
|
2020-01-22 19:57:20 +00:00
|
|
|
} else {
|
2021-01-28 13:07:11 +00:00
|
|
|
program->dev.physical_sgprs = 512;
|
|
|
|
|
program->dev.sgpr_alloc_granule = 8;
|
|
|
|
|
program->dev.sgpr_limit = 104;
|
2020-01-22 19:57:20 +00:00
|
|
|
}
|
|
|
|
|
|
2023-01-05 13:58:02 +00:00
|
|
|
program->dev.scratch_alloc_granule = gfx_level >= GFX11 ? 256 : 1024;
|
|
|
|
|
|
2021-01-28 13:07:11 +00:00
|
|
|
program->dev.max_wave64_per_simd = 10;
|
2022-05-12 02:50:17 -04:00
|
|
|
if (program->gfx_level >= GFX10_3)
|
2021-01-28 13:07:11 +00:00
|
|
|
program->dev.max_wave64_per_simd = 16;
|
2022-05-12 02:50:17 -04:00
|
|
|
else if (program->gfx_level == GFX10)
|
2021-01-28 13:07:11 +00:00
|
|
|
program->dev.max_wave64_per_simd = 20;
|
|
|
|
|
else if (program->family >= CHIP_POLARIS10 && program->family <= CHIP_VEGAM)
|
|
|
|
|
program->dev.max_wave64_per_simd = 8;
|
|
|
|
|
|
2022-05-12 02:50:17 -04:00
|
|
|
program->dev.simd_per_cu = program->gfx_level >= GFX10 ? 2 : 4;
|
2021-01-28 13:07:11 +00:00
|
|
|
|
|
|
|
|
switch (program->family) {
|
|
|
|
|
/* GFX8 APUs */
|
|
|
|
|
case CHIP_CARRIZO:
|
|
|
|
|
case CHIP_STONEY:
|
|
|
|
|
/* GFX9 APUS */
|
|
|
|
|
case CHIP_RAVEN:
|
|
|
|
|
case CHIP_RAVEN2:
|
2021-06-09 10:14:54 +02:00
|
|
|
case CHIP_RENOIR: program->dev.xnack_enabled = true; break;
|
|
|
|
|
default: break;
|
2021-01-28 13:07:11 +00:00
|
|
|
}
|
|
|
|
|
|
2022-11-02 14:42:40 -04:00
|
|
|
program->dev.sram_ecc_enabled = program->family == CHIP_MI100;
|
2021-01-28 13:07:11 +00:00
|
|
|
/* apparently gfx702 also has fast v_fma_f32 but I can't find a family for that */
|
2022-05-12 02:50:17 -04:00
|
|
|
program->dev.has_fast_fma32 = program->gfx_level >= GFX9;
|
2021-06-09 10:14:54 +02:00
|
|
|
if (program->family == CHIP_TAHITI || program->family == CHIP_CARRIZO ||
|
2021-01-28 13:07:11 +00:00
|
|
|
program->family == CHIP_HAWAII)
|
|
|
|
|
program->dev.has_fast_fma32 = true;
|
2022-05-12 02:50:17 -04:00
|
|
|
program->dev.has_mac_legacy32 = program->gfx_level <= GFX7 || program->gfx_level >= GFX10;
|
2021-01-28 13:07:11 +00:00
|
|
|
|
2022-05-12 02:50:17 -04:00
|
|
|
program->dev.fused_mad_mix = program->gfx_level >= GFX10;
|
2022-01-27 14:00:38 +00:00
|
|
|
if (program->family == CHIP_VEGA12 || program->family == CHIP_VEGA20 ||
|
2022-11-02 14:42:40 -04:00
|
|
|
program->family == CHIP_MI100 || program->family == CHIP_MI200)
|
2022-01-27 14:00:38 +00:00
|
|
|
program->dev.fused_mad_mix = true;
|
|
|
|
|
|
2022-05-19 15:18:36 +01:00
|
|
|
if (program->gfx_level >= GFX11) {
|
|
|
|
|
program->dev.scratch_global_offset_min = -4096;
|
|
|
|
|
program->dev.scratch_global_offset_max = 4095;
|
|
|
|
|
} else if (program->gfx_level >= GFX10 || program->gfx_level == GFX8) {
|
|
|
|
|
program->dev.scratch_global_offset_min = -2048;
|
|
|
|
|
program->dev.scratch_global_offset_max = 2047;
|
|
|
|
|
} else if (program->gfx_level == GFX9) {
|
|
|
|
|
/* The minimum is actually -4096, but negative offsets are broken when SADDR is used. */
|
|
|
|
|
program->dev.scratch_global_offset_min = 0;
|
|
|
|
|
program->dev.scratch_global_offset_max = 4095;
|
|
|
|
|
}
|
|
|
|
|
|
2023-03-21 14:44:09 +00:00
|
|
|
if (program->gfx_level >= GFX11) {
|
|
|
|
|
/* GFX11 can have only 1 NSA dword. The last VGPR isn't included here because it contains the
|
|
|
|
|
* rest of the address.
|
|
|
|
|
*/
|
|
|
|
|
program->dev.max_nsa_vgprs = 4;
|
|
|
|
|
} else if (program->gfx_level >= GFX10_3) {
|
|
|
|
|
/* GFX10.3 can have up to 3 NSA dwords. */
|
|
|
|
|
program->dev.max_nsa_vgprs = 13;
|
|
|
|
|
} else if (program->gfx_level >= GFX10) {
|
|
|
|
|
/* Limit NSA instructions to 1 NSA dword on GFX10 to avoid stability issues. */
|
|
|
|
|
program->dev.max_nsa_vgprs = 5;
|
|
|
|
|
} else {
|
|
|
|
|
program->dev.max_nsa_vgprs = 0;
|
|
|
|
|
}
|
|
|
|
|
|
2021-02-01 15:14:01 +00:00
|
|
|
program->wgp_mode = wgp_mode;
|
2021-01-28 11:07:26 +00:00
|
|
|
|
2021-04-20 17:35:41 +01:00
|
|
|
program->progress = CompilationProgress::after_isel;
|
|
|
|
|
|
2020-01-22 19:57:20 +00:00
|
|
|
program->next_fp_mode.preserve_signed_zero_inf_nan32 = false;
|
|
|
|
|
program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false;
|
|
|
|
|
program->next_fp_mode.must_flush_denorms32 = false;
|
|
|
|
|
program->next_fp_mode.must_flush_denorms16_64 = false;
|
|
|
|
|
program->next_fp_mode.care_about_round32 = false;
|
|
|
|
|
program->next_fp_mode.care_about_round16_64 = false;
|
|
|
|
|
program->next_fp_mode.denorm16_64 = fp_denorm_keep;
|
|
|
|
|
program->next_fp_mode.denorm32 = 0;
|
|
|
|
|
program->next_fp_mode.round16_64 = fp_round_ne;
|
|
|
|
|
program->next_fp_mode.round32 = fp_round_ne;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
memory_sync_info
|
|
|
|
|
get_sync_info(const Instruction* instr)
|
2020-06-26 15:54:22 +01:00
|
|
|
{
|
2023-04-03 21:27:47 +03:00
|
|
|
/* Primitive Ordered Pixel Shading barriers necessary for accesses to memory shared between
|
|
|
|
|
* overlapping waves in the queue family.
|
|
|
|
|
*/
|
|
|
|
|
if (instr->opcode == aco_opcode::p_pops_gfx9_overlapped_wave_wait_done ||
|
|
|
|
|
(instr->opcode == aco_opcode::s_wait_event &&
|
|
|
|
|
!(instr->sopp().imm & wait_event_imm_dont_wait_export_ready))) {
|
|
|
|
|
return memory_sync_info(storage_buffer | storage_image, semantic_acquire, scope_queuefamily);
|
|
|
|
|
} else if (instr->opcode == aco_opcode::p_pops_gfx9_ordered_section_done) {
|
|
|
|
|
return memory_sync_info(storage_buffer | storage_image, semantic_release, scope_queuefamily);
|
|
|
|
|
}
|
|
|
|
|
|
2020-06-26 15:54:22 +01:00
|
|
|
switch (instr->format) {
|
2021-06-09 10:14:54 +02:00
|
|
|
case Format::SMEM: return instr->smem().sync;
|
|
|
|
|
case Format::MUBUF: return instr->mubuf().sync;
|
|
|
|
|
case Format::MIMG: return instr->mimg().sync;
|
|
|
|
|
case Format::MTBUF: return instr->mtbuf().sync;
|
2020-06-26 15:54:22 +01:00
|
|
|
case Format::FLAT:
|
|
|
|
|
case Format::GLOBAL:
|
2021-06-09 10:14:54 +02:00
|
|
|
case Format::SCRATCH: return instr->flatlike().sync;
|
|
|
|
|
case Format::DS: return instr->ds().sync;
|
2022-06-17 13:53:08 +01:00
|
|
|
case Format::LDSDIR: return instr->ldsdir().sync;
|
2021-06-09 10:14:54 +02:00
|
|
|
default: return memory_sync_info();
|
2020-06-26 15:54:22 +01:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
2022-05-12 02:50:17 -04:00
|
|
|
can_use_SDWA(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool pre_ra)
|
2020-05-11 17:49:40 +01:00
|
|
|
{
|
|
|
|
|
if (!instr->isVALU())
|
|
|
|
|
return false;
|
|
|
|
|
|
2022-05-13 12:01:03 +01:00
|
|
|
if (gfx_level < GFX8 || gfx_level >= GFX11 || instr->isDPP() || instr->isVOP3P())
|
2020-05-11 17:49:40 +01:00
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (instr->isSDWA())
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
if (instr->isVOP3()) {
|
2023-02-21 20:08:42 +01:00
|
|
|
VALU_instruction& vop3 = instr->valu();
|
2020-05-11 17:49:40 +01:00
|
|
|
if (instr->format == Format::VOP3)
|
|
|
|
|
return false;
|
2022-05-12 02:50:17 -04:00
|
|
|
if (vop3.clamp && instr->isVOPC() && gfx_level != GFX8)
|
2020-05-11 17:49:40 +01:00
|
|
|
return false;
|
2022-05-12 02:50:17 -04:00
|
|
|
if (vop3.omod && gfx_level < GFX9)
|
2020-05-11 17:49:40 +01:00
|
|
|
return false;
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
// TODO: return true if we know we will use vcc
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
if (!pre_ra && instr->definitions.size() >= 2)
|
2020-05-11 17:49:40 +01:00
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 1; i < instr->operands.size(); i++) {
|
|
|
|
|
if (instr->operands[i].isLiteral())
|
|
|
|
|
return false;
|
2022-05-12 02:50:17 -04:00
|
|
|
if (gfx_level < GFX9 && !instr->operands[i].isOfType(RegType::vgpr))
|
2020-05-11 17:49:40 +01:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
aco: use VOPC_SDWA on GFX9+
Totals from 5138 (3.42% of 150170) affected shaders: (GFX10.3)
VGPRs: 409520 -> 409416 (-0.03%); split: -0.03%, +0.00%
CodeSize: 43056360 -> 43035696 (-0.05%); split: -0.06%, +0.02%
MaxWaves: 69296 -> 69310 (+0.02%)
Instrs: 8161016 -> 8153365 (-0.09%); split: -0.10%, +0.01%
Latency: 109397002 -> 109756208 (+0.33%); split: -0.05%, +0.38%
InvThroughput: 23238920 -> 23310761 (+0.31%); split: -0.11%, +0.42%
VClause: 135141 -> 135100 (-0.03%); split: -0.05%, +0.02%
SClause: 349511 -> 349489 (-0.01%); split: -0.01%, +0.00%
Copies: 388107 -> 387754 (-0.09%); split: -0.48%, +0.38%
Branches: 184629 -> 184503 (-0.07%); split: -0.08%, +0.01%
PreSGPRs: 258807 -> 258839 (+0.01%)
PreVGPRs: 372561 -> 372184 (-0.10%); split: -0.10%, +0.00%
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12364>
2021-07-07 11:37:49 +02:00
|
|
|
if (!instr->definitions.empty() && instr->definitions[0].bytes() > 4 && !instr->isVOPC())
|
2021-06-07 16:56:45 +01:00
|
|
|
return false;
|
|
|
|
|
|
2020-05-11 17:49:40 +01:00
|
|
|
if (!instr->operands.empty()) {
|
|
|
|
|
if (instr->operands[0].isLiteral())
|
|
|
|
|
return false;
|
2022-05-12 02:50:17 -04:00
|
|
|
if (gfx_level < GFX9 && !instr->operands[0].isOfType(RegType::vgpr))
|
2020-05-11 17:49:40 +01:00
|
|
|
return false;
|
2021-06-07 16:56:45 +01:00
|
|
|
if (instr->operands[0].bytes() > 4)
|
|
|
|
|
return false;
|
|
|
|
|
if (instr->operands.size() > 1 && instr->operands[1].bytes() > 4)
|
|
|
|
|
return false;
|
2020-05-11 17:49:40 +01:00
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool is_mac = instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_mac_f16 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_fmac_f32 || instr->opcode == aco_opcode::v_fmac_f16;
|
2020-05-11 17:49:40 +01:00
|
|
|
|
2022-05-12 02:50:17 -04:00
|
|
|
if (gfx_level != GFX8 && is_mac)
|
2020-05-11 17:49:40 +01:00
|
|
|
return false;
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
// TODO: return true if we know we will use vcc
|
2022-05-12 02:50:17 -04:00
|
|
|
if (!pre_ra && instr->isVOPC() && gfx_level == GFX8)
|
2020-05-11 17:49:40 +01:00
|
|
|
return false;
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
if (!pre_ra && instr->operands.size() >= 3 && !is_mac)
|
2020-05-11 17:49:40 +01:00
|
|
|
return false;
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
|
2022-10-23 00:37:43 +02:00
|
|
|
instr->opcode != aco_opcode::v_fmamk_f32 && instr->opcode != aco_opcode::v_fmaak_f32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_fmamk_f16 && instr->opcode != aco_opcode::v_fmaak_f16 &&
|
2020-05-11 17:49:40 +01:00
|
|
|
instr->opcode != aco_opcode::v_readfirstlane_b32 &&
|
2021-06-09 10:14:54 +02:00
|
|
|
instr->opcode != aco_opcode::v_clrexcp && instr->opcode != aco_opcode::v_swap_b32;
|
2020-05-11 17:49:40 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* updates "instr" and returns the old instruction (or NULL if no update was needed) */
|
2021-06-09 10:14:54 +02:00
|
|
|
aco_ptr<Instruction>
|
2022-05-12 02:50:17 -04:00
|
|
|
convert_to_SDWA(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr)
|
2020-05-11 17:49:40 +01:00
|
|
|
{
|
|
|
|
|
if (instr->isSDWA())
|
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
|
|
aco_ptr<Instruction> tmp = std::move(instr);
|
2023-05-13 11:40:35 +02:00
|
|
|
Format format = asSDWA(withoutVOP3(tmp->format));
|
2021-06-09 10:14:54 +02:00
|
|
|
instr.reset(create_instruction<SDWA_instruction>(tmp->opcode, format, tmp->operands.size(),
|
|
|
|
|
tmp->definitions.size()));
|
2020-05-11 17:49:40 +01:00
|
|
|
std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
|
|
|
|
|
std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(), instr->definitions.begin());
|
|
|
|
|
|
2021-01-21 16:13:34 +00:00
|
|
|
SDWA_instruction& sdwa = instr->sdwa();
|
2020-05-11 17:49:40 +01:00
|
|
|
|
|
|
|
|
if (tmp->isVOP3()) {
|
2023-02-21 20:08:42 +01:00
|
|
|
VALU_instruction& vop3 = tmp->valu();
|
2023-03-07 13:53:07 +01:00
|
|
|
sdwa.neg = vop3.neg;
|
2023-03-27 23:32:54 +02:00
|
|
|
sdwa.abs = vop3.abs;
|
2021-01-21 16:13:34 +00:00
|
|
|
sdwa.omod = vop3.omod;
|
|
|
|
|
sdwa.clamp = vop3.clamp;
|
2020-05-11 17:49:40 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < instr->operands.size(); i++) {
|
2020-08-22 20:45:54 +02:00
|
|
|
/* SDWA only uses operands 0 and 1. */
|
|
|
|
|
if (i >= 2)
|
|
|
|
|
break;
|
|
|
|
|
|
2021-08-30 17:58:36 +02:00
|
|
|
sdwa.sel[i] = SubdwordSel(instr->operands[i].bytes(), 0, false);
|
2020-05-11 17:49:40 +01:00
|
|
|
}
|
|
|
|
|
|
2021-08-30 17:58:36 +02:00
|
|
|
sdwa.dst_sel = SubdwordSel(instr->definitions[0].bytes(), 0, false);
|
|
|
|
|
|
2022-05-12 02:50:17 -04:00
|
|
|
if (instr->definitions[0].getTemp().type() == RegType::sgpr && gfx_level == GFX8)
|
2020-05-11 17:49:40 +01:00
|
|
|
instr->definitions[0].setFixed(vcc);
|
|
|
|
|
if (instr->definitions.size() >= 2)
|
|
|
|
|
instr->definitions[1].setFixed(vcc);
|
|
|
|
|
if (instr->operands.size() >= 3)
|
|
|
|
|
instr->operands[2].setFixed(vcc);
|
|
|
|
|
|
2022-01-28 14:49:50 +00:00
|
|
|
instr->pass_flags = tmp->pass_flags;
|
|
|
|
|
|
2020-05-11 17:49:40 +01:00
|
|
|
return tmp;
|
|
|
|
|
}
|
|
|
|
|
|
2021-07-14 17:11:44 +01:00
|
|
|
bool
|
2023-04-23 14:55:17 +02:00
|
|
|
can_use_DPP(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool dpp8)
|
2021-07-14 17:11:44 +01:00
|
|
|
{
|
|
|
|
|
assert(instr->isVALU() && !instr->operands.empty());
|
|
|
|
|
|
|
|
|
|
if (instr->isDPP())
|
2021-11-29 00:12:04 +09:00
|
|
|
return instr->isDPP8() == dpp8;
|
2021-07-14 17:11:44 +01:00
|
|
|
|
2023-04-23 14:55:17 +02:00
|
|
|
if (instr->isSDWA() || instr->isVINTERP_INREG())
|
2021-07-14 17:11:44 +01:00
|
|
|
return false;
|
|
|
|
|
|
2023-04-23 14:55:17 +02:00
|
|
|
if ((instr->format == Format::VOP3 || instr->isVOP3P()) && gfx_level < GFX11)
|
2021-07-14 17:11:44 +01:00
|
|
|
return false;
|
|
|
|
|
|
2023-04-23 14:55:17 +02:00
|
|
|
if ((instr->isVOPC() || instr->definitions.size() > 1) && instr->definitions.back().isFixed() &&
|
|
|
|
|
instr->definitions.back().physReg() != vcc && gfx_level < GFX11)
|
2021-07-14 17:11:44 +01:00
|
|
|
return false;
|
|
|
|
|
|
2023-04-23 14:55:17 +02:00
|
|
|
if (instr->operands.size() >= 3 && instr->operands[2].isFixed() &&
|
|
|
|
|
instr->operands[2].isOfType(RegType::sgpr) && instr->operands[2].physReg() != vcc &&
|
|
|
|
|
gfx_level < GFX11)
|
2021-07-14 17:11:44 +01:00
|
|
|
return false;
|
|
|
|
|
|
2023-04-23 14:55:17 +02:00
|
|
|
if (instr->isVOP3() && gfx_level < GFX11) {
|
2023-02-21 20:08:42 +01:00
|
|
|
const VALU_instruction* vop3 = &instr->valu();
|
2023-04-23 14:55:17 +02:00
|
|
|
if (vop3->clamp || vop3->omod)
|
2021-07-14 17:11:44 +01:00
|
|
|
return false;
|
2021-11-29 00:12:04 +09:00
|
|
|
if (dpp8)
|
|
|
|
|
return false;
|
2023-04-23 14:55:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < instr->operands.size(); i++) {
|
|
|
|
|
if (instr->operands[i].isLiteral())
|
2021-07-14 17:11:44 +01:00
|
|
|
return false;
|
2023-04-23 14:55:17 +02:00
|
|
|
if (!instr->operands[i].isOfType(RegType::vgpr) && i < 2)
|
2021-11-29 16:34:15 +00:00
|
|
|
return false;
|
2021-07-14 17:11:44 +01:00
|
|
|
}
|
|
|
|
|
|
2023-04-23 14:55:17 +02:00
|
|
|
/* simpler than listing all VOP3P opcodes which do not support DPP */
|
|
|
|
|
if (instr->isVOP3P()) {
|
|
|
|
|
return instr->opcode == aco_opcode::v_fma_mix_f32 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_fma_mixhi_f16 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_dot2_f32_f16 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_dot2_f32_bf16;
|
|
|
|
|
}
|
|
|
|
|
|
2021-07-14 17:11:44 +01:00
|
|
|
/* there are more cases but those all take 64-bit inputs */
|
|
|
|
|
return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
|
2023-01-04 14:53:00 +00:00
|
|
|
instr->opcode != aco_opcode::v_fmamk_f32 && instr->opcode != aco_opcode::v_fmaak_f32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_fmamk_f16 && instr->opcode != aco_opcode::v_fmaak_f16 &&
|
2021-07-14 17:11:44 +01:00
|
|
|
instr->opcode != aco_opcode::v_readfirstlane_b32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_cvt_f64_i32 &&
|
2023-04-23 14:55:17 +02:00
|
|
|
instr->opcode != aco_opcode::v_cvt_f64_f32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_cvt_f64_u32 && instr->opcode != aco_opcode::v_mul_lo_u32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_mul_lo_i32 && instr->opcode != aco_opcode::v_mul_hi_u32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_mul_hi_i32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_qsad_pk_u16_u8 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_mqsad_pk_u16_u8 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_mqsad_u32_u8 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_mad_u64_u32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_mad_i64_i32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_permlane16_b32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_permlanex16_b32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_permlane64_b32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_readlane_b32_e64 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_writelane_b32_e64;
|
2021-07-14 17:11:44 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
aco_ptr<Instruction>
|
2023-04-23 14:55:17 +02:00
|
|
|
convert_to_DPP(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr, bool dpp8)
|
2021-07-14 17:11:44 +01:00
|
|
|
{
|
|
|
|
|
if (instr->isDPP())
|
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
|
|
aco_ptr<Instruction> tmp = std::move(instr);
|
2023-04-23 14:55:17 +02:00
|
|
|
Format format =
|
|
|
|
|
(Format)((uint32_t)tmp->format | (uint32_t)(dpp8 ? Format::DPP8 : Format::DPP16));
|
2021-11-29 00:12:04 +09:00
|
|
|
if (dpp8)
|
|
|
|
|
instr.reset(create_instruction<DPP8_instruction>(tmp->opcode, format, tmp->operands.size(),
|
|
|
|
|
tmp->definitions.size()));
|
|
|
|
|
else
|
|
|
|
|
instr.reset(create_instruction<DPP16_instruction>(tmp->opcode, format, tmp->operands.size(),
|
|
|
|
|
tmp->definitions.size()));
|
2021-07-14 17:11:44 +01:00
|
|
|
std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
|
2023-04-23 14:55:17 +02:00
|
|
|
std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(), instr->definitions.begin());
|
2021-07-14 17:11:44 +01:00
|
|
|
|
2021-11-29 00:12:04 +09:00
|
|
|
if (dpp8) {
|
|
|
|
|
DPP8_instruction* dpp = &instr->dpp8();
|
|
|
|
|
for (unsigned i = 0; i < 8; i++)
|
|
|
|
|
dpp->lane_sel[i] = i;
|
|
|
|
|
} else {
|
|
|
|
|
DPP16_instruction* dpp = &instr->dpp16();
|
|
|
|
|
dpp->dpp_ctrl = dpp_quad_perm(0, 1, 2, 3);
|
|
|
|
|
dpp->row_mask = 0xf;
|
|
|
|
|
dpp->bank_mask = 0xf;
|
2021-07-14 17:11:44 +01:00
|
|
|
}
|
|
|
|
|
|
2023-03-23 13:14:05 +01:00
|
|
|
instr->valu().neg = tmp->valu().neg;
|
|
|
|
|
instr->valu().abs = tmp->valu().abs;
|
2023-04-23 14:55:17 +02:00
|
|
|
instr->valu().omod = tmp->valu().omod;
|
|
|
|
|
instr->valu().clamp = tmp->valu().clamp;
|
2023-03-23 13:14:05 +01:00
|
|
|
instr->valu().opsel = tmp->valu().opsel;
|
2023-04-23 14:55:17 +02:00
|
|
|
instr->valu().opsel_lo = tmp->valu().opsel_lo;
|
|
|
|
|
instr->valu().opsel_hi = tmp->valu().opsel_hi;
|
2023-03-23 13:14:05 +01:00
|
|
|
|
2023-04-23 14:55:17 +02:00
|
|
|
if ((instr->isVOPC() || instr->definitions.size() > 1) && gfx_level < GFX11)
|
2021-07-14 17:11:44 +01:00
|
|
|
instr->definitions.back().setFixed(vcc);
|
|
|
|
|
|
2023-04-23 14:55:17 +02:00
|
|
|
if (instr->operands.size() >= 3 && instr->operands[2].isOfType(RegType::sgpr) &&
|
|
|
|
|
gfx_level < GFX11)
|
2021-07-14 17:11:44 +01:00
|
|
|
instr->operands[2].setFixed(vcc);
|
|
|
|
|
|
2022-01-28 14:49:50 +00:00
|
|
|
instr->pass_flags = tmp->pass_flags;
|
|
|
|
|
|
2023-04-23 14:55:17 +02:00
|
|
|
/* DPP16 supports input modifiers, so we might no longer need VOP3. */
|
|
|
|
|
bool remove_vop3 = !dpp8 && !instr->valu().omod && !instr->valu().clamp &&
|
|
|
|
|
(instr->isVOP1() || instr->isVOP2() || instr->isVOPC());
|
|
|
|
|
|
|
|
|
|
/* VOPC/add_co/sub_co definition needs VCC without VOP3. */
|
|
|
|
|
remove_vop3 &= instr->definitions.back().regClass().type() != RegType::sgpr ||
|
|
|
|
|
!instr->definitions.back().isFixed() ||
|
|
|
|
|
instr->definitions.back().physReg() == vcc;
|
|
|
|
|
|
|
|
|
|
/* addc/subb/cndmask 3rd operand needs VCC without VOP3. */
|
|
|
|
|
remove_vop3 &= instr->operands.size() < 3 || !instr->operands[2].isFixed() ||
|
|
|
|
|
instr->operands[2].isOfType(RegType::vgpr) || instr->operands[2].physReg() == vcc;
|
|
|
|
|
|
|
|
|
|
if (remove_vop3)
|
2023-05-13 11:40:35 +02:00
|
|
|
instr->format = withoutVOP3(instr->format);
|
2023-04-23 14:55:17 +02:00
|
|
|
|
2021-07-14 17:11:44 +01:00
|
|
|
return tmp;
|
|
|
|
|
}
|
|
|
|
|
|
2023-05-16 17:02:45 +02:00
|
|
|
bool
|
|
|
|
|
can_use_input_modifiers(amd_gfx_level gfx_level, aco_opcode op, int idx)
|
|
|
|
|
{
|
|
|
|
|
if (op == aco_opcode::v_mov_b32)
|
|
|
|
|
return gfx_level >= GFX10;
|
|
|
|
|
|
|
|
|
|
if (op == aco_opcode::v_ldexp_f16 || op == aco_opcode::v_ldexp_f32 ||
|
|
|
|
|
op == aco_opcode::v_ldexp_f64)
|
|
|
|
|
return idx == 0;
|
|
|
|
|
|
|
|
|
|
return instr_info.can_use_input_modifiers[(int)op];
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
2022-05-12 02:50:17 -04:00
|
|
|
can_use_opsel(amd_gfx_level gfx_level, aco_opcode op, int idx)
|
2020-06-03 11:27:55 +01:00
|
|
|
{
|
|
|
|
|
/* opsel is only GFX9+ */
|
2022-05-12 02:50:17 -04:00
|
|
|
if (gfx_level < GFX9)
|
2020-06-03 11:27:55 +01:00
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
switch (op) {
|
|
|
|
|
case aco_opcode::v_div_fixup_f16:
|
|
|
|
|
case aco_opcode::v_fma_f16:
|
|
|
|
|
case aco_opcode::v_mad_f16:
|
|
|
|
|
case aco_opcode::v_mad_u16:
|
|
|
|
|
case aco_opcode::v_mad_i16:
|
|
|
|
|
case aco_opcode::v_med3_f16:
|
|
|
|
|
case aco_opcode::v_med3_i16:
|
|
|
|
|
case aco_opcode::v_med3_u16:
|
|
|
|
|
case aco_opcode::v_min3_f16:
|
|
|
|
|
case aco_opcode::v_min3_i16:
|
|
|
|
|
case aco_opcode::v_min3_u16:
|
|
|
|
|
case aco_opcode::v_max3_f16:
|
|
|
|
|
case aco_opcode::v_max3_i16:
|
|
|
|
|
case aco_opcode::v_max3_u16:
|
2023-01-14 11:38:25 +01:00
|
|
|
case aco_opcode::v_minmax_f16:
|
|
|
|
|
case aco_opcode::v_maxmin_f16:
|
2020-06-03 11:27:55 +01:00
|
|
|
case aco_opcode::v_max_u16_e64:
|
|
|
|
|
case aco_opcode::v_max_i16_e64:
|
|
|
|
|
case aco_opcode::v_min_u16_e64:
|
|
|
|
|
case aco_opcode::v_min_i16_e64:
|
|
|
|
|
case aco_opcode::v_add_i16:
|
|
|
|
|
case aco_opcode::v_sub_i16:
|
|
|
|
|
case aco_opcode::v_add_u16_e64:
|
|
|
|
|
case aco_opcode::v_sub_u16_e64:
|
|
|
|
|
case aco_opcode::v_lshlrev_b16_e64:
|
|
|
|
|
case aco_opcode::v_lshrrev_b16_e64:
|
|
|
|
|
case aco_opcode::v_ashrrev_i16_e64:
|
2023-01-14 11:38:25 +01:00
|
|
|
case aco_opcode::v_and_b16:
|
|
|
|
|
case aco_opcode::v_or_b16:
|
|
|
|
|
case aco_opcode::v_xor_b16:
|
2021-06-09 10:14:54 +02:00
|
|
|
case aco_opcode::v_mul_lo_u16_e64: return true;
|
2020-06-03 11:27:55 +01:00
|
|
|
case aco_opcode::v_pack_b32_f16:
|
2020-08-17 11:36:24 +01:00
|
|
|
case aco_opcode::v_cvt_pknorm_i16_f16:
|
2021-06-09 10:14:54 +02:00
|
|
|
case aco_opcode::v_cvt_pknorm_u16_f16: return idx != -1;
|
2020-06-03 11:27:55 +01:00
|
|
|
case aco_opcode::v_mad_u32_u16:
|
2021-06-09 10:14:54 +02:00
|
|
|
case aco_opcode::v_mad_i32_i16: return idx >= 0 && idx < 2;
|
2022-06-16 18:15:16 +01:00
|
|
|
case aco_opcode::v_dot2_f16_f16:
|
|
|
|
|
case aco_opcode::v_dot2_bf16_bf16: return idx == -1 || idx == 2;
|
2023-01-14 11:38:25 +01:00
|
|
|
case aco_opcode::v_cndmask_b16: return idx != 2;
|
2022-06-17 13:53:08 +01:00
|
|
|
case aco_opcode::v_interp_p10_f16_f32_inreg:
|
|
|
|
|
case aco_opcode::v_interp_p10_rtz_f16_f32_inreg: return idx == 0 || idx == 2;
|
|
|
|
|
case aco_opcode::v_interp_p2_f16_f32_inreg:
|
|
|
|
|
case aco_opcode::v_interp_p2_rtz_f16_f32_inreg: return idx == -1 || idx == 0;
|
2023-03-21 13:26:19 +01:00
|
|
|
default:
|
|
|
|
|
return gfx_level >= GFX11 && (get_gfx11_true16_mask(op) & BITFIELD_BIT(idx == -1 ? 3 : idx));
|
2020-06-03 11:27:55 +01:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-04-27 16:22:52 +01:00
|
|
|
bool
|
2023-05-06 17:03:22 +02:00
|
|
|
can_write_m0(const aco_ptr<Instruction>& instr)
|
2023-04-27 16:22:52 +01:00
|
|
|
{
|
|
|
|
|
if (instr->isSALU())
|
|
|
|
|
return true;
|
|
|
|
|
|
2023-05-06 17:03:22 +02:00
|
|
|
/* VALU can't write m0 on any GPU generations. */
|
2023-04-27 16:22:52 +01:00
|
|
|
if (instr->isVALU())
|
2023-05-06 17:03:22 +02:00
|
|
|
return false;
|
2023-04-27 16:22:52 +01:00
|
|
|
|
|
|
|
|
switch (instr->opcode) {
|
|
|
|
|
case aco_opcode::p_parallelcopy:
|
|
|
|
|
case aco_opcode::p_extract:
|
|
|
|
|
case aco_opcode::p_insert:
|
2023-05-06 17:03:22 +02:00
|
|
|
/* These pseudo instructions are implemented with SALU when writing m0. */
|
2023-04-27 16:22:52 +01:00
|
|
|
return true;
|
|
|
|
|
default:
|
2023-05-06 17:03:22 +02:00
|
|
|
/* Assume that no other instructions can write m0. */
|
2023-04-27 16:22:52 +01:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-08-18 18:56:59 +02:00
|
|
|
bool
|
2022-05-12 02:50:17 -04:00
|
|
|
instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op)
|
2021-08-18 18:56:59 +02:00
|
|
|
{
|
|
|
|
|
/* partial register writes are GFX9+, only */
|
2022-05-12 02:50:17 -04:00
|
|
|
if (gfx_level < GFX9)
|
2021-08-18 18:56:59 +02:00
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
switch (op) {
|
|
|
|
|
/* VOP3 */
|
|
|
|
|
case aco_opcode::v_mad_f16:
|
|
|
|
|
case aco_opcode::v_mad_u16:
|
|
|
|
|
case aco_opcode::v_mad_i16:
|
|
|
|
|
case aco_opcode::v_fma_f16:
|
|
|
|
|
case aco_opcode::v_div_fixup_f16:
|
|
|
|
|
case aco_opcode::v_interp_p2_f16:
|
|
|
|
|
case aco_opcode::v_fma_mixlo_f16:
|
2022-06-01 17:16:55 +02:00
|
|
|
case aco_opcode::v_fma_mixhi_f16:
|
2021-08-18 18:56:59 +02:00
|
|
|
/* VOP2 */
|
|
|
|
|
case aco_opcode::v_mac_f16:
|
|
|
|
|
case aco_opcode::v_madak_f16:
|
2022-05-12 02:50:17 -04:00
|
|
|
case aco_opcode::v_madmk_f16: return gfx_level >= GFX9;
|
2021-08-18 18:56:59 +02:00
|
|
|
case aco_opcode::v_add_f16:
|
|
|
|
|
case aco_opcode::v_sub_f16:
|
|
|
|
|
case aco_opcode::v_subrev_f16:
|
|
|
|
|
case aco_opcode::v_mul_f16:
|
|
|
|
|
case aco_opcode::v_max_f16:
|
|
|
|
|
case aco_opcode::v_min_f16:
|
|
|
|
|
case aco_opcode::v_ldexp_f16:
|
|
|
|
|
case aco_opcode::v_fmac_f16:
|
|
|
|
|
case aco_opcode::v_fmamk_f16:
|
|
|
|
|
case aco_opcode::v_fmaak_f16:
|
|
|
|
|
/* VOP1 */
|
|
|
|
|
case aco_opcode::v_cvt_f16_f32:
|
2023-01-08 16:54:56 +01:00
|
|
|
case aco_opcode::p_cvt_f16_f32_rtne:
|
2021-08-18 18:56:59 +02:00
|
|
|
case aco_opcode::v_cvt_f16_u16:
|
|
|
|
|
case aco_opcode::v_cvt_f16_i16:
|
|
|
|
|
case aco_opcode::v_rcp_f16:
|
|
|
|
|
case aco_opcode::v_sqrt_f16:
|
|
|
|
|
case aco_opcode::v_rsq_f16:
|
|
|
|
|
case aco_opcode::v_log_f16:
|
|
|
|
|
case aco_opcode::v_exp_f16:
|
|
|
|
|
case aco_opcode::v_frexp_mant_f16:
|
|
|
|
|
case aco_opcode::v_frexp_exp_i16_f16:
|
|
|
|
|
case aco_opcode::v_floor_f16:
|
|
|
|
|
case aco_opcode::v_ceil_f16:
|
|
|
|
|
case aco_opcode::v_trunc_f16:
|
|
|
|
|
case aco_opcode::v_rndne_f16:
|
|
|
|
|
case aco_opcode::v_fract_f16:
|
|
|
|
|
case aco_opcode::v_sin_f16:
|
2023-01-08 16:54:56 +01:00
|
|
|
case aco_opcode::v_cos_f16:
|
|
|
|
|
case aco_opcode::v_cvt_u16_f16:
|
|
|
|
|
case aco_opcode::v_cvt_i16_f16:
|
|
|
|
|
case aco_opcode::v_cvt_norm_i16_f16:
|
|
|
|
|
case aco_opcode::v_cvt_norm_u16_f16: return gfx_level >= GFX10;
|
2021-08-18 18:56:59 +02:00
|
|
|
/* on GFX10, all opsel instructions preserve the high bits */
|
2022-05-12 02:50:17 -04:00
|
|
|
default: return gfx_level >= GFX10 && can_use_opsel(gfx_level, op, -1);
|
2021-08-18 18:56:59 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-11-11 19:58:45 +00:00
|
|
|
/* On GFX11, for some instructions, bit 7 of the destination/operand vgpr is opsel and the field
|
|
|
|
|
* only supports v0-v127.
|
|
|
|
|
*/
|
|
|
|
|
uint8_t
|
|
|
|
|
get_gfx11_true16_mask(aco_opcode op)
|
|
|
|
|
{
|
|
|
|
|
switch (op) {
|
|
|
|
|
case aco_opcode::v_ceil_f16:
|
|
|
|
|
case aco_opcode::v_cos_f16:
|
|
|
|
|
case aco_opcode::v_cvt_f16_i16:
|
|
|
|
|
case aco_opcode::v_cvt_f16_u16:
|
|
|
|
|
case aco_opcode::v_cvt_i16_f16:
|
|
|
|
|
case aco_opcode::v_cvt_u16_f16:
|
|
|
|
|
case aco_opcode::v_cvt_norm_i16_f16:
|
|
|
|
|
case aco_opcode::v_cvt_norm_u16_f16:
|
|
|
|
|
case aco_opcode::v_exp_f16:
|
|
|
|
|
case aco_opcode::v_floor_f16:
|
|
|
|
|
case aco_opcode::v_fract_f16:
|
|
|
|
|
case aco_opcode::v_frexp_exp_i16_f16:
|
|
|
|
|
case aco_opcode::v_frexp_mant_f16:
|
|
|
|
|
case aco_opcode::v_log_f16:
|
|
|
|
|
case aco_opcode::v_not_b16:
|
|
|
|
|
case aco_opcode::v_rcp_f16:
|
|
|
|
|
case aco_opcode::v_rndne_f16:
|
|
|
|
|
case aco_opcode::v_rsq_f16:
|
|
|
|
|
case aco_opcode::v_sin_f16:
|
|
|
|
|
case aco_opcode::v_sqrt_f16:
|
|
|
|
|
case aco_opcode::v_trunc_f16:
|
|
|
|
|
case aco_opcode::v_mov_b16: return 0x1 | 0x8;
|
|
|
|
|
case aco_opcode::v_add_f16:
|
|
|
|
|
case aco_opcode::v_fmaak_f16:
|
|
|
|
|
case aco_opcode::v_fmac_f16:
|
|
|
|
|
case aco_opcode::v_fmamk_f16:
|
|
|
|
|
case aco_opcode::v_ldexp_f16:
|
|
|
|
|
case aco_opcode::v_max_f16:
|
|
|
|
|
case aco_opcode::v_min_f16:
|
|
|
|
|
case aco_opcode::v_mul_f16:
|
|
|
|
|
case aco_opcode::v_sub_f16:
|
|
|
|
|
case aco_opcode::v_subrev_f16:
|
|
|
|
|
case aco_opcode::v_and_b16:
|
|
|
|
|
case aco_opcode::v_or_b16:
|
|
|
|
|
case aco_opcode::v_xor_b16: return 0x3 | 0x8;
|
|
|
|
|
case aco_opcode::v_cmp_class_f16:
|
|
|
|
|
case aco_opcode::v_cmpx_class_f16:
|
|
|
|
|
case aco_opcode::v_cvt_f32_f16:
|
|
|
|
|
case aco_opcode::v_cvt_i32_i16:
|
|
|
|
|
case aco_opcode::v_cvt_u32_u16: return 0x1;
|
|
|
|
|
case aco_opcode::v_cmp_eq_f16:
|
|
|
|
|
case aco_opcode::v_cmp_eq_i16:
|
|
|
|
|
case aco_opcode::v_cmp_eq_u16:
|
|
|
|
|
case aco_opcode::v_cmp_ge_f16:
|
|
|
|
|
case aco_opcode::v_cmp_ge_i16:
|
|
|
|
|
case aco_opcode::v_cmp_ge_u16:
|
|
|
|
|
case aco_opcode::v_cmp_gt_f16:
|
|
|
|
|
case aco_opcode::v_cmp_gt_i16:
|
|
|
|
|
case aco_opcode::v_cmp_gt_u16:
|
|
|
|
|
case aco_opcode::v_cmp_le_f16:
|
|
|
|
|
case aco_opcode::v_cmp_le_i16:
|
|
|
|
|
case aco_opcode::v_cmp_le_u16:
|
|
|
|
|
case aco_opcode::v_cmp_lg_f16:
|
|
|
|
|
case aco_opcode::v_cmp_lg_i16:
|
|
|
|
|
case aco_opcode::v_cmp_lg_u16:
|
|
|
|
|
case aco_opcode::v_cmp_lt_f16:
|
|
|
|
|
case aco_opcode::v_cmp_lt_i16:
|
|
|
|
|
case aco_opcode::v_cmp_lt_u16:
|
|
|
|
|
case aco_opcode::v_cmp_neq_f16:
|
|
|
|
|
case aco_opcode::v_cmp_nge_f16:
|
|
|
|
|
case aco_opcode::v_cmp_ngt_f16:
|
|
|
|
|
case aco_opcode::v_cmp_nle_f16:
|
|
|
|
|
case aco_opcode::v_cmp_nlg_f16:
|
|
|
|
|
case aco_opcode::v_cmp_nlt_f16:
|
|
|
|
|
case aco_opcode::v_cmp_o_f16:
|
|
|
|
|
case aco_opcode::v_cmp_u_f16:
|
|
|
|
|
case aco_opcode::v_cmpx_eq_f16:
|
|
|
|
|
case aco_opcode::v_cmpx_eq_i16:
|
|
|
|
|
case aco_opcode::v_cmpx_eq_u16:
|
|
|
|
|
case aco_opcode::v_cmpx_ge_f16:
|
|
|
|
|
case aco_opcode::v_cmpx_ge_i16:
|
|
|
|
|
case aco_opcode::v_cmpx_ge_u16:
|
|
|
|
|
case aco_opcode::v_cmpx_gt_f16:
|
|
|
|
|
case aco_opcode::v_cmpx_gt_i16:
|
|
|
|
|
case aco_opcode::v_cmpx_gt_u16:
|
|
|
|
|
case aco_opcode::v_cmpx_le_f16:
|
|
|
|
|
case aco_opcode::v_cmpx_le_i16:
|
|
|
|
|
case aco_opcode::v_cmpx_le_u16:
|
|
|
|
|
case aco_opcode::v_cmpx_lg_f16:
|
|
|
|
|
case aco_opcode::v_cmpx_lg_i16:
|
|
|
|
|
case aco_opcode::v_cmpx_lg_u16:
|
|
|
|
|
case aco_opcode::v_cmpx_lt_f16:
|
|
|
|
|
case aco_opcode::v_cmpx_lt_i16:
|
|
|
|
|
case aco_opcode::v_cmpx_lt_u16:
|
|
|
|
|
case aco_opcode::v_cmpx_neq_f16:
|
|
|
|
|
case aco_opcode::v_cmpx_nge_f16:
|
|
|
|
|
case aco_opcode::v_cmpx_ngt_f16:
|
|
|
|
|
case aco_opcode::v_cmpx_nle_f16:
|
|
|
|
|
case aco_opcode::v_cmpx_nlg_f16:
|
|
|
|
|
case aco_opcode::v_cmpx_nlt_f16:
|
|
|
|
|
case aco_opcode::v_cmpx_o_f16:
|
|
|
|
|
case aco_opcode::v_cmpx_u_f16: return 0x3;
|
|
|
|
|
case aco_opcode::v_cvt_f16_f32:
|
|
|
|
|
case aco_opcode::v_sat_pk_u8_i16: return 0x8;
|
|
|
|
|
default: return 0x0;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
uint32_t
|
|
|
|
|
get_reduction_identity(ReduceOp op, unsigned idx)
|
2020-09-01 16:30:06 +01:00
|
|
|
{
|
|
|
|
|
switch (op) {
|
|
|
|
|
case iadd8:
|
|
|
|
|
case iadd16:
|
|
|
|
|
case iadd32:
|
|
|
|
|
case iadd64:
|
|
|
|
|
case fadd16:
|
|
|
|
|
case fadd32:
|
|
|
|
|
case fadd64:
|
|
|
|
|
case ior8:
|
|
|
|
|
case ior16:
|
|
|
|
|
case ior32:
|
|
|
|
|
case ior64:
|
|
|
|
|
case ixor8:
|
|
|
|
|
case ixor16:
|
|
|
|
|
case ixor32:
|
|
|
|
|
case ixor64:
|
|
|
|
|
case umax8:
|
|
|
|
|
case umax16:
|
|
|
|
|
case umax32:
|
2021-06-09 10:14:54 +02:00
|
|
|
case umax64: return 0;
|
2020-09-01 16:30:06 +01:00
|
|
|
case imul8:
|
|
|
|
|
case imul16:
|
|
|
|
|
case imul32:
|
2021-06-09 10:14:54 +02:00
|
|
|
case imul64: return idx ? 0 : 1;
|
|
|
|
|
case fmul16: return 0x3c00u; /* 1.0 */
|
|
|
|
|
case fmul32: return 0x3f800000u; /* 1.0 */
|
|
|
|
|
case fmul64: return idx ? 0x3ff00000u : 0u; /* 1.0 */
|
|
|
|
|
case imin8: return INT8_MAX;
|
|
|
|
|
case imin16: return INT16_MAX;
|
|
|
|
|
case imin32: return INT32_MAX;
|
|
|
|
|
case imin64: return idx ? 0x7fffffffu : 0xffffffffu;
|
|
|
|
|
case imax8: return INT8_MIN;
|
|
|
|
|
case imax16: return INT16_MIN;
|
|
|
|
|
case imax32: return INT32_MIN;
|
|
|
|
|
case imax64: return idx ? 0x80000000u : 0;
|
2020-09-01 16:30:06 +01:00
|
|
|
case umin8:
|
|
|
|
|
case umin16:
|
|
|
|
|
case iand8:
|
2021-06-09 10:14:54 +02:00
|
|
|
case iand16: return 0xffffffffu;
|
2020-09-01 16:30:06 +01:00
|
|
|
case umin32:
|
|
|
|
|
case umin64:
|
|
|
|
|
case iand32:
|
2021-06-09 10:14:54 +02:00
|
|
|
case iand64: return 0xffffffffu;
|
|
|
|
|
case fmin16: return 0x7c00u; /* infinity */
|
|
|
|
|
case fmin32: return 0x7f800000u; /* infinity */
|
|
|
|
|
case fmin64: return idx ? 0x7ff00000u : 0u; /* infinity */
|
|
|
|
|
case fmax16: return 0xfc00u; /* negative infinity */
|
|
|
|
|
case fmax32: return 0xff800000u; /* negative infinity */
|
|
|
|
|
case fmax64: return idx ? 0xfff00000u : 0u; /* negative infinity */
|
|
|
|
|
default: unreachable("Invalid reduction operation"); break;
|
2020-09-01 16:30:06 +01:00
|
|
|
}
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2023-05-16 17:10:57 +02:00
|
|
|
unsigned
|
|
|
|
|
get_operand_size(aco_ptr<Instruction>& instr, unsigned index)
|
|
|
|
|
{
|
|
|
|
|
if (instr->isPseudo())
|
|
|
|
|
return instr->operands[index].bytes() * 8u;
|
|
|
|
|
else if (instr->opcode == aco_opcode::v_mad_u64_u32 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_mad_i64_i32)
|
|
|
|
|
return index == 2 ? 64 : 32;
|
|
|
|
|
else if (instr->opcode == aco_opcode::v_fma_mix_f32 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_fma_mixhi_f16)
|
|
|
|
|
return instr->valu().opsel_hi[index] ? 16 : 32;
|
|
|
|
|
else if (instr->isVALU() || instr->isSALU())
|
|
|
|
|
return instr_info.operand_size[(int)instr->opcode];
|
|
|
|
|
else
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
needs_exec_mask(const Instruction* instr)
|
|
|
|
|
{
|
2021-11-12 13:46:17 +00:00
|
|
|
if (instr->isVALU()) {
|
|
|
|
|
return instr->opcode != aco_opcode::v_readlane_b32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_readlane_b32_e64 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_writelane_b32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_writelane_b32_e64;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (instr->isVMEM() || instr->isFlatLike())
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
if (instr->isSALU() || instr->isBranch() || instr->isSMEM() || instr->isBarrier())
|
2020-08-12 16:58:35 +02:00
|
|
|
return instr->reads_exec();
|
|
|
|
|
|
2021-01-20 15:27:16 +00:00
|
|
|
if (instr->isPseudo()) {
|
2020-08-12 16:58:35 +02:00
|
|
|
switch (instr->opcode) {
|
|
|
|
|
case aco_opcode::p_create_vector:
|
|
|
|
|
case aco_opcode::p_extract_vector:
|
|
|
|
|
case aco_opcode::p_split_vector:
|
2021-05-07 09:37:59 +02:00
|
|
|
case aco_opcode::p_phi:
|
|
|
|
|
case aco_opcode::p_parallelcopy:
|
2020-08-12 16:58:35 +02:00
|
|
|
for (Definition def : instr->definitions) {
|
|
|
|
|
if (def.getTemp().type() == RegType::vgpr)
|
|
|
|
|
return true;
|
|
|
|
|
}
|
2021-11-12 13:46:17 +00:00
|
|
|
return instr->reads_exec();
|
2020-08-12 16:58:35 +02:00
|
|
|
case aco_opcode::p_spill:
|
|
|
|
|
case aco_opcode::p_reload:
|
aco/insert_exec_mask: stay in WQM while helper lanes are still needed
This patch flags all instructions WQM which don't require
Exact mode, but depend on the exec mask as long as WQM
is needed on any control flow path afterwards.
This will mostly prevent accidental copies of WQM values
within Exact mode, and also makes a lot of other workarounds
unnecessary.
Totals from 17374 (12.88% of 134913) affected shaders: (GFX10.3)
VGPRs: 526952 -> 527384 (+0.08%); split: -0.01%, +0.09%
CodeSize: 33740512 -> 33766636 (+0.08%); split: -0.06%, +0.14%
MaxWaves: 488166 -> 488108 (-0.01%); split: +0.00%, -0.02%
Instrs: 6254240 -> 6260557 (+0.10%); split: -0.08%, +0.18%
Latency: 66497580 -> 66463472 (-0.05%); split: -0.15%, +0.10%
InvThroughput: 13265741 -> 13264036 (-0.01%); split: -0.03%, +0.01%
VClause: 122962 -> 122975 (+0.01%); split: -0.01%, +0.02%
SClause: 334805 -> 334405 (-0.12%); split: -0.51%, +0.39%
Copies: 275728 -> 282341 (+2.40%); split: -0.91%, +3.31%
Branches: 92546 -> 90990 (-1.68%); split: -1.68%, +0.00%
PreSGPRs: 504119 -> 504352 (+0.05%); split: -0.00%, +0.05%
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14951>
2022-01-24 19:43:49 +01:00
|
|
|
case aco_opcode::p_end_linear_vgpr:
|
2021-05-07 09:37:59 +02:00
|
|
|
case aco_opcode::p_logical_start:
|
|
|
|
|
case aco_opcode::p_logical_end:
|
2022-05-19 14:12:08 +01:00
|
|
|
case aco_opcode::p_startpgm:
|
|
|
|
|
case aco_opcode::p_init_scratch: return instr->reads_exec();
|
2023-05-29 16:39:39 +02:00
|
|
|
case aco_opcode::p_start_linear_vgpr: return instr->operands.size();
|
2021-06-09 10:14:54 +02:00
|
|
|
default: break;
|
2020-08-12 16:58:35 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2021-07-14 17:22:02 +01:00
|
|
|
struct CmpInfo {
|
|
|
|
|
aco_opcode ordered;
|
|
|
|
|
aco_opcode unordered;
|
2022-07-27 11:43:03 +02:00
|
|
|
aco_opcode swapped;
|
2021-07-14 17:22:02 +01:00
|
|
|
aco_opcode inverse;
|
2022-07-27 12:27:07 +02:00
|
|
|
aco_opcode vcmpx;
|
2021-07-14 17:22:02 +01:00
|
|
|
aco_opcode f32;
|
|
|
|
|
unsigned size;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
ALWAYS_INLINE bool
|
|
|
|
|
get_cmp_info(aco_opcode op, CmpInfo* info)
|
|
|
|
|
{
|
|
|
|
|
info->ordered = aco_opcode::num_opcodes;
|
|
|
|
|
info->unordered = aco_opcode::num_opcodes;
|
2022-07-27 11:43:03 +02:00
|
|
|
info->swapped = aco_opcode::num_opcodes;
|
2022-07-27 12:34:27 +02:00
|
|
|
info->inverse = aco_opcode::num_opcodes;
|
2022-07-27 12:03:22 +02:00
|
|
|
info->f32 = aco_opcode::num_opcodes;
|
2023-05-06 18:00:00 +02:00
|
|
|
info->vcmpx = aco_opcode::num_opcodes;
|
2021-07-14 17:22:02 +01:00
|
|
|
switch (op) {
|
|
|
|
|
// clang-format off
|
|
|
|
|
#define CMP2(ord, unord, ord_swap, unord_swap, sz) \
|
|
|
|
|
case aco_opcode::v_cmp_##ord##_f##sz: \
|
|
|
|
|
case aco_opcode::v_cmp_n##unord##_f##sz: \
|
|
|
|
|
info->ordered = aco_opcode::v_cmp_##ord##_f##sz; \
|
|
|
|
|
info->unordered = aco_opcode::v_cmp_n##unord##_f##sz; \
|
2022-07-27 11:43:03 +02:00
|
|
|
info->swapped = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord_swap##_f##sz \
|
|
|
|
|
: aco_opcode::v_cmp_n##unord_swap##_f##sz; \
|
2021-07-14 17:22:02 +01:00
|
|
|
info->inverse = op == aco_opcode::v_cmp_n##unord##_f##sz ? aco_opcode::v_cmp_##unord##_f##sz \
|
|
|
|
|
: aco_opcode::v_cmp_n##ord##_f##sz; \
|
|
|
|
|
info->f32 = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord##_f32 \
|
|
|
|
|
: aco_opcode::v_cmp_n##unord##_f32; \
|
2022-07-27 12:27:07 +02:00
|
|
|
info->vcmpx = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmpx_##ord##_f##sz \
|
|
|
|
|
: aco_opcode::v_cmpx_n##unord##_f##sz; \
|
2021-07-14 17:22:02 +01:00
|
|
|
info->size = sz; \
|
|
|
|
|
return true;
|
|
|
|
|
#define CMP(ord, unord, ord_swap, unord_swap) \
|
|
|
|
|
CMP2(ord, unord, ord_swap, unord_swap, 16) \
|
|
|
|
|
CMP2(ord, unord, ord_swap, unord_swap, 32) \
|
|
|
|
|
CMP2(ord, unord, ord_swap, unord_swap, 64)
|
|
|
|
|
CMP(lt, /*n*/ge, gt, /*n*/le)
|
|
|
|
|
CMP(eq, /*n*/lg, eq, /*n*/lg)
|
|
|
|
|
CMP(le, /*n*/gt, ge, /*n*/lt)
|
2022-07-27 12:04:58 +02:00
|
|
|
CMP(gt, /*n*/le, lt, /*n*/ge)
|
2021-07-14 17:22:02 +01:00
|
|
|
CMP(lg, /*n*/eq, lg, /*n*/eq)
|
|
|
|
|
CMP(ge, /*n*/lt, le, /*n*/gt)
|
|
|
|
|
#undef CMP
|
|
|
|
|
#undef CMP2
|
|
|
|
|
#define ORD_TEST(sz) \
|
|
|
|
|
case aco_opcode::v_cmp_u_f##sz: \
|
|
|
|
|
info->f32 = aco_opcode::v_cmp_u_f32; \
|
2022-07-27 12:43:31 +02:00
|
|
|
info->swapped = aco_opcode::v_cmp_u_f##sz; \
|
2021-07-14 17:22:02 +01:00
|
|
|
info->inverse = aco_opcode::v_cmp_o_f##sz; \
|
2022-07-27 12:27:07 +02:00
|
|
|
info->vcmpx = aco_opcode::v_cmpx_u_f##sz; \
|
2021-07-14 17:22:02 +01:00
|
|
|
info->size = sz; \
|
|
|
|
|
return true; \
|
|
|
|
|
case aco_opcode::v_cmp_o_f##sz: \
|
|
|
|
|
info->f32 = aco_opcode::v_cmp_o_f32; \
|
2022-07-27 12:43:31 +02:00
|
|
|
info->swapped = aco_opcode::v_cmp_o_f##sz; \
|
2021-07-14 17:22:02 +01:00
|
|
|
info->inverse = aco_opcode::v_cmp_u_f##sz; \
|
2022-07-27 12:27:07 +02:00
|
|
|
info->vcmpx = aco_opcode::v_cmpx_o_f##sz; \
|
2021-07-14 17:22:02 +01:00
|
|
|
info->size = sz; \
|
|
|
|
|
return true;
|
|
|
|
|
ORD_TEST(16)
|
|
|
|
|
ORD_TEST(32)
|
|
|
|
|
ORD_TEST(64)
|
|
|
|
|
#undef ORD_TEST
|
2022-07-27 12:03:22 +02:00
|
|
|
#define CMPI2(op, swap, inv, type, sz) \
|
|
|
|
|
case aco_opcode::v_cmp_##op##_##type##sz: \
|
|
|
|
|
info->swapped = aco_opcode::v_cmp_##swap##_##type##sz; \
|
|
|
|
|
info->inverse = aco_opcode::v_cmp_##inv##_##type##sz; \
|
2022-07-27 12:27:07 +02:00
|
|
|
info->vcmpx = aco_opcode::v_cmpx_##op##_##type##sz; \
|
2022-07-27 12:03:22 +02:00
|
|
|
info->size = sz; \
|
|
|
|
|
return true;
|
|
|
|
|
#define CMPI(op, swap, inv) \
|
|
|
|
|
CMPI2(op, swap, inv, i, 16) \
|
|
|
|
|
CMPI2(op, swap, inv, u, 16) \
|
|
|
|
|
CMPI2(op, swap, inv, i, 32) \
|
|
|
|
|
CMPI2(op, swap, inv, u, 32) \
|
|
|
|
|
CMPI2(op, swap, inv, i, 64) \
|
|
|
|
|
CMPI2(op, swap, inv, u, 64)
|
|
|
|
|
CMPI(lt, gt, ge)
|
|
|
|
|
CMPI(eq, eq, lg)
|
|
|
|
|
CMPI(le, ge, gt)
|
|
|
|
|
CMPI(gt, lt, le)
|
|
|
|
|
CMPI(lg, lg, eq)
|
|
|
|
|
CMPI(ge, le, lt)
|
|
|
|
|
#undef CMPI
|
|
|
|
|
#undef CMPI2
|
2022-07-27 12:34:27 +02:00
|
|
|
#define CMPCLASS(sz) \
|
|
|
|
|
case aco_opcode::v_cmp_class_f##sz: \
|
|
|
|
|
info->vcmpx = aco_opcode::v_cmpx_class_f##sz; \
|
|
|
|
|
info->size = sz; \
|
|
|
|
|
return true;
|
|
|
|
|
CMPCLASS(16)
|
|
|
|
|
CMPCLASS(32)
|
|
|
|
|
CMPCLASS(64)
|
|
|
|
|
#undef CMPCLASS
|
2021-07-14 17:22:02 +01:00
|
|
|
// clang-format on
|
|
|
|
|
default: return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
aco_opcode
|
|
|
|
|
get_ordered(aco_opcode op)
|
|
|
|
|
{
|
|
|
|
|
CmpInfo info;
|
|
|
|
|
return get_cmp_info(op, &info) ? info.ordered : aco_opcode::num_opcodes;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
aco_opcode
|
|
|
|
|
get_unordered(aco_opcode op)
|
|
|
|
|
{
|
|
|
|
|
CmpInfo info;
|
|
|
|
|
return get_cmp_info(op, &info) ? info.unordered : aco_opcode::num_opcodes;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
aco_opcode
|
|
|
|
|
get_inverse(aco_opcode op)
|
|
|
|
|
{
|
|
|
|
|
CmpInfo info;
|
|
|
|
|
return get_cmp_info(op, &info) ? info.inverse : aco_opcode::num_opcodes;
|
|
|
|
|
}
|
|
|
|
|
|
2023-01-22 19:50:46 +01:00
|
|
|
aco_opcode
|
|
|
|
|
get_swapped(aco_opcode op)
|
|
|
|
|
{
|
|
|
|
|
CmpInfo info;
|
|
|
|
|
return get_cmp_info(op, &info) ? info.swapped : aco_opcode::num_opcodes;
|
|
|
|
|
}
|
|
|
|
|
|
2021-07-14 17:22:02 +01:00
|
|
|
aco_opcode
|
|
|
|
|
get_f32_cmp(aco_opcode op)
|
|
|
|
|
{
|
|
|
|
|
CmpInfo info;
|
|
|
|
|
return get_cmp_info(op, &info) ? info.f32 : aco_opcode::num_opcodes;
|
|
|
|
|
}
|
|
|
|
|
|
2022-07-27 12:27:07 +02:00
|
|
|
aco_opcode
|
|
|
|
|
get_vcmpx(aco_opcode op)
|
|
|
|
|
{
|
|
|
|
|
CmpInfo info;
|
|
|
|
|
return get_cmp_info(op, &info) ? info.vcmpx : aco_opcode::num_opcodes;
|
|
|
|
|
}
|
|
|
|
|
|
2021-07-14 17:22:02 +01:00
|
|
|
unsigned
|
|
|
|
|
get_cmp_bitsize(aco_opcode op)
|
|
|
|
|
{
|
|
|
|
|
CmpInfo info;
|
|
|
|
|
return get_cmp_info(op, &info) ? info.size : 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool
|
2022-08-15 17:01:06 +01:00
|
|
|
is_fp_cmp(aco_opcode op)
|
2021-07-14 17:22:02 +01:00
|
|
|
{
|
|
|
|
|
CmpInfo info;
|
|
|
|
|
return get_cmp_info(op, &info) && info.ordered != aco_opcode::num_opcodes;
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-15 17:01:52 +01:00
|
|
|
bool
|
|
|
|
|
is_cmpx(aco_opcode op)
|
|
|
|
|
{
|
|
|
|
|
CmpInfo info;
|
|
|
|
|
return !get_cmp_info(op, &info);
|
|
|
|
|
}
|
|
|
|
|
|
2021-07-14 17:22:02 +01:00
|
|
|
bool
|
2023-04-23 14:55:17 +02:00
|
|
|
can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op, unsigned idx0, unsigned idx1)
|
2021-07-14 17:22:02 +01:00
|
|
|
{
|
2023-04-23 14:55:17 +02:00
|
|
|
if (idx0 == idx1) {
|
|
|
|
|
*new_op = instr->opcode;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (idx0 > idx1)
|
|
|
|
|
std::swap(idx0, idx1);
|
|
|
|
|
|
2021-07-14 17:22:02 +01:00
|
|
|
if (instr->isDPP())
|
|
|
|
|
return false;
|
|
|
|
|
|
2023-04-23 14:55:17 +02:00
|
|
|
if (!instr->isVOP3() && !instr->isVOP3P() && !instr->operands[0].isOfType(RegType::vgpr))
|
2021-07-14 17:22:02 +01:00
|
|
|
return false;
|
|
|
|
|
|
2023-04-23 14:55:17 +02:00
|
|
|
if (instr->isVOPC()) {
|
|
|
|
|
CmpInfo info;
|
|
|
|
|
if (get_cmp_info(instr->opcode, &info) && info.swapped != aco_opcode::num_opcodes) {
|
|
|
|
|
*new_op = info.swapped;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* opcodes not relevant for DPP or SGPRs optimizations are not included. */
|
2021-07-14 17:22:02 +01:00
|
|
|
switch (instr->opcode) {
|
2023-04-23 14:55:17 +02:00
|
|
|
case aco_opcode::v_med3_f32: return false; /* order matters for clamp+GFX8+denorm ftz. */
|
2021-07-14 17:22:02 +01:00
|
|
|
case aco_opcode::v_add_u32:
|
|
|
|
|
case aco_opcode::v_add_co_u32:
|
|
|
|
|
case aco_opcode::v_add_co_u32_e64:
|
|
|
|
|
case aco_opcode::v_add_i32:
|
2023-04-23 14:55:17 +02:00
|
|
|
case aco_opcode::v_add_i16:
|
|
|
|
|
case aco_opcode::v_add_u16_e64:
|
|
|
|
|
case aco_opcode::v_add3_u32:
|
2021-07-14 17:22:02 +01:00
|
|
|
case aco_opcode::v_add_f16:
|
|
|
|
|
case aco_opcode::v_add_f32:
|
2023-04-23 14:55:17 +02:00
|
|
|
case aco_opcode::v_mul_i32_i24:
|
|
|
|
|
case aco_opcode::v_mul_hi_i32_i24:
|
|
|
|
|
case aco_opcode::v_mul_u32_u24:
|
|
|
|
|
case aco_opcode::v_mul_hi_u32_u24:
|
|
|
|
|
case aco_opcode::v_mul_lo_u16:
|
|
|
|
|
case aco_opcode::v_mul_lo_u16_e64:
|
2021-07-14 17:22:02 +01:00
|
|
|
case aco_opcode::v_mul_f16:
|
|
|
|
|
case aco_opcode::v_mul_f32:
|
2023-04-23 14:55:17 +02:00
|
|
|
case aco_opcode::v_mul_legacy_f32:
|
2021-07-14 17:22:02 +01:00
|
|
|
case aco_opcode::v_or_b32:
|
|
|
|
|
case aco_opcode::v_and_b32:
|
|
|
|
|
case aco_opcode::v_xor_b32:
|
2023-04-23 14:55:17 +02:00
|
|
|
case aco_opcode::v_xnor_b32:
|
|
|
|
|
case aco_opcode::v_xor3_b32:
|
|
|
|
|
case aco_opcode::v_or3_b32:
|
|
|
|
|
case aco_opcode::v_and_b16:
|
|
|
|
|
case aco_opcode::v_or_b16:
|
|
|
|
|
case aco_opcode::v_xor_b16:
|
|
|
|
|
case aco_opcode::v_max3_f32:
|
|
|
|
|
case aco_opcode::v_min3_f32:
|
|
|
|
|
case aco_opcode::v_max3_f16:
|
|
|
|
|
case aco_opcode::v_min3_f16:
|
|
|
|
|
case aco_opcode::v_med3_f16:
|
|
|
|
|
case aco_opcode::v_max3_u32:
|
|
|
|
|
case aco_opcode::v_min3_u32:
|
|
|
|
|
case aco_opcode::v_med3_u32:
|
|
|
|
|
case aco_opcode::v_max3_i32:
|
|
|
|
|
case aco_opcode::v_min3_i32:
|
|
|
|
|
case aco_opcode::v_med3_i32:
|
|
|
|
|
case aco_opcode::v_max3_u16:
|
|
|
|
|
case aco_opcode::v_min3_u16:
|
|
|
|
|
case aco_opcode::v_med3_u16:
|
|
|
|
|
case aco_opcode::v_max3_i16:
|
|
|
|
|
case aco_opcode::v_min3_i16:
|
|
|
|
|
case aco_opcode::v_med3_i16:
|
2021-07-14 17:22:02 +01:00
|
|
|
case aco_opcode::v_max_f16:
|
|
|
|
|
case aco_opcode::v_max_f32:
|
|
|
|
|
case aco_opcode::v_min_f16:
|
|
|
|
|
case aco_opcode::v_min_f32:
|
|
|
|
|
case aco_opcode::v_max_i32:
|
|
|
|
|
case aco_opcode::v_min_i32:
|
|
|
|
|
case aco_opcode::v_max_u32:
|
|
|
|
|
case aco_opcode::v_min_u32:
|
|
|
|
|
case aco_opcode::v_max_i16:
|
|
|
|
|
case aco_opcode::v_min_i16:
|
|
|
|
|
case aco_opcode::v_max_u16:
|
|
|
|
|
case aco_opcode::v_min_u16:
|
|
|
|
|
case aco_opcode::v_max_i16_e64:
|
|
|
|
|
case aco_opcode::v_min_i16_e64:
|
|
|
|
|
case aco_opcode::v_max_u16_e64:
|
|
|
|
|
case aco_opcode::v_min_u16_e64: *new_op = instr->opcode; return true;
|
|
|
|
|
case aco_opcode::v_sub_f16: *new_op = aco_opcode::v_subrev_f16; return true;
|
|
|
|
|
case aco_opcode::v_sub_f32: *new_op = aco_opcode::v_subrev_f32; return true;
|
|
|
|
|
case aco_opcode::v_sub_co_u32: *new_op = aco_opcode::v_subrev_co_u32; return true;
|
|
|
|
|
case aco_opcode::v_sub_u16: *new_op = aco_opcode::v_subrev_u16; return true;
|
|
|
|
|
case aco_opcode::v_sub_u32: *new_op = aco_opcode::v_subrev_u32; return true;
|
2023-04-23 14:55:17 +02:00
|
|
|
case aco_opcode::v_sub_co_u32_e64: *new_op = aco_opcode::v_subrev_co_u32_e64; return true;
|
|
|
|
|
case aco_opcode::v_subrev_f16: *new_op = aco_opcode::v_sub_f16; return true;
|
|
|
|
|
case aco_opcode::v_subrev_f32: *new_op = aco_opcode::v_sub_f32; return true;
|
|
|
|
|
case aco_opcode::v_subrev_co_u32: *new_op = aco_opcode::v_sub_co_u32; return true;
|
|
|
|
|
case aco_opcode::v_subrev_u16: *new_op = aco_opcode::v_sub_u16; return true;
|
|
|
|
|
case aco_opcode::v_subrev_u32: *new_op = aco_opcode::v_sub_u32; return true;
|
|
|
|
|
case aco_opcode::v_subrev_co_u32_e64: *new_op = aco_opcode::v_sub_co_u32_e64; return true;
|
|
|
|
|
case aco_opcode::v_addc_co_u32:
|
|
|
|
|
case aco_opcode::v_mad_i32_i24:
|
|
|
|
|
case aco_opcode::v_mad_u32_u24:
|
|
|
|
|
case aco_opcode::v_lerp_u8:
|
|
|
|
|
case aco_opcode::v_sad_u8:
|
|
|
|
|
case aco_opcode::v_sad_hi_u8:
|
|
|
|
|
case aco_opcode::v_sad_u16:
|
|
|
|
|
case aco_opcode::v_sad_u32:
|
|
|
|
|
case aco_opcode::v_xad_u32:
|
|
|
|
|
case aco_opcode::v_add_lshl_u32:
|
|
|
|
|
case aco_opcode::v_and_or_b32:
|
|
|
|
|
case aco_opcode::v_mad_u16:
|
|
|
|
|
case aco_opcode::v_mad_i16:
|
|
|
|
|
case aco_opcode::v_mad_u32_u16:
|
|
|
|
|
case aco_opcode::v_mad_i32_i16:
|
|
|
|
|
case aco_opcode::v_maxmin_f32:
|
|
|
|
|
case aco_opcode::v_minmax_f32:
|
|
|
|
|
case aco_opcode::v_maxmin_f16:
|
|
|
|
|
case aco_opcode::v_minmax_f16:
|
|
|
|
|
case aco_opcode::v_maxmin_u32:
|
|
|
|
|
case aco_opcode::v_minmax_u32:
|
|
|
|
|
case aco_opcode::v_maxmin_i32:
|
|
|
|
|
case aco_opcode::v_minmax_i32:
|
|
|
|
|
case aco_opcode::v_fma_f32:
|
|
|
|
|
case aco_opcode::v_fma_legacy_f32:
|
|
|
|
|
case aco_opcode::v_fmac_f32:
|
|
|
|
|
case aco_opcode::v_fmac_legacy_f32:
|
|
|
|
|
case aco_opcode::v_mac_f32:
|
|
|
|
|
case aco_opcode::v_mac_legacy_f32:
|
|
|
|
|
case aco_opcode::v_fma_f16:
|
|
|
|
|
case aco_opcode::v_fmac_f16:
|
|
|
|
|
case aco_opcode::v_mac_f16:
|
|
|
|
|
case aco_opcode::v_dot4c_i32_i8:
|
|
|
|
|
case aco_opcode::v_dot2c_f32_f16:
|
|
|
|
|
case aco_opcode::v_dot2_f32_f16:
|
|
|
|
|
case aco_opcode::v_dot2_f32_bf16:
|
|
|
|
|
case aco_opcode::v_dot2_f16_f16:
|
|
|
|
|
case aco_opcode::v_dot2_bf16_bf16:
|
|
|
|
|
case aco_opcode::v_fma_mix_f32:
|
|
|
|
|
case aco_opcode::v_fma_mixlo_f16:
|
|
|
|
|
case aco_opcode::v_fma_mixhi_f16:
|
|
|
|
|
case aco_opcode::v_pk_fmac_f16: {
|
|
|
|
|
if (idx1 == 2)
|
|
|
|
|
return false;
|
|
|
|
|
*new_op = instr->opcode;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
case aco_opcode::v_subb_co_u32: {
|
|
|
|
|
if (idx1 == 2)
|
|
|
|
|
return false;
|
|
|
|
|
*new_op = aco_opcode::v_subbrev_co_u32;
|
|
|
|
|
return true;
|
2021-07-14 17:22:02 +01:00
|
|
|
}
|
2023-04-23 14:55:17 +02:00
|
|
|
case aco_opcode::v_subbrev_co_u32: {
|
|
|
|
|
if (idx1 == 2)
|
|
|
|
|
return false;
|
|
|
|
|
*new_op = aco_opcode::v_subb_co_u32;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
default: return false;
|
2021-07-14 17:22:02 +01:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
wait_imm::wait_imm() : vm(unset_counter), exp(unset_counter), lgkm(unset_counter), vs(unset_counter)
|
|
|
|
|
{}
|
|
|
|
|
wait_imm::wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_)
|
|
|
|
|
: vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_)
|
|
|
|
|
{}
|
2021-01-27 16:27:38 +00:00
|
|
|
|
2022-05-12 02:50:17 -04:00
|
|
|
wait_imm::wait_imm(enum amd_gfx_level gfx_level, uint16_t packed) : vs(unset_counter)
|
2021-01-27 16:27:38 +00:00
|
|
|
{
|
2022-07-21 15:12:38 +01:00
|
|
|
if (gfx_level == GFX11) {
|
|
|
|
|
vm = (packed >> 10) & 0x3f;
|
|
|
|
|
lgkm = (packed >> 4) & 0x3f;
|
|
|
|
|
exp = packed & 0x7;
|
|
|
|
|
} else {
|
|
|
|
|
vm = packed & 0xf;
|
|
|
|
|
if (gfx_level >= GFX9)
|
|
|
|
|
vm |= (packed >> 10) & 0x30;
|
|
|
|
|
|
|
|
|
|
exp = (packed >> 4) & 0x7;
|
2021-01-27 16:27:38 +00:00
|
|
|
|
2022-07-21 15:12:38 +01:00
|
|
|
lgkm = (packed >> 8) & 0xf;
|
|
|
|
|
if (gfx_level >= GFX10)
|
|
|
|
|
lgkm |= (packed >> 8) & 0x30;
|
|
|
|
|
}
|
2021-01-27 16:27:38 +00:00
|
|
|
|
2022-07-21 15:12:38 +01:00
|
|
|
if (vm == (gfx_level >= GFX9 ? 0x3f : 0xf))
|
|
|
|
|
vm = wait_imm::unset_counter;
|
|
|
|
|
if (exp == 0x7)
|
|
|
|
|
exp = wait_imm::unset_counter;
|
|
|
|
|
if (lgkm == (gfx_level >= GFX10 ? 0x3f : 0xf))
|
|
|
|
|
lgkm = wait_imm::unset_counter;
|
2021-01-27 16:27:38 +00:00
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
uint16_t
|
2022-05-12 02:50:17 -04:00
|
|
|
wait_imm::pack(enum amd_gfx_level gfx_level) const
|
2021-01-27 16:27:38 +00:00
|
|
|
{
|
|
|
|
|
uint16_t imm = 0;
|
|
|
|
|
assert(exp == unset_counter || exp <= 0x7);
|
2022-05-12 02:50:17 -04:00
|
|
|
switch (gfx_level) {
|
2022-05-06 11:38:43 +02:00
|
|
|
case GFX11:
|
|
|
|
|
assert(lgkm == unset_counter || lgkm <= 0x3f);
|
|
|
|
|
assert(vm == unset_counter || vm <= 0x3f);
|
|
|
|
|
imm = ((vm & 0x3f) << 10) | ((lgkm & 0x3f) << 4) | (exp & 0x7);
|
|
|
|
|
break;
|
2021-01-27 16:27:38 +00:00
|
|
|
case GFX10:
|
|
|
|
|
case GFX10_3:
|
|
|
|
|
assert(lgkm == unset_counter || lgkm <= 0x3f);
|
|
|
|
|
assert(vm == unset_counter || vm <= 0x3f);
|
|
|
|
|
imm = ((vm & 0x30) << 10) | ((lgkm & 0x3f) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
|
|
|
|
|
break;
|
|
|
|
|
case GFX9:
|
|
|
|
|
assert(lgkm == unset_counter || lgkm <= 0xf);
|
|
|
|
|
assert(vm == unset_counter || vm <= 0x3f);
|
|
|
|
|
imm = ((vm & 0x30) << 10) | ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
assert(lgkm == unset_counter || lgkm <= 0xf);
|
|
|
|
|
assert(vm == unset_counter || vm <= 0xf);
|
|
|
|
|
imm = ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
|
|
|
|
|
break;
|
|
|
|
|
}
|
2022-05-12 02:50:17 -04:00
|
|
|
if (gfx_level < GFX9 && vm == wait_imm::unset_counter)
|
2021-06-09 10:14:54 +02:00
|
|
|
imm |= 0xc000; /* should have no effect on pre-GFX9 and now we won't have to worry about the
|
|
|
|
|
architecture when interpreting the immediate */
|
2022-05-12 02:50:17 -04:00
|
|
|
if (gfx_level < GFX10 && lgkm == wait_imm::unset_counter)
|
2021-06-09 10:14:54 +02:00
|
|
|
imm |= 0x3000; /* should have no effect on pre-GFX10 and now we won't have to worry about the
|
|
|
|
|
architecture when interpreting the immediate */
|
2021-01-27 16:27:38 +00:00
|
|
|
return imm;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
wait_imm::combine(const wait_imm& other)
|
2021-01-27 16:27:38 +00:00
|
|
|
{
|
|
|
|
|
bool changed = other.vm < vm || other.exp < exp || other.lgkm < lgkm || other.vs < vs;
|
|
|
|
|
vm = std::min(vm, other.vm);
|
|
|
|
|
exp = std::min(exp, other.exp);
|
|
|
|
|
lgkm = std::min(lgkm, other.lgkm);
|
|
|
|
|
vs = std::min(vs, other.vs);
|
|
|
|
|
return changed;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
wait_imm::empty() const
|
2021-01-27 16:27:38 +00:00
|
|
|
{
|
2021-06-09 10:14:54 +02:00
|
|
|
return vm == unset_counter && exp == unset_counter && lgkm == unset_counter &&
|
|
|
|
|
vs == unset_counter;
|
2021-01-27 16:27:38 +00:00
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
should_form_clause(const Instruction* a, const Instruction* b)
|
2020-11-30 17:53:23 +00:00
|
|
|
{
|
2023-06-07 17:00:12 +01:00
|
|
|
if (a->definitions.empty() != b->definitions.empty())
|
|
|
|
|
return false;
|
|
|
|
|
|
2020-11-30 17:53:23 +00:00
|
|
|
if (a->format != b->format)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
/* Assume loads which don't use descriptors might load from similar addresses. */
|
|
|
|
|
if (a->isFlatLike())
|
|
|
|
|
return true;
|
|
|
|
|
if (a->isSMEM() && a->operands[0].bytes() == 8 && b->operands[0].bytes() == 8)
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
/* If they load from the same descriptor, assume they might load from similar
|
|
|
|
|
* addresses.
|
|
|
|
|
*/
|
|
|
|
|
if (a->isVMEM() || a->isSMEM())
|
|
|
|
|
return a->operands[0].tempId() == b->operands[0].tempId();
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2023-04-12 15:50:57 +01:00
|
|
|
int
|
|
|
|
|
get_op_fixed_to_def(Instruction* instr)
|
|
|
|
|
{
|
|
|
|
|
if (instr->opcode == aco_opcode::v_interp_p2_f32 || instr->opcode == aco_opcode::v_mac_f32 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_fmac_f32 || instr->opcode == aco_opcode::v_mac_f16 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_fmac_f16 || instr->opcode == aco_opcode::v_mac_legacy_f32 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_fmac_legacy_f32 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_pk_fmac_f16 || instr->opcode == aco_opcode::v_writelane_b32 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_writelane_b32_e64 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_dot4c_i32_i8) {
|
|
|
|
|
return 2;
|
|
|
|
|
} else if (instr->opcode == aco_opcode::s_addk_i32 || instr->opcode == aco_opcode::s_mulk_i32 ||
|
|
|
|
|
instr->opcode == aco_opcode::s_cmovk_i32) {
|
|
|
|
|
return 0;
|
|
|
|
|
} else if (instr->isMUBUF() && instr->definitions.size() == 1 && instr->operands.size() == 4) {
|
|
|
|
|
return 3;
|
|
|
|
|
} else if (instr->isMIMG() && instr->definitions.size() == 1 &&
|
|
|
|
|
!instr->operands[2].isUndefined()) {
|
|
|
|
|
return 2;
|
|
|
|
|
}
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
|
|
|
|
|
2022-07-21 15:54:26 +01:00
|
|
|
bool
|
|
|
|
|
dealloc_vgprs(Program* program)
|
|
|
|
|
{
|
|
|
|
|
if (program->gfx_level < GFX11)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
/* skip if deallocating VGPRs won't increase occupancy */
|
|
|
|
|
uint16_t max_waves = program->dev.max_wave64_per_simd * (64 / program->wave_size);
|
|
|
|
|
max_waves = max_suitable_waves(program, max_waves);
|
|
|
|
|
if (program->max_reg_demand.vgpr <= get_addr_vgpr_from_waves(program, max_waves))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
Block& block = program->blocks.back();
|
|
|
|
|
|
|
|
|
|
/* don't bother checking if there is a pending VMEM store or export: there almost always is */
|
|
|
|
|
Builder bld(program);
|
|
|
|
|
if (!block.instructions.empty() && block.instructions.back()->opcode == aco_opcode::s_endpgm) {
|
|
|
|
|
bld.reset(&block.instructions, block.instructions.begin() + (block.instructions.size() - 1));
|
|
|
|
|
bld.sopp(aco_opcode::s_sendmsg, -1, sendmsg_dealloc_vgprs);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2022-11-14 18:57:08 +00:00
|
|
|
bool
|
|
|
|
|
Instruction::isTrans() const noexcept
|
|
|
|
|
{
|
|
|
|
|
return instr_info.classes[(int)opcode] == instr_class::valu_transcendental32 ||
|
|
|
|
|
instr_info.classes[(int)opcode] == instr_class::valu_double_transcendental;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
} // namespace aco
|