2020-06-03 11:27:55 +01:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2020 Valve Corporation
|
|
|
|
|
*
|
2024-04-08 09:02:30 +02:00
|
|
|
* SPDX-License-Identifier: MIT
|
2020-06-03 11:27:55 +01:00
|
|
|
*/
|
2021-06-09 15:40:03 +02:00
|
|
|
|
2020-06-03 11:27:55 +01:00
|
|
|
#include "aco_ir.h"
|
2021-06-09 15:40:03 +02:00
|
|
|
|
2021-07-14 17:11:44 +01:00
|
|
|
#include "aco_builder.h"
|
2025-11-27 10:04:54 +01:00
|
|
|
#include "aco_shader_info.h"
|
2021-07-14 17:11:44 +01:00
|
|
|
|
2022-09-13 12:49:56 +03:00
|
|
|
#include "util/u_debug.h"
|
2020-06-03 11:27:55 +01:00
|
|
|
|
2021-06-09 15:40:03 +02:00
|
|
|
#include "c11/threads.h"
|
|
|
|
|
|
2025-02-17 18:42:48 +01:00
|
|
|
#include "ac_descriptors.h"
|
|
|
|
|
#include "amdgfxregs.h"
|
|
|
|
|
|
2020-06-03 11:27:55 +01:00
|
|
|
namespace aco {
|
|
|
|
|
|
2022-10-21 11:09:46 +02:00
|
|
|
thread_local aco::monotonic_buffer_resource* instruction_buffer = nullptr;
|
2022-08-15 21:42:19 +02:00
|
|
|
|
2020-01-22 19:57:20 +00:00
|
|
|
uint64_t debug_flags = 0;
|
|
|
|
|
|
2023-10-12 10:52:45 +02:00
|
|
|
static const struct debug_control aco_debug_options[] = {
|
|
|
|
|
{"validateir", DEBUG_VALIDATE_IR},
|
|
|
|
|
{"validatera", DEBUG_VALIDATE_RA},
|
2024-07-04 16:07:39 +02:00
|
|
|
{"validate-livevars", DEBUG_VALIDATE_LIVE_VARS},
|
2024-07-30 10:45:13 +02:00
|
|
|
{"validateopt", DEBUG_VALIDATE_OPT},
|
2024-08-09 15:20:16 +02:00
|
|
|
{"novalidate", DEBUG_NO_VALIDATE},
|
2023-10-12 10:52:45 +02:00
|
|
|
{"force-waitcnt", DEBUG_FORCE_WAITCNT},
|
|
|
|
|
{"force-waitdeps", DEBUG_FORCE_WAITDEPS},
|
|
|
|
|
{"novn", DEBUG_NO_VN},
|
|
|
|
|
{"noopt", DEBUG_NO_OPT},
|
2024-01-18 11:54:53 +00:00
|
|
|
{"nosched", DEBUG_NO_SCHED | DEBUG_NO_SCHED_ILP | DEBUG_NO_SCHED_VOPD},
|
2023-10-12 10:52:45 +02:00
|
|
|
{"nosched-ilp", DEBUG_NO_SCHED_ILP},
|
2024-01-18 11:54:53 +00:00
|
|
|
{"nosched-vopd", DEBUG_NO_SCHED_VOPD},
|
2023-10-12 10:52:45 +02:00
|
|
|
{"perfinfo", DEBUG_PERF_INFO},
|
|
|
|
|
{"liveinfo", DEBUG_LIVE_INFO},
|
|
|
|
|
{NULL, 0}};
|
2020-01-22 19:57:20 +00:00
|
|
|
|
|
|
|
|
static once_flag init_once_flag = ONCE_FLAG_INIT;
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
init_once()
|
|
|
|
|
{
|
mesa: replace most occurrences of getenv() with os_get_option()
The standard way to query options in mesa is `os_get_option()` which
abstracts platform-specific mechanisms to get config variables.
However in quite a few places `getenv()` is still used and this may
preclude controlling some options on some systems.
For instance it is not generally possible to use `MESA_DEBUG` on
Android.
So replace most `getenv()` occurrences with `os_get_option()` to
support configuration options more consistently across different
platforms.
Do the same with `secure_getenv()` replacing it with
`os_get_option_secure()`.
The bulk of the proposed changes are mechanically performed by the
following script:
-----------------------------------------------------------------------
#!/bin/sh
set -e
replace() {
# Don't replace in some files, for example where `os_get_option` is defined,
# or in external files
EXCLUDE_FILES_PATTERN='(src/util/os_misc.c|src/util/u_debug.h|src/gtest/include/gtest/internal/gtest-port.h)'
# Don't replace some "system" variables
EXCLUDE_VARS_PATTERN='("XDG|"DISPLAY|"HOME|"TMPDIR|"POSIXLY_CORRECT)'
git grep "[=!( ]$1(" -- src/ | cut -d ':' -f 1 | sort | uniq | \
grep -v -E "$EXCLUDE_FILES_PATTERN" | \
while read -r file;
do
# Don't replace usages of XDG_* variables or HOME
sed -E -e "/$EXCLUDE_VARS_PATTERN/!s/([=!\( ])$1\(/\1$2\(/g" -i "$file";
done
}
# Add const to os_get_option results, to avoid warning about discarded qualifier:
# warning: initialization discards ‘const’ qualifier from pointer target type [-Wdiscarded-qualifiers]
# but also errors in some cases:
# error: invalid conversion from ‘const char*’ to ‘char*’ [-fpermissive]
add_const_results() {
git grep -l -P '(?<!const )char.*os_get_option' | \
while read -r file;
do
sed -e '/^\s*const/! s/\(char.*os_get_option\)/const \1/g' -i "$file"
done
}
replace 'secure_getenv' 'os_get_option_secure'
replace 'getenv' 'os_get_option'
add_const_results
-----------------------------------------------------------------------
After this, the `#include "util/os_misc.h"` is also added in files where
`os_get_option()` was not used before.
And since the replacements from the script above generated some new
`-Wdiscarded-qualifiers` warnings, those have been addressed as well,
generally by declaring `os_get_option()` results as `const char *` and
adjusting some function declarations.
Finally some replacements caused new errors like:
-----------------------------------------------------------------------
../src/gallium/auxiliary/gallivm/lp_bld_misc.cpp:127:31: error: no matching function for call to 'strtok'
127 | for (n = 0, option = strtok(env_llc_options, " "); option; n++, option = strtok(NULL, " ")) {
| ^~~~~~
/android-ndk-r27c/toolchains/llvm/prebuilt/linux-x86_64/bin/../sysroot/usr/include/string.h:124:17: note: candidate function not viable: 1st argument ('const char *') would lose const qualifier
124 | char* _Nullable strtok(char* _Nullable __s, const char* _Nonnull __delimiter);
| ^ ~~~~~~~~~~~~~~~~~~~
-----------------------------------------------------------------------
Those have been addressed too, copying the const string returned by
`os_get_option()` so that it could be modified.
In particular, the error above has been fixed by copying the `const
char *env_llc_options` variable in
`src/gallium/auxiliary/gallivm/lp_bld_misc.cpp` to a `char *` which can
be tokenized using `strtok()`.
Reviewed-by: Eric Engestrom <eric@igalia.com>
Reviewed-by: Yonggang Luo <luoyonggang@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38128>
2025-10-20 20:02:49 +08:00
|
|
|
debug_flags = parse_debug_string(os_get_option("ACO_DEBUG"), aco_debug_options);
|
2020-01-22 19:57:20 +00:00
|
|
|
|
|
|
|
|
#ifndef NDEBUG
|
|
|
|
|
/* enable some flags by default on debug builds */
|
2024-08-09 15:20:16 +02:00
|
|
|
if (!(debug_flags & aco::DEBUG_NO_VALIDATE)) {
|
2024-07-30 10:45:13 +02:00
|
|
|
debug_flags |= aco::DEBUG_VALIDATE_IR | DEBUG_VALIDATE_OPT;
|
2024-08-09 15:20:16 +02:00
|
|
|
}
|
2020-01-22 19:57:20 +00:00
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
init()
|
|
|
|
|
{
|
|
|
|
|
call_once(&init_once_flag, init_once);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
2022-05-05 13:34:41 +10:00
|
|
|
init_program(Program* program, Stage stage, const struct aco_shader_info* info,
|
2025-11-27 10:04:54 +01:00
|
|
|
const aco_compiler_options* options, ac_shader_config* config)
|
2020-01-22 19:57:20 +00:00
|
|
|
{
|
2025-11-27 10:04:54 +01:00
|
|
|
assert(options->family != CHIP_UNKNOWN);
|
2022-10-21 11:09:46 +02:00
|
|
|
instruction_buffer = &program->m;
|
2020-01-22 19:57:20 +00:00
|
|
|
program->stage = stage;
|
|
|
|
|
program->config = config;
|
2022-05-05 13:34:41 +10:00
|
|
|
program->info = *info;
|
2025-11-27 10:04:54 +01:00
|
|
|
program->gfx_level = options->gfx_level;
|
2020-01-22 19:57:20 +00:00
|
|
|
program->wave_size = info->wave_size;
|
|
|
|
|
program->lane_mask = program->wave_size == 32 ? s1 : s2;
|
|
|
|
|
|
2023-03-15 11:59:41 -07:00
|
|
|
/* GFX6: There is 64KB LDS per CU, but a single workgroup can only use 32KB. */
|
2025-11-27 10:04:54 +01:00
|
|
|
program->dev.lds_limit = program->gfx_level >= GFX7 ? 65536 : 32768;
|
2025-11-27 11:53:39 +01:00
|
|
|
program->dev.has_16bank_lds = options->cu_info->has_lds_bank_count_16;
|
2020-01-22 19:57:20 +00:00
|
|
|
|
2025-11-27 10:05:34 +01:00
|
|
|
program->dev.max_waves_per_simd = options->cu_info->max_waves_per_simd;
|
|
|
|
|
program->dev.simd_per_cu = options->cu_info->num_simd_per_compute_unit;
|
|
|
|
|
program->dev.physical_sgprs = options->cu_info->num_physical_sgprs_per_simd;
|
|
|
|
|
program->dev.sgpr_alloc_granule = options->cu_info->sgpr_alloc_granularity;
|
|
|
|
|
program->dev.sgpr_limit = options->cu_info->max_sgpr_alloc;
|
|
|
|
|
program->dev.physical_vgprs = options->cu_info->num_physical_wave64_vgprs_per_simd;
|
|
|
|
|
program->dev.vgpr_alloc_granule = options->cu_info->wave64_vgpr_alloc_granularity;
|
|
|
|
|
program->dev.vgpr_limit = options->cu_info->max_vgpr_alloc;
|
|
|
|
|
|
|
|
|
|
if (program->wave_size == 32) {
|
|
|
|
|
program->dev.physical_vgprs *= 2;
|
|
|
|
|
program->dev.vgpr_alloc_granule *= 2;
|
2020-01-22 19:57:20 +00:00
|
|
|
}
|
|
|
|
|
|
2025-09-24 15:59:33 +02:00
|
|
|
if (program->stage == raytracing_cs) {
|
|
|
|
|
unsigned vgpr_limit = util_align_npot(128, program->dev.vgpr_alloc_granule);
|
|
|
|
|
unsigned min_waves = program->dev.physical_vgprs / vgpr_limit;
|
|
|
|
|
vgpr_limit = program->dev.physical_vgprs / min_waves;
|
|
|
|
|
program->dev.vgpr_limit = util_round_down_npot(vgpr_limit, program->dev.vgpr_alloc_granule);
|
|
|
|
|
}
|
2025-02-17 18:42:47 +01:00
|
|
|
|
2025-11-27 10:04:54 +01:00
|
|
|
program->dev.scratch_alloc_granule = program->gfx_level >= GFX11 ? 256 : 1024;
|
2023-01-05 13:58:02 +00:00
|
|
|
|
2025-11-27 11:58:08 +01:00
|
|
|
/* XNACK replay can be used for demand paging and page migration.
|
|
|
|
|
* This is only relevant to GPGPU programming with unified shared memory.
|
|
|
|
|
*/
|
|
|
|
|
program->dev.xnack_enabled = false;
|
2021-01-28 13:07:11 +00:00
|
|
|
|
2025-11-27 11:53:39 +01:00
|
|
|
program->dev.sram_ecc_enabled = options->cu_info->has_sram_ecc_enabled;
|
2025-11-27 13:27:00 +01:00
|
|
|
program->dev.has_point_sample_accel = options->cu_info->has_point_sample_accel;
|
2025-11-27 14:41:22 +01:00
|
|
|
program->dev.has_gfx6_mrt_export_bug = options->cu_info->has_gfx6_mrt_export_bug;
|
2025-11-27 11:53:39 +01:00
|
|
|
|
|
|
|
|
program->dev.has_fast_fma32 = options->cu_info->has_fast_fma32;
|
2024-05-17 19:14:37 +01:00
|
|
|
program->dev.has_mac_legacy32 = program->gfx_level <= GFX7 || program->gfx_level == GFX10;
|
|
|
|
|
program->dev.has_fmac_legacy32 = program->gfx_level >= GFX10_3 && program->gfx_level < GFX12;
|
2025-11-27 11:53:39 +01:00
|
|
|
program->dev.fused_mad_mix = options->cu_info->has_fma_mix;
|
2025-11-27 12:53:52 +01:00
|
|
|
program->dev.has_mad32 = options->cu_info->has_mad32;
|
2022-01-27 14:00:38 +00:00
|
|
|
|
2025-04-23 17:06:31 +01:00
|
|
|
if (program->gfx_level >= GFX12) {
|
|
|
|
|
program->dev.scratch_global_offset_min = -8388608;
|
|
|
|
|
program->dev.scratch_global_offset_max = 8388607;
|
|
|
|
|
} else if (program->gfx_level >= GFX11) {
|
2022-05-19 15:18:36 +01:00
|
|
|
program->dev.scratch_global_offset_min = -4096;
|
|
|
|
|
program->dev.scratch_global_offset_max = 4095;
|
|
|
|
|
} else if (program->gfx_level >= GFX10 || program->gfx_level == GFX8) {
|
|
|
|
|
program->dev.scratch_global_offset_min = -2048;
|
|
|
|
|
program->dev.scratch_global_offset_max = 2047;
|
|
|
|
|
} else if (program->gfx_level == GFX9) {
|
|
|
|
|
/* The minimum is actually -4096, but negative offsets are broken when SADDR is used. */
|
|
|
|
|
program->dev.scratch_global_offset_min = 0;
|
|
|
|
|
program->dev.scratch_global_offset_max = 4095;
|
|
|
|
|
}
|
|
|
|
|
|
2025-04-23 17:01:48 +01:00
|
|
|
if (program->gfx_level >= GFX12)
|
|
|
|
|
program->dev.buf_offset_max = 0x7fffff;
|
|
|
|
|
else
|
|
|
|
|
program->dev.buf_offset_max = 0xfff;
|
|
|
|
|
|
2025-04-28 12:10:26 +01:00
|
|
|
if (program->gfx_level >= GFX12)
|
|
|
|
|
program->dev.smem_offset_max = 0x7fffff;
|
|
|
|
|
else if (program->gfx_level >= GFX8)
|
aco: increase max_const_offset_plus_one for SMEM load_global
fossil-db (gfx1201):
Totals from 1115 (1.40% of 79377) affected shaders:
Instrs: 1473805 -> 1467571 (-0.42%); split: -0.43%, +0.01%
CodeSize: 7852972 -> 7819656 (-0.42%); split: -0.44%, +0.02%
SpillSGPRs: 1632 -> 1460 (-10.54%); split: -11.27%, +0.74%
Latency: 11975762 -> 11971915 (-0.03%); split: -0.05%, +0.02%
InvThroughput: 2496961 -> 2496448 (-0.02%); split: -0.03%, +0.01%
VClause: 25213 -> 25218 (+0.02%); split: -0.00%, +0.02%
SClause: 28822 -> 28565 (-0.89%); split: -1.41%, +0.52%
Copies: 106377 -> 105715 (-0.62%); split: -1.23%, +0.61%
Branches: 27497 -> 27473 (-0.09%)
PreSGPRs: 52071 -> 51310 (-1.46%)
VALU: 871051 -> 870694 (-0.04%); split: -0.04%, +0.00%
SALU: 186090 -> 181811 (-2.30%); split: -2.32%, +0.02%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34730>
2025-04-23 16:41:53 +01:00
|
|
|
program->dev.smem_offset_max = 0xfffff;
|
|
|
|
|
else if (program->gfx_level >= GFX7)
|
|
|
|
|
program->dev.smem_offset_max = 0xffffffff;
|
|
|
|
|
else if (program->gfx_level >= GFX6)
|
|
|
|
|
program->dev.smem_offset_max = 0x3ff;
|
|
|
|
|
|
2024-05-23 11:43:06 +01:00
|
|
|
if (program->gfx_level >= GFX12) {
|
|
|
|
|
/* Same as GFX11, except one less for VSAMPLE. */
|
|
|
|
|
program->dev.max_nsa_vgprs = 3;
|
|
|
|
|
} else if (program->gfx_level >= GFX11) {
|
2023-03-21 14:44:09 +00:00
|
|
|
/* GFX11 can have only 1 NSA dword. The last VGPR isn't included here because it contains the
|
|
|
|
|
* rest of the address.
|
|
|
|
|
*/
|
|
|
|
|
program->dev.max_nsa_vgprs = 4;
|
|
|
|
|
} else if (program->gfx_level >= GFX10_3) {
|
|
|
|
|
/* GFX10.3 can have up to 3 NSA dwords. */
|
|
|
|
|
program->dev.max_nsa_vgprs = 13;
|
|
|
|
|
} else if (program->gfx_level >= GFX10) {
|
|
|
|
|
/* Limit NSA instructions to 1 NSA dword on GFX10 to avoid stability issues. */
|
|
|
|
|
program->dev.max_nsa_vgprs = 5;
|
|
|
|
|
} else {
|
|
|
|
|
program->dev.max_nsa_vgprs = 0;
|
|
|
|
|
}
|
|
|
|
|
|
2025-11-27 10:04:54 +01:00
|
|
|
program->wgp_mode = options->wgp_mode;
|
2021-01-28 11:07:26 +00:00
|
|
|
|
2021-04-20 17:35:41 +01:00
|
|
|
program->progress = CompilationProgress::after_isel;
|
|
|
|
|
|
2020-01-22 19:57:20 +00:00
|
|
|
program->next_fp_mode.must_flush_denorms32 = false;
|
|
|
|
|
program->next_fp_mode.must_flush_denorms16_64 = false;
|
|
|
|
|
program->next_fp_mode.care_about_round32 = false;
|
|
|
|
|
program->next_fp_mode.care_about_round16_64 = false;
|
|
|
|
|
program->next_fp_mode.denorm16_64 = fp_denorm_keep;
|
|
|
|
|
program->next_fp_mode.denorm32 = 0;
|
|
|
|
|
program->next_fp_mode.round16_64 = fp_round_ne;
|
|
|
|
|
program->next_fp_mode.round32 = fp_round_ne;
|
2025-06-30 16:11:42 +02:00
|
|
|
program->needs_fp_mode_insertion = false;
|
2020-01-22 19:57:20 +00:00
|
|
|
}
|
|
|
|
|
|
2024-05-27 14:23:38 +01:00
|
|
|
bool
|
|
|
|
|
is_wait_export_ready(amd_gfx_level gfx_level, const Instruction* instr)
|
|
|
|
|
{
|
|
|
|
|
return instr->opcode == aco_opcode::s_wait_event &&
|
|
|
|
|
(gfx_level >= GFX12 ? (instr->salu().imm & wait_event_imm_wait_export_ready_gfx12)
|
|
|
|
|
: !(instr->salu().imm & wait_event_imm_dont_wait_export_ready_gfx11));
|
|
|
|
|
}
|
|
|
|
|
|
2025-07-22 15:11:29 +01:00
|
|
|
static bool
|
|
|
|
|
is_done_sendmsg(amd_gfx_level gfx_level, const Instruction* instr)
|
|
|
|
|
{
|
|
|
|
|
if (gfx_level <= GFX10_3 && instr->opcode == aco_opcode::s_sendmsg)
|
|
|
|
|
return (instr->salu().imm & sendmsg_id_mask) == sendmsg_gs_done;
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static bool
|
|
|
|
|
is_pos_prim_export(amd_gfx_level gfx_level, const Instruction* instr)
|
|
|
|
|
{
|
|
|
|
|
/* Because of NO_PC_EXPORT=1, a done=1 position or primitive export can launch PS waves before
|
|
|
|
|
* the NGG/VS wave finishes if there are no parameter exports.
|
|
|
|
|
*/
|
|
|
|
|
return gfx_level >= GFX10 && instr->opcode == aco_opcode::exp &&
|
|
|
|
|
instr->exp().dest >= V_008DFC_SQ_EXP_POS && instr->exp().dest <= V_008DFC_SQ_EXP_PRIM;
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-09 10:03:59 +01:00
|
|
|
static bool
|
|
|
|
|
is_pops_end_export(Program* program, const Instruction* instr)
|
|
|
|
|
{
|
|
|
|
|
return program->gfx_level >= GFX11 && instr->opcode == aco_opcode::exp &&
|
|
|
|
|
instr->exp().dest <= V_008DFC_SQ_EXP_NULL && program->has_pops_overlapped_waves_wait;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static bool
|
|
|
|
|
is_ordered_ps_done_sendmsg(const Instruction* instr)
|
|
|
|
|
{
|
|
|
|
|
return instr->opcode == aco_opcode::s_sendmsg &&
|
|
|
|
|
(instr->salu().imm & sendmsg_id_mask) == sendmsg_ordered_ps_done;
|
|
|
|
|
}
|
|
|
|
|
|
2025-07-22 15:11:29 +01:00
|
|
|
uint16_t
|
2025-09-09 10:03:59 +01:00
|
|
|
is_atomic_or_control_instr(Program* program, const Instruction* instr, memory_sync_info sync,
|
2025-07-22 15:11:29 +01:00
|
|
|
unsigned semantic)
|
|
|
|
|
{
|
|
|
|
|
bool is_acquire = semantic & semantic_acquire;
|
|
|
|
|
bool is_release = semantic & semantic_release;
|
|
|
|
|
|
|
|
|
|
bool is_atomic = sync.semantics & semantic_atomic;
|
|
|
|
|
// TODO: NIR doesn't have any atomic load/store, so we assume any load/store is atomic
|
|
|
|
|
is_atomic |= !(sync.semantics & semantic_private) && sync.storage;
|
|
|
|
|
if (is_atomic) {
|
|
|
|
|
bool is_load = !instr->definitions.empty() || (sync.semantics & semantic_rmw);
|
|
|
|
|
bool is_store = instr->definitions.empty() || (sync.semantics & semantic_rmw);
|
|
|
|
|
return ((is_release && is_store) || (is_acquire && is_load)) ? sync.storage : 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
uint16_t cls = BITFIELD_MASK(storage_count);
|
2025-09-03 11:20:33 +01:00
|
|
|
if (is_acquire) {
|
|
|
|
|
if (is_wait_export_ready(program->gfx_level, instr) ||
|
|
|
|
|
instr->opcode == aco_opcode::p_pops_gfx9_add_exiting_wave_id)
|
|
|
|
|
return cls & ~storage_shared;
|
|
|
|
|
}
|
2025-09-09 10:03:59 +01:00
|
|
|
if (is_release) {
|
|
|
|
|
if (is_done_sendmsg(program->gfx_level, instr) ||
|
|
|
|
|
is_pos_prim_export(program->gfx_level, instr))
|
|
|
|
|
return cls & ~storage_shared;
|
|
|
|
|
|
|
|
|
|
if (is_pops_end_export(program, instr) || is_ordered_ps_done_sendmsg(instr) ||
|
|
|
|
|
instr->opcode == aco_opcode::p_pops_gfx9_ordered_section_done)
|
|
|
|
|
return cls & ~storage_shared;
|
|
|
|
|
}
|
2025-07-22 15:11:29 +01:00
|
|
|
return (instr->isBarrier() && instr->barrier().exec_scope > scope_invocation) ? cls : 0;
|
|
|
|
|
}
|
|
|
|
|
|
2020-06-26 15:54:22 +01:00
|
|
|
memory_sync_info
|
|
|
|
|
get_sync_info(const Instruction* instr)
|
|
|
|
|
{
|
2023-04-03 21:27:47 +03:00
|
|
|
/* Primitive Ordered Pixel Shading barriers necessary for accesses to memory shared between
|
|
|
|
|
* overlapping waves in the queue family.
|
|
|
|
|
*/
|
|
|
|
|
if (instr->opcode == aco_opcode::p_pops_gfx9_overlapped_wave_wait_done ||
|
2024-05-27 14:23:38 +01:00
|
|
|
instr->opcode == aco_opcode::s_wait_event) {
|
2023-04-03 21:27:47 +03:00
|
|
|
return memory_sync_info(storage_buffer | storage_image, semantic_acquire, scope_queuefamily);
|
|
|
|
|
} else if (instr->opcode == aco_opcode::p_pops_gfx9_ordered_section_done) {
|
|
|
|
|
return memory_sync_info(storage_buffer | storage_image, semantic_release, scope_queuefamily);
|
|
|
|
|
}
|
|
|
|
|
|
2020-06-26 15:54:22 +01:00
|
|
|
switch (instr->format) {
|
2021-01-21 16:13:34 +00:00
|
|
|
case Format::SMEM: return instr->smem().sync;
|
|
|
|
|
case Format::MUBUF: return instr->mubuf().sync;
|
|
|
|
|
case Format::MIMG: return instr->mimg().sync;
|
|
|
|
|
case Format::MTBUF: return instr->mtbuf().sync;
|
2020-06-26 15:54:22 +01:00
|
|
|
case Format::FLAT:
|
|
|
|
|
case Format::GLOBAL:
|
2021-01-21 16:13:34 +00:00
|
|
|
case Format::SCRATCH: return instr->flatlike().sync;
|
|
|
|
|
case Format::DS: return instr->ds().sync;
|
2022-06-17 13:53:08 +01:00
|
|
|
case Format::LDSDIR: return instr->ldsdir().sync;
|
2020-06-26 15:54:22 +01:00
|
|
|
default: return memory_sync_info();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
bool
|
2022-05-12 02:50:17 -04:00
|
|
|
can_use_SDWA(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool pre_ra)
|
2020-05-11 17:49:40 +01:00
|
|
|
{
|
|
|
|
|
if (!instr->isVALU())
|
|
|
|
|
return false;
|
|
|
|
|
|
2022-05-13 12:01:03 +01:00
|
|
|
if (gfx_level < GFX8 || gfx_level >= GFX11 || instr->isDPP() || instr->isVOP3P())
|
2020-05-11 17:49:40 +01:00
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (instr->isSDWA())
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
if (instr->isVOP3()) {
|
2023-02-21 20:08:42 +01:00
|
|
|
VALU_instruction& vop3 = instr->valu();
|
2020-05-11 17:49:40 +01:00
|
|
|
if (instr->format == Format::VOP3)
|
|
|
|
|
return false;
|
2022-05-12 02:50:17 -04:00
|
|
|
if (vop3.clamp && instr->isVOPC() && gfx_level != GFX8)
|
2020-05-11 17:49:40 +01:00
|
|
|
return false;
|
2022-05-12 02:50:17 -04:00
|
|
|
if (vop3.omod && gfx_level < GFX9)
|
2020-05-11 17:49:40 +01:00
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
// TODO: return true if we know we will use vcc
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
if (!pre_ra && instr->definitions.size() >= 2)
|
2020-05-11 17:49:40 +01:00
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 1; i < instr->operands.size(); i++) {
|
|
|
|
|
if (instr->operands[i].isLiteral())
|
|
|
|
|
return false;
|
2022-05-12 02:50:17 -04:00
|
|
|
if (gfx_level < GFX9 && !instr->operands[i].isOfType(RegType::vgpr))
|
2020-05-11 17:49:40 +01:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
aco: use VOPC_SDWA on GFX9+
Totals from 5138 (3.42% of 150170) affected shaders: (GFX10.3)
VGPRs: 409520 -> 409416 (-0.03%); split: -0.03%, +0.00%
CodeSize: 43056360 -> 43035696 (-0.05%); split: -0.06%, +0.02%
MaxWaves: 69296 -> 69310 (+0.02%)
Instrs: 8161016 -> 8153365 (-0.09%); split: -0.10%, +0.01%
Latency: 109397002 -> 109756208 (+0.33%); split: -0.05%, +0.38%
InvThroughput: 23238920 -> 23310761 (+0.31%); split: -0.11%, +0.42%
VClause: 135141 -> 135100 (-0.03%); split: -0.05%, +0.02%
SClause: 349511 -> 349489 (-0.01%); split: -0.01%, +0.00%
Copies: 388107 -> 387754 (-0.09%); split: -0.48%, +0.38%
Branches: 184629 -> 184503 (-0.07%); split: -0.08%, +0.01%
PreSGPRs: 258807 -> 258839 (+0.01%)
PreVGPRs: 372561 -> 372184 (-0.10%); split: -0.10%, +0.00%
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12364>
2021-07-07 11:37:49 +02:00
|
|
|
if (!instr->definitions.empty() && instr->definitions[0].bytes() > 4 && !instr->isVOPC())
|
2021-06-07 16:56:45 +01:00
|
|
|
return false;
|
|
|
|
|
|
2020-05-11 17:49:40 +01:00
|
|
|
if (!instr->operands.empty()) {
|
|
|
|
|
if (instr->operands[0].isLiteral())
|
|
|
|
|
return false;
|
2022-05-12 02:50:17 -04:00
|
|
|
if (gfx_level < GFX9 && !instr->operands[0].isOfType(RegType::vgpr))
|
2020-05-11 17:49:40 +01:00
|
|
|
return false;
|
2021-06-07 16:56:45 +01:00
|
|
|
if (instr->operands[0].bytes() > 4)
|
|
|
|
|
return false;
|
|
|
|
|
if (instr->operands.size() > 1 && instr->operands[1].bytes() > 4)
|
|
|
|
|
return false;
|
2020-05-11 17:49:40 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool is_mac = instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_mac_f16 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_fmac_f32 || instr->opcode == aco_opcode::v_fmac_f16;
|
|
|
|
|
|
2022-05-12 02:50:17 -04:00
|
|
|
if (gfx_level != GFX8 && is_mac)
|
2020-05-11 17:49:40 +01:00
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
// TODO: return true if we know we will use vcc
|
2022-05-12 02:50:17 -04:00
|
|
|
if (!pre_ra && instr->isVOPC() && gfx_level == GFX8)
|
2020-05-11 17:49:40 +01:00
|
|
|
return false;
|
aco: optimize 32-bit extracts and inserts using SDWA
Still need to use dst_u=preserve field to optimize packs
fossil-db (Sienna Cichlid):
Totals from 15974 (10.66% of 149839) affected shaders:
VGPRs: 1009064 -> 1008968 (-0.01%); split: -0.03%, +0.02%
SpillSGPRs: 7959 -> 7964 (+0.06%)
CodeSize: 101716436 -> 101159568 (-0.55%); split: -0.55%, +0.01%
MaxWaves: 284464 -> 284490 (+0.01%); split: +0.02%, -0.01%
Instrs: 19334216 -> 19224241 (-0.57%); split: -0.57%, +0.00%
Latency: 375465295 -> 375230478 (-0.06%); split: -0.14%, +0.08%
InvThroughput: 79006105 -> 78860705 (-0.18%); split: -0.25%, +0.07%
fossil-db (Polaris):
Totals from 11369 (7.51% of 151365) affected shaders:
SGPRs: 787920 -> 787680 (-0.03%); split: -0.04%, +0.01%
VGPRs: 681056 -> 681040 (-0.00%); split: -0.01%, +0.00%
CodeSize: 68127288 -> 67664120 (-0.68%); split: -0.69%, +0.01%
MaxWaves: 54370 -> 54371 (+0.00%)
Instrs: 13294638 -> 13214109 (-0.61%); split: -0.62%, +0.01%
Latency: 373515759 -> 373214571 (-0.08%); split: -0.11%, +0.03%
InvThroughput: 166529524 -> 166275291 (-0.15%); split: -0.20%, +0.05%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3151>
2020-08-12 14:23:56 +01:00
|
|
|
if (!pre_ra && instr->operands.size() >= 3 && !is_mac)
|
2020-05-11 17:49:40 +01:00
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
|
2022-10-23 00:37:43 +02:00
|
|
|
instr->opcode != aco_opcode::v_fmamk_f32 && instr->opcode != aco_opcode::v_fmaak_f32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_fmamk_f16 && instr->opcode != aco_opcode::v_fmaak_f16 &&
|
2020-05-11 17:49:40 +01:00
|
|
|
instr->opcode != aco_opcode::v_readfirstlane_b32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_clrexcp && instr->opcode != aco_opcode::v_swap_b32;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* updates "instr" and returns the old instruction (or NULL if no update was needed) */
|
|
|
|
|
aco_ptr<Instruction>
|
2022-05-12 02:50:17 -04:00
|
|
|
convert_to_SDWA(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr)
|
2020-05-11 17:49:40 +01:00
|
|
|
{
|
|
|
|
|
if (instr->isSDWA())
|
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
|
|
aco_ptr<Instruction> tmp = std::move(instr);
|
2023-05-13 11:40:35 +02:00
|
|
|
Format format = asSDWA(withoutVOP3(tmp->format));
|
2024-03-25 15:55:27 +01:00
|
|
|
instr.reset(
|
|
|
|
|
create_instruction(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
|
2020-05-11 17:49:40 +01:00
|
|
|
std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
|
|
|
|
|
std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(), instr->definitions.begin());
|
|
|
|
|
|
2021-01-21 16:13:34 +00:00
|
|
|
SDWA_instruction& sdwa = instr->sdwa();
|
2020-05-11 17:49:40 +01:00
|
|
|
|
|
|
|
|
if (tmp->isVOP3()) {
|
2023-02-21 20:08:42 +01:00
|
|
|
VALU_instruction& vop3 = tmp->valu();
|
2023-03-07 13:53:07 +01:00
|
|
|
sdwa.neg = vop3.neg;
|
2023-03-27 23:32:54 +02:00
|
|
|
sdwa.abs = vop3.abs;
|
2021-01-21 16:13:34 +00:00
|
|
|
sdwa.omod = vop3.omod;
|
|
|
|
|
sdwa.clamp = vop3.clamp;
|
2020-05-11 17:49:40 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < instr->operands.size(); i++) {
|
2020-08-22 20:45:54 +02:00
|
|
|
/* SDWA only uses operands 0 and 1. */
|
|
|
|
|
if (i >= 2)
|
|
|
|
|
break;
|
|
|
|
|
|
2021-08-30 17:58:36 +02:00
|
|
|
sdwa.sel[i] = SubdwordSel(instr->operands[i].bytes(), 0, false);
|
2020-05-11 17:49:40 +01:00
|
|
|
}
|
|
|
|
|
|
2021-08-30 17:58:36 +02:00
|
|
|
sdwa.dst_sel = SubdwordSel(instr->definitions[0].bytes(), 0, false);
|
|
|
|
|
|
2022-05-12 02:50:17 -04:00
|
|
|
if (instr->definitions[0].getTemp().type() == RegType::sgpr && gfx_level == GFX8)
|
2024-09-11 11:11:42 +02:00
|
|
|
instr->definitions[0].setPrecolored(vcc);
|
2020-05-11 17:49:40 +01:00
|
|
|
if (instr->definitions.size() >= 2)
|
2024-09-11 11:11:42 +02:00
|
|
|
instr->definitions[1].setPrecolored(vcc);
|
2020-05-11 17:49:40 +01:00
|
|
|
if (instr->operands.size() >= 3)
|
2024-09-11 11:11:42 +02:00
|
|
|
instr->operands[2].setPrecolored(vcc);
|
2020-05-11 17:49:40 +01:00
|
|
|
|
2022-01-28 14:49:50 +00:00
|
|
|
instr->pass_flags = tmp->pass_flags;
|
|
|
|
|
|
2020-05-11 17:49:40 +01:00
|
|
|
return tmp;
|
|
|
|
|
}
|
|
|
|
|
|
2026-01-27 15:12:52 +01:00
|
|
|
bool
|
|
|
|
|
opcode_supports_dpp(amd_gfx_level gfx_level, aco_opcode opcode, bool vop3p)
|
|
|
|
|
{
|
|
|
|
|
switch (opcode) {
|
2026-01-27 15:27:10 +01:00
|
|
|
/* reverse integer subtract and shift seem to apply dpp to src1 instead of src0 */
|
|
|
|
|
case aco_opcode::v_subrev_co_u32:
|
|
|
|
|
case aco_opcode::v_subrev_co_u32_e64:
|
|
|
|
|
case aco_opcode::v_subbrev_co_u32:
|
|
|
|
|
case aco_opcode::v_subrev_u16:
|
|
|
|
|
case aco_opcode::v_subrev_u32:
|
|
|
|
|
case aco_opcode::v_ashrrev_i32:
|
|
|
|
|
case aco_opcode::v_lshrrev_b32:
|
|
|
|
|
case aco_opcode::v_lshlrev_b32:
|
|
|
|
|
case aco_opcode::v_ashrrev_i16:
|
|
|
|
|
case aco_opcode::v_lshrrev_b16:
|
|
|
|
|
case aco_opcode::v_lshlrev_b16:
|
|
|
|
|
case aco_opcode::v_ashrrev_i16_e64:
|
|
|
|
|
case aco_opcode::v_lshrrev_b16_e64:
|
|
|
|
|
case aco_opcode::v_lshlrev_b16_e64: return false;
|
2026-01-27 15:12:52 +01:00
|
|
|
case aco_opcode::v_pk_fmac_f16: return gfx_level < GFX11;
|
|
|
|
|
/* there are more cases but those all take 64-bit inputs */
|
|
|
|
|
case aco_opcode::v_madmk_f32:
|
|
|
|
|
case aco_opcode::v_madak_f32:
|
|
|
|
|
case aco_opcode::v_madmk_f16:
|
|
|
|
|
case aco_opcode::v_madak_f16:
|
|
|
|
|
case aco_opcode::v_fmamk_f32:
|
|
|
|
|
case aco_opcode::v_fmaak_f32:
|
|
|
|
|
case aco_opcode::v_fmamk_f16:
|
|
|
|
|
case aco_opcode::v_fmaak_f16:
|
|
|
|
|
case aco_opcode::v_readfirstlane_b32:
|
|
|
|
|
case aco_opcode::v_cvt_f64_i32:
|
|
|
|
|
case aco_opcode::v_cvt_f64_f32:
|
|
|
|
|
case aco_opcode::v_cvt_f64_u32:
|
|
|
|
|
case aco_opcode::v_mul_lo_u32:
|
|
|
|
|
case aco_opcode::v_mul_lo_i32:
|
|
|
|
|
case aco_opcode::v_mul_hi_u32:
|
|
|
|
|
case aco_opcode::v_mul_hi_i32:
|
|
|
|
|
case aco_opcode::v_qsad_pk_u16_u8:
|
|
|
|
|
case aco_opcode::v_mqsad_pk_u16_u8:
|
|
|
|
|
case aco_opcode::v_mqsad_u32_u8:
|
|
|
|
|
case aco_opcode::v_mad_u64_u32:
|
|
|
|
|
case aco_opcode::v_mad_i64_i32:
|
|
|
|
|
case aco_opcode::v_permlane16_b32:
|
|
|
|
|
case aco_opcode::v_permlanex16_b32:
|
|
|
|
|
case aco_opcode::v_permlane64_b32:
|
|
|
|
|
case aco_opcode::v_readlane_b32_e64:
|
|
|
|
|
case aco_opcode::v_writelane_b32_e64: return false;
|
|
|
|
|
/* simpler than listing all VOP3P opcodes which do not support DPP */
|
|
|
|
|
case aco_opcode::v_fma_mix_f32:
|
|
|
|
|
case aco_opcode::v_fma_mixlo_f16:
|
|
|
|
|
case aco_opcode::v_fma_mixhi_f16:
|
|
|
|
|
case aco_opcode::p_v_fma_mixlo_f16_rtz:
|
|
|
|
|
case aco_opcode::p_v_fma_mixhi_f16_rtz:
|
|
|
|
|
case aco_opcode::v_dot2_f32_f16:
|
|
|
|
|
case aco_opcode::v_dot2_f32_bf16: return gfx_level >= GFX11;
|
|
|
|
|
default: return !vop3p;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-07-14 17:11:44 +01:00
|
|
|
bool
|
2023-04-23 14:55:17 +02:00
|
|
|
can_use_DPP(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool dpp8)
|
2021-07-14 17:11:44 +01:00
|
|
|
{
|
|
|
|
|
assert(instr->isVALU() && !instr->operands.empty());
|
|
|
|
|
|
|
|
|
|
if (instr->isDPP())
|
2021-11-29 00:12:04 +09:00
|
|
|
return instr->isDPP8() == dpp8;
|
2021-07-14 17:11:44 +01:00
|
|
|
|
2023-04-23 14:55:17 +02:00
|
|
|
if (instr->isSDWA() || instr->isVINTERP_INREG())
|
2021-07-14 17:11:44 +01:00
|
|
|
return false;
|
|
|
|
|
|
2023-04-23 14:55:17 +02:00
|
|
|
if ((instr->format == Format::VOP3 || instr->isVOP3P()) && gfx_level < GFX11)
|
2021-07-14 17:11:44 +01:00
|
|
|
return false;
|
|
|
|
|
|
2023-04-23 14:55:17 +02:00
|
|
|
if ((instr->isVOPC() || instr->definitions.size() > 1) && instr->definitions.back().isFixed() &&
|
|
|
|
|
instr->definitions.back().physReg() != vcc && gfx_level < GFX11)
|
2021-07-14 17:11:44 +01:00
|
|
|
return false;
|
|
|
|
|
|
2023-04-23 14:55:17 +02:00
|
|
|
if (instr->operands.size() >= 3 && instr->operands[2].isFixed() &&
|
|
|
|
|
instr->operands[2].isOfType(RegType::sgpr) && instr->operands[2].physReg() != vcc &&
|
|
|
|
|
gfx_level < GFX11)
|
2021-07-14 17:11:44 +01:00
|
|
|
return false;
|
|
|
|
|
|
2023-04-23 14:55:17 +02:00
|
|
|
if (instr->isVOP3() && gfx_level < GFX11) {
|
2023-02-21 20:08:42 +01:00
|
|
|
const VALU_instruction* vop3 = &instr->valu();
|
2023-04-23 14:55:17 +02:00
|
|
|
if (vop3->clamp || vop3->omod)
|
2021-07-14 17:11:44 +01:00
|
|
|
return false;
|
2021-11-29 00:12:04 +09:00
|
|
|
if (dpp8)
|
|
|
|
|
return false;
|
2023-04-23 14:55:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < instr->operands.size(); i++) {
|
|
|
|
|
if (instr->operands[i].isLiteral())
|
2021-07-14 17:11:44 +01:00
|
|
|
return false;
|
aco: apply DPP with scalar src1 on gfx11.5+
Foz-DB Navi48:
Totals from 6261 (7.62% of 82179) affected shaders:
MaxWaves: 176284 -> 176236 (-0.03%); split: +0.01%, -0.03%
Instrs: 5850185 -> 5828451 (-0.37%); split: -0.41%, +0.04%
CodeSize: 31363324 -> 31419904 (+0.18%); split: -0.08%, +0.26%
VGPRs: 328284 -> 328200 (-0.03%); split: -0.07%, +0.05%
SpillSGPRs: 2268 -> 2256 (-0.53%)
Latency: 50235516 -> 50218816 (-0.03%); split: -0.06%, +0.03%
InvThroughput: 8256243 -> 8242036 (-0.17%); split: -0.22%, +0.05%
VClause: 81000 -> 80975 (-0.03%); split: -0.11%, +0.08%
SClause: 136376 -> 136387 (+0.01%); split: -0.11%, +0.11%
Copies: 414021 -> 417894 (+0.94%); split: -0.13%, +1.07%
Branches: 105301 -> 105298 (-0.00%); split: -0.00%, +0.00%
PreSGPRs: 291360 -> 291432 (+0.02%)
PreVGPRs: 238593 -> 238729 (+0.06%); split: -0.02%, +0.08%
VALU: 3425446 -> 3403463 (-0.64%); split: -0.65%, +0.01%
SALU: 815505 -> 819372 (+0.47%); split: -0.02%, +0.50%
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39516>
2026-01-25 16:24:06 +01:00
|
|
|
if (!instr->operands[i].isOfType(RegType::vgpr) &&
|
|
|
|
|
(i == 0 || (i == 1 && gfx_level < GFX11_5)))
|
2021-11-29 16:34:15 +00:00
|
|
|
return false;
|
2021-07-14 17:11:44 +01:00
|
|
|
}
|
|
|
|
|
|
2026-01-27 15:12:52 +01:00
|
|
|
return opcode_supports_dpp(gfx_level, instr->opcode, instr->isVOP3P());
|
2021-07-14 17:11:44 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
aco_ptr<Instruction>
|
2023-04-23 14:55:17 +02:00
|
|
|
convert_to_DPP(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr, bool dpp8)
|
2021-07-14 17:11:44 +01:00
|
|
|
{
|
|
|
|
|
if (instr->isDPP())
|
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
|
|
aco_ptr<Instruction> tmp = std::move(instr);
|
2023-04-23 14:55:17 +02:00
|
|
|
Format format =
|
|
|
|
|
(Format)((uint32_t)tmp->format | (uint32_t)(dpp8 ? Format::DPP8 : Format::DPP16));
|
2021-11-29 00:12:04 +09:00
|
|
|
if (dpp8)
|
2024-03-25 15:55:27 +01:00
|
|
|
instr.reset(
|
|
|
|
|
create_instruction(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
|
2021-11-29 00:12:04 +09:00
|
|
|
else
|
2024-03-25 15:55:27 +01:00
|
|
|
instr.reset(
|
|
|
|
|
create_instruction(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
|
2021-07-14 17:11:44 +01:00
|
|
|
std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
|
2023-04-23 14:55:17 +02:00
|
|
|
std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(), instr->definitions.begin());
|
2021-07-14 17:11:44 +01:00
|
|
|
|
2021-11-29 00:12:04 +09:00
|
|
|
if (dpp8) {
|
|
|
|
|
DPP8_instruction* dpp = &instr->dpp8();
|
2023-10-02 15:44:49 +01:00
|
|
|
dpp->lane_sel = 0xfac688; /* [0,1,2,3,4,5,6,7] */
|
2023-10-02 15:47:11 +01:00
|
|
|
dpp->fetch_inactive = gfx_level >= GFX10;
|
2021-11-29 00:12:04 +09:00
|
|
|
} else {
|
|
|
|
|
DPP16_instruction* dpp = &instr->dpp16();
|
|
|
|
|
dpp->dpp_ctrl = dpp_quad_perm(0, 1, 2, 3);
|
|
|
|
|
dpp->row_mask = 0xf;
|
|
|
|
|
dpp->bank_mask = 0xf;
|
2023-10-02 15:47:11 +01:00
|
|
|
dpp->fetch_inactive = gfx_level >= GFX10;
|
2021-07-14 17:11:44 +01:00
|
|
|
}
|
|
|
|
|
|
2023-03-23 13:14:05 +01:00
|
|
|
instr->valu().neg = tmp->valu().neg;
|
|
|
|
|
instr->valu().abs = tmp->valu().abs;
|
2023-04-23 14:55:17 +02:00
|
|
|
instr->valu().omod = tmp->valu().omod;
|
|
|
|
|
instr->valu().clamp = tmp->valu().clamp;
|
2023-03-23 13:14:05 +01:00
|
|
|
instr->valu().opsel = tmp->valu().opsel;
|
2023-04-23 14:55:17 +02:00
|
|
|
instr->valu().opsel_lo = tmp->valu().opsel_lo;
|
|
|
|
|
instr->valu().opsel_hi = tmp->valu().opsel_hi;
|
2023-03-23 13:14:05 +01:00
|
|
|
|
2023-04-23 14:55:17 +02:00
|
|
|
if ((instr->isVOPC() || instr->definitions.size() > 1) && gfx_level < GFX11)
|
2024-09-11 11:11:42 +02:00
|
|
|
instr->definitions.back().setPrecolored(vcc);
|
2021-07-14 17:11:44 +01:00
|
|
|
|
2023-04-23 14:55:17 +02:00
|
|
|
if (instr->operands.size() >= 3 && instr->operands[2].isOfType(RegType::sgpr) &&
|
|
|
|
|
gfx_level < GFX11)
|
2024-09-11 11:11:42 +02:00
|
|
|
instr->operands[2].setPrecolored(vcc);
|
2021-07-14 17:11:44 +01:00
|
|
|
|
2022-01-28 14:49:50 +00:00
|
|
|
instr->pass_flags = tmp->pass_flags;
|
|
|
|
|
|
2023-04-23 14:55:17 +02:00
|
|
|
/* DPP16 supports input modifiers, so we might no longer need VOP3. */
|
|
|
|
|
bool remove_vop3 = !dpp8 && !instr->valu().omod && !instr->valu().clamp &&
|
|
|
|
|
(instr->isVOP1() || instr->isVOP2() || instr->isVOPC());
|
|
|
|
|
|
|
|
|
|
/* VOPC/add_co/sub_co definition needs VCC without VOP3. */
|
|
|
|
|
remove_vop3 &= instr->definitions.back().regClass().type() != RegType::sgpr ||
|
|
|
|
|
!instr->definitions.back().isFixed() ||
|
|
|
|
|
instr->definitions.back().physReg() == vcc;
|
|
|
|
|
|
|
|
|
|
/* addc/subb/cndmask 3rd operand needs VCC without VOP3. */
|
|
|
|
|
remove_vop3 &= instr->operands.size() < 3 || !instr->operands[2].isFixed() ||
|
|
|
|
|
instr->operands[2].isOfType(RegType::vgpr) || instr->operands[2].physReg() == vcc;
|
|
|
|
|
|
aco: apply DPP with scalar src1 on gfx11.5+
Foz-DB Navi48:
Totals from 6261 (7.62% of 82179) affected shaders:
MaxWaves: 176284 -> 176236 (-0.03%); split: +0.01%, -0.03%
Instrs: 5850185 -> 5828451 (-0.37%); split: -0.41%, +0.04%
CodeSize: 31363324 -> 31419904 (+0.18%); split: -0.08%, +0.26%
VGPRs: 328284 -> 328200 (-0.03%); split: -0.07%, +0.05%
SpillSGPRs: 2268 -> 2256 (-0.53%)
Latency: 50235516 -> 50218816 (-0.03%); split: -0.06%, +0.03%
InvThroughput: 8256243 -> 8242036 (-0.17%); split: -0.22%, +0.05%
VClause: 81000 -> 80975 (-0.03%); split: -0.11%, +0.08%
SClause: 136376 -> 136387 (+0.01%); split: -0.11%, +0.11%
Copies: 414021 -> 417894 (+0.94%); split: -0.13%, +1.07%
Branches: 105301 -> 105298 (-0.00%); split: -0.00%, +0.00%
PreSGPRs: 291360 -> 291432 (+0.02%)
PreVGPRs: 238593 -> 238729 (+0.06%); split: -0.02%, +0.08%
VALU: 3425446 -> 3403463 (-0.64%); split: -0.65%, +0.01%
SALU: 815505 -> 819372 (+0.47%); split: -0.02%, +0.50%
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39516>
2026-01-25 16:24:06 +01:00
|
|
|
/* scalar src1 needs VOP3. */
|
|
|
|
|
remove_vop3 &= instr->operands.size() < 2 || instr->operands[1].isOfType(RegType::vgpr);
|
|
|
|
|
|
2023-04-23 14:55:17 +02:00
|
|
|
if (remove_vop3)
|
2023-05-13 11:40:35 +02:00
|
|
|
instr->format = withoutVOP3(instr->format);
|
2023-04-23 14:55:17 +02:00
|
|
|
|
2021-07-14 17:11:44 +01:00
|
|
|
return tmp;
|
|
|
|
|
}
|
|
|
|
|
|
2023-05-16 17:02:45 +02:00
|
|
|
bool
|
|
|
|
|
can_use_input_modifiers(amd_gfx_level gfx_level, aco_opcode op, int idx)
|
|
|
|
|
{
|
|
|
|
|
if (op == aco_opcode::v_mov_b32)
|
|
|
|
|
return gfx_level >= GFX10;
|
|
|
|
|
|
2025-04-29 15:55:47 +02:00
|
|
|
return instr_info.alu_opcode_infos[(int)op].input_modifiers & BITFIELD_BIT(idx);
|
2023-05-16 17:02:45 +02:00
|
|
|
}
|
|
|
|
|
|
2020-06-03 11:27:55 +01:00
|
|
|
bool
|
2022-05-12 02:50:17 -04:00
|
|
|
can_use_opsel(amd_gfx_level gfx_level, aco_opcode op, int idx)
|
2020-06-03 11:27:55 +01:00
|
|
|
{
|
|
|
|
|
/* opsel is only GFX9+ */
|
2022-05-12 02:50:17 -04:00
|
|
|
if (gfx_level < GFX9)
|
2020-06-03 11:27:55 +01:00
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
switch (op) {
|
|
|
|
|
case aco_opcode::v_div_fixup_f16:
|
|
|
|
|
case aco_opcode::v_fma_f16:
|
|
|
|
|
case aco_opcode::v_mad_f16:
|
|
|
|
|
case aco_opcode::v_mad_u16:
|
|
|
|
|
case aco_opcode::v_mad_i16:
|
|
|
|
|
case aco_opcode::v_med3_f16:
|
|
|
|
|
case aco_opcode::v_med3_i16:
|
|
|
|
|
case aco_opcode::v_med3_u16:
|
|
|
|
|
case aco_opcode::v_min3_f16:
|
|
|
|
|
case aco_opcode::v_min3_i16:
|
|
|
|
|
case aco_opcode::v_min3_u16:
|
|
|
|
|
case aco_opcode::v_max3_f16:
|
|
|
|
|
case aco_opcode::v_max3_i16:
|
|
|
|
|
case aco_opcode::v_max3_u16:
|
2023-01-14 11:38:25 +01:00
|
|
|
case aco_opcode::v_minmax_f16:
|
|
|
|
|
case aco_opcode::v_maxmin_f16:
|
2020-06-03 11:27:55 +01:00
|
|
|
case aco_opcode::v_max_u16_e64:
|
|
|
|
|
case aco_opcode::v_max_i16_e64:
|
|
|
|
|
case aco_opcode::v_min_u16_e64:
|
|
|
|
|
case aco_opcode::v_min_i16_e64:
|
|
|
|
|
case aco_opcode::v_add_i16:
|
|
|
|
|
case aco_opcode::v_sub_i16:
|
|
|
|
|
case aco_opcode::v_add_u16_e64:
|
|
|
|
|
case aco_opcode::v_sub_u16_e64:
|
|
|
|
|
case aco_opcode::v_lshlrev_b16_e64:
|
|
|
|
|
case aco_opcode::v_lshrrev_b16_e64:
|
|
|
|
|
case aco_opcode::v_ashrrev_i16_e64:
|
2023-01-14 11:38:25 +01:00
|
|
|
case aco_opcode::v_and_b16:
|
|
|
|
|
case aco_opcode::v_or_b16:
|
|
|
|
|
case aco_opcode::v_xor_b16:
|
2020-06-03 11:27:55 +01:00
|
|
|
case aco_opcode::v_mul_lo_u16_e64: return true;
|
|
|
|
|
case aco_opcode::v_pack_b32_f16:
|
2020-08-17 11:36:24 +01:00
|
|
|
case aco_opcode::v_cvt_pknorm_i16_f16:
|
|
|
|
|
case aco_opcode::v_cvt_pknorm_u16_f16: return idx != -1;
|
2020-06-03 11:27:55 +01:00
|
|
|
case aco_opcode::v_mad_u32_u16:
|
|
|
|
|
case aco_opcode::v_mad_i32_i16: return idx >= 0 && idx < 2;
|
2022-06-16 18:15:16 +01:00
|
|
|
case aco_opcode::v_dot2_f16_f16:
|
|
|
|
|
case aco_opcode::v_dot2_bf16_bf16: return idx == -1 || idx == 2;
|
2023-01-14 11:38:25 +01:00
|
|
|
case aco_opcode::v_cndmask_b16: return idx != 2;
|
2022-06-17 13:53:08 +01:00
|
|
|
case aco_opcode::v_interp_p10_f16_f32_inreg:
|
|
|
|
|
case aco_opcode::v_interp_p10_rtz_f16_f32_inreg: return idx == 0 || idx == 2;
|
|
|
|
|
case aco_opcode::v_interp_p2_f16_f32_inreg:
|
|
|
|
|
case aco_opcode::v_interp_p2_rtz_f16_f32_inreg: return idx == -1 || idx == 0;
|
2025-04-09 14:17:47 +02:00
|
|
|
case aco_opcode::v_cvt_pk_fp8_f32:
|
|
|
|
|
case aco_opcode::p_v_cvt_pk_fp8_f32_ovfl:
|
|
|
|
|
case aco_opcode::v_cvt_pk_bf8_f32: return idx == -1;
|
2025-06-01 18:55:42 +02:00
|
|
|
case aco_opcode::v_alignbyte_b32:
|
|
|
|
|
case aco_opcode::v_alignbit_b32: return idx == 2;
|
2023-03-21 13:26:19 +01:00
|
|
|
default:
|
|
|
|
|
return gfx_level >= GFX11 && (get_gfx11_true16_mask(op) & BITFIELD_BIT(idx == -1 ? 3 : idx));
|
2020-06-03 11:27:55 +01:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-04-27 16:22:52 +01:00
|
|
|
bool
|
2023-05-06 17:03:22 +02:00
|
|
|
can_write_m0(const aco_ptr<Instruction>& instr)
|
2023-04-27 16:22:52 +01:00
|
|
|
{
|
|
|
|
|
if (instr->isSALU())
|
|
|
|
|
return true;
|
|
|
|
|
|
2023-05-06 17:03:22 +02:00
|
|
|
/* VALU can't write m0 on any GPU generations. */
|
2023-04-27 16:22:52 +01:00
|
|
|
if (instr->isVALU())
|
2023-05-06 17:03:22 +02:00
|
|
|
return false;
|
2023-04-27 16:22:52 +01:00
|
|
|
|
|
|
|
|
switch (instr->opcode) {
|
|
|
|
|
case aco_opcode::p_parallelcopy:
|
|
|
|
|
case aco_opcode::p_extract:
|
|
|
|
|
case aco_opcode::p_insert:
|
2023-05-06 17:03:22 +02:00
|
|
|
/* These pseudo instructions are implemented with SALU when writing m0. */
|
2023-04-27 16:22:52 +01:00
|
|
|
return true;
|
|
|
|
|
default:
|
2023-05-06 17:03:22 +02:00
|
|
|
/* Assume that no other instructions can write m0. */
|
2023-04-27 16:22:52 +01:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-08-18 18:56:59 +02:00
|
|
|
bool
|
2022-05-12 02:50:17 -04:00
|
|
|
instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op)
|
2021-08-18 18:56:59 +02:00
|
|
|
{
|
|
|
|
|
/* partial register writes are GFX9+, only */
|
2022-05-12 02:50:17 -04:00
|
|
|
if (gfx_level < GFX9)
|
2021-08-18 18:56:59 +02:00
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
switch (op) {
|
|
|
|
|
/* VOP3 */
|
2024-04-03 09:36:04 +02:00
|
|
|
case aco_opcode::v_mad_legacy_f16:
|
|
|
|
|
case aco_opcode::v_mad_legacy_u16:
|
|
|
|
|
case aco_opcode::v_mad_legacy_i16:
|
|
|
|
|
case aco_opcode::v_fma_legacy_f16:
|
|
|
|
|
case aco_opcode::v_div_fixup_legacy_f16: return false;
|
2021-08-18 18:56:59 +02:00
|
|
|
case aco_opcode::v_interp_p2_f16:
|
2024-04-06 11:07:37 +02:00
|
|
|
case aco_opcode::v_interp_p2_hi_f16:
|
2021-08-18 18:56:59 +02:00
|
|
|
case aco_opcode::v_fma_mixlo_f16:
|
2022-06-01 17:16:55 +02:00
|
|
|
case aco_opcode::v_fma_mixhi_f16:
|
2025-10-19 16:40:56 +02:00
|
|
|
case aco_opcode::p_v_fma_mixlo_f16_rtz:
|
|
|
|
|
case aco_opcode::p_v_fma_mixhi_f16_rtz:
|
2021-08-18 18:56:59 +02:00
|
|
|
/* VOP2 */
|
|
|
|
|
case aco_opcode::v_mac_f16:
|
|
|
|
|
case aco_opcode::v_madak_f16:
|
2022-05-12 02:50:17 -04:00
|
|
|
case aco_opcode::v_madmk_f16: return gfx_level >= GFX9;
|
2021-08-18 18:56:59 +02:00
|
|
|
case aco_opcode::v_add_f16:
|
|
|
|
|
case aco_opcode::v_sub_f16:
|
|
|
|
|
case aco_opcode::v_subrev_f16:
|
|
|
|
|
case aco_opcode::v_mul_f16:
|
|
|
|
|
case aco_opcode::v_max_f16:
|
|
|
|
|
case aco_opcode::v_min_f16:
|
|
|
|
|
case aco_opcode::v_ldexp_f16:
|
|
|
|
|
case aco_opcode::v_fmac_f16:
|
|
|
|
|
case aco_opcode::v_fmamk_f16:
|
|
|
|
|
case aco_opcode::v_fmaak_f16:
|
|
|
|
|
/* VOP1 */
|
|
|
|
|
case aco_opcode::v_cvt_f16_f32:
|
2023-09-21 20:04:31 +02:00
|
|
|
case aco_opcode::p_v_cvt_f16_f32_rtne:
|
2025-10-14 20:48:03 +02:00
|
|
|
case aco_opcode::p_v_cvt_f16_f32_rtpi:
|
|
|
|
|
case aco_opcode::p_v_cvt_f16_f32_rtni:
|
2021-08-18 18:56:59 +02:00
|
|
|
case aco_opcode::v_cvt_f16_u16:
|
|
|
|
|
case aco_opcode::v_cvt_f16_i16:
|
|
|
|
|
case aco_opcode::v_rcp_f16:
|
|
|
|
|
case aco_opcode::v_sqrt_f16:
|
|
|
|
|
case aco_opcode::v_rsq_f16:
|
|
|
|
|
case aco_opcode::v_log_f16:
|
|
|
|
|
case aco_opcode::v_exp_f16:
|
|
|
|
|
case aco_opcode::v_frexp_mant_f16:
|
|
|
|
|
case aco_opcode::v_frexp_exp_i16_f16:
|
|
|
|
|
case aco_opcode::v_floor_f16:
|
|
|
|
|
case aco_opcode::v_ceil_f16:
|
|
|
|
|
case aco_opcode::v_trunc_f16:
|
|
|
|
|
case aco_opcode::v_rndne_f16:
|
|
|
|
|
case aco_opcode::v_fract_f16:
|
|
|
|
|
case aco_opcode::v_sin_f16:
|
2023-01-08 16:54:56 +01:00
|
|
|
case aco_opcode::v_cos_f16:
|
|
|
|
|
case aco_opcode::v_cvt_u16_f16:
|
|
|
|
|
case aco_opcode::v_cvt_i16_f16:
|
|
|
|
|
case aco_opcode::v_cvt_norm_i16_f16:
|
|
|
|
|
case aco_opcode::v_cvt_norm_u16_f16: return gfx_level >= GFX10;
|
2024-04-03 09:36:04 +02:00
|
|
|
/* all non legacy opsel instructions preserve the high bits */
|
|
|
|
|
default: return can_use_opsel(gfx_level, op, -1);
|
2021-08-18 18:56:59 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-11-11 19:58:45 +00:00
|
|
|
/* On GFX11, for some instructions, bit 7 of the destination/operand vgpr is opsel and the field
|
|
|
|
|
* only supports v0-v127.
|
2023-07-24 13:57:16 +02:00
|
|
|
* The first three bits are used for operands 0-2, and the 4th bit is used for the destination.
|
2022-11-11 19:58:45 +00:00
|
|
|
*/
|
|
|
|
|
uint8_t
|
|
|
|
|
get_gfx11_true16_mask(aco_opcode op)
|
|
|
|
|
{
|
|
|
|
|
switch (op) {
|
|
|
|
|
case aco_opcode::v_ceil_f16:
|
|
|
|
|
case aco_opcode::v_cos_f16:
|
|
|
|
|
case aco_opcode::v_cvt_f16_i16:
|
|
|
|
|
case aco_opcode::v_cvt_f16_u16:
|
|
|
|
|
case aco_opcode::v_cvt_i16_f16:
|
|
|
|
|
case aco_opcode::v_cvt_u16_f16:
|
|
|
|
|
case aco_opcode::v_cvt_norm_i16_f16:
|
|
|
|
|
case aco_opcode::v_cvt_norm_u16_f16:
|
|
|
|
|
case aco_opcode::v_exp_f16:
|
|
|
|
|
case aco_opcode::v_floor_f16:
|
|
|
|
|
case aco_opcode::v_fract_f16:
|
|
|
|
|
case aco_opcode::v_frexp_exp_i16_f16:
|
|
|
|
|
case aco_opcode::v_frexp_mant_f16:
|
|
|
|
|
case aco_opcode::v_log_f16:
|
|
|
|
|
case aco_opcode::v_not_b16:
|
|
|
|
|
case aco_opcode::v_rcp_f16:
|
|
|
|
|
case aco_opcode::v_rndne_f16:
|
|
|
|
|
case aco_opcode::v_rsq_f16:
|
|
|
|
|
case aco_opcode::v_sin_f16:
|
|
|
|
|
case aco_opcode::v_sqrt_f16:
|
|
|
|
|
case aco_opcode::v_trunc_f16:
|
2024-05-10 21:44:08 +02:00
|
|
|
case aco_opcode::v_swap_b16:
|
2022-11-11 19:58:45 +00:00
|
|
|
case aco_opcode::v_mov_b16: return 0x1 | 0x8;
|
|
|
|
|
case aco_opcode::v_add_f16:
|
|
|
|
|
case aco_opcode::v_fmaak_f16:
|
|
|
|
|
case aco_opcode::v_fmac_f16:
|
|
|
|
|
case aco_opcode::v_fmamk_f16:
|
|
|
|
|
case aco_opcode::v_ldexp_f16:
|
|
|
|
|
case aco_opcode::v_max_f16:
|
|
|
|
|
case aco_opcode::v_min_f16:
|
|
|
|
|
case aco_opcode::v_mul_f16:
|
|
|
|
|
case aco_opcode::v_sub_f16:
|
|
|
|
|
case aco_opcode::v_subrev_f16:
|
|
|
|
|
case aco_opcode::v_and_b16:
|
|
|
|
|
case aco_opcode::v_or_b16:
|
|
|
|
|
case aco_opcode::v_xor_b16: return 0x3 | 0x8;
|
2025-04-09 14:26:16 +02:00
|
|
|
case aco_opcode::v_cvt_pk_f32_fp8:
|
|
|
|
|
case aco_opcode::v_cvt_pk_f32_bf8:
|
2022-11-11 19:58:45 +00:00
|
|
|
case aco_opcode::v_cvt_f32_f16:
|
|
|
|
|
case aco_opcode::v_cvt_i32_i16:
|
|
|
|
|
case aco_opcode::v_cvt_u32_u16: return 0x1;
|
2023-07-22 23:17:45 +02:00
|
|
|
case aco_opcode::v_cmp_class_f16:
|
2022-11-11 19:58:45 +00:00
|
|
|
case aco_opcode::v_cmp_eq_f16:
|
|
|
|
|
case aco_opcode::v_cmp_eq_i16:
|
|
|
|
|
case aco_opcode::v_cmp_eq_u16:
|
|
|
|
|
case aco_opcode::v_cmp_ge_f16:
|
|
|
|
|
case aco_opcode::v_cmp_ge_i16:
|
|
|
|
|
case aco_opcode::v_cmp_ge_u16:
|
|
|
|
|
case aco_opcode::v_cmp_gt_f16:
|
|
|
|
|
case aco_opcode::v_cmp_gt_i16:
|
|
|
|
|
case aco_opcode::v_cmp_gt_u16:
|
|
|
|
|
case aco_opcode::v_cmp_le_f16:
|
|
|
|
|
case aco_opcode::v_cmp_le_i16:
|
|
|
|
|
case aco_opcode::v_cmp_le_u16:
|
|
|
|
|
case aco_opcode::v_cmp_lg_f16:
|
|
|
|
|
case aco_opcode::v_cmp_lg_i16:
|
|
|
|
|
case aco_opcode::v_cmp_lg_u16:
|
|
|
|
|
case aco_opcode::v_cmp_lt_f16:
|
|
|
|
|
case aco_opcode::v_cmp_lt_i16:
|
|
|
|
|
case aco_opcode::v_cmp_lt_u16:
|
|
|
|
|
case aco_opcode::v_cmp_neq_f16:
|
|
|
|
|
case aco_opcode::v_cmp_nge_f16:
|
|
|
|
|
case aco_opcode::v_cmp_ngt_f16:
|
|
|
|
|
case aco_opcode::v_cmp_nle_f16:
|
|
|
|
|
case aco_opcode::v_cmp_nlg_f16:
|
|
|
|
|
case aco_opcode::v_cmp_nlt_f16:
|
|
|
|
|
case aco_opcode::v_cmp_o_f16:
|
|
|
|
|
case aco_opcode::v_cmp_u_f16:
|
2023-07-22 23:17:45 +02:00
|
|
|
case aco_opcode::v_cmpx_class_f16:
|
2022-11-11 19:58:45 +00:00
|
|
|
case aco_opcode::v_cmpx_eq_f16:
|
|
|
|
|
case aco_opcode::v_cmpx_eq_i16:
|
|
|
|
|
case aco_opcode::v_cmpx_eq_u16:
|
|
|
|
|
case aco_opcode::v_cmpx_ge_f16:
|
|
|
|
|
case aco_opcode::v_cmpx_ge_i16:
|
|
|
|
|
case aco_opcode::v_cmpx_ge_u16:
|
|
|
|
|
case aco_opcode::v_cmpx_gt_f16:
|
|
|
|
|
case aco_opcode::v_cmpx_gt_i16:
|
|
|
|
|
case aco_opcode::v_cmpx_gt_u16:
|
|
|
|
|
case aco_opcode::v_cmpx_le_f16:
|
|
|
|
|
case aco_opcode::v_cmpx_le_i16:
|
|
|
|
|
case aco_opcode::v_cmpx_le_u16:
|
|
|
|
|
case aco_opcode::v_cmpx_lg_f16:
|
|
|
|
|
case aco_opcode::v_cmpx_lg_i16:
|
|
|
|
|
case aco_opcode::v_cmpx_lg_u16:
|
|
|
|
|
case aco_opcode::v_cmpx_lt_f16:
|
|
|
|
|
case aco_opcode::v_cmpx_lt_i16:
|
|
|
|
|
case aco_opcode::v_cmpx_lt_u16:
|
|
|
|
|
case aco_opcode::v_cmpx_neq_f16:
|
|
|
|
|
case aco_opcode::v_cmpx_nge_f16:
|
|
|
|
|
case aco_opcode::v_cmpx_ngt_f16:
|
|
|
|
|
case aco_opcode::v_cmpx_nle_f16:
|
|
|
|
|
case aco_opcode::v_cmpx_nlg_f16:
|
|
|
|
|
case aco_opcode::v_cmpx_nlt_f16:
|
|
|
|
|
case aco_opcode::v_cmpx_o_f16:
|
|
|
|
|
case aco_opcode::v_cmpx_u_f16: return 0x3;
|
|
|
|
|
case aco_opcode::v_cvt_f16_f32:
|
|
|
|
|
case aco_opcode::v_sat_pk_u8_i16: return 0x8;
|
|
|
|
|
default: return 0x0;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-01 16:30:06 +01:00
|
|
|
uint32_t
|
|
|
|
|
get_reduction_identity(ReduceOp op, unsigned idx)
|
|
|
|
|
{
|
|
|
|
|
switch (op) {
|
|
|
|
|
case iadd8:
|
|
|
|
|
case iadd16:
|
|
|
|
|
case iadd32:
|
|
|
|
|
case iadd64:
|
|
|
|
|
case fadd16:
|
|
|
|
|
case fadd32:
|
|
|
|
|
case fadd64:
|
|
|
|
|
case ior8:
|
|
|
|
|
case ior16:
|
|
|
|
|
case ior32:
|
|
|
|
|
case ior64:
|
|
|
|
|
case ixor8:
|
|
|
|
|
case ixor16:
|
|
|
|
|
case ixor32:
|
|
|
|
|
case ixor64:
|
|
|
|
|
case umax8:
|
|
|
|
|
case umax16:
|
|
|
|
|
case umax32:
|
|
|
|
|
case umax64: return 0;
|
|
|
|
|
case imul8:
|
|
|
|
|
case imul16:
|
|
|
|
|
case imul32:
|
|
|
|
|
case imul64: return idx ? 0 : 1;
|
|
|
|
|
case fmul16: return 0x3c00u; /* 1.0 */
|
|
|
|
|
case fmul32: return 0x3f800000u; /* 1.0 */
|
|
|
|
|
case fmul64: return idx ? 0x3ff00000u : 0u; /* 1.0 */
|
|
|
|
|
case imin8: return INT8_MAX;
|
|
|
|
|
case imin16: return INT16_MAX;
|
|
|
|
|
case imin32: return INT32_MAX;
|
|
|
|
|
case imin64: return idx ? 0x7fffffffu : 0xffffffffu;
|
|
|
|
|
case imax8: return INT8_MIN;
|
|
|
|
|
case imax16: return INT16_MIN;
|
|
|
|
|
case imax32: return INT32_MIN;
|
|
|
|
|
case imax64: return idx ? 0x80000000u : 0;
|
|
|
|
|
case umin8:
|
|
|
|
|
case umin16:
|
|
|
|
|
case iand8:
|
|
|
|
|
case iand16: return 0xffffffffu;
|
|
|
|
|
case umin32:
|
|
|
|
|
case umin64:
|
|
|
|
|
case iand32:
|
|
|
|
|
case iand64: return 0xffffffffu;
|
|
|
|
|
case fmin16: return 0x7c00u; /* infinity */
|
|
|
|
|
case fmin32: return 0x7f800000u; /* infinity */
|
|
|
|
|
case fmin64: return idx ? 0x7ff00000u : 0u; /* infinity */
|
|
|
|
|
case fmax16: return 0xfc00u; /* negative infinity */
|
|
|
|
|
case fmax32: return 0xff800000u; /* negative infinity */
|
|
|
|
|
case fmax64: return idx ? 0xfff00000u : 0u; /* negative infinity */
|
2025-07-23 09:17:35 +02:00
|
|
|
default: UNREACHABLE("Invalid reduction operation"); break;
|
2020-09-01 16:30:06 +01:00
|
|
|
}
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2025-05-16 12:18:18 +02:00
|
|
|
aco_type
|
|
|
|
|
get_operand_type(aco_ptr<Instruction>& alu, unsigned index)
|
2023-05-16 17:10:57 +02:00
|
|
|
{
|
2025-05-16 12:18:18 +02:00
|
|
|
assert(alu->isVALU() || alu->isSALU());
|
|
|
|
|
aco_type type = instr_info.alu_opcode_infos[(int)alu->opcode].op_types[index];
|
|
|
|
|
|
|
|
|
|
if (alu->opcode == aco_opcode::v_fma_mix_f32 || alu->opcode == aco_opcode::v_fma_mixlo_f16 ||
|
2025-10-19 16:40:56 +02:00
|
|
|
alu->opcode == aco_opcode::v_fma_mixhi_f16 ||
|
|
|
|
|
alu->opcode == aco_opcode::p_v_fma_mixlo_f16_rtz ||
|
|
|
|
|
alu->opcode == aco_opcode::p_v_fma_mixhi_f16_rtz)
|
2025-05-16 12:18:18 +02:00
|
|
|
type.bit_size = alu->valu().opsel_hi[index] ? 16 : 32;
|
|
|
|
|
|
|
|
|
|
return type;
|
2023-05-16 17:10:57 +02:00
|
|
|
}
|
|
|
|
|
|
2020-08-12 16:58:35 +02:00
|
|
|
bool
|
|
|
|
|
needs_exec_mask(const Instruction* instr)
|
|
|
|
|
{
|
2021-11-12 13:46:17 +00:00
|
|
|
if (instr->isVALU()) {
|
|
|
|
|
return instr->opcode != aco_opcode::v_readlane_b32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_readlane_b32_e64 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_writelane_b32 &&
|
|
|
|
|
instr->opcode != aco_opcode::v_writelane_b32_e64;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (instr->isVMEM() || instr->isFlatLike())
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
if (instr->isSALU() || instr->isBranch() || instr->isSMEM() || instr->isBarrier())
|
2024-12-18 10:57:34 +01:00
|
|
|
return instr->opcode == aco_opcode::s_cbranch_execz ||
|
2025-03-06 14:05:35 +01:00
|
|
|
instr->opcode == aco_opcode::s_cbranch_execnz ||
|
|
|
|
|
instr->opcode == aco_opcode::s_setpc_b64 || instr->reads_exec();
|
2020-08-12 16:58:35 +02:00
|
|
|
|
2021-01-20 15:27:16 +00:00
|
|
|
if (instr->isPseudo()) {
|
2020-08-12 16:58:35 +02:00
|
|
|
switch (instr->opcode) {
|
|
|
|
|
case aco_opcode::p_create_vector:
|
|
|
|
|
case aco_opcode::p_extract_vector:
|
|
|
|
|
case aco_opcode::p_split_vector:
|
2021-05-07 09:37:59 +02:00
|
|
|
case aco_opcode::p_phi:
|
|
|
|
|
case aco_opcode::p_parallelcopy:
|
2020-08-12 16:58:35 +02:00
|
|
|
for (Definition def : instr->definitions) {
|
|
|
|
|
if (def.getTemp().type() == RegType::vgpr)
|
|
|
|
|
return true;
|
|
|
|
|
}
|
2021-11-12 13:46:17 +00:00
|
|
|
return instr->reads_exec();
|
2020-08-12 16:58:35 +02:00
|
|
|
case aco_opcode::p_spill:
|
|
|
|
|
case aco_opcode::p_reload:
|
aco/insert_exec_mask: stay in WQM while helper lanes are still needed
This patch flags all instructions WQM which don't require
Exact mode, but depend on the exec mask as long as WQM
is needed on any control flow path afterwards.
This will mostly prevent accidental copies of WQM values
within Exact mode, and also makes a lot of other workarounds
unnecessary.
Totals from 17374 (12.88% of 134913) affected shaders: (GFX10.3)
VGPRs: 526952 -> 527384 (+0.08%); split: -0.01%, +0.09%
CodeSize: 33740512 -> 33766636 (+0.08%); split: -0.06%, +0.14%
MaxWaves: 488166 -> 488108 (-0.01%); split: +0.00%, -0.02%
Instrs: 6254240 -> 6260557 (+0.10%); split: -0.08%, +0.18%
Latency: 66497580 -> 66463472 (-0.05%); split: -0.15%, +0.10%
InvThroughput: 13265741 -> 13264036 (-0.01%); split: -0.03%, +0.01%
VClause: 122962 -> 122975 (+0.01%); split: -0.01%, +0.02%
SClause: 334805 -> 334405 (-0.12%); split: -0.51%, +0.39%
Copies: 275728 -> 282341 (+2.40%); split: -0.91%, +3.31%
Branches: 92546 -> 90990 (-1.68%); split: -1.68%, +0.00%
PreSGPRs: 504119 -> 504352 (+0.05%); split: -0.00%, +0.05%
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14951>
2022-01-24 19:43:49 +01:00
|
|
|
case aco_opcode::p_end_linear_vgpr:
|
2021-05-07 09:37:59 +02:00
|
|
|
case aco_opcode::p_logical_start:
|
|
|
|
|
case aco_opcode::p_logical_end:
|
2022-05-19 14:12:08 +01:00
|
|
|
case aco_opcode::p_startpgm:
|
aco: insert a single p_end_wqm after the last derivative calculation
This new instruction replaces p_wqm.
Totals from 28065 (36.65% of 76572) affected shaders: (GFX11)
MaxWaves: 823922 -> 823952 (+0.00%); split: +0.01%, -0.01%
Instrs: 22221375 -> 22180465 (-0.18%); split: -0.26%, +0.08%
CodeSize: 117310676 -> 117040684 (-0.23%); split: -0.30%, +0.07%
VGPRs: 1183476 -> 1186656 (+0.27%); split: -0.19%, +0.46%
SpillSGPRs: 2305 -> 2302 (-0.13%)
Latency: 176559310 -> 176427793 (-0.07%); split: -0.21%, +0.14%
InvThroughput: 26245204 -> 26195550 (-0.19%); split: -0.26%, +0.07%
VClause: 368058 -> 369460 (+0.38%); split: -0.21%, +0.59%
SClause: 857077 -> 842588 (-1.69%); split: -2.06%, +0.37%
Copies: 1245650 -> 1249434 (+0.30%); split: -0.33%, +0.63%
Branches: 394837 -> 396070 (+0.31%); split: -0.01%, +0.32%
PreSGPRs: 1019139 -> 1019567 (+0.04%); split: -0.02%, +0.06%
PreVGPRs: 925739 -> 931860 (+0.66%); split: -0.00%, +0.66%
Changes are due to scheduling and re-enabling cross-lane optimizations.
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25038>
2023-09-02 11:14:33 +02:00
|
|
|
case aco_opcode::p_end_wqm:
|
2022-05-19 14:12:08 +01:00
|
|
|
case aco_opcode::p_init_scratch: return instr->reads_exec();
|
2023-05-29 16:39:39 +02:00
|
|
|
case aco_opcode::p_start_linear_vgpr: return instr->operands.size();
|
2020-08-12 16:58:35 +02:00
|
|
|
default: break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2021-07-14 17:22:02 +01:00
|
|
|
struct CmpInfo {
|
2022-07-27 11:43:03 +02:00
|
|
|
aco_opcode swapped;
|
2021-07-14 17:22:02 +01:00
|
|
|
aco_opcode inverse;
|
2022-07-27 12:27:07 +02:00
|
|
|
aco_opcode vcmpx;
|
2021-07-14 17:22:02 +01:00
|
|
|
};
|
|
|
|
|
|
2024-06-15 16:18:54 +02:00
|
|
|
static ALWAYS_INLINE bool
|
2021-07-14 17:22:02 +01:00
|
|
|
get_cmp_info(aco_opcode op, CmpInfo* info)
|
|
|
|
|
{
|
2022-07-27 11:43:03 +02:00
|
|
|
info->swapped = aco_opcode::num_opcodes;
|
2022-07-27 12:34:27 +02:00
|
|
|
info->inverse = aco_opcode::num_opcodes;
|
2023-05-06 18:00:00 +02:00
|
|
|
info->vcmpx = aco_opcode::num_opcodes;
|
2021-07-14 17:22:02 +01:00
|
|
|
switch (op) {
|
|
|
|
|
// clang-format off
|
|
|
|
|
#define CMP2(ord, unord, ord_swap, unord_swap, sz) \
|
|
|
|
|
case aco_opcode::v_cmp_##ord##_f##sz: \
|
|
|
|
|
case aco_opcode::v_cmp_n##unord##_f##sz: \
|
2022-07-27 11:43:03 +02:00
|
|
|
info->swapped = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord_swap##_f##sz \
|
|
|
|
|
: aco_opcode::v_cmp_n##unord_swap##_f##sz; \
|
2021-07-14 17:22:02 +01:00
|
|
|
info->inverse = op == aco_opcode::v_cmp_n##unord##_f##sz ? aco_opcode::v_cmp_##unord##_f##sz \
|
|
|
|
|
: aco_opcode::v_cmp_n##ord##_f##sz; \
|
2022-07-27 12:27:07 +02:00
|
|
|
info->vcmpx = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmpx_##ord##_f##sz \
|
|
|
|
|
: aco_opcode::v_cmpx_n##unord##_f##sz; \
|
2021-07-14 17:22:02 +01:00
|
|
|
return true;
|
|
|
|
|
#define CMP(ord, unord, ord_swap, unord_swap) \
|
|
|
|
|
CMP2(ord, unord, ord_swap, unord_swap, 16) \
|
|
|
|
|
CMP2(ord, unord, ord_swap, unord_swap, 32) \
|
|
|
|
|
CMP2(ord, unord, ord_swap, unord_swap, 64)
|
|
|
|
|
CMP(lt, /*n*/ge, gt, /*n*/le)
|
|
|
|
|
CMP(eq, /*n*/lg, eq, /*n*/lg)
|
|
|
|
|
CMP(le, /*n*/gt, ge, /*n*/lt)
|
2022-07-27 12:04:58 +02:00
|
|
|
CMP(gt, /*n*/le, lt, /*n*/ge)
|
2021-07-14 17:22:02 +01:00
|
|
|
CMP(lg, /*n*/eq, lg, /*n*/eq)
|
|
|
|
|
CMP(ge, /*n*/lt, le, /*n*/gt)
|
|
|
|
|
#undef CMP
|
|
|
|
|
#undef CMP2
|
|
|
|
|
#define ORD_TEST(sz) \
|
|
|
|
|
case aco_opcode::v_cmp_u_f##sz: \
|
2022-07-27 12:43:31 +02:00
|
|
|
info->swapped = aco_opcode::v_cmp_u_f##sz; \
|
2021-07-14 17:22:02 +01:00
|
|
|
info->inverse = aco_opcode::v_cmp_o_f##sz; \
|
2022-07-27 12:27:07 +02:00
|
|
|
info->vcmpx = aco_opcode::v_cmpx_u_f##sz; \
|
2021-07-14 17:22:02 +01:00
|
|
|
return true; \
|
|
|
|
|
case aco_opcode::v_cmp_o_f##sz: \
|
2022-07-27 12:43:31 +02:00
|
|
|
info->swapped = aco_opcode::v_cmp_o_f##sz; \
|
2021-07-14 17:22:02 +01:00
|
|
|
info->inverse = aco_opcode::v_cmp_u_f##sz; \
|
2022-07-27 12:27:07 +02:00
|
|
|
info->vcmpx = aco_opcode::v_cmpx_o_f##sz; \
|
2021-07-14 17:22:02 +01:00
|
|
|
return true;
|
|
|
|
|
ORD_TEST(16)
|
|
|
|
|
ORD_TEST(32)
|
|
|
|
|
ORD_TEST(64)
|
|
|
|
|
#undef ORD_TEST
|
2022-07-27 12:03:22 +02:00
|
|
|
#define CMPI2(op, swap, inv, type, sz) \
|
|
|
|
|
case aco_opcode::v_cmp_##op##_##type##sz: \
|
|
|
|
|
info->swapped = aco_opcode::v_cmp_##swap##_##type##sz; \
|
|
|
|
|
info->inverse = aco_opcode::v_cmp_##inv##_##type##sz; \
|
2022-07-27 12:27:07 +02:00
|
|
|
info->vcmpx = aco_opcode::v_cmpx_##op##_##type##sz; \
|
2022-07-27 12:03:22 +02:00
|
|
|
return true;
|
|
|
|
|
#define CMPI(op, swap, inv) \
|
|
|
|
|
CMPI2(op, swap, inv, i, 16) \
|
|
|
|
|
CMPI2(op, swap, inv, u, 16) \
|
|
|
|
|
CMPI2(op, swap, inv, i, 32) \
|
|
|
|
|
CMPI2(op, swap, inv, u, 32) \
|
|
|
|
|
CMPI2(op, swap, inv, i, 64) \
|
|
|
|
|
CMPI2(op, swap, inv, u, 64)
|
|
|
|
|
CMPI(lt, gt, ge)
|
|
|
|
|
CMPI(eq, eq, lg)
|
|
|
|
|
CMPI(le, ge, gt)
|
|
|
|
|
CMPI(gt, lt, le)
|
|
|
|
|
CMPI(lg, lg, eq)
|
|
|
|
|
CMPI(ge, le, lt)
|
|
|
|
|
#undef CMPI
|
|
|
|
|
#undef CMPI2
|
2022-07-27 12:34:27 +02:00
|
|
|
#define CMPCLASS(sz) \
|
|
|
|
|
case aco_opcode::v_cmp_class_f##sz: \
|
|
|
|
|
info->vcmpx = aco_opcode::v_cmpx_class_f##sz; \
|
|
|
|
|
return true;
|
|
|
|
|
CMPCLASS(16)
|
|
|
|
|
CMPCLASS(32)
|
|
|
|
|
CMPCLASS(64)
|
|
|
|
|
#undef CMPCLASS
|
2021-07-14 17:22:02 +01:00
|
|
|
// clang-format on
|
|
|
|
|
default: return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
aco_opcode
|
2024-05-30 18:54:30 +02:00
|
|
|
get_vcmp_inverse(aco_opcode op)
|
2021-07-14 17:22:02 +01:00
|
|
|
{
|
|
|
|
|
CmpInfo info;
|
|
|
|
|
return get_cmp_info(op, &info) ? info.inverse : aco_opcode::num_opcodes;
|
|
|
|
|
}
|
|
|
|
|
|
2023-01-22 19:50:46 +01:00
|
|
|
aco_opcode
|
2024-05-30 18:54:30 +02:00
|
|
|
get_vcmp_swapped(aco_opcode op)
|
2023-01-22 19:50:46 +01:00
|
|
|
{
|
|
|
|
|
CmpInfo info;
|
|
|
|
|
return get_cmp_info(op, &info) ? info.swapped : aco_opcode::num_opcodes;
|
|
|
|
|
}
|
|
|
|
|
|
2022-07-27 12:27:07 +02:00
|
|
|
aco_opcode
|
|
|
|
|
get_vcmpx(aco_opcode op)
|
|
|
|
|
{
|
|
|
|
|
CmpInfo info;
|
|
|
|
|
return get_cmp_info(op, &info) ? info.vcmpx : aco_opcode::num_opcodes;
|
2021-07-14 17:22:02 +01:00
|
|
|
}
|
|
|
|
|
|
2022-08-15 17:01:52 +01:00
|
|
|
bool
|
|
|
|
|
is_cmpx(aco_opcode op)
|
|
|
|
|
{
|
|
|
|
|
CmpInfo info;
|
|
|
|
|
return !get_cmp_info(op, &info);
|
|
|
|
|
}
|
|
|
|
|
|
2024-08-13 18:37:37 +02:00
|
|
|
aco_opcode
|
|
|
|
|
get_swapped_opcode(aco_opcode opcode, unsigned idx0, unsigned idx1)
|
2021-07-14 17:22:02 +01:00
|
|
|
{
|
2024-08-13 18:37:37 +02:00
|
|
|
if (idx0 == idx1)
|
|
|
|
|
return opcode;
|
2023-04-23 14:55:17 +02:00
|
|
|
|
|
|
|
|
if (idx0 > idx1)
|
|
|
|
|
std::swap(idx0, idx1);
|
|
|
|
|
|
2024-08-13 18:37:37 +02:00
|
|
|
CmpInfo info;
|
|
|
|
|
if (get_cmp_info(opcode, &info) && info.swapped != aco_opcode::num_opcodes)
|
|
|
|
|
return info.swapped;
|
2023-04-23 14:55:17 +02:00
|
|
|
|
|
|
|
|
/* opcodes not relevant for DPP or SGPRs optimizations are not included. */
|
2024-08-13 18:37:37 +02:00
|
|
|
switch (opcode) {
|
2021-07-14 17:22:02 +01:00
|
|
|
case aco_opcode::v_add_u32:
|
|
|
|
|
case aco_opcode::v_add_co_u32:
|
|
|
|
|
case aco_opcode::v_add_co_u32_e64:
|
|
|
|
|
case aco_opcode::v_add_i32:
|
2023-04-23 14:55:17 +02:00
|
|
|
case aco_opcode::v_add_i16:
|
|
|
|
|
case aco_opcode::v_add_u16_e64:
|
|
|
|
|
case aco_opcode::v_add3_u32:
|
2021-07-14 17:22:02 +01:00
|
|
|
case aco_opcode::v_add_f16:
|
|
|
|
|
case aco_opcode::v_add_f32:
|
2023-04-23 14:55:17 +02:00
|
|
|
case aco_opcode::v_mul_i32_i24:
|
|
|
|
|
case aco_opcode::v_mul_hi_i32_i24:
|
|
|
|
|
case aco_opcode::v_mul_u32_u24:
|
|
|
|
|
case aco_opcode::v_mul_hi_u32_u24:
|
|
|
|
|
case aco_opcode::v_mul_lo_u16:
|
|
|
|
|
case aco_opcode::v_mul_lo_u16_e64:
|
2021-07-14 17:22:02 +01:00
|
|
|
case aco_opcode::v_mul_f16:
|
|
|
|
|
case aco_opcode::v_mul_f32:
|
2023-04-23 14:55:17 +02:00
|
|
|
case aco_opcode::v_mul_legacy_f32:
|
2021-07-14 17:22:02 +01:00
|
|
|
case aco_opcode::v_or_b32:
|
|
|
|
|
case aco_opcode::v_and_b32:
|
|
|
|
|
case aco_opcode::v_xor_b32:
|
2023-04-23 14:55:17 +02:00
|
|
|
case aco_opcode::v_xnor_b32:
|
|
|
|
|
case aco_opcode::v_xor3_b32:
|
|
|
|
|
case aco_opcode::v_or3_b32:
|
|
|
|
|
case aco_opcode::v_and_b16:
|
|
|
|
|
case aco_opcode::v_or_b16:
|
|
|
|
|
case aco_opcode::v_xor_b16:
|
|
|
|
|
case aco_opcode::v_max3_f32:
|
|
|
|
|
case aco_opcode::v_min3_f32:
|
|
|
|
|
case aco_opcode::v_max3_f16:
|
|
|
|
|
case aco_opcode::v_min3_f16:
|
|
|
|
|
case aco_opcode::v_med3_f16:
|
|
|
|
|
case aco_opcode::v_max3_u32:
|
|
|
|
|
case aco_opcode::v_min3_u32:
|
|
|
|
|
case aco_opcode::v_med3_u32:
|
|
|
|
|
case aco_opcode::v_max3_i32:
|
|
|
|
|
case aco_opcode::v_min3_i32:
|
|
|
|
|
case aco_opcode::v_med3_i32:
|
|
|
|
|
case aco_opcode::v_max3_u16:
|
|
|
|
|
case aco_opcode::v_min3_u16:
|
|
|
|
|
case aco_opcode::v_med3_u16:
|
|
|
|
|
case aco_opcode::v_max3_i16:
|
|
|
|
|
case aco_opcode::v_min3_i16:
|
|
|
|
|
case aco_opcode::v_med3_i16:
|
2021-07-14 17:22:02 +01:00
|
|
|
case aco_opcode::v_max_f16:
|
|
|
|
|
case aco_opcode::v_max_f32:
|
|
|
|
|
case aco_opcode::v_min_f16:
|
|
|
|
|
case aco_opcode::v_min_f32:
|
|
|
|
|
case aco_opcode::v_max_i32:
|
|
|
|
|
case aco_opcode::v_min_i32:
|
|
|
|
|
case aco_opcode::v_max_u32:
|
|
|
|
|
case aco_opcode::v_min_u32:
|
|
|
|
|
case aco_opcode::v_max_i16:
|
|
|
|
|
case aco_opcode::v_min_i16:
|
|
|
|
|
case aco_opcode::v_max_u16:
|
|
|
|
|
case aco_opcode::v_min_u16:
|
|
|
|
|
case aco_opcode::v_max_i16_e64:
|
|
|
|
|
case aco_opcode::v_min_i16_e64:
|
|
|
|
|
case aco_opcode::v_max_u16_e64:
|
2024-08-13 18:37:37 +02:00
|
|
|
case aco_opcode::v_min_u16_e64: return opcode;
|
|
|
|
|
case aco_opcode::v_sub_f16: return aco_opcode::v_subrev_f16;
|
|
|
|
|
case aco_opcode::v_sub_f32: return aco_opcode::v_subrev_f32;
|
|
|
|
|
case aco_opcode::v_sub_co_u32: return aco_opcode::v_subrev_co_u32;
|
|
|
|
|
case aco_opcode::v_sub_u16: return aco_opcode::v_subrev_u16;
|
|
|
|
|
case aco_opcode::v_sub_u32: return aco_opcode::v_subrev_u32;
|
|
|
|
|
case aco_opcode::v_sub_co_u32_e64: return aco_opcode::v_subrev_co_u32_e64;
|
|
|
|
|
case aco_opcode::v_subrev_f16: return aco_opcode::v_sub_f16;
|
|
|
|
|
case aco_opcode::v_subrev_f32: return aco_opcode::v_sub_f32;
|
|
|
|
|
case aco_opcode::v_subrev_co_u32: return aco_opcode::v_sub_co_u32;
|
|
|
|
|
case aco_opcode::v_subrev_u16: return aco_opcode::v_sub_u16;
|
|
|
|
|
case aco_opcode::v_subrev_u32: return aco_opcode::v_sub_u32;
|
|
|
|
|
case aco_opcode::v_subrev_co_u32_e64: return aco_opcode::v_sub_co_u32_e64;
|
2023-04-23 14:55:17 +02:00
|
|
|
case aco_opcode::v_addc_co_u32:
|
|
|
|
|
case aco_opcode::v_mad_i32_i24:
|
|
|
|
|
case aco_opcode::v_mad_u32_u24:
|
|
|
|
|
case aco_opcode::v_lerp_u8:
|
|
|
|
|
case aco_opcode::v_sad_u8:
|
|
|
|
|
case aco_opcode::v_sad_hi_u8:
|
|
|
|
|
case aco_opcode::v_sad_u16:
|
|
|
|
|
case aco_opcode::v_sad_u32:
|
|
|
|
|
case aco_opcode::v_xad_u32:
|
|
|
|
|
case aco_opcode::v_add_lshl_u32:
|
|
|
|
|
case aco_opcode::v_and_or_b32:
|
|
|
|
|
case aco_opcode::v_mad_u16:
|
|
|
|
|
case aco_opcode::v_mad_i16:
|
|
|
|
|
case aco_opcode::v_mad_u32_u16:
|
|
|
|
|
case aco_opcode::v_mad_i32_i16:
|
|
|
|
|
case aco_opcode::v_maxmin_f32:
|
|
|
|
|
case aco_opcode::v_minmax_f32:
|
|
|
|
|
case aco_opcode::v_maxmin_f16:
|
|
|
|
|
case aco_opcode::v_minmax_f16:
|
|
|
|
|
case aco_opcode::v_maxmin_u32:
|
|
|
|
|
case aco_opcode::v_minmax_u32:
|
|
|
|
|
case aco_opcode::v_maxmin_i32:
|
|
|
|
|
case aco_opcode::v_minmax_i32:
|
|
|
|
|
case aco_opcode::v_fma_f32:
|
|
|
|
|
case aco_opcode::v_fma_legacy_f32:
|
|
|
|
|
case aco_opcode::v_fmac_f32:
|
|
|
|
|
case aco_opcode::v_fmac_legacy_f32:
|
|
|
|
|
case aco_opcode::v_mac_f32:
|
|
|
|
|
case aco_opcode::v_mac_legacy_f32:
|
|
|
|
|
case aco_opcode::v_fma_f16:
|
|
|
|
|
case aco_opcode::v_fmac_f16:
|
|
|
|
|
case aco_opcode::v_mac_f16:
|
|
|
|
|
case aco_opcode::v_dot4c_i32_i8:
|
|
|
|
|
case aco_opcode::v_dot2c_f32_f16:
|
|
|
|
|
case aco_opcode::v_dot2_f32_f16:
|
|
|
|
|
case aco_opcode::v_dot2_f32_bf16:
|
|
|
|
|
case aco_opcode::v_dot2_f16_f16:
|
|
|
|
|
case aco_opcode::v_dot2_bf16_bf16:
|
|
|
|
|
case aco_opcode::v_fma_mix_f32:
|
|
|
|
|
case aco_opcode::v_fma_mixlo_f16:
|
|
|
|
|
case aco_opcode::v_fma_mixhi_f16:
|
2025-10-19 16:40:56 +02:00
|
|
|
case aco_opcode::p_v_fma_mixlo_f16_rtz:
|
|
|
|
|
case aco_opcode::p_v_fma_mixhi_f16_rtz:
|
2023-04-23 14:55:17 +02:00
|
|
|
case aco_opcode::v_pk_fmac_f16: {
|
|
|
|
|
if (idx1 == 2)
|
2024-08-13 18:37:37 +02:00
|
|
|
return aco_opcode::num_opcodes;
|
|
|
|
|
return opcode;
|
2023-04-23 14:55:17 +02:00
|
|
|
}
|
|
|
|
|
case aco_opcode::v_subb_co_u32: {
|
|
|
|
|
if (idx1 == 2)
|
2024-08-13 18:37:37 +02:00
|
|
|
return aco_opcode::num_opcodes;
|
|
|
|
|
return aco_opcode::v_subbrev_co_u32;
|
2021-07-14 17:22:02 +01:00
|
|
|
}
|
2023-04-23 14:55:17 +02:00
|
|
|
case aco_opcode::v_subbrev_co_u32: {
|
|
|
|
|
if (idx1 == 2)
|
2024-08-13 18:37:37 +02:00
|
|
|
return aco_opcode::num_opcodes;
|
|
|
|
|
return aco_opcode::v_subb_co_u32;
|
2023-04-23 14:55:17 +02:00
|
|
|
}
|
2024-08-13 18:37:37 +02:00
|
|
|
case aco_opcode::v_med3_f32: /* order matters for clamp+GFX8+denorm ftz. */
|
|
|
|
|
default: return aco_opcode::num_opcodes;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool
|
|
|
|
|
can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op, unsigned idx0, unsigned idx1)
|
|
|
|
|
{
|
|
|
|
|
if (idx0 == idx1) {
|
|
|
|
|
*new_op = instr->opcode;
|
|
|
|
|
return true;
|
2021-07-14 17:22:02 +01:00
|
|
|
}
|
2024-08-13 18:37:37 +02:00
|
|
|
|
|
|
|
|
if (instr->isDPP())
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (!instr->isVOP3() && !instr->isVOP3P() && !instr->operands[0].isOfType(RegType::vgpr))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
aco_opcode candidate = get_swapped_opcode(instr->opcode, idx0, idx1);
|
|
|
|
|
if (candidate == aco_opcode::num_opcodes)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
*new_op = candidate;
|
|
|
|
|
return true;
|
2021-07-14 17:22:02 +01:00
|
|
|
}
|
|
|
|
|
|
2024-05-03 12:02:21 +01:00
|
|
|
wait_imm::wait_imm()
|
|
|
|
|
: exp(unset_counter), lgkm(unset_counter), vm(unset_counter), vs(unset_counter),
|
|
|
|
|
sample(unset_counter), bvh(unset_counter), km(unset_counter)
|
2021-01-27 16:27:38 +00:00
|
|
|
{}
|
|
|
|
|
wait_imm::wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_)
|
2024-05-03 12:02:21 +01:00
|
|
|
: exp(exp_), lgkm(lgkm_), vm(vm_), vs(vs_), sample(unset_counter), bvh(unset_counter),
|
|
|
|
|
km(unset_counter)
|
2021-01-27 16:27:38 +00:00
|
|
|
{}
|
|
|
|
|
|
|
|
|
|
uint16_t
|
2022-05-12 02:50:17 -04:00
|
|
|
wait_imm::pack(enum amd_gfx_level gfx_level) const
|
2021-01-27 16:27:38 +00:00
|
|
|
{
|
|
|
|
|
uint16_t imm = 0;
|
|
|
|
|
assert(exp == unset_counter || exp <= 0x7);
|
2023-10-03 11:41:44 +02:00
|
|
|
if (gfx_level >= GFX11) {
|
2022-05-06 11:38:43 +02:00
|
|
|
assert(lgkm == unset_counter || lgkm <= 0x3f);
|
|
|
|
|
assert(vm == unset_counter || vm <= 0x3f);
|
|
|
|
|
imm = ((vm & 0x3f) << 10) | ((lgkm & 0x3f) << 4) | (exp & 0x7);
|
2023-10-03 11:41:44 +02:00
|
|
|
} else if (gfx_level >= GFX10) {
|
2021-01-27 16:27:38 +00:00
|
|
|
assert(lgkm == unset_counter || lgkm <= 0x3f);
|
|
|
|
|
assert(vm == unset_counter || vm <= 0x3f);
|
|
|
|
|
imm = ((vm & 0x30) << 10) | ((lgkm & 0x3f) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
|
2023-10-03 11:41:44 +02:00
|
|
|
} else if (gfx_level >= GFX9) {
|
2021-01-27 16:27:38 +00:00
|
|
|
assert(lgkm == unset_counter || lgkm <= 0xf);
|
|
|
|
|
assert(vm == unset_counter || vm <= 0x3f);
|
|
|
|
|
imm = ((vm & 0x30) << 10) | ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
|
2023-10-03 11:41:44 +02:00
|
|
|
} else {
|
2021-01-27 16:27:38 +00:00
|
|
|
assert(lgkm == unset_counter || lgkm <= 0xf);
|
|
|
|
|
assert(vm == unset_counter || vm <= 0xf);
|
|
|
|
|
imm = ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
|
|
|
|
|
}
|
2022-05-12 02:50:17 -04:00
|
|
|
if (gfx_level < GFX9 && vm == wait_imm::unset_counter)
|
2021-01-27 16:27:38 +00:00
|
|
|
imm |= 0xc000; /* should have no effect on pre-GFX9 and now we won't have to worry about the
|
|
|
|
|
architecture when interpreting the immediate */
|
2022-05-12 02:50:17 -04:00
|
|
|
if (gfx_level < GFX10 && lgkm == wait_imm::unset_counter)
|
2021-01-27 16:27:38 +00:00
|
|
|
imm |= 0x3000; /* should have no effect on pre-GFX10 and now we won't have to worry about the
|
|
|
|
|
architecture when interpreting the immediate */
|
|
|
|
|
return imm;
|
|
|
|
|
}
|
|
|
|
|
|
2024-05-03 11:19:57 +01:00
|
|
|
wait_imm
|
|
|
|
|
wait_imm::max(enum amd_gfx_level gfx_level)
|
|
|
|
|
{
|
|
|
|
|
wait_imm imm;
|
|
|
|
|
imm.vm = gfx_level >= GFX9 ? 63 : 15;
|
|
|
|
|
imm.exp = 7;
|
|
|
|
|
imm.lgkm = gfx_level >= GFX10 ? 63 : 15;
|
|
|
|
|
imm.vs = gfx_level >= GFX10 ? 63 : 0;
|
2024-05-03 12:02:21 +01:00
|
|
|
imm.sample = gfx_level >= GFX12 ? 63 : 0;
|
|
|
|
|
imm.bvh = gfx_level >= GFX12 ? 7 : 0;
|
|
|
|
|
imm.km = gfx_level >= GFX12 ? 31 : 0;
|
2024-05-03 11:19:57 +01:00
|
|
|
return imm;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool
|
|
|
|
|
wait_imm::unpack(enum amd_gfx_level gfx_level, const Instruction* instr)
|
|
|
|
|
{
|
|
|
|
|
if (!instr->isSALU() || (!instr->operands.empty() && instr->operands[0].physReg() != sgpr_null))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
aco_opcode op = instr->opcode;
|
|
|
|
|
uint16_t packed = instr->salu().imm;
|
|
|
|
|
|
2024-05-03 12:02:21 +01:00
|
|
|
if (op == aco_opcode::s_wait_loadcnt) {
|
|
|
|
|
vm = std::min<uint8_t>(vm, packed);
|
|
|
|
|
} else if (op == aco_opcode::s_wait_storecnt) {
|
|
|
|
|
vs = std::min<uint8_t>(vs, packed);
|
|
|
|
|
} else if (op == aco_opcode::s_wait_samplecnt) {
|
|
|
|
|
sample = std::min<uint8_t>(sample, packed);
|
|
|
|
|
} else if (op == aco_opcode::s_wait_bvhcnt) {
|
|
|
|
|
bvh = std::min<uint8_t>(bvh, packed);
|
|
|
|
|
} else if (op == aco_opcode::s_wait_expcnt) {
|
|
|
|
|
exp = std::min<uint8_t>(exp, packed);
|
|
|
|
|
} else if (op == aco_opcode::s_wait_dscnt) {
|
|
|
|
|
lgkm = std::min<uint8_t>(lgkm, packed);
|
|
|
|
|
} else if (op == aco_opcode::s_wait_kmcnt) {
|
|
|
|
|
km = std::min<uint8_t>(km, packed);
|
|
|
|
|
} else if (op == aco_opcode::s_wait_loadcnt_dscnt) {
|
|
|
|
|
uint32_t vm2 = (packed >> 8) & 0x3f;
|
|
|
|
|
uint32_t ds = packed & 0x3f;
|
|
|
|
|
vm = std::min<uint8_t>(vm, vm2 == 0x3f ? wait_imm::unset_counter : vm2);
|
|
|
|
|
lgkm = std::min<uint8_t>(lgkm, ds == 0x3f ? wait_imm::unset_counter : ds);
|
|
|
|
|
} else if (op == aco_opcode::s_wait_storecnt_dscnt) {
|
|
|
|
|
uint32_t vs2 = (packed >> 8) & 0x3f;
|
|
|
|
|
uint32_t ds = packed & 0x3f;
|
|
|
|
|
vs = std::min<uint8_t>(vs, vs2 == 0x3f ? wait_imm::unset_counter : vs2);
|
|
|
|
|
lgkm = std::min<uint8_t>(lgkm, ds == 0x3f ? wait_imm::unset_counter : ds);
|
|
|
|
|
} else if (op == aco_opcode::s_waitcnt_expcnt) {
|
2024-05-03 11:19:57 +01:00
|
|
|
exp = std::min<uint8_t>(exp, packed);
|
|
|
|
|
} else if (op == aco_opcode::s_waitcnt_lgkmcnt) {
|
|
|
|
|
lgkm = std::min<uint8_t>(lgkm, packed);
|
|
|
|
|
} else if (op == aco_opcode::s_waitcnt_vmcnt) {
|
|
|
|
|
vm = std::min<uint8_t>(vm, packed);
|
|
|
|
|
} else if (op == aco_opcode::s_waitcnt_vscnt) {
|
|
|
|
|
vs = std::min<uint8_t>(vs, packed);
|
|
|
|
|
} else if (op == aco_opcode::s_waitcnt) {
|
|
|
|
|
uint8_t vm2, lgkm2, exp2;
|
|
|
|
|
if (gfx_level >= GFX11) {
|
|
|
|
|
vm2 = (packed >> 10) & 0x3f;
|
|
|
|
|
lgkm2 = (packed >> 4) & 0x3f;
|
|
|
|
|
exp2 = packed & 0x7;
|
|
|
|
|
} else {
|
|
|
|
|
vm2 = packed & 0xf;
|
|
|
|
|
if (gfx_level >= GFX9)
|
|
|
|
|
vm2 |= (packed >> 10) & 0x30;
|
|
|
|
|
|
|
|
|
|
exp2 = (packed >> 4) & 0x7;
|
|
|
|
|
|
|
|
|
|
lgkm2 = (packed >> 8) & 0xf;
|
|
|
|
|
if (gfx_level >= GFX10)
|
|
|
|
|
lgkm2 |= (packed >> 8) & 0x30;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (vm2 == (gfx_level >= GFX9 ? 0x3f : 0xf))
|
|
|
|
|
vm2 = wait_imm::unset_counter;
|
|
|
|
|
if (exp2 == 0x7)
|
|
|
|
|
exp2 = wait_imm::unset_counter;
|
|
|
|
|
if (lgkm2 == (gfx_level >= GFX10 ? 0x3f : 0xf))
|
|
|
|
|
lgkm2 = wait_imm::unset_counter;
|
|
|
|
|
|
|
|
|
|
vm = std::min(vm, vm2);
|
|
|
|
|
exp = std::min(exp, exp2);
|
|
|
|
|
lgkm = std::min(lgkm, lgkm2);
|
|
|
|
|
} else {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2021-01-27 16:27:38 +00:00
|
|
|
bool
|
|
|
|
|
wait_imm::combine(const wait_imm& other)
|
|
|
|
|
{
|
2024-04-24 11:36:13 +01:00
|
|
|
bool changed = false;
|
|
|
|
|
for (unsigned i = 0; i < wait_type_num; i++) {
|
|
|
|
|
if (other[i] < (*this)[i])
|
|
|
|
|
changed = true;
|
|
|
|
|
(*this)[i] = std::min((*this)[i], other[i]);
|
|
|
|
|
}
|
2021-01-27 16:27:38 +00:00
|
|
|
return changed;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool
|
|
|
|
|
wait_imm::empty() const
|
|
|
|
|
{
|
2024-04-24 11:36:13 +01:00
|
|
|
for (unsigned i = 0; i < wait_type_num; i++) {
|
|
|
|
|
if ((*this)[i] != unset_counter)
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
return true;
|
2021-01-27 16:27:38 +00:00
|
|
|
}
|
|
|
|
|
|
2023-09-25 12:29:26 +01:00
|
|
|
void
|
|
|
|
|
wait_imm::print(FILE* output) const
|
|
|
|
|
{
|
2024-04-24 11:36:13 +01:00
|
|
|
const char* names[wait_type_num];
|
|
|
|
|
names[wait_type_exp] = "exp";
|
|
|
|
|
names[wait_type_vm] = "vm";
|
|
|
|
|
names[wait_type_lgkm] = "lgkm";
|
|
|
|
|
names[wait_type_vs] = "vs";
|
2024-05-03 12:02:21 +01:00
|
|
|
names[wait_type_sample] = "sample";
|
|
|
|
|
names[wait_type_bvh] = "bvh";
|
|
|
|
|
names[wait_type_km] = "km";
|
2024-04-24 11:36:13 +01:00
|
|
|
for (unsigned i = 0; i < wait_type_num; i++) {
|
|
|
|
|
if ((*this)[i] != unset_counter)
|
|
|
|
|
fprintf(output, "%s: %u\n", names[i], (*this)[i]);
|
|
|
|
|
}
|
2023-09-25 12:29:26 +01:00
|
|
|
}
|
|
|
|
|
|
2024-09-19 12:24:39 +01:00
|
|
|
void
|
|
|
|
|
wait_imm::build_waitcnt(Builder& bld)
|
|
|
|
|
{
|
|
|
|
|
enum amd_gfx_level gfx_level = bld.program->gfx_level;
|
|
|
|
|
|
|
|
|
|
if (gfx_level >= GFX12) {
|
|
|
|
|
if (vm != wait_imm::unset_counter && lgkm != wait_imm::unset_counter) {
|
|
|
|
|
bld.sopp(aco_opcode::s_wait_loadcnt_dscnt, (vm << 8) | lgkm);
|
|
|
|
|
vm = wait_imm::unset_counter;
|
|
|
|
|
lgkm = wait_imm::unset_counter;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (vs != wait_imm::unset_counter && lgkm != wait_imm::unset_counter) {
|
|
|
|
|
bld.sopp(aco_opcode::s_wait_storecnt_dscnt, (vs << 8) | lgkm);
|
|
|
|
|
vs = wait_imm::unset_counter;
|
|
|
|
|
lgkm = wait_imm::unset_counter;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
aco_opcode op[wait_type_num];
|
|
|
|
|
op[wait_type_exp] = aco_opcode::s_wait_expcnt;
|
|
|
|
|
op[wait_type_lgkm] = aco_opcode::s_wait_dscnt;
|
|
|
|
|
op[wait_type_vm] = aco_opcode::s_wait_loadcnt;
|
|
|
|
|
op[wait_type_vs] = aco_opcode::s_wait_storecnt;
|
|
|
|
|
op[wait_type_sample] = aco_opcode::s_wait_samplecnt;
|
|
|
|
|
op[wait_type_bvh] = aco_opcode::s_wait_bvhcnt;
|
|
|
|
|
op[wait_type_km] = aco_opcode::s_wait_kmcnt;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < wait_type_num; i++) {
|
|
|
|
|
if ((*this)[i] != wait_imm::unset_counter)
|
|
|
|
|
bld.sopp(op[i], (*this)[i]);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
if (vs != wait_imm::unset_counter) {
|
|
|
|
|
assert(gfx_level >= GFX10);
|
|
|
|
|
bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), vs);
|
|
|
|
|
vs = wait_imm::unset_counter;
|
|
|
|
|
}
|
|
|
|
|
if (!empty())
|
|
|
|
|
bld.sopp(aco_opcode::s_waitcnt, pack(gfx_level));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
*this = wait_imm();
|
|
|
|
|
}
|
|
|
|
|
|
2020-11-30 17:53:23 +00:00
|
|
|
bool
|
|
|
|
|
should_form_clause(const Instruction* a, const Instruction* b)
|
|
|
|
|
{
|
2023-06-07 17:00:12 +01:00
|
|
|
if (a->definitions.empty() != b->definitions.empty())
|
|
|
|
|
return false;
|
|
|
|
|
|
aco: form mixed MTBUF/MUBUF clauses
This should be one clause (all of the instructions load from the same vertex buffer)
s_clause 0x2 ; bfa10002
tbuffer_load_format_xyzw v[8:11], v5, s[4:7], 0 format:[BUF_FMT_8_8_8_8_UNORM] idxen offset:36 ; e9c32024 80010805
tbuffer_load_format_xyzw v[12:15], v5, s[4:7], 0 format:[BUF_FMT_8_8_8_8_UNORM] idxen offset:16 ; e9c32010 80010c05
tbuffer_load_format_xyzw v[16:19], v5, s[4:7], 0 format:[BUF_FMT_8_8_8_8_UNORM] idxen offset:12 ; e9c3200c 80011005
s_clause 0x2 ; bfa10002
buffer_load_dwordx3 v[20:22], v5, s[4:7], 0 idxen ; e03c2000 80011405
buffer_load_dwordx3 v[23:25], v5, s[4:7], 0 idxen offset:20 ; e03c2014 80011705
buffer_load_dwordx4 v[28:31], v5, s[4:7], 0 idxen offset:48 ; e0382030 80011c05
tbuffer_load_format_xy v[0:1], v5, s[4:7], 0 format:[BUF_FMT_8_8_UNORM] idxen offset:32 ; e8712020 80010005
Foz-DB Navi21:
Totals from 5624 (7.08% of 79395) affected shaders:
MaxWaves: 149894 -> 149898 (+0.00%)
Instrs: 3032697 -> 3034853 (+0.07%); split: -0.05%, +0.12%
CodeSize: 15907852 -> 15915752 (+0.05%); split: -0.05%, +0.10%
VGPRs: 216248 -> 216144 (-0.05%)
Latency: 10955137 -> 11008760 (+0.49%); split: -0.22%, +0.70%
InvThroughput: 2032857 -> 2033916 (+0.05%); split: -0.03%, +0.08%
VClause: 50120 -> 41778 (-16.64%); split: -16.66%, +0.02%
SClause: 62034 -> 62004 (-0.05%); split: -0.33%, +0.29%
Copies: 253836 -> 254505 (+0.26%); split: -0.17%, +0.43%
VALU: 1621606 -> 1622274 (+0.04%); split: -0.03%, +0.07%
SALU: 653251 -> 653252 (+0.00%)
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34379>
2025-01-21 01:12:58 +01:00
|
|
|
/* MUBUF and MTBUF can appear in the same clause. */
|
|
|
|
|
if ((a->isMTBUF() && b->isMUBUF()) || (a->isMUBUF() && b->isMTBUF())) {
|
|
|
|
|
} else if (a->format != b->format) {
|
2020-11-30 17:53:23 +00:00
|
|
|
return false;
|
aco: form mixed MTBUF/MUBUF clauses
This should be one clause (all of the instructions load from the same vertex buffer)
s_clause 0x2 ; bfa10002
tbuffer_load_format_xyzw v[8:11], v5, s[4:7], 0 format:[BUF_FMT_8_8_8_8_UNORM] idxen offset:36 ; e9c32024 80010805
tbuffer_load_format_xyzw v[12:15], v5, s[4:7], 0 format:[BUF_FMT_8_8_8_8_UNORM] idxen offset:16 ; e9c32010 80010c05
tbuffer_load_format_xyzw v[16:19], v5, s[4:7], 0 format:[BUF_FMT_8_8_8_8_UNORM] idxen offset:12 ; e9c3200c 80011005
s_clause 0x2 ; bfa10002
buffer_load_dwordx3 v[20:22], v5, s[4:7], 0 idxen ; e03c2000 80011405
buffer_load_dwordx3 v[23:25], v5, s[4:7], 0 idxen offset:20 ; e03c2014 80011705
buffer_load_dwordx4 v[28:31], v5, s[4:7], 0 idxen offset:48 ; e0382030 80011c05
tbuffer_load_format_xy v[0:1], v5, s[4:7], 0 format:[BUF_FMT_8_8_UNORM] idxen offset:32 ; e8712020 80010005
Foz-DB Navi21:
Totals from 5624 (7.08% of 79395) affected shaders:
MaxWaves: 149894 -> 149898 (+0.00%)
Instrs: 3032697 -> 3034853 (+0.07%); split: -0.05%, +0.12%
CodeSize: 15907852 -> 15915752 (+0.05%); split: -0.05%, +0.10%
VGPRs: 216248 -> 216144 (-0.05%)
Latency: 10955137 -> 11008760 (+0.49%); split: -0.22%, +0.70%
InvThroughput: 2032857 -> 2033916 (+0.05%); split: -0.03%, +0.08%
VClause: 50120 -> 41778 (-16.64%); split: -16.66%, +0.02%
SClause: 62034 -> 62004 (-0.05%); split: -0.33%, +0.29%
Copies: 253836 -> 254505 (+0.26%); split: -0.17%, +0.43%
VALU: 1621606 -> 1622274 (+0.04%); split: -0.03%, +0.07%
SALU: 653251 -> 653252 (+0.00%)
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34379>
2025-01-21 01:12:58 +01:00
|
|
|
}
|
2020-11-30 17:53:23 +00:00
|
|
|
|
2023-10-30 16:19:20 +01:00
|
|
|
if (a->operands.empty() || b->operands.empty())
|
|
|
|
|
return false;
|
|
|
|
|
|
2020-11-30 17:53:23 +00:00
|
|
|
/* Assume loads which don't use descriptors might load from similar addresses. */
|
2024-01-04 12:58:14 +01:00
|
|
|
if (a->isFlatLike() || a->accessesLDS())
|
2020-11-30 17:53:23 +00:00
|
|
|
return true;
|
|
|
|
|
if (a->isSMEM() && a->operands[0].bytes() == 8 && b->operands[0].bytes() == 8)
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
/* If they load from the same descriptor, assume they might load from similar
|
|
|
|
|
* addresses.
|
|
|
|
|
*/
|
|
|
|
|
if (a->isVMEM() || a->isSMEM())
|
|
|
|
|
return a->operands[0].tempId() == b->operands[0].tempId();
|
|
|
|
|
|
2024-07-17 17:22:18 +01:00
|
|
|
if (a->isEXP() && b->isEXP())
|
|
|
|
|
return true;
|
|
|
|
|
|
2020-11-30 17:53:23 +00:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2025-03-27 18:37:52 +01:00
|
|
|
aco::small_vec<uint32_t, 2>
|
2025-04-17 16:59:53 +01:00
|
|
|
get_tied_defs(Instruction* instr)
|
2023-04-12 15:50:57 +01:00
|
|
|
{
|
2025-03-27 18:37:52 +01:00
|
|
|
aco::small_vec<uint32_t, 2> ops;
|
2023-04-12 15:50:57 +01:00
|
|
|
if (instr->opcode == aco_opcode::v_interp_p2_f32 || instr->opcode == aco_opcode::v_mac_f32 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_fmac_f32 || instr->opcode == aco_opcode::v_mac_f16 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_fmac_f16 || instr->opcode == aco_opcode::v_mac_legacy_f32 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_fmac_legacy_f32 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_pk_fmac_f16 || instr->opcode == aco_opcode::v_writelane_b32 ||
|
|
|
|
|
instr->opcode == aco_opcode::v_writelane_b32_e64 ||
|
2023-09-21 20:41:45 +02:00
|
|
|
instr->opcode == aco_opcode::v_dot4c_i32_i8 || instr->opcode == aco_opcode::s_fmac_f32 ||
|
|
|
|
|
instr->opcode == aco_opcode::s_fmac_f16) {
|
2025-03-27 18:37:52 +01:00
|
|
|
ops.push_back(2);
|
2023-04-12 15:50:57 +01:00
|
|
|
} else if (instr->opcode == aco_opcode::s_addk_i32 || instr->opcode == aco_opcode::s_mulk_i32 ||
|
2023-12-19 21:10:41 +01:00
|
|
|
instr->opcode == aco_opcode::s_cmovk_i32 ||
|
2025-03-27 18:45:45 +01:00
|
|
|
instr->opcode == aco_opcode::ds_bvh_stack_push4_pop1_rtn_b32 ||
|
|
|
|
|
instr->opcode == aco_opcode::ds_bvh_stack_push8_pop1_rtn_b32 ||
|
|
|
|
|
instr->opcode == aco_opcode::ds_bvh_stack_push8_pop2_rtn_b64) {
|
2025-03-27 18:37:52 +01:00
|
|
|
ops.push_back(0);
|
2025-07-06 20:54:49 +02:00
|
|
|
} else if (instr->isMUBUF() && instr->definitions.size() == 1 &&
|
|
|
|
|
(instr_info.is_atomic[(int)instr->opcode] || instr->mubuf().tfe)) {
|
2025-03-27 18:37:52 +01:00
|
|
|
ops.push_back(3);
|
2023-04-12 15:50:57 +01:00
|
|
|
} else if (instr->isMIMG() && instr->definitions.size() == 1 &&
|
|
|
|
|
!instr->operands[2].isUndefined()) {
|
2025-03-27 18:37:52 +01:00
|
|
|
ops.push_back(2);
|
2025-03-12 22:43:57 +01:00
|
|
|
} else if (instr->opcode == aco_opcode::image_bvh8_intersect_ray) {
|
|
|
|
|
/* VADDR starts at 3. */
|
2025-04-14 23:24:53 +02:00
|
|
|
ops.push_back(3 + 4);
|
|
|
|
|
ops.push_back(3 + 7);
|
2023-04-12 15:50:57 +01:00
|
|
|
}
|
2025-03-27 18:37:52 +01:00
|
|
|
return ops;
|
2023-04-12 15:50:57 +01:00
|
|
|
}
|
|
|
|
|
|
2024-05-03 12:04:59 +01:00
|
|
|
uint8_t
|
2025-11-27 13:27:00 +01:00
|
|
|
get_vmem_type(Instruction* instr, bool has_point_sample_accel)
|
2024-05-03 12:04:59 +01:00
|
|
|
{
|
2025-05-29 11:59:03 +01:00
|
|
|
if (instr->opcode == aco_opcode::image_bvh_intersect_ray ||
|
|
|
|
|
instr->opcode == aco_opcode::image_bvh64_intersect_ray ||
|
|
|
|
|
instr->opcode == aco_opcode::image_bvh_dual_intersect_ray ||
|
2025-05-08 18:17:41 +01:00
|
|
|
instr->opcode == aco_opcode::image_bvh8_intersect_ray) {
|
2024-05-03 12:04:59 +01:00
|
|
|
return vmem_bvh;
|
2025-05-29 11:44:54 +01:00
|
|
|
} else if (instr->opcode == aco_opcode::image_msaa_load) {
|
2024-05-03 12:04:59 +01:00
|
|
|
return vmem_sampler;
|
2025-05-08 18:17:41 +01:00
|
|
|
} else if (instr->isMIMG() && !instr->operands[1].isUndefined() &&
|
|
|
|
|
instr->operands[1].regClass() == s4) {
|
2025-11-27 13:27:00 +01:00
|
|
|
bool point_sample_accel =
|
|
|
|
|
has_point_sample_accel && (instr->opcode == aco_opcode::image_sample ||
|
|
|
|
|
instr->opcode == aco_opcode::image_sample_l ||
|
|
|
|
|
instr->opcode == aco_opcode::image_sample_lz);
|
2025-05-08 18:17:41 +01:00
|
|
|
return vmem_sampler | (point_sample_accel ? vmem_nosampler : 0);
|
|
|
|
|
} else if (instr->isVMEM() || instr->isScratch() || instr->isGlobal()) {
|
2024-05-03 12:04:59 +01:00
|
|
|
return vmem_nosampler;
|
2025-05-08 18:17:41 +01:00
|
|
|
}
|
2024-05-03 12:04:59 +01:00
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2024-09-11 16:03:36 +02:00
|
|
|
/* Parse implicit data dependency resolution:
|
|
|
|
|
* Returns the value of each counter that must be reached
|
|
|
|
|
* before an instruction is issued.
|
|
|
|
|
*
|
|
|
|
|
* (Probably incomplete.)
|
|
|
|
|
*/
|
|
|
|
|
depctr_wait
|
|
|
|
|
parse_depctr_wait(const Instruction* instr)
|
2024-08-05 12:30:06 +01:00
|
|
|
{
|
2024-09-11 16:03:36 +02:00
|
|
|
depctr_wait res;
|
|
|
|
|
if (instr->isVMEM() || instr->isFlatLike() || instr->isDS() || instr->isEXP()) {
|
|
|
|
|
res.va_vdst = 0;
|
|
|
|
|
res.va_exec = 0;
|
|
|
|
|
res.sa_exec = 0;
|
|
|
|
|
if (instr->isVMEM() || instr->isFlatLike()) {
|
|
|
|
|
res.sa_sdst = 0;
|
|
|
|
|
res.va_sdst = 0;
|
|
|
|
|
res.va_vcc = 0;
|
|
|
|
|
}
|
|
|
|
|
} else if (instr->isSMEM()) {
|
|
|
|
|
res.sa_sdst = 0;
|
|
|
|
|
res.va_sdst = 0;
|
|
|
|
|
res.va_vcc = 0;
|
|
|
|
|
} else if (instr->isLDSDIR()) {
|
|
|
|
|
res.va_vdst = instr->ldsdir().wait_vdst;
|
|
|
|
|
res.va_exec = 0;
|
|
|
|
|
res.sa_exec = 0;
|
|
|
|
|
} else if (instr->opcode == aco_opcode::s_waitcnt_depctr) {
|
|
|
|
|
unsigned imm = instr->salu().imm;
|
|
|
|
|
res.va_vdst = (imm >> 12) & 0xf;
|
|
|
|
|
res.va_sdst = (imm >> 9) & 0x7;
|
|
|
|
|
res.va_ssrc = (imm >> 8) & 0x1;
|
|
|
|
|
res.hold_cnt = (imm >> 7) & 0x1;
|
|
|
|
|
res.vm_vsrc = (imm >> 2) & 0x7;
|
|
|
|
|
res.va_vcc = (imm >> 1) & 0x1;
|
|
|
|
|
res.sa_sdst = imm & 0x1;
|
|
|
|
|
} else if (instr->isVALU()) {
|
|
|
|
|
res.sa_exec = 0;
|
|
|
|
|
for (const Definition& def : instr->definitions) {
|
|
|
|
|
if (def.regClass().type() == RegType::sgpr) {
|
|
|
|
|
res.sa_sdst = 0;
|
|
|
|
|
/* Notably, this is the only exception, even VALU that
|
|
|
|
|
* reads exec doesn't implicitly wait for va_exec.
|
|
|
|
|
*/
|
|
|
|
|
if (instr->opcode == aco_opcode::v_readfirstlane_b32)
|
|
|
|
|
res.va_exec = 0;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else if (instr_info.classes[(int)instr->opcode] == instr_class::branch ||
|
|
|
|
|
instr_info.classes[(int)instr->opcode] == instr_class::sendmsg) {
|
|
|
|
|
res.sa_exec = 0;
|
|
|
|
|
res.va_exec = 0;
|
|
|
|
|
switch (instr->opcode) {
|
|
|
|
|
case aco_opcode::s_cbranch_vccz:
|
|
|
|
|
case aco_opcode::s_cbranch_vccnz:
|
|
|
|
|
res.va_vcc = 0;
|
|
|
|
|
res.sa_sdst = 0;
|
|
|
|
|
break;
|
|
|
|
|
case aco_opcode::s_cbranch_scc0:
|
|
|
|
|
case aco_opcode::s_cbranch_scc1:
|
|
|
|
|
res.sa_sdst = 0;
|
|
|
|
|
break;
|
|
|
|
|
default: break;
|
|
|
|
|
}
|
|
|
|
|
} else if (instr->isSALU()) {
|
|
|
|
|
for (const Definition& def : instr->definitions) {
|
|
|
|
|
if (def.physReg() < vcc) {
|
|
|
|
|
res.va_sdst = 0;
|
|
|
|
|
} else if (def.physReg() <= vcc_hi) {
|
|
|
|
|
res.va_vcc = 0;
|
|
|
|
|
} else if (def.physReg() == exec || def.physReg() == exec_hi) {
|
|
|
|
|
res.va_exec = 0;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
for (const Operand& op : instr->operands) {
|
|
|
|
|
if (op.physReg() < vcc) {
|
|
|
|
|
res.va_sdst = 0;
|
|
|
|
|
} else if (op.physReg() <= vcc_hi) {
|
|
|
|
|
res.va_vcc = 0;
|
|
|
|
|
} else if (op.physReg() == exec || op.physReg() == exec_hi) {
|
|
|
|
|
res.va_exec = 0;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return res;
|
2024-08-05 12:30:06 +01:00
|
|
|
}
|
|
|
|
|
|
2025-09-02 15:39:19 +01:00
|
|
|
uint16_t
|
|
|
|
|
depctr_wait::pack() const
|
|
|
|
|
{
|
|
|
|
|
uint16_t imm = 0;
|
|
|
|
|
imm |= (va_vdst & 0xf) << 12;
|
|
|
|
|
imm |= (va_sdst & 0x7) << 9;
|
|
|
|
|
imm |= (va_ssrc & 0x1) << 8;
|
|
|
|
|
imm |= (hold_cnt & 0x1) << 7;
|
|
|
|
|
imm |= 0x3 << 5; /* don't know what this is, if anything */
|
|
|
|
|
imm |= (vm_vsrc & 0x7) << 2;
|
|
|
|
|
imm |= (va_vcc & 0x1) << 1;
|
|
|
|
|
imm |= (sa_sdst & 0x1);
|
|
|
|
|
return imm;
|
|
|
|
|
}
|
|
|
|
|
|
2022-11-14 18:57:08 +00:00
|
|
|
bool
|
|
|
|
|
Instruction::isTrans() const noexcept
|
|
|
|
|
{
|
|
|
|
|
return instr_info.classes[(int)opcode] == instr_class::valu_transcendental32 ||
|
2024-05-16 19:12:01 +02:00
|
|
|
instr_info.classes[(int)opcode] == instr_class::valu_double_transcendental ||
|
|
|
|
|
instr_info.classes[(int)opcode] == instr_class::valu_pseudo_scalar_trans;
|
2022-11-14 18:57:08 +00:00
|
|
|
}
|
|
|
|
|
|
2024-04-02 16:36:20 +02:00
|
|
|
size_t
|
2024-03-25 15:27:56 +01:00
|
|
|
get_instr_data_size(Format format)
|
|
|
|
|
{
|
|
|
|
|
switch (format) {
|
|
|
|
|
case Format::SOP1:
|
|
|
|
|
case Format::SOP2:
|
|
|
|
|
case Format::SOPC:
|
|
|
|
|
case Format::SOPK:
|
|
|
|
|
case Format::SOPP: return sizeof(SALU_instruction);
|
|
|
|
|
case Format::SMEM: return sizeof(SMEM_instruction);
|
|
|
|
|
case Format::PSEUDO: return sizeof(Pseudo_instruction);
|
|
|
|
|
case Format::PSEUDO_BARRIER: return sizeof(Pseudo_barrier_instruction);
|
|
|
|
|
case Format::PSEUDO_REDUCTION: return sizeof(Pseudo_reduction_instruction);
|
|
|
|
|
case Format::PSEUDO_BRANCH: return sizeof(Pseudo_branch_instruction);
|
2025-02-17 18:42:48 +01:00
|
|
|
case Format::PSEUDO_CALL: return sizeof(Pseudo_call_instruction);
|
2024-03-25 15:27:56 +01:00
|
|
|
case Format::DS: return sizeof(DS_instruction);
|
|
|
|
|
case Format::FLAT:
|
|
|
|
|
case Format::GLOBAL:
|
|
|
|
|
case Format::SCRATCH: return sizeof(FLAT_instruction);
|
|
|
|
|
case Format::LDSDIR: return sizeof(LDSDIR_instruction);
|
|
|
|
|
case Format::MTBUF: return sizeof(MTBUF_instruction);
|
|
|
|
|
case Format::MUBUF: return sizeof(MUBUF_instruction);
|
|
|
|
|
case Format::MIMG: return sizeof(MIMG_instruction);
|
|
|
|
|
case Format::VOPD: return sizeof(VOPD_instruction);
|
|
|
|
|
case Format::VINTERP_INREG: return sizeof(VINTERP_inreg_instruction);
|
|
|
|
|
case Format::VINTRP: return sizeof(VINTRP_instruction);
|
|
|
|
|
case Format::EXP: return sizeof(Export_instruction);
|
|
|
|
|
default:
|
|
|
|
|
if ((uint16_t)format & (uint16_t)Format::DPP16)
|
|
|
|
|
return sizeof(DPP16_instruction);
|
|
|
|
|
else if ((uint16_t)format & (uint16_t)Format::DPP8)
|
|
|
|
|
return sizeof(DPP8_instruction);
|
|
|
|
|
else if ((uint16_t)format & (uint16_t)Format::SDWA)
|
|
|
|
|
return sizeof(SDWA_instruction);
|
|
|
|
|
else
|
|
|
|
|
return sizeof(VALU_instruction);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-27 16:27:21 +01:00
|
|
|
Instruction*
|
|
|
|
|
create_instruction(aco_opcode opcode, Format format, uint32_t num_operands,
|
|
|
|
|
uint32_t num_definitions)
|
|
|
|
|
{
|
|
|
|
|
size_t size = get_instr_data_size(format);
|
|
|
|
|
size_t total_size = size + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
|
|
|
|
|
|
|
|
|
|
void* data = instruction_buffer->allocate(total_size, alignof(uint32_t));
|
|
|
|
|
memset(data, 0, total_size);
|
|
|
|
|
Instruction* inst = (Instruction*)data;
|
|
|
|
|
|
|
|
|
|
inst->opcode = opcode;
|
|
|
|
|
inst->format = format;
|
|
|
|
|
|
|
|
|
|
uint16_t operands_offset = size - offsetof(Instruction, operands);
|
|
|
|
|
inst->operands = aco::span<Operand>(operands_offset, num_operands);
|
|
|
|
|
uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
|
|
|
|
|
inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
|
|
|
|
|
|
|
|
|
|
return inst;
|
|
|
|
|
}
|
|
|
|
|
|
2025-02-17 18:42:48 +01:00
|
|
|
Temp
|
|
|
|
|
load_scratch_resource(Program* program, Builder& bld, unsigned resume_idx,
|
|
|
|
|
bool apply_scratch_offset)
|
|
|
|
|
{
|
2026-01-27 11:44:34 +00:00
|
|
|
if (program->stack_ptr != Temp()) {
|
|
|
|
|
/* We can't apply any offsets when using the stack pointer as a scratch resource. */
|
2025-05-16 16:46:01 +02:00
|
|
|
assert(!apply_scratch_offset || program->scratch_offsets.empty());
|
2026-01-27 11:44:34 +00:00
|
|
|
return program->stack_ptr;
|
2025-05-16 16:46:01 +02:00
|
|
|
}
|
2025-02-17 18:42:48 +01:00
|
|
|
Temp private_segment_buffer;
|
|
|
|
|
if (!program->private_segment_buffers.empty())
|
|
|
|
|
private_segment_buffer = program->private_segment_buffers[resume_idx];
|
|
|
|
|
if (!private_segment_buffer.bytes()) {
|
|
|
|
|
Temp addr_lo =
|
|
|
|
|
bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_lo));
|
|
|
|
|
Temp addr_hi =
|
|
|
|
|
bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_hi));
|
|
|
|
|
private_segment_buffer =
|
|
|
|
|
bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi);
|
|
|
|
|
} else if (program->stage.hw != AC_HW_COMPUTE_SHADER) {
|
|
|
|
|
private_segment_buffer =
|
|
|
|
|
bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, Operand::zero());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (apply_scratch_offset && !program->scratch_offsets.empty()) {
|
|
|
|
|
Temp addr_lo = bld.tmp(s1);
|
|
|
|
|
Temp addr_hi = bld.tmp(s1);
|
|
|
|
|
bld.pseudo(aco_opcode::p_split_vector, Definition(addr_lo), Definition(addr_hi),
|
|
|
|
|
private_segment_buffer);
|
|
|
|
|
|
|
|
|
|
Temp carry = bld.tmp(s1);
|
|
|
|
|
Temp scratch_offset = program->scratch_offsets[resume_idx];
|
|
|
|
|
addr_lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), addr_lo,
|
|
|
|
|
scratch_offset);
|
|
|
|
|
addr_hi = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), addr_hi,
|
|
|
|
|
Operand::c32(0), bld.scc(carry));
|
|
|
|
|
|
|
|
|
|
private_segment_buffer =
|
|
|
|
|
bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
struct ac_buffer_state ac_state = {0};
|
|
|
|
|
uint32_t desc[4];
|
|
|
|
|
|
|
|
|
|
ac_state.size = 0xffffffff;
|
|
|
|
|
ac_state.format = PIPE_FORMAT_R32_FLOAT;
|
|
|
|
|
for (int i = 0; i < 4; i++)
|
|
|
|
|
ac_state.swizzle[i] = PIPE_SWIZZLE_0;
|
|
|
|
|
/* older generations need element size = 4 bytes. element size removed in GFX9 */
|
|
|
|
|
ac_state.element_size = program->gfx_level <= GFX8 ? 1u : 0u;
|
|
|
|
|
ac_state.index_stride = program->wave_size == 64 ? 3u : 2u;
|
|
|
|
|
ac_state.add_tid = true;
|
|
|
|
|
ac_state.gfx10_oob_select = V_008F0C_OOB_SELECT_RAW;
|
|
|
|
|
|
|
|
|
|
ac_build_buffer_descriptor(program->gfx_level, &ac_state, desc);
|
|
|
|
|
|
|
|
|
|
return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer,
|
|
|
|
|
Operand::c32(desc[2]), Operand::c32(desc[3]));
|
|
|
|
|
}
|
|
|
|
|
|
2020-06-03 11:27:55 +01:00
|
|
|
} // namespace aco
|