2019-09-17 13:22:17 +02:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2018 Valve Corporation
|
|
|
|
|
* Copyright © 2018 Google
|
|
|
|
|
*
|
2024-04-08 09:02:30 +02:00
|
|
|
* SPDX-License-Identifier: MIT
|
2019-09-17 13:22:17 +02:00
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include "aco_builder.h"
|
2025-05-14 15:13:06 +02:00
|
|
|
#include "aco_instruction_selection.h"
|
2021-06-09 15:40:03 +02:00
|
|
|
#include "aco_ir.h"
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2025-05-14 15:13:06 +02:00
|
|
|
#include "amdgfxregs.h"
|
2025-05-14 12:02:21 +02:00
|
|
|
#include <array>
|
|
|
|
|
#include <utility>
|
|
|
|
|
#include <vector>
|
2025-04-08 13:10:19 +02:00
|
|
|
|
2025-05-14 12:02:21 +02:00
|
|
|
namespace aco {
|
|
|
|
|
namespace {
|
2023-07-15 19:49:49 +02:00
|
|
|
|
2025-05-14 12:02:21 +02:00
|
|
|
void visit_cf_list(struct isel_context* ctx, struct exec_list* list);
|
2023-07-15 19:49:49 +02:00
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
void
|
2025-05-14 12:02:21 +02:00
|
|
|
visit_load_const(isel_context* ctx, nir_load_const_instr* instr)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2025-05-14 12:02:21 +02:00
|
|
|
Temp dst = get_ssa_temp(ctx, &instr->def);
|
2024-01-17 11:52:10 +01:00
|
|
|
|
2025-05-14 12:02:21 +02:00
|
|
|
// TODO: we really want to have the resulting type as this would allow for 64bit literals
|
|
|
|
|
// which get truncated the lsb if double and msb if int
|
|
|
|
|
// for now, we only use s_mov_b64 with 64bit inline constants
|
|
|
|
|
assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar.");
|
|
|
|
|
assert(dst.type() == RegType::sgpr);
|
2024-01-17 11:52:10 +01:00
|
|
|
|
2025-05-14 12:02:21 +02:00
|
|
|
Builder bld(ctx->program, ctx->block);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2025-05-14 12:02:21 +02:00
|
|
|
if (instr->def.bit_size == 1) {
|
2019-11-27 11:04:47 +01:00
|
|
|
assert(dst.regClass() == bld.lm);
|
2025-05-14 12:02:21 +02:00
|
|
|
int val = instr->value[0].b ? -1 : 0;
|
|
|
|
|
Operand op = bld.lm.size() == 1 ? Operand::c32(val) : Operand::c64(val);
|
|
|
|
|
bld.copy(Definition(dst), op);
|
|
|
|
|
} else if (instr->def.bit_size == 8) {
|
|
|
|
|
bld.copy(Definition(dst), Operand::c32(instr->value[0].u8));
|
|
|
|
|
} else if (instr->def.bit_size == 16) {
|
|
|
|
|
/* sign-extend to use s_movk_i32 instead of a literal */
|
|
|
|
|
bld.copy(Definition(dst), Operand::c32(instr->value[0].i16));
|
|
|
|
|
} else if (dst.size() == 1) {
|
|
|
|
|
bld.copy(Definition(dst), Operand::c32(instr->value[0].u32));
|
|
|
|
|
} else {
|
|
|
|
|
assert(dst.size() != 1);
|
2024-03-25 15:55:27 +01:00
|
|
|
aco_ptr<Instruction> vec{
|
2025-05-14 12:02:21 +02:00
|
|
|
create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
|
|
|
|
|
if (instr->def.bit_size == 64)
|
|
|
|
|
for (unsigned i = 0; i < dst.size(); i++)
|
|
|
|
|
vec->operands[i] = Operand::c32(instr->value[0].u64 >> i * 32);
|
|
|
|
|
else {
|
|
|
|
|
for (unsigned i = 0; i < dst.size(); i++)
|
|
|
|
|
vec->operands[i] = Operand::c32(instr->value[i].u32);
|
2023-04-14 15:44:43 +01:00
|
|
|
}
|
2024-02-19 17:00:19 +00:00
|
|
|
vec->definitions[0] = Definition(dst);
|
2023-04-14 15:44:43 +01:00
|
|
|
ctx->block->instructions.emplace_back(std::move(vec));
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-05-14 12:02:21 +02:00
|
|
|
Temp merged_wave_info_to_mask(isel_context* ctx, unsigned i);
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
void
|
2023-08-12 16:17:15 -04:00
|
|
|
get_const_vec(nir_def* vec, nir_const_value* cv[4])
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
if (vec->parent_instr->type != nir_instr_type_alu)
|
|
|
|
|
return;
|
2025-07-31 09:49:36 -04:00
|
|
|
nir_alu_instr* vec_instr = nir_def_as_alu(vec);
|
2019-09-17 13:22:17 +02:00
|
|
|
if (vec_instr->op != nir_op_vec(vec->num_components))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < vec->num_components; i++) {
|
|
|
|
|
cv[i] =
|
|
|
|
|
vec_instr->src[i].swizzle[0] == 0 ? nir_src_as_const_value(vec_instr->src[i].src) : NULL;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
visit_tex(isel_context* ctx, nir_tex_instr* instr)
|
|
|
|
|
{
|
2022-11-09 10:46:30 +01:00
|
|
|
assert(instr->op != nir_texop_samples_identical);
|
radv,aco: use lower_to_fragment_fetch
This simplifies ACO and will let us optimize the FMASK fetch (for example,
move it out of loops).
fossil-db (Sienna Cichlid):
Totals from 955 (0.64% of 150170) affected shaders:
CodeSize: 4722016 -> 4722952 (+0.02%); split: -0.02%, +0.04%
Instrs: 875619 -> 875760 (+0.02%); split: -0.02%, +0.04%
Latency: 14069089 -> 14071699 (+0.02%); split: -0.02%, +0.04%
InvThroughput: 2321419 -> 2321218 (-0.01%); split: -0.02%, +0.01%
VClause: 23080 -> 23081 (+0.00%)
SClause: 32426 -> 32019 (-1.26%); split: -1.88%, +0.62%
Copies: 42787 -> 42777 (-0.02%); split: -0.19%, +0.16%
Branches: 17900 -> 17902 (+0.01%); split: -0.04%, +0.06%
PreSGPRs: 43229 -> 41002 (-5.15%); split: -5.16%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12214>
2021-08-04 16:17:56 +01:00
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
Builder bld(ctx->program, ctx->block);
|
|
|
|
|
bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
|
2020-05-11 16:33:14 +02:00
|
|
|
has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false,
|
2023-04-14 17:49:46 +01:00
|
|
|
has_sample_index = false, has_clamped_lod = false, has_wqm_coord = false;
|
radv,aco: use lower_to_fragment_fetch
This simplifies ACO and will let us optimize the FMASK fetch (for example,
move it out of loops).
fossil-db (Sienna Cichlid):
Totals from 955 (0.64% of 150170) affected shaders:
CodeSize: 4722016 -> 4722952 (+0.02%); split: -0.02%, +0.04%
Instrs: 875619 -> 875760 (+0.02%); split: -0.02%, +0.04%
Latency: 14069089 -> 14071699 (+0.02%); split: -0.02%, +0.04%
InvThroughput: 2321419 -> 2321218 (-0.01%); split: -0.02%, +0.01%
VClause: 23080 -> 23081 (+0.00%)
SClause: 32426 -> 32019 (-1.26%); split: -1.88%, +0.62%
Copies: 42787 -> 42777 (-0.02%); split: -0.19%, +0.16%
Branches: 17900 -> 17902 (+0.01%); split: -0.04%, +0.06%
PreSGPRs: 43229 -> 41002 (-5.15%); split: -5.16%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12214>
2021-08-04 16:17:56 +01:00
|
|
|
Temp resource, sampler, bias = Temp(), compare = Temp(), sample_index = Temp(), lod = Temp(),
|
2022-06-06 17:04:14 +02:00
|
|
|
offset = Temp(), ddx = Temp(), ddy = Temp(), clamped_lod = Temp(),
|
2023-04-14 17:49:46 +01:00
|
|
|
coord = Temp(), wqm_coord = Temp();
|
2020-01-23 19:12:55 +01:00
|
|
|
std::vector<Temp> coords;
|
|
|
|
|
std::vector<Temp> derivs;
|
2019-09-17 13:22:17 +02:00
|
|
|
nir_const_value* const_offset[4] = {NULL, NULL, NULL, NULL};
|
radv,aco: lower texture descriptor loads in NIR
fossil-db (Sienna Cichlid):
Totals from 39445 (24.30% of 162293) affected shaders:
MaxWaves: 875988 -> 875972 (-0.00%)
Instrs: 35372561 -> 35234909 (-0.39%); split: -0.41%, +0.03%
CodeSize: 190237480 -> 189379240 (-0.45%); split: -0.47%, +0.02%
VGPRs: 1889856 -> 1889928 (+0.00%); split: -0.00%, +0.01%
SpillSGPRs: 10764 -> 10857 (+0.86%); split: -2.04%, +2.91%
SpillVGPRs: 1891 -> 1907 (+0.85%); split: -0.32%, +1.16%
Scratch: 260096 -> 261120 (+0.39%)
Latency: 477701150 -> 477578466 (-0.03%); split: -0.06%, +0.03%
InvThroughput: 87819847 -> 87830346 (+0.01%); split: -0.03%, +0.04%
VClause: 673353 -> 673829 (+0.07%); split: -0.04%, +0.11%
SClause: 1385396 -> 1366478 (-1.37%); split: -1.65%, +0.29%
Copies: 2327965 -> 2229134 (-4.25%); split: -4.58%, +0.34%
Branches: 906707 -> 906434 (-0.03%); split: -0.13%, +0.10%
PreSGPRs: 1874153 -> 1862698 (-0.61%); split: -1.34%, +0.73%
PreVGPRs: 1691382 -> 1691383 (+0.00%); split: -0.00%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12773>
2021-08-12 15:36:56 +01:00
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < instr->num_srcs; i++) {
|
|
|
|
|
switch (instr->src[i].src_type) {
|
|
|
|
|
case nir_tex_src_texture_handle:
|
|
|
|
|
resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[i].src.ssa));
|
|
|
|
|
break;
|
|
|
|
|
case nir_tex_src_sampler_handle:
|
|
|
|
|
sampler = bld.as_uniform(get_ssa_temp(ctx, instr->src[i].src.ssa));
|
|
|
|
|
break;
|
|
|
|
|
default: break;
|
|
|
|
|
}
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2022-05-12 02:50:17 -04:00
|
|
|
bool tg4_integer_workarounds = ctx->options->gfx_level <= GFX8 && instr->op == nir_texop_tg4 &&
|
radv,aco: lower texture descriptor loads in NIR
fossil-db (Sienna Cichlid):
Totals from 39445 (24.30% of 162293) affected shaders:
MaxWaves: 875988 -> 875972 (-0.00%)
Instrs: 35372561 -> 35234909 (-0.39%); split: -0.41%, +0.03%
CodeSize: 190237480 -> 189379240 (-0.45%); split: -0.47%, +0.02%
VGPRs: 1889856 -> 1889928 (+0.00%); split: -0.00%, +0.01%
SpillSGPRs: 10764 -> 10857 (+0.86%); split: -2.04%, +2.91%
SpillVGPRs: 1891 -> 1907 (+0.85%); split: -0.32%, +1.16%
Scratch: 260096 -> 261120 (+0.39%)
Latency: 477701150 -> 477578466 (-0.03%); split: -0.06%, +0.03%
InvThroughput: 87819847 -> 87830346 (+0.01%); split: -0.03%, +0.04%
VClause: 673353 -> 673829 (+0.07%); split: -0.04%, +0.11%
SClause: 1385396 -> 1366478 (-1.37%); split: -1.65%, +0.29%
Copies: 2327965 -> 2229134 (-4.25%); split: -4.58%, +0.34%
Branches: 906707 -> 906434 (-0.03%); split: -0.13%, +0.10%
PreSGPRs: 1874153 -> 1862698 (-0.61%); split: -1.34%, +0.73%
PreVGPRs: 1691382 -> 1691383 (+0.00%); split: -0.00%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12773>
2021-08-12 15:36:56 +01:00
|
|
|
(instr->dest_type & (nir_type_int | nir_type_uint));
|
2019-09-17 13:22:17 +02:00
|
|
|
bool tg4_integer_cube_workaround =
|
|
|
|
|
tg4_integer_workarounds && instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
|
|
|
|
|
|
2022-06-06 17:04:14 +02:00
|
|
|
bool a16 = false, g16 = false;
|
|
|
|
|
|
|
|
|
|
int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord);
|
2025-06-25 13:09:20 +02:00
|
|
|
if (coord_idx >= 0)
|
2022-06-06 17:04:14 +02:00
|
|
|
a16 = instr->src[coord_idx].src.ssa->bit_size == 16;
|
|
|
|
|
|
|
|
|
|
int ddx_idx = nir_tex_instr_src_index(instr, nir_tex_src_ddx);
|
2025-06-25 13:09:20 +02:00
|
|
|
if (ddx_idx >= 0)
|
2022-06-06 17:04:14 +02:00
|
|
|
g16 = instr->src[ddx_idx].src.ssa->bit_size == 16;
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
for (unsigned i = 0; i < instr->num_srcs; i++) {
|
|
|
|
|
switch (instr->src[i].src_type) {
|
2020-01-23 19:12:55 +01:00
|
|
|
case nir_tex_src_coord: {
|
2022-06-06 17:04:14 +02:00
|
|
|
assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
|
2022-08-11 14:43:23 +02:00
|
|
|
coord = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
|
2019-09-17 13:22:17 +02:00
|
|
|
break;
|
2020-01-23 19:12:55 +01:00
|
|
|
}
|
2023-04-14 17:49:46 +01:00
|
|
|
case nir_tex_src_backend1: {
|
|
|
|
|
assert(instr->src[i].src.ssa->bit_size == 32);
|
|
|
|
|
wqm_coord = get_ssa_temp(ctx, instr->src[i].src.ssa);
|
|
|
|
|
has_wqm_coord = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
case nir_tex_src_bias:
|
2022-06-06 17:04:14 +02:00
|
|
|
assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
|
2022-08-11 14:43:23 +02:00
|
|
|
/* Doesn't need get_ssa_temp_tex because we pack it into its own dword anyway. */
|
2020-05-11 15:23:52 +02:00
|
|
|
bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
|
|
|
|
|
has_bias = true;
|
2019-09-17 13:22:17 +02:00
|
|
|
break;
|
|
|
|
|
case nir_tex_src_lod: {
|
2020-10-21 18:25:56 +02:00
|
|
|
if (nir_src_is_const(instr->src[i].src) && nir_src_as_uint(instr->src[i].src) == 0) {
|
2019-09-17 13:22:17 +02:00
|
|
|
level_zero = true;
|
|
|
|
|
} else {
|
2022-06-06 17:04:14 +02:00
|
|
|
assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
|
2022-08-11 14:43:23 +02:00
|
|
|
lod = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
|
2019-09-17 13:22:17 +02:00
|
|
|
has_lod = true;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
2020-05-11 16:33:14 +02:00
|
|
|
case nir_tex_src_min_lod:
|
2022-06-06 17:04:14 +02:00
|
|
|
assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
|
2022-08-11 14:43:23 +02:00
|
|
|
clamped_lod = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
|
2020-05-11 16:33:14 +02:00
|
|
|
has_clamped_lod = true;
|
|
|
|
|
break;
|
2019-09-17 13:22:17 +02:00
|
|
|
case nir_tex_src_comparator:
|
|
|
|
|
if (instr->is_shadow) {
|
2022-06-06 17:04:14 +02:00
|
|
|
assert(instr->src[i].src.ssa->bit_size == 32);
|
2019-09-17 13:22:17 +02:00
|
|
|
compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
|
|
|
|
|
has_compare = true;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
case nir_tex_src_offset:
|
2023-04-14 17:49:46 +01:00
|
|
|
case nir_tex_src_backend2:
|
2022-06-06 17:04:14 +02:00
|
|
|
assert(instr->src[i].src.ssa->bit_size == 32);
|
2019-09-17 13:22:17 +02:00
|
|
|
offset = get_ssa_temp(ctx, instr->src[i].src.ssa);
|
|
|
|
|
get_const_vec(instr->src[i].src.ssa, const_offset);
|
|
|
|
|
has_offset = true;
|
|
|
|
|
break;
|
|
|
|
|
case nir_tex_src_ddx:
|
2022-06-06 17:04:14 +02:00
|
|
|
assert(instr->src[i].src.ssa->bit_size == (g16 ? 16 : 32));
|
2022-08-11 14:43:23 +02:00
|
|
|
ddx = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, g16);
|
2019-09-17 13:22:17 +02:00
|
|
|
has_ddx = true;
|
|
|
|
|
break;
|
|
|
|
|
case nir_tex_src_ddy:
|
2022-06-06 17:04:14 +02:00
|
|
|
assert(instr->src[i].src.ssa->bit_size == (g16 ? 16 : 32));
|
2022-08-11 14:43:23 +02:00
|
|
|
ddy = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, g16);
|
2019-09-17 13:22:17 +02:00
|
|
|
has_ddy = true;
|
|
|
|
|
break;
|
|
|
|
|
case nir_tex_src_ms_index:
|
2022-06-06 17:04:14 +02:00
|
|
|
assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
|
2022-08-11 14:43:23 +02:00
|
|
|
sample_index = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
|
2019-09-17 13:22:17 +02:00
|
|
|
has_sample_index = true;
|
|
|
|
|
break;
|
|
|
|
|
case nir_tex_src_texture_offset:
|
|
|
|
|
case nir_tex_src_sampler_offset:
|
|
|
|
|
default: break;
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-23 19:12:55 +01:00
|
|
|
|
2023-04-14 17:49:46 +01:00
|
|
|
if (has_wqm_coord) {
|
|
|
|
|
assert(instr->op == nir_texop_tex || instr->op == nir_texop_txb ||
|
|
|
|
|
instr->op == nir_texop_lod);
|
|
|
|
|
assert(wqm_coord.regClass().is_linear_vgpr());
|
|
|
|
|
assert(!a16 && !g16);
|
|
|
|
|
}
|
|
|
|
|
|
2023-03-31 19:43:03 +01:00
|
|
|
if (instr->op == nir_texop_tg4 && !has_lod && !instr->is_gather_implicit_lod)
|
|
|
|
|
level_zero = true;
|
|
|
|
|
|
2022-06-04 20:04:13 +02:00
|
|
|
if (has_offset) {
|
|
|
|
|
assert(instr->op != nir_texop_txf);
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
aco_ptr<Instruction> tmp_instr;
|
|
|
|
|
Temp acc, pack = Temp();
|
|
|
|
|
|
|
|
|
|
uint32_t pack_const = 0;
|
|
|
|
|
for (unsigned i = 0; i < offset.size(); i++) {
|
|
|
|
|
if (!const_offset[i])
|
|
|
|
|
continue;
|
|
|
|
|
pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (offset.type() == RegType::sgpr) {
|
|
|
|
|
for (unsigned i = 0; i < offset.size(); i++) {
|
|
|
|
|
if (const_offset[i])
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
acc = emit_extract_vector(ctx, offset, i, s1);
|
2021-07-13 11:22:46 +02:00
|
|
|
acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc,
|
|
|
|
|
Operand::c32(0x3Fu));
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
if (i) {
|
|
|
|
|
acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc,
|
2021-07-13 11:22:46 +02:00
|
|
|
Operand::c32(8u * i));
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (pack == Temp()) {
|
|
|
|
|
pack = acc;
|
|
|
|
|
} else {
|
|
|
|
|
pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (pack_const && pack != Temp())
|
|
|
|
|
pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
|
2021-07-13 11:22:46 +02:00
|
|
|
Operand::c32(pack_const), pack);
|
2019-09-17 13:22:17 +02:00
|
|
|
} else {
|
|
|
|
|
for (unsigned i = 0; i < offset.size(); i++) {
|
|
|
|
|
if (const_offset[i])
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
acc = emit_extract_vector(ctx, offset, i, v1);
|
2021-07-13 11:22:46 +02:00
|
|
|
acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x3Fu), acc);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
if (i) {
|
2021-07-13 11:22:46 +02:00
|
|
|
acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(8u * i), acc);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (pack == Temp()) {
|
|
|
|
|
pack = acc;
|
|
|
|
|
} else {
|
|
|
|
|
pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (pack_const && pack != Temp())
|
2022-04-13 14:11:18 +02:00
|
|
|
pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(pack_const), pack);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
2023-09-29 16:35:20 +01:00
|
|
|
if (pack == Temp())
|
2021-07-13 11:22:46 +02:00
|
|
|
offset = bld.copy(bld.def(v1), Operand::c32(pack_const));
|
2019-09-17 13:22:17 +02:00
|
|
|
else
|
|
|
|
|
offset = pack;
|
|
|
|
|
}
|
|
|
|
|
|
2022-06-06 17:04:14 +02:00
|
|
|
std::vector<Temp> unpacked_coord;
|
radv,radeonsi: use ac_nir_lower_tex
fossil-db (navi21):
Totals from 17279 (12.74% of 135636) affected shaders:
MaxWaves: 270015 -> 269991 (-0.01%)
Instrs: 24847385 -> 24843807 (-0.01%); split: -0.02%, +0.00%
CodeSize: 133215364 -> 133198744 (-0.01%); split: -0.02%, +0.01%
VGPRs: 1217632 -> 1217872 (+0.02%); split: -0.00%, +0.02%
Latency: 405347021 -> 404971784 (-0.09%); split: -0.09%, +0.00%
InvThroughput: 75386590 -> 75350344 (-0.05%); split: -0.07%, +0.03%
VClause: 426986 -> 426821 (-0.04%); split: -0.04%, +0.01%
SClause: 966751 -> 966971 (+0.02%); split: -0.01%, +0.03%
Copies: 1738510 -> 1737970 (-0.03%); split: -0.08%, +0.05%
PreSGPRs: 1169070 -> 1169120 (+0.00%); split: -0.00%, +0.00%
PreVGPRs: 1136102 -> 1136183 (+0.01%); split: -0.00%, +0.01%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22636>
2023-04-06 11:43:29 +01:00
|
|
|
if (coord != Temp())
|
2022-06-06 17:04:14 +02:00
|
|
|
unpacked_coord.push_back(coord);
|
|
|
|
|
if (has_sample_index)
|
|
|
|
|
unpacked_coord.push_back(sample_index);
|
|
|
|
|
if (has_lod)
|
|
|
|
|
unpacked_coord.push_back(lod);
|
|
|
|
|
if (has_clamped_lod)
|
|
|
|
|
unpacked_coord.push_back(clamped_lod);
|
|
|
|
|
|
|
|
|
|
coords = emit_pack_v1(ctx, unpacked_coord);
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
/* pack derivatives */
|
|
|
|
|
if (has_ddx || has_ddy) {
|
2022-06-06 17:04:14 +02:00
|
|
|
assert(a16 == g16 || ctx->options->gfx_level >= GFX10);
|
|
|
|
|
std::array<Temp, 2> ddxddy = {ddx, ddy};
|
|
|
|
|
for (Temp tmp : ddxddy) {
|
|
|
|
|
if (tmp == Temp())
|
|
|
|
|
continue;
|
|
|
|
|
std::vector<Temp> unpacked = {tmp};
|
|
|
|
|
for (Temp derv : emit_pack_v1(ctx, unpacked))
|
|
|
|
|
derivs.push_back(derv);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
has_derivs = true;
|
|
|
|
|
}
|
|
|
|
|
|
2023-05-30 19:22:48 +08:00
|
|
|
unsigned dim = 0;
|
|
|
|
|
bool da = false;
|
|
|
|
|
if (instr->sampler_dim != GLSL_SAMPLER_DIM_BUF) {
|
|
|
|
|
dim = ac_get_sampler_dim(ctx->options->gfx_level, instr->sampler_dim, instr->is_array);
|
|
|
|
|
da = should_declare_array((ac_image_dim)dim);
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
/* Build tex instruction */
|
2025-05-26 17:20:10 +02:00
|
|
|
unsigned dmask = nir_def_components_read(&instr->def);
|
|
|
|
|
/* Mask out the bit set for the sparse info. */
|
|
|
|
|
if (instr->is_sparse)
|
|
|
|
|
dmask &= ~(1u << (instr->def.num_components - 1));
|
2020-11-20 15:11:16 +00:00
|
|
|
if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
|
|
|
|
|
dmask = u_bit_consecutive(0, util_last_bit(dmask));
|
2025-05-26 17:20:10 +02:00
|
|
|
/* Set the 5th bit for the sparse code. */
|
2020-11-20 15:11:16 +00:00
|
|
|
if (instr->is_sparse)
|
|
|
|
|
dmask = MAX2(dmask, 1) | 0x10;
|
2025-05-26 17:20:10 +02:00
|
|
|
|
2023-08-14 11:56:00 -05:00
|
|
|
bool d16 = instr->def.bit_size == 16;
|
|
|
|
|
Temp dst = get_ssa_temp(ctx, &instr->def);
|
2019-09-17 13:22:17 +02:00
|
|
|
Temp tmp_dst = dst;
|
|
|
|
|
|
2020-11-20 15:11:16 +00:00
|
|
|
/* gather4 selects the component by dmask and always returns vec4 (vec5 if sparse) */
|
2019-09-17 13:22:17 +02:00
|
|
|
if (instr->op == nir_texop_tg4) {
|
2023-08-14 11:56:00 -05:00
|
|
|
assert(instr->def.num_components == (4 + instr->is_sparse));
|
2019-09-17 13:22:17 +02:00
|
|
|
if (instr->is_shadow)
|
|
|
|
|
dmask = 1;
|
|
|
|
|
else
|
|
|
|
|
dmask = 1 << instr->component;
|
|
|
|
|
if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr)
|
2021-10-08 13:51:58 +02:00
|
|
|
tmp_dst = bld.tmp(instr->is_sparse ? v5 : (d16 ? v2 : v4));
|
radv,aco: use lower_to_fragment_fetch
This simplifies ACO and will let us optimize the FMASK fetch (for example,
move it out of loops).
fossil-db (Sienna Cichlid):
Totals from 955 (0.64% of 150170) affected shaders:
CodeSize: 4722016 -> 4722952 (+0.02%); split: -0.02%, +0.04%
Instrs: 875619 -> 875760 (+0.02%); split: -0.02%, +0.04%
Latency: 14069089 -> 14071699 (+0.02%); split: -0.02%, +0.04%
InvThroughput: 2321419 -> 2321218 (-0.01%); split: -0.02%, +0.01%
VClause: 23080 -> 23081 (+0.00%)
SClause: 32426 -> 32019 (-1.26%); split: -1.88%, +0.62%
Copies: 42787 -> 42777 (-0.02%); split: -0.19%, +0.16%
Branches: 17900 -> 17902 (+0.01%); split: -0.04%, +0.06%
PreSGPRs: 43229 -> 41002 (-5.15%); split: -5.16%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12214>
2021-08-04 16:17:56 +01:00
|
|
|
} else if (instr->op == nir_texop_fragment_mask_fetch_amd) {
|
2019-09-17 13:22:17 +02:00
|
|
|
tmp_dst = bld.tmp(v1);
|
2023-08-14 11:56:00 -05:00
|
|
|
} else if (util_bitcount(dmask) != instr->def.num_components || dst.type() == RegType::sgpr) {
|
|
|
|
|
unsigned bytes = util_bitcount(dmask) * instr->def.bit_size / 8;
|
2021-10-08 13:51:58 +02:00
|
|
|
tmp_dst = bld.tmp(RegClass::get(RegType::vgpr, bytes));
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Temp tg4_compare_cube_wa64 = Temp();
|
|
|
|
|
|
|
|
|
|
if (tg4_integer_workarounds) {
|
|
|
|
|
Temp half_texel[2];
|
2023-11-17 10:46:33 +08:00
|
|
|
if (instr->sampler_dim == GLSL_SAMPLER_DIM_RECT) {
|
|
|
|
|
half_texel[0] = half_texel[1] = bld.copy(bld.def(v1), Operand::c32(0xbf000000 /*-0.5*/));
|
|
|
|
|
} else {
|
|
|
|
|
Temp tg4_lod = bld.copy(bld.def(v1), Operand::zero());
|
|
|
|
|
Temp size = bld.tmp(v2);
|
2025-03-27 18:44:34 +01:00
|
|
|
MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, {size}, resource,
|
2023-11-17 10:46:33 +08:00
|
|
|
Operand(s4), std::vector<Temp>{tg4_lod});
|
|
|
|
|
tex->dim = dim;
|
|
|
|
|
tex->dmask = 0x3;
|
|
|
|
|
tex->da = da;
|
|
|
|
|
emit_split_vector(ctx, size, size.size());
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
|
|
|
|
half_texel[i] = emit_extract_vector(ctx, size, i, v1);
|
|
|
|
|
half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]);
|
|
|
|
|
half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]);
|
|
|
|
|
half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1),
|
|
|
|
|
Operand::c32(0xbf000000 /*-0.5*/), half_texel[i]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (instr->sampler_dim == GLSL_SAMPLER_DIM_2D && !instr->is_array) {
|
|
|
|
|
/* In vulkan, whether the sampler uses unnormalized
|
|
|
|
|
* coordinates or not is a dynamic property of the
|
|
|
|
|
* sampler. Hence, to figure out whether or not we
|
|
|
|
|
* need to divide by the texture size, we need to test
|
|
|
|
|
* the sampler at runtime. This tests the bit set by
|
|
|
|
|
* radv_init_sampler().
|
|
|
|
|
*/
|
|
|
|
|
unsigned bit_idx = ffs(S_008F30_FORCE_UNNORMALIZED(1)) - 1;
|
|
|
|
|
Temp dword0 = emit_extract_vector(ctx, sampler, 0, s1);
|
|
|
|
|
Temp not_needed =
|
|
|
|
|
bld.sopc(aco_opcode::s_bitcmp0_b32, bld.def(s1, scc), dword0, Operand::c32(bit_idx));
|
|
|
|
|
|
|
|
|
|
not_needed = bool_to_vector_condition(ctx, not_needed);
|
|
|
|
|
half_texel[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
|
|
|
|
|
Operand::c32(0xbf000000 /*-0.5*/), half_texel[0], not_needed);
|
|
|
|
|
half_texel[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
|
|
|
|
|
Operand::c32(0xbf000000 /*-0.5*/), half_texel[1], not_needed);
|
|
|
|
|
}
|
2021-03-31 17:46:52 +01:00
|
|
|
}
|
|
|
|
|
|
2020-01-23 19:12:55 +01:00
|
|
|
Temp new_coords[2] = {bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[0], half_texel[0]),
|
|
|
|
|
bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[1], half_texel[1])};
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
if (tg4_integer_cube_workaround) {
|
|
|
|
|
/* see comment in ac_nir_to_llvm.c's lower_gather4_integer() */
|
2020-08-10 21:00:51 -07:00
|
|
|
Temp* const desc = (Temp*)alloca(resource.size() * sizeof(Temp));
|
2024-03-25 15:55:27 +01:00
|
|
|
aco_ptr<Instruction> split{
|
|
|
|
|
create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, resource.size())};
|
2019-09-17 13:22:17 +02:00
|
|
|
split->operands[0] = Operand(resource);
|
|
|
|
|
for (unsigned i = 0; i < resource.size(); i++) {
|
|
|
|
|
desc[i] = bld.tmp(s1);
|
|
|
|
|
split->definitions[i] = Definition(desc[i]);
|
|
|
|
|
}
|
|
|
|
|
ctx->block->instructions.emplace_back(std::move(split));
|
|
|
|
|
|
|
|
|
|
Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1],
|
2021-07-13 11:22:46 +02:00
|
|
|
Operand::c32(20u | (6u << 16)));
|
2019-09-17 13:22:17 +02:00
|
|
|
Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt,
|
2021-07-13 11:22:46 +02:00
|
|
|
Operand::c32(V_008F14_IMG_DATA_FORMAT_8_8_8_8));
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
Temp nfmt;
|
radv,aco: lower texture descriptor loads in NIR
fossil-db (Sienna Cichlid):
Totals from 39445 (24.30% of 162293) affected shaders:
MaxWaves: 875988 -> 875972 (-0.00%)
Instrs: 35372561 -> 35234909 (-0.39%); split: -0.41%, +0.03%
CodeSize: 190237480 -> 189379240 (-0.45%); split: -0.47%, +0.02%
VGPRs: 1889856 -> 1889928 (+0.00%); split: -0.00%, +0.01%
SpillSGPRs: 10764 -> 10857 (+0.86%); split: -2.04%, +2.91%
SpillVGPRs: 1891 -> 1907 (+0.85%); split: -0.32%, +1.16%
Scratch: 260096 -> 261120 (+0.39%)
Latency: 477701150 -> 477578466 (-0.03%); split: -0.06%, +0.03%
InvThroughput: 87819847 -> 87830346 (+0.01%); split: -0.03%, +0.04%
VClause: 673353 -> 673829 (+0.07%); split: -0.04%, +0.11%
SClause: 1385396 -> 1366478 (-1.37%); split: -1.65%, +0.29%
Copies: 2327965 -> 2229134 (-4.25%); split: -4.58%, +0.34%
Branches: 906707 -> 906434 (-0.03%); split: -0.13%, +0.10%
PreSGPRs: 1874153 -> 1862698 (-0.61%); split: -1.34%, +0.73%
PreVGPRs: 1691382 -> 1691383 (+0.00%); split: -0.00%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12773>
2021-08-12 15:36:56 +01:00
|
|
|
if (instr->dest_type & nir_type_uint) {
|
2021-07-13 11:22:46 +02:00
|
|
|
nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
|
|
|
|
|
Operand::c32(V_008F14_IMG_NUM_FORMAT_USCALED),
|
|
|
|
|
Operand::c32(V_008F14_IMG_NUM_FORMAT_UINT), bld.scc(compare_cube_wa));
|
2019-09-17 13:22:17 +02:00
|
|
|
} else {
|
2021-07-13 11:22:46 +02:00
|
|
|
nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
|
|
|
|
|
Operand::c32(V_008F14_IMG_NUM_FORMAT_SSCALED),
|
|
|
|
|
Operand::c32(V_008F14_IMG_NUM_FORMAT_SINT), bld.scc(compare_cube_wa));
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
2019-11-27 11:04:47 +01:00
|
|
|
tg4_compare_cube_wa64 = bld.tmp(bld.lm);
|
2019-11-04 19:28:08 +01:00
|
|
|
bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64);
|
|
|
|
|
|
2021-07-13 11:22:46 +02:00
|
|
|
nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt,
|
|
|
|
|
Operand::c32(26u));
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1],
|
2021-07-13 11:22:46 +02:00
|
|
|
Operand::c32(C_008F14_NUM_FORMAT));
|
2019-09-17 13:22:17 +02:00
|
|
|
desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt);
|
|
|
|
|
|
2024-03-25 15:55:27 +01:00
|
|
|
aco_ptr<Instruction> vec{
|
|
|
|
|
create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, resource.size(), 1)};
|
2019-09-17 13:22:17 +02:00
|
|
|
for (unsigned i = 0; i < resource.size(); i++)
|
|
|
|
|
vec->operands[i] = Operand(desc[i]);
|
|
|
|
|
resource = bld.tmp(resource.regClass());
|
|
|
|
|
vec->definitions[0] = Definition(resource);
|
|
|
|
|
ctx->block->instructions.emplace_back(std::move(vec));
|
|
|
|
|
|
|
|
|
|
new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[0], coords[0],
|
2020-01-23 19:12:55 +01:00
|
|
|
tg4_compare_cube_wa64);
|
2019-09-17 13:22:17 +02:00
|
|
|
new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[1], coords[1],
|
2020-01-23 19:12:55 +01:00
|
|
|
tg4_compare_cube_wa64);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
2020-01-23 19:12:55 +01:00
|
|
|
coords[0] = new_coords[0];
|
|
|
|
|
coords[1] = new_coords[1];
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
|
|
|
|
|
// FIXME: if (ctx->abi->gfx9_stride_size_workaround) return
|
|
|
|
|
// ac_build_buffer_load_format_gfx9_safe()
|
|
|
|
|
|
|
|
|
|
assert(coords.size() == 1);
|
|
|
|
|
aco_opcode op;
|
2021-10-08 13:51:58 +02:00
|
|
|
if (d16) {
|
|
|
|
|
switch (util_last_bit(dmask & 0xf)) {
|
|
|
|
|
case 1: op = aco_opcode::buffer_load_format_d16_x; break;
|
|
|
|
|
case 2: op = aco_opcode::buffer_load_format_d16_xy; break;
|
|
|
|
|
case 3: op = aco_opcode::buffer_load_format_d16_xyz; break;
|
|
|
|
|
case 4: op = aco_opcode::buffer_load_format_d16_xyzw; break;
|
2025-07-23 09:17:35 +02:00
|
|
|
default: UNREACHABLE("Tex instruction loads more than 4 components.");
|
2021-10-08 13:51:58 +02:00
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
switch (util_last_bit(dmask & 0xf)) {
|
|
|
|
|
case 1: op = aco_opcode::buffer_load_format_x; break;
|
|
|
|
|
case 2: op = aco_opcode::buffer_load_format_xy; break;
|
|
|
|
|
case 3: op = aco_opcode::buffer_load_format_xyz; break;
|
|
|
|
|
case 4: op = aco_opcode::buffer_load_format_xyzw; break;
|
2025-07-23 09:17:35 +02:00
|
|
|
default: UNREACHABLE("Tex instruction loads more than 4 components.");
|
2021-10-08 13:51:58 +02:00
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2024-03-25 15:55:27 +01:00
|
|
|
aco_ptr<Instruction> mubuf{create_instruction(op, Format::MUBUF, 3 + instr->is_sparse, 1)};
|
2020-01-16 16:54:35 +01:00
|
|
|
mubuf->operands[0] = Operand(resource);
|
2020-01-23 19:12:55 +01:00
|
|
|
mubuf->operands[1] = Operand(coords[0]);
|
2021-07-13 11:22:46 +02:00
|
|
|
mubuf->operands[2] = Operand::c32(0);
|
2019-09-17 13:22:17 +02:00
|
|
|
mubuf->definitions[0] = Definition(tmp_dst);
|
2024-03-25 12:05:50 +01:00
|
|
|
mubuf->mubuf().idxen = true;
|
|
|
|
|
mubuf->mubuf().tfe = instr->is_sparse;
|
|
|
|
|
if (mubuf->mubuf().tfe)
|
2020-11-20 15:11:16 +00:00
|
|
|
mubuf->operands[3] = emit_tfe_init(bld, tmp_dst);
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx->block->instructions.emplace_back(std::move(mubuf));
|
|
|
|
|
|
2023-08-14 11:56:00 -05:00
|
|
|
expand_vector(ctx, tmp_dst, dst, instr->def.num_components, dmask);
|
2019-09-17 13:22:17 +02:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2020-01-23 19:12:55 +01:00
|
|
|
/* gather MIMG address components */
|
|
|
|
|
std::vector<Temp> args;
|
2023-04-14 17:49:46 +01:00
|
|
|
if (has_wqm_coord) {
|
|
|
|
|
args.emplace_back(wqm_coord);
|
|
|
|
|
if (!(ctx->block->kind & block_kind_top_level))
|
|
|
|
|
ctx->unended_linear_vgprs.push_back(wqm_coord);
|
|
|
|
|
}
|
aco: emit_wqm on MIMG dst, not operands
Now p_wqm always kills its operand, so no movs will be created for it.
Long term we want to remove p_wqm in favor of a Definition flag,
so this is also a step in that direction.
Foz-DB Navi21:
Totals from 45351 (33.63% of 134864) affected shaders:
VGPRs: 2099552 -> 2116192 (+0.79%); split: -0.14%, +0.93%
CodeSize: 179530772 -> 179072104 (-0.26%); split: -0.29%, +0.03%
MaxWaves: 1054740 -> 1052262 (-0.23%); split: +0.10%, -0.33%
Instrs: 33238535 -> 33188347 (-0.15%); split: -0.17%, +0.02%
Latency: 451000471 -> 450869384 (-0.03%); split: -0.11%, +0.08%
InvThroughput: 86026785 -> 86286288 (+0.30%); split: -0.11%, +0.41%
VClause: 633291 -> 623920 (-1.48%); split: -1.91%, +0.43%
SClause: 1436708 -> 1431395 (-0.37%); split: -0.60%, +0.23%
Copies: 2166563 -> 2122592 (-2.03%); split: -2.29%, +0.26%
Branches: 706846 -> 706838 (-0.00%); split: -0.00%, +0.00%
PreSGPRs: 1976162 -> 1976592 (+0.02%)
PreVGPRs: 1797409 -> 1794704 (-0.15%)
MaxWaves regressions in Detroit: Become Human MaxWaves seem to be due
to the scheduler choosing to schedule more aggressively.
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22956>
2023-05-11 11:46:42 +02:00
|
|
|
if (has_offset)
|
2020-01-23 19:12:55 +01:00
|
|
|
args.emplace_back(offset);
|
|
|
|
|
if (has_bias)
|
2022-06-06 17:04:14 +02:00
|
|
|
args.emplace_back(emit_pack_v1(ctx, {bias})[0]);
|
2020-01-23 19:12:55 +01:00
|
|
|
if (has_compare)
|
|
|
|
|
args.emplace_back(compare);
|
|
|
|
|
if (has_derivs)
|
|
|
|
|
args.insert(args.end(), derivs.begin(), derivs.end());
|
|
|
|
|
|
|
|
|
|
args.insert(args.end(), coords.begin(), coords.end());
|
2021-01-25 12:51:54 +00:00
|
|
|
|
radv,aco: use lower_to_fragment_fetch
This simplifies ACO and will let us optimize the FMASK fetch (for example,
move it out of loops).
fossil-db (Sienna Cichlid):
Totals from 955 (0.64% of 150170) affected shaders:
CodeSize: 4722016 -> 4722952 (+0.02%); split: -0.02%, +0.04%
Instrs: 875619 -> 875760 (+0.02%); split: -0.02%, +0.04%
Latency: 14069089 -> 14071699 (+0.02%); split: -0.02%, +0.04%
InvThroughput: 2321419 -> 2321218 (-0.01%); split: -0.02%, +0.01%
VClause: 23080 -> 23081 (+0.00%)
SClause: 32426 -> 32019 (-1.26%); split: -1.88%, +0.62%
Copies: 42787 -> 42777 (-0.02%); split: -0.19%, +0.16%
Branches: 17900 -> 17902 (+0.01%); split: -0.04%, +0.06%
PreSGPRs: 43229 -> 41002 (-5.15%); split: -5.16%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12214>
2021-08-04 16:17:56 +01:00
|
|
|
if (instr->op == nir_texop_txf || instr->op == nir_texop_fragment_fetch_amd ||
|
2022-10-25 15:45:10 +01:00
|
|
|
instr->op == nir_texop_fragment_mask_fetch_amd || instr->op == nir_texop_txf_ms) {
|
2020-01-07 15:18:58 +01:00
|
|
|
aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
|
|
|
|
|
instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS
|
|
|
|
|
? aco_opcode::image_load
|
|
|
|
|
: aco_opcode::image_load_mip;
|
2021-01-14 17:33:43 +00:00
|
|
|
Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
|
2025-03-27 18:44:34 +01:00
|
|
|
MIMG_instruction* tex = emit_mimg(bld, op, {tmp_dst}, resource, Operand(s4), args, vdata);
|
2021-10-06 18:10:25 +01:00
|
|
|
if (instr->op == nir_texop_fragment_mask_fetch_amd)
|
|
|
|
|
tex->dim = da ? ac_image_2darray : ac_image_2d;
|
|
|
|
|
else
|
|
|
|
|
tex->dim = dim;
|
2020-11-20 15:11:16 +00:00
|
|
|
tex->dmask = dmask & 0xf;
|
2019-09-17 13:22:17 +02:00
|
|
|
tex->unrm = true;
|
|
|
|
|
tex->da = da;
|
2020-11-20 15:11:16 +00:00
|
|
|
tex->tfe = instr->is_sparse;
|
2021-10-08 13:51:58 +02:00
|
|
|
tex->d16 = d16;
|
2022-06-06 17:04:14 +02:00
|
|
|
tex->a16 = a16;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
radv,aco: use lower_to_fragment_fetch
This simplifies ACO and will let us optimize the FMASK fetch (for example,
move it out of loops).
fossil-db (Sienna Cichlid):
Totals from 955 (0.64% of 150170) affected shaders:
CodeSize: 4722016 -> 4722952 (+0.02%); split: -0.02%, +0.04%
Instrs: 875619 -> 875760 (+0.02%); split: -0.02%, +0.04%
Latency: 14069089 -> 14071699 (+0.02%); split: -0.02%, +0.04%
InvThroughput: 2321419 -> 2321218 (-0.01%); split: -0.02%, +0.01%
VClause: 23080 -> 23081 (+0.00%)
SClause: 32426 -> 32019 (-1.26%); split: -1.88%, +0.62%
Copies: 42787 -> 42777 (-0.02%); split: -0.19%, +0.16%
Branches: 17900 -> 17902 (+0.01%); split: -0.04%, +0.06%
PreSGPRs: 43229 -> 41002 (-5.15%); split: -5.16%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12214>
2021-08-04 16:17:56 +01:00
|
|
|
if (instr->op == nir_texop_fragment_mask_fetch_amd) {
|
2021-08-04 16:13:47 +01:00
|
|
|
/* Use 0x76543210 if the image doesn't have FMASK. */
|
|
|
|
|
assert(dmask == 1 && dst.bytes() == 4);
|
|
|
|
|
assert(dst.id() != tmp_dst.id());
|
|
|
|
|
|
|
|
|
|
if (dst.regClass() == s1) {
|
|
|
|
|
Temp is_not_null = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand::zero(),
|
|
|
|
|
emit_extract_vector(ctx, resource, 1, s1));
|
|
|
|
|
bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), bld.as_uniform(tmp_dst),
|
|
|
|
|
Operand::c32(0x76543210), bld.scc(is_not_null));
|
|
|
|
|
} else {
|
|
|
|
|
Temp is_not_null = bld.tmp(bld.lm);
|
|
|
|
|
bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(is_not_null), Operand::zero(),
|
2022-03-15 14:49:32 +01:00
|
|
|
emit_extract_vector(ctx, resource, 1, s1));
|
2021-08-04 16:13:47 +01:00
|
|
|
bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst),
|
|
|
|
|
bld.copy(bld.def(v1), Operand::c32(0x76543210)), tmp_dst, is_not_null);
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
} else {
|
2023-08-14 11:56:00 -05:00
|
|
|
expand_vector(ctx, tmp_dst, dst, instr->def.num_components, dmask);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2022-06-06 17:04:14 +02:00
|
|
|
bool separate_g16 = ctx->options->gfx_level >= GFX10 && g16;
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
// TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
|
|
|
|
|
aco_opcode opcode = aco_opcode::image_sample;
|
|
|
|
|
if (has_offset) { /* image_sample_*_o */
|
2020-05-11 16:33:14 +02:00
|
|
|
if (has_clamped_lod) {
|
|
|
|
|
if (has_compare) {
|
|
|
|
|
opcode = aco_opcode::image_sample_c_cl_o;
|
2022-06-06 17:04:14 +02:00
|
|
|
if (separate_g16)
|
|
|
|
|
opcode = aco_opcode::image_sample_c_d_cl_o_g16;
|
|
|
|
|
else if (has_derivs)
|
2020-05-11 16:33:14 +02:00
|
|
|
opcode = aco_opcode::image_sample_c_d_cl_o;
|
|
|
|
|
if (has_bias)
|
|
|
|
|
opcode = aco_opcode::image_sample_c_b_cl_o;
|
|
|
|
|
} else {
|
|
|
|
|
opcode = aco_opcode::image_sample_cl_o;
|
2022-06-06 17:04:14 +02:00
|
|
|
if (separate_g16)
|
|
|
|
|
opcode = aco_opcode::image_sample_d_cl_o_g16;
|
|
|
|
|
else if (has_derivs)
|
2020-05-11 16:33:14 +02:00
|
|
|
opcode = aco_opcode::image_sample_d_cl_o;
|
|
|
|
|
if (has_bias)
|
|
|
|
|
opcode = aco_opcode::image_sample_b_cl_o;
|
|
|
|
|
}
|
|
|
|
|
} else if (has_compare) {
|
2019-09-17 13:22:17 +02:00
|
|
|
opcode = aco_opcode::image_sample_c_o;
|
2022-06-06 17:04:14 +02:00
|
|
|
if (separate_g16)
|
|
|
|
|
opcode = aco_opcode::image_sample_c_d_o_g16;
|
|
|
|
|
else if (has_derivs)
|
2019-09-17 13:22:17 +02:00
|
|
|
opcode = aco_opcode::image_sample_c_d_o;
|
|
|
|
|
if (has_bias)
|
|
|
|
|
opcode = aco_opcode::image_sample_c_b_o;
|
|
|
|
|
if (level_zero)
|
|
|
|
|
opcode = aco_opcode::image_sample_c_lz_o;
|
|
|
|
|
if (has_lod)
|
|
|
|
|
opcode = aco_opcode::image_sample_c_l_o;
|
|
|
|
|
} else {
|
|
|
|
|
opcode = aco_opcode::image_sample_o;
|
2022-06-06 17:04:14 +02:00
|
|
|
if (separate_g16)
|
|
|
|
|
opcode = aco_opcode::image_sample_d_o_g16;
|
|
|
|
|
else if (has_derivs)
|
2019-09-17 13:22:17 +02:00
|
|
|
opcode = aco_opcode::image_sample_d_o;
|
|
|
|
|
if (has_bias)
|
|
|
|
|
opcode = aco_opcode::image_sample_b_o;
|
|
|
|
|
if (level_zero)
|
|
|
|
|
opcode = aco_opcode::image_sample_lz_o;
|
|
|
|
|
if (has_lod)
|
|
|
|
|
opcode = aco_opcode::image_sample_l_o;
|
|
|
|
|
}
|
2020-05-11 16:33:14 +02:00
|
|
|
} else if (has_clamped_lod) { /* image_sample_*_cl */
|
|
|
|
|
if (has_compare) {
|
|
|
|
|
opcode = aco_opcode::image_sample_c_cl;
|
2022-06-06 17:04:14 +02:00
|
|
|
if (separate_g16)
|
|
|
|
|
opcode = aco_opcode::image_sample_c_d_cl_g16;
|
|
|
|
|
else if (has_derivs)
|
2020-05-11 16:33:14 +02:00
|
|
|
opcode = aco_opcode::image_sample_c_d_cl;
|
|
|
|
|
if (has_bias)
|
|
|
|
|
opcode = aco_opcode::image_sample_c_b_cl;
|
|
|
|
|
} else {
|
|
|
|
|
opcode = aco_opcode::image_sample_cl;
|
2022-06-06 17:04:14 +02:00
|
|
|
if (separate_g16)
|
|
|
|
|
opcode = aco_opcode::image_sample_d_cl_g16;
|
|
|
|
|
else if (has_derivs)
|
2020-05-11 16:33:14 +02:00
|
|
|
opcode = aco_opcode::image_sample_d_cl;
|
|
|
|
|
if (has_bias)
|
|
|
|
|
opcode = aco_opcode::image_sample_b_cl;
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
} else { /* no offset */
|
|
|
|
|
if (has_compare) {
|
|
|
|
|
opcode = aco_opcode::image_sample_c;
|
2022-06-06 17:04:14 +02:00
|
|
|
if (separate_g16)
|
|
|
|
|
opcode = aco_opcode::image_sample_c_d_g16;
|
|
|
|
|
else if (has_derivs)
|
2019-09-17 13:22:17 +02:00
|
|
|
opcode = aco_opcode::image_sample_c_d;
|
|
|
|
|
if (has_bias)
|
|
|
|
|
opcode = aco_opcode::image_sample_c_b;
|
|
|
|
|
if (level_zero)
|
|
|
|
|
opcode = aco_opcode::image_sample_c_lz;
|
|
|
|
|
if (has_lod)
|
|
|
|
|
opcode = aco_opcode::image_sample_c_l;
|
|
|
|
|
} else {
|
|
|
|
|
opcode = aco_opcode::image_sample;
|
2022-06-06 17:04:14 +02:00
|
|
|
if (separate_g16)
|
|
|
|
|
opcode = aco_opcode::image_sample_d_g16;
|
|
|
|
|
else if (has_derivs)
|
2019-09-17 13:22:17 +02:00
|
|
|
opcode = aco_opcode::image_sample_d;
|
|
|
|
|
if (has_bias)
|
|
|
|
|
opcode = aco_opcode::image_sample_b;
|
|
|
|
|
if (level_zero)
|
|
|
|
|
opcode = aco_opcode::image_sample_lz;
|
|
|
|
|
if (has_lod)
|
|
|
|
|
opcode = aco_opcode::image_sample_l;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (instr->op == nir_texop_tg4) {
|
2023-03-31 19:43:03 +01:00
|
|
|
/* GFX11 supports implicit LOD, but the extension is unsupported. */
|
|
|
|
|
assert(level_zero || ctx->options->gfx_level < GFX11);
|
|
|
|
|
|
2020-05-20 16:25:28 +02:00
|
|
|
if (has_offset) { /* image_gather4_*_o */
|
|
|
|
|
if (has_compare) {
|
2023-03-31 19:43:03 +01:00
|
|
|
opcode = aco_opcode::image_gather4_c_o;
|
|
|
|
|
if (level_zero)
|
|
|
|
|
opcode = aco_opcode::image_gather4_c_lz_o;
|
2020-05-20 16:25:28 +02:00
|
|
|
if (has_lod)
|
|
|
|
|
opcode = aco_opcode::image_gather4_c_l_o;
|
|
|
|
|
if (has_bias)
|
|
|
|
|
opcode = aco_opcode::image_gather4_c_b_o;
|
|
|
|
|
} else {
|
2023-03-31 19:43:03 +01:00
|
|
|
opcode = aco_opcode::image_gather4_o;
|
|
|
|
|
if (level_zero)
|
|
|
|
|
opcode = aco_opcode::image_gather4_lz_o;
|
2020-05-20 16:25:28 +02:00
|
|
|
if (has_lod)
|
|
|
|
|
opcode = aco_opcode::image_gather4_l_o;
|
|
|
|
|
if (has_bias)
|
|
|
|
|
opcode = aco_opcode::image_gather4_b_o;
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
} else {
|
2020-05-20 16:25:28 +02:00
|
|
|
if (has_compare) {
|
2023-03-31 19:43:03 +01:00
|
|
|
opcode = aco_opcode::image_gather4_c;
|
|
|
|
|
if (level_zero)
|
|
|
|
|
opcode = aco_opcode::image_gather4_c_lz;
|
2020-05-20 16:25:28 +02:00
|
|
|
if (has_lod)
|
|
|
|
|
opcode = aco_opcode::image_gather4_c_l;
|
|
|
|
|
if (has_bias)
|
|
|
|
|
opcode = aco_opcode::image_gather4_c_b;
|
|
|
|
|
} else {
|
2023-03-31 19:43:03 +01:00
|
|
|
opcode = aco_opcode::image_gather4;
|
|
|
|
|
if (level_zero)
|
|
|
|
|
opcode = aco_opcode::image_gather4_lz;
|
2020-05-20 16:25:28 +02:00
|
|
|
if (has_lod)
|
|
|
|
|
opcode = aco_opcode::image_gather4_l;
|
|
|
|
|
if (has_bias)
|
|
|
|
|
opcode = aco_opcode::image_gather4_b;
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
} else if (instr->op == nir_texop_lod) {
|
|
|
|
|
opcode = aco_opcode::image_get_lod;
|
|
|
|
|
}
|
|
|
|
|
|
2021-01-14 17:33:43 +00:00
|
|
|
bool implicit_derivs = bld.program->stage == fragment_fs && !has_derivs && !has_lod &&
|
|
|
|
|
!level_zero && instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
|
|
|
|
|
instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS;
|
2020-01-23 19:12:55 +01:00
|
|
|
|
2021-01-14 17:33:43 +00:00
|
|
|
Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
|
2025-03-27 18:44:34 +01:00
|
|
|
MIMG_instruction* tex =
|
|
|
|
|
emit_mimg(bld, opcode, {tmp_dst}, resource, Operand(sampler), args, vdata);
|
2019-10-08 14:42:52 +02:00
|
|
|
tex->dim = dim;
|
2020-11-20 15:11:16 +00:00
|
|
|
tex->dmask = dmask & 0xf;
|
2019-09-17 13:22:17 +02:00
|
|
|
tex->da = da;
|
2023-11-17 10:36:42 +08:00
|
|
|
tex->unrm = instr->sampler_dim == GLSL_SAMPLER_DIM_RECT;
|
2020-11-20 15:11:16 +00:00
|
|
|
tex->tfe = instr->is_sparse;
|
2021-10-08 13:51:58 +02:00
|
|
|
tex->d16 = d16;
|
2022-06-06 17:04:14 +02:00
|
|
|
tex->a16 = a16;
|
aco: insert a single p_end_wqm after the last derivative calculation
This new instruction replaces p_wqm.
Totals from 28065 (36.65% of 76572) affected shaders: (GFX11)
MaxWaves: 823922 -> 823952 (+0.00%); split: +0.01%, -0.01%
Instrs: 22221375 -> 22180465 (-0.18%); split: -0.26%, +0.08%
CodeSize: 117310676 -> 117040684 (-0.23%); split: -0.30%, +0.07%
VGPRs: 1183476 -> 1186656 (+0.27%); split: -0.19%, +0.46%
SpillSGPRs: 2305 -> 2302 (-0.13%)
Latency: 176559310 -> 176427793 (-0.07%); split: -0.21%, +0.14%
InvThroughput: 26245204 -> 26195550 (-0.19%); split: -0.26%, +0.07%
VClause: 368058 -> 369460 (+0.38%); split: -0.21%, +0.59%
SClause: 857077 -> 842588 (-1.69%); split: -2.06%, +0.37%
Copies: 1245650 -> 1249434 (+0.30%); split: -0.33%, +0.63%
Branches: 394837 -> 396070 (+0.31%); split: -0.01%, +0.32%
PreSGPRs: 1019139 -> 1019567 (+0.04%); split: -0.02%, +0.06%
PreVGPRs: 925739 -> 931860 (+0.66%); split: -0.00%, +0.66%
Changes are due to scheduling and re-enabling cross-lane optimizations.
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25038>
2023-09-02 11:14:33 +02:00
|
|
|
if (implicit_derivs)
|
|
|
|
|
set_wqm(ctx, true);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
if (tg4_integer_cube_workaround) {
|
|
|
|
|
assert(tmp_dst.id() != dst.id());
|
2020-11-20 15:11:16 +00:00
|
|
|
assert(tmp_dst.size() == dst.size());
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
emit_split_vector(ctx, tmp_dst, tmp_dst.size());
|
|
|
|
|
Temp val[4];
|
2020-11-20 15:11:16 +00:00
|
|
|
for (unsigned i = 0; i < 4; i++) {
|
2019-09-17 13:22:17 +02:00
|
|
|
val[i] = emit_extract_vector(ctx, tmp_dst, i, v1);
|
|
|
|
|
Temp cvt_val;
|
radv,aco: lower texture descriptor loads in NIR
fossil-db (Sienna Cichlid):
Totals from 39445 (24.30% of 162293) affected shaders:
MaxWaves: 875988 -> 875972 (-0.00%)
Instrs: 35372561 -> 35234909 (-0.39%); split: -0.41%, +0.03%
CodeSize: 190237480 -> 189379240 (-0.45%); split: -0.47%, +0.02%
VGPRs: 1889856 -> 1889928 (+0.00%); split: -0.00%, +0.01%
SpillSGPRs: 10764 -> 10857 (+0.86%); split: -2.04%, +2.91%
SpillVGPRs: 1891 -> 1907 (+0.85%); split: -0.32%, +1.16%
Scratch: 260096 -> 261120 (+0.39%)
Latency: 477701150 -> 477578466 (-0.03%); split: -0.06%, +0.03%
InvThroughput: 87819847 -> 87830346 (+0.01%); split: -0.03%, +0.04%
VClause: 673353 -> 673829 (+0.07%); split: -0.04%, +0.11%
SClause: 1385396 -> 1366478 (-1.37%); split: -1.65%, +0.29%
Copies: 2327965 -> 2229134 (-4.25%); split: -4.58%, +0.34%
Branches: 906707 -> 906434 (-0.03%); split: -0.13%, +0.10%
PreSGPRs: 1874153 -> 1862698 (-0.61%); split: -1.34%, +0.73%
PreVGPRs: 1691382 -> 1691383 (+0.00%); split: -0.00%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12773>
2021-08-12 15:36:56 +01:00
|
|
|
if (instr->dest_type & nir_type_uint)
|
2019-09-17 13:22:17 +02:00
|
|
|
cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]);
|
|
|
|
|
else
|
|
|
|
|
cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]);
|
|
|
|
|
val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val,
|
|
|
|
|
tg4_compare_cube_wa64);
|
|
|
|
|
}
|
2020-11-20 15:11:16 +00:00
|
|
|
|
|
|
|
|
Temp tmp = dst.regClass() == tmp_dst.regClass() ? dst : bld.tmp(tmp_dst.regClass());
|
|
|
|
|
if (instr->is_sparse)
|
|
|
|
|
tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],
|
|
|
|
|
val[3], emit_extract_vector(ctx, tmp_dst, 4, v1));
|
|
|
|
|
else
|
|
|
|
|
tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],
|
|
|
|
|
val[3]);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
2020-11-20 15:11:16 +00:00
|
|
|
unsigned mask = instr->op == nir_texop_tg4 ? (instr->is_sparse ? 0x1F : 0xF) : dmask;
|
2025-05-26 17:20:10 +02:00
|
|
|
|
|
|
|
|
/* Move the bit for the sparse residency code from the 5th bit to the last component. */
|
|
|
|
|
if (mask & 0x10) {
|
|
|
|
|
mask &= ~0x10;
|
|
|
|
|
mask |= 1u << (instr->def.num_components - 1);
|
|
|
|
|
}
|
|
|
|
|
|
2023-08-14 11:56:00 -05:00
|
|
|
expand_vector(ctx, tmp_dst, dst, instr->def.num_components, mask);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2020-01-06 16:50:41 +00:00
|
|
|
Operand
|
2024-04-09 18:13:53 +02:00
|
|
|
get_phi_operand(isel_context* ctx, nir_def* ssa, RegClass rc)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
Temp tmp = get_ssa_temp(ctx, ssa);
|
2023-08-15 09:59:06 -05:00
|
|
|
if (ssa->parent_instr->type == nir_instr_type_undef) {
|
2020-06-18 13:52:28 +01:00
|
|
|
return Operand(rc);
|
2024-04-09 18:13:53 +02:00
|
|
|
} else if (ssa->bit_size == 1 && ssa->parent_instr->type == nir_instr_type_load_const) {
|
2023-02-19 14:07:10 +01:00
|
|
|
bool val = nir_instr_as_load_const(ssa->parent_instr)->value[0].b;
|
|
|
|
|
return Operand::c32_or_c64(val ? -1 : 0, ctx->program->lane_mask == s2);
|
2020-01-06 16:50:41 +00:00
|
|
|
} else {
|
2019-09-17 13:22:17 +02:00
|
|
|
return Operand(tmp);
|
2020-01-06 16:50:41 +00:00
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
visit_phi(isel_context* ctx, nir_phi_instr* instr)
|
|
|
|
|
{
|
2023-08-14 11:56:00 -05:00
|
|
|
Temp dst = get_ssa_temp(ctx, &instr->def);
|
|
|
|
|
assert(instr->def.bit_size != 1 || dst.regClass() == ctx->program->lane_mask);
|
2024-04-09 18:13:53 +02:00
|
|
|
aco_opcode opcode = instr->def.bit_size == 1 ? aco_opcode::p_boolean_phi : aco_opcode::p_phi;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2019-11-18 17:26:38 +00:00
|
|
|
/* we want a sorted list of sources, since the predecessor list is also sorted */
|
2023-08-12 16:17:15 -04:00
|
|
|
std::map<unsigned, nir_def*> phi_src;
|
2019-11-18 17:26:38 +00:00
|
|
|
nir_foreach_phi_src (src, instr)
|
2019-09-17 13:22:17 +02:00
|
|
|
phi_src[src->pred->index] = src->src.ssa;
|
2019-11-18 17:26:38 +00:00
|
|
|
|
2024-04-09 18:13:53 +02:00
|
|
|
Instruction* phi = create_instruction(opcode, Format::PSEUDO, phi_src.size(), 1);
|
|
|
|
|
unsigned i = 0;
|
|
|
|
|
for (std::pair<unsigned, nir_def*> src : phi_src)
|
|
|
|
|
phi->operands[i++] = get_phi_operand(ctx, src.second, dst.regClass());
|
2019-09-17 13:22:17 +02:00
|
|
|
phi->definitions[0] = Definition(dst);
|
|
|
|
|
ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
2023-08-12 16:17:15 -04:00
|
|
|
visit_undef(isel_context* ctx, nir_undef_instr* instr)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
Temp dst = get_ssa_temp(ctx, &instr->def);
|
|
|
|
|
|
|
|
|
|
assert(dst.type() == RegType::sgpr);
|
|
|
|
|
|
|
|
|
|
if (dst.size() == 1) {
|
2021-07-13 11:22:46 +02:00
|
|
|
Builder(ctx->program, ctx->block).copy(Definition(dst), Operand::zero());
|
2019-09-17 13:22:17 +02:00
|
|
|
} else {
|
2024-03-25 15:55:27 +01:00
|
|
|
aco_ptr<Instruction> vec{
|
|
|
|
|
create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
|
2019-09-17 13:22:17 +02:00
|
|
|
for (unsigned i = 0; i < dst.size(); i++)
|
2021-07-13 11:22:46 +02:00
|
|
|
vec->operands[i] = Operand::zero();
|
2019-09-17 13:22:17 +02:00
|
|
|
vec->definitions[0] = Definition(dst);
|
|
|
|
|
ctx->block->instructions.emplace_back(std::move(vec));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-07-13 11:52:14 +01:00
|
|
|
void
|
|
|
|
|
visit_jump(isel_context* ctx, nir_jump_instr* instr)
|
|
|
|
|
{
|
2024-07-19 20:03:43 +01:00
|
|
|
end_empty_exec_skip(ctx);
|
|
|
|
|
|
2020-07-13 11:52:14 +01:00
|
|
|
switch (instr->type) {
|
|
|
|
|
case nir_jump_break: emit_loop_break(ctx); break;
|
|
|
|
|
case nir_jump_continue: emit_loop_continue(ctx); break;
|
|
|
|
|
default: isel_err(&instr->instr, "Unknown NIR jump instr"); abort();
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2024-10-06 09:38:35 +02:00
|
|
|
void
|
2024-11-04 21:16:16 +01:00
|
|
|
visit_debug_info(isel_context* ctx, nir_instr_debug_info* instr_info)
|
2024-10-06 09:38:35 +02:00
|
|
|
{
|
|
|
|
|
ac_shader_debug_info info;
|
|
|
|
|
memset(&info, 0, sizeof(info));
|
|
|
|
|
|
2024-11-04 21:16:16 +01:00
|
|
|
info.type = ac_shader_debug_info_src_loc;
|
|
|
|
|
if (instr_info->filename)
|
|
|
|
|
info.src_loc.file = strdup(instr_info->filename);
|
|
|
|
|
info.src_loc.line = instr_info->line;
|
|
|
|
|
info.src_loc.column = instr_info->column;
|
|
|
|
|
info.src_loc.spirv_offset = instr_info->spirv_offset;
|
2024-10-06 09:38:35 +02:00
|
|
|
|
|
|
|
|
Builder bld(ctx->program, ctx->block);
|
|
|
|
|
bld.pseudo(aco_opcode::p_debug_info, Operand::c32(ctx->program->debug_info.size()));
|
|
|
|
|
|
|
|
|
|
ctx->program->debug_info.push_back(info);
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
void
|
|
|
|
|
visit_block(isel_context* ctx, nir_block* block)
|
|
|
|
|
{
|
2023-04-14 17:49:46 +01:00
|
|
|
if (ctx->block->kind & block_kind_top_level) {
|
|
|
|
|
Builder bld(ctx->program, ctx->block);
|
2024-01-31 18:44:21 +00:00
|
|
|
for (Temp tmp : ctx->unended_linear_vgprs) {
|
2024-07-25 17:15:15 +02:00
|
|
|
bld.pseudo(aco_opcode::p_end_linear_vgpr, tmp);
|
2024-01-31 18:44:21 +00:00
|
|
|
}
|
2023-04-14 17:49:46 +01:00
|
|
|
ctx->unended_linear_vgprs.clear();
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-19 20:03:43 +01:00
|
|
|
nir_foreach_phi (instr, block)
|
|
|
|
|
visit_phi(ctx, instr);
|
|
|
|
|
|
|
|
|
|
nir_phi_instr* last_phi = nir_block_last_phi_instr(block);
|
|
|
|
|
begin_empty_exec_skip(ctx, last_phi ? &last_phi->instr : NULL, block);
|
|
|
|
|
|
2022-08-17 00:18:54 +02:00
|
|
|
ctx->block->instructions.reserve(ctx->block->instructions.size() +
|
|
|
|
|
exec_list_length(&block->instr_list) * 2);
|
2019-09-17 13:22:17 +02:00
|
|
|
nir_foreach_instr (instr, block) {
|
2024-11-04 21:16:16 +01:00
|
|
|
if (ctx->shader->has_debug_info)
|
|
|
|
|
visit_debug_info(ctx, nir_instr_get_debug_info(instr));
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
switch (instr->type) {
|
|
|
|
|
case nir_instr_type_alu: visit_alu_instr(ctx, nir_instr_as_alu(instr)); break;
|
|
|
|
|
case nir_instr_type_load_const: visit_load_const(ctx, nir_instr_as_load_const(instr)); break;
|
|
|
|
|
case nir_instr_type_intrinsic: visit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); break;
|
|
|
|
|
case nir_instr_type_tex: visit_tex(ctx, nir_instr_as_tex(instr)); break;
|
2024-07-19 20:03:43 +01:00
|
|
|
case nir_instr_type_phi: break;
|
2023-08-15 09:59:06 -05:00
|
|
|
case nir_instr_type_undef: visit_undef(ctx, nir_instr_as_undef(instr)); break;
|
2019-09-17 13:22:17 +02:00
|
|
|
case nir_instr_type_deref: break;
|
|
|
|
|
case nir_instr_type_jump: visit_jump(ctx, nir_instr_as_jump(instr)); break;
|
2020-08-14 10:42:27 +02:00
|
|
|
default: isel_err(instr, "Unknown NIR instr type");
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-05-14 15:13:06 +02:00
|
|
|
void
|
2019-09-17 13:22:17 +02:00
|
|
|
visit_loop(isel_context* ctx, nir_loop* loop)
|
|
|
|
|
{
|
2021-12-02 10:31:56 +01:00
|
|
|
assert(!nir_loop_has_continue_construct(loop));
|
2020-07-13 11:52:14 +01:00
|
|
|
loop_context lc;
|
|
|
|
|
begin_loop(ctx, &lc);
|
aco/isel: track control flow divergence in loops more accurately
We introduce two new variables, cf_context::in_divergent_cf and
cf_context::parent_loop.has_divergent_break, in order to determine
whether there is any other invocations on a different CF path.
Totals from 1305 (1.64% of 79395) affected shaders: (Navi31)
Instrs: 659211 -> 657815 (-0.21%); split: -0.22%, +0.01%
CodeSize: 3483228 -> 3477960 (-0.15%); split: -0.16%, +0.01%
VGPRs: 68820 -> 48048 (-30.18%)
Latency: 14197750 -> 14170767 (-0.19%); split: -0.26%, +0.07%
InvThroughput: 1619103 -> 1619826 (+0.04%); split: -0.02%, +0.07%
VClause: 12384 -> 12350 (-0.27%)
SClause: 26693 -> 26844 (+0.57%); split: -0.01%, +0.57%
Copies: 44994 -> 43535 (-3.24%); split: -3.26%, +0.02%
PreSGPRs: 49007 -> 48907 (-0.20%)
PreVGPRs: 32171 -> 32121 (-0.16%)
VALU: 349984 -> 349857 (-0.04%); split: -0.04%, +0.00%
SALU: 84252 -> 83988 (-0.31%); split: -0.32%, +0.00%
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33206>
2025-01-24 12:32:47 +01:00
|
|
|
ctx->cf_info.parent_loop.has_divergent_break =
|
|
|
|
|
loop->divergent_break && nir_loop_first_block(loop)->predecessors->entries > 1;
|
|
|
|
|
ctx->cf_info.in_divergent_cf |= ctx->cf_info.parent_loop.has_divergent_break;
|
2021-06-22 15:23:37 +01:00
|
|
|
|
2024-03-22 18:40:51 +00:00
|
|
|
visit_cf_list(ctx, &loop->body);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-07-13 11:52:14 +01:00
|
|
|
end_loop(ctx, &lc);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2025-05-14 15:13:06 +02:00
|
|
|
void
|
2020-01-31 16:47:10 +00:00
|
|
|
visit_if(isel_context* ctx, nir_if* if_stmt)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
|
|
|
|
|
Builder bld(ctx->program, ctx->block);
|
2024-03-25 12:05:50 +01:00
|
|
|
aco_ptr<Instruction> branch;
|
2020-01-22 18:21:43 +01:00
|
|
|
if_context ic;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2024-09-10 12:31:27 +02:00
|
|
|
if (!nir_src_is_divergent(&if_stmt->condition)) { /* uniform condition */
|
2019-09-17 13:22:17 +02:00
|
|
|
/**
|
|
|
|
|
* Uniform conditionals are represented in the following way*) :
|
|
|
|
|
*
|
|
|
|
|
* The linear and logical CFG:
|
|
|
|
|
* BB_IF
|
|
|
|
|
* / \
|
|
|
|
|
* BB_THEN (logical) BB_ELSE (logical)
|
|
|
|
|
* \ /
|
|
|
|
|
* BB_ENDIF
|
|
|
|
|
*
|
|
|
|
|
* *) Exceptions may be due to break and continue statements within loops
|
|
|
|
|
* If a break/continue happens within uniform control flow, it branches
|
|
|
|
|
* to the loop exit/entry block. Otherwise, it branches to the next
|
|
|
|
|
* merge block.
|
|
|
|
|
**/
|
|
|
|
|
|
2020-01-22 18:21:43 +01:00
|
|
|
assert(cond.regClass() == ctx->program->lane_mask);
|
2019-11-04 19:28:08 +01:00
|
|
|
cond = bool_to_scalar_condition(ctx, cond);
|
|
|
|
|
|
2020-01-22 18:21:43 +01:00
|
|
|
begin_uniform_if_then(ctx, &ic, cond);
|
2019-09-17 13:22:17 +02:00
|
|
|
visit_cf_list(ctx, &if_stmt->then_list);
|
|
|
|
|
|
2020-01-22 18:21:43 +01:00
|
|
|
begin_uniform_if_else(ctx, &ic);
|
2019-09-17 13:22:17 +02:00
|
|
|
visit_cf_list(ctx, &if_stmt->else_list);
|
|
|
|
|
|
2020-01-22 18:21:43 +01:00
|
|
|
end_uniform_if(ctx, &ic);
|
2019-09-17 13:22:17 +02:00
|
|
|
} else { /* non-uniform condition */
|
|
|
|
|
/**
|
|
|
|
|
* To maintain a logical and linear CFG without critical edges,
|
|
|
|
|
* non-uniform conditionals are represented in the following way*) :
|
|
|
|
|
*
|
|
|
|
|
* The linear CFG:
|
|
|
|
|
* BB_IF
|
|
|
|
|
* / \
|
|
|
|
|
* BB_THEN (logical) BB_THEN (linear)
|
|
|
|
|
* \ /
|
|
|
|
|
* BB_INVERT (linear)
|
|
|
|
|
* / \
|
|
|
|
|
* BB_ELSE (logical) BB_ELSE (linear)
|
|
|
|
|
* \ /
|
|
|
|
|
* BB_ENDIF
|
|
|
|
|
*
|
|
|
|
|
* The logical CFG:
|
|
|
|
|
* BB_IF
|
|
|
|
|
* / \
|
|
|
|
|
* BB_THEN (logical) BB_ELSE (logical)
|
|
|
|
|
* \ /
|
|
|
|
|
* BB_ENDIF
|
|
|
|
|
*
|
|
|
|
|
* *) Exceptions may be due to break and continue statements within loops
|
|
|
|
|
**/
|
|
|
|
|
|
2022-02-23 10:13:54 +01:00
|
|
|
begin_divergent_if_then(ctx, &ic, cond, if_stmt->control);
|
2019-09-17 13:22:17 +02:00
|
|
|
visit_cf_list(ctx, &if_stmt->then_list);
|
|
|
|
|
|
2022-02-23 10:13:54 +01:00
|
|
|
begin_divergent_if_else(ctx, &ic, if_stmt->control);
|
2019-09-17 13:22:17 +02:00
|
|
|
visit_cf_list(ctx, &if_stmt->else_list);
|
|
|
|
|
|
|
|
|
|
end_divergent_if(ctx, &ic);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-05-14 15:13:06 +02:00
|
|
|
void
|
2020-01-31 16:39:20 +00:00
|
|
|
visit_cf_list(isel_context* ctx, struct exec_list* list)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2024-07-19 20:03:43 +01:00
|
|
|
if (nir_cf_list_is_empty_block(list))
|
|
|
|
|
return;
|
|
|
|
|
|
2025-01-23 10:07:01 +01:00
|
|
|
bool skipping_empty_exec_old = ctx->skipping_empty_exec;
|
|
|
|
|
if_context empty_exec_skip_old = std::move(ctx->empty_exec_skip);
|
|
|
|
|
ctx->skipping_empty_exec = false;
|
2024-07-19 20:03:43 +01:00
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
foreach_list_typed (nir_cf_node, node, node, list) {
|
|
|
|
|
switch (node->type) {
|
|
|
|
|
case nir_cf_node_block: visit_block(ctx, nir_cf_node_as_block(node)); break;
|
2024-03-22 18:40:51 +00:00
|
|
|
case nir_cf_node_if: visit_if(ctx, nir_cf_node_as_if(node)); break;
|
2019-09-17 13:22:17 +02:00
|
|
|
case nir_cf_node_loop: visit_loop(ctx, nir_cf_node_as_loop(node)); break;
|
2025-07-23 09:17:35 +02:00
|
|
|
default: UNREACHABLE("unimplemented cf list type");
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
2024-07-19 20:03:43 +01:00
|
|
|
|
|
|
|
|
end_empty_exec_skip(ctx);
|
2025-01-23 10:07:01 +01:00
|
|
|
ctx->skipping_empty_exec = skipping_empty_exec_old;
|
|
|
|
|
ctx->empty_exec_skip = std::move(empty_exec_skip_old);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2025-05-14 15:13:06 +02:00
|
|
|
void
|
2022-06-24 16:03:52 +02:00
|
|
|
create_fs_jump_to_epilog(isel_context* ctx)
|
|
|
|
|
{
|
|
|
|
|
Builder bld(ctx->program, ctx->block);
|
2023-11-16 16:53:41 +01:00
|
|
|
std::vector<Operand> exports;
|
2023-11-16 16:54:30 +01:00
|
|
|
unsigned vgpr = 256; /* VGPR 0 */
|
|
|
|
|
|
|
|
|
|
if (ctx->outputs.mask[FRAG_RESULT_DEPTH])
|
|
|
|
|
exports.emplace_back(Operand(ctx->outputs.temps[FRAG_RESULT_DEPTH * 4u], PhysReg{vgpr++}));
|
|
|
|
|
|
|
|
|
|
if (ctx->outputs.mask[FRAG_RESULT_STENCIL])
|
|
|
|
|
exports.emplace_back(Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u], PhysReg{vgpr++}));
|
|
|
|
|
|
|
|
|
|
if (ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK])
|
|
|
|
|
exports.emplace_back(
|
|
|
|
|
Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u], PhysReg{vgpr++}));
|
|
|
|
|
|
|
|
|
|
PhysReg exports_start(vgpr);
|
2022-06-24 16:03:52 +02:00
|
|
|
|
|
|
|
|
for (unsigned slot = FRAG_RESULT_DATA0; slot < FRAG_RESULT_DATA7 + 1; ++slot) {
|
|
|
|
|
unsigned color_index = slot - FRAG_RESULT_DATA0;
|
|
|
|
|
unsigned color_type = (ctx->output_color_types >> (color_index * 2)) & 0x3;
|
|
|
|
|
unsigned write_mask = ctx->outputs.mask[slot];
|
|
|
|
|
|
|
|
|
|
if (!write_mask)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
PhysReg color_start(exports_start.reg() + color_index * 4);
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < 4; i++) {
|
|
|
|
|
if (!(write_mask & BITFIELD_BIT(i))) {
|
2023-11-16 16:53:41 +01:00
|
|
|
exports.emplace_back(Operand(v1));
|
2022-06-24 16:03:52 +02:00
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
PhysReg chan_reg = color_start.advance(i * 4u);
|
|
|
|
|
Operand chan(ctx->outputs.temps[slot * 4u + i]);
|
|
|
|
|
|
|
|
|
|
if (color_type == ACO_TYPE_FLOAT16) {
|
|
|
|
|
chan = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), chan);
|
|
|
|
|
} else if (color_type == ACO_TYPE_INT16 || color_type == ACO_TYPE_UINT16) {
|
|
|
|
|
bool sign_ext = color_type == ACO_TYPE_INT16;
|
|
|
|
|
Temp tmp = convert_int(ctx, bld, chan.getTemp(), 16, 32, sign_ext);
|
|
|
|
|
chan = Operand(tmp);
|
|
|
|
|
}
|
|
|
|
|
|
2024-09-11 11:11:42 +02:00
|
|
|
chan.setPrecolored(chan_reg);
|
2023-11-16 16:53:41 +01:00
|
|
|
exports.emplace_back(chan);
|
2022-06-24 16:03:52 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-07 13:20:27 +01:00
|
|
|
Temp continue_pc = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->program->info.epilog_pc));
|
2022-06-24 16:03:52 +02:00
|
|
|
|
2024-03-25 15:55:27 +01:00
|
|
|
aco_ptr<Instruction> jump{
|
|
|
|
|
create_instruction(aco_opcode::p_jump_to_epilog, Format::PSEUDO, 1 + exports.size(), 0)};
|
2022-06-24 16:03:52 +02:00
|
|
|
jump->operands[0] = Operand(continue_pc);
|
2023-11-16 16:53:41 +01:00
|
|
|
for (unsigned i = 0; i < exports.size(); i++) {
|
|
|
|
|
jump->operands[i + 1] = exports[i];
|
2022-06-24 16:03:52 +02:00
|
|
|
}
|
|
|
|
|
ctx->block->instructions.emplace_back(std::move(jump));
|
|
|
|
|
}
|
|
|
|
|
|
2025-05-14 15:13:06 +02:00
|
|
|
Operand
|
2023-07-20 10:16:29 +08:00
|
|
|
get_arg_for_end(isel_context* ctx, struct ac_arg arg)
|
|
|
|
|
{
|
|
|
|
|
return Operand(get_arg(ctx, arg), get_arg_reg(ctx->args, arg));
|
|
|
|
|
}
|
|
|
|
|
|
2025-05-14 15:13:06 +02:00
|
|
|
void
|
2023-08-11 16:58:36 +08:00
|
|
|
create_fs_end_for_epilog(isel_context* ctx)
|
|
|
|
|
{
|
|
|
|
|
Builder bld(ctx->program, ctx->block);
|
|
|
|
|
|
|
|
|
|
std::vector<Operand> regs;
|
|
|
|
|
|
|
|
|
|
regs.emplace_back(get_arg_for_end(ctx, ctx->program->info.ps.alpha_reference));
|
|
|
|
|
|
|
|
|
|
unsigned vgpr = 256;
|
|
|
|
|
|
|
|
|
|
for (unsigned slot = FRAG_RESULT_DATA0; slot <= FRAG_RESULT_DATA7; slot++) {
|
|
|
|
|
unsigned index = slot - FRAG_RESULT_DATA0;
|
|
|
|
|
unsigned type = (ctx->output_color_types >> (index * 2)) & 0x3;
|
|
|
|
|
unsigned write_mask = ctx->outputs.mask[slot];
|
|
|
|
|
|
|
|
|
|
if (!write_mask)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
if (type == ACO_TYPE_ANY32) {
|
|
|
|
|
u_foreach_bit (i, write_mask) {
|
|
|
|
|
regs.emplace_back(Operand(ctx->outputs.temps[slot * 4 + i], PhysReg{vgpr + i}));
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
|
|
|
|
unsigned mask = (write_mask >> (i * 2)) & 0x3;
|
|
|
|
|
if (!mask)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
unsigned chan = slot * 4 + i * 2;
|
|
|
|
|
Operand lo = mask & 0x1 ? Operand(ctx->outputs.temps[chan]) : Operand(v2b);
|
|
|
|
|
Operand hi = mask & 0x2 ? Operand(ctx->outputs.temps[chan + 1]) : Operand(v2b);
|
|
|
|
|
|
|
|
|
|
Temp dst = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), lo, hi);
|
|
|
|
|
regs.emplace_back(Operand(dst, PhysReg{vgpr + i}));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
vgpr += 4;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (ctx->outputs.mask[FRAG_RESULT_DEPTH])
|
|
|
|
|
regs.emplace_back(Operand(ctx->outputs.temps[FRAG_RESULT_DEPTH * 4], PhysReg{vgpr++}));
|
|
|
|
|
|
|
|
|
|
if (ctx->outputs.mask[FRAG_RESULT_STENCIL])
|
|
|
|
|
regs.emplace_back(Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4], PhysReg{vgpr++}));
|
|
|
|
|
|
|
|
|
|
if (ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK])
|
|
|
|
|
regs.emplace_back(Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4], PhysReg{vgpr++}));
|
|
|
|
|
|
|
|
|
|
build_end_with_regs(ctx, regs);
|
|
|
|
|
|
|
|
|
|
/* Exit WQM mode finally. */
|
|
|
|
|
ctx->program->needs_exact = true;
|
|
|
|
|
}
|
|
|
|
|
|
2019-11-15 13:51:27 +01:00
|
|
|
void
|
2024-03-25 12:05:50 +01:00
|
|
|
split_arguments(isel_context* ctx, Instruction* startpgm)
|
2019-11-15 13:51:27 +01:00
|
|
|
{
|
|
|
|
|
/* Split all arguments except for the first (ring_offsets) and the last
|
|
|
|
|
* (exec) so that the dead channels don't stay live throughout the program.
|
|
|
|
|
*/
|
2021-02-04 16:01:44 +01:00
|
|
|
for (int i = 1; i < startpgm->definitions.size(); i++) {
|
2019-11-15 13:51:27 +01:00
|
|
|
if (startpgm->definitions[i].regClass().size() > 1) {
|
|
|
|
|
emit_split_vector(ctx, startpgm->definitions[i].getTemp(),
|
|
|
|
|
startpgm->definitions[i].regClass().size());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-11-09 20:51:45 +00:00
|
|
|
void
|
|
|
|
|
setup_fp_mode(isel_context* ctx, nir_shader* shader)
|
|
|
|
|
{
|
|
|
|
|
Program* program = ctx->program;
|
|
|
|
|
|
|
|
|
|
unsigned float_controls = shader->info.float_controls_execution_mode;
|
|
|
|
|
|
|
|
|
|
program->next_fp_mode.must_flush_denorms32 =
|
|
|
|
|
float_controls & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32;
|
|
|
|
|
program->next_fp_mode.must_flush_denorms16_64 =
|
|
|
|
|
float_controls &
|
|
|
|
|
(FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 | FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64);
|
|
|
|
|
|
|
|
|
|
program->next_fp_mode.care_about_round32 =
|
|
|
|
|
float_controls &
|
|
|
|
|
(FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32);
|
|
|
|
|
|
|
|
|
|
program->next_fp_mode.care_about_round16_64 =
|
|
|
|
|
float_controls &
|
|
|
|
|
(FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 |
|
|
|
|
|
FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64);
|
|
|
|
|
|
2020-05-19 13:24:45 +01:00
|
|
|
/* default to preserving fp16 and fp64 denorms, since it's free for fp64 and
|
|
|
|
|
* the precision seems needed for Wolfenstein: Youngblood to render correctly */
|
2019-11-09 20:51:45 +00:00
|
|
|
if (program->next_fp_mode.must_flush_denorms16_64)
|
|
|
|
|
program->next_fp_mode.denorm16_64 = 0;
|
|
|
|
|
else
|
|
|
|
|
program->next_fp_mode.denorm16_64 = fp_denorm_keep;
|
|
|
|
|
|
|
|
|
|
/* preserving fp32 denorms is expensive, so only do it if asked */
|
|
|
|
|
if (float_controls & FLOAT_CONTROLS_DENORM_PRESERVE_FP32)
|
|
|
|
|
program->next_fp_mode.denorm32 = fp_denorm_keep;
|
|
|
|
|
else
|
|
|
|
|
program->next_fp_mode.denorm32 = 0;
|
|
|
|
|
|
|
|
|
|
if (float_controls & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
|
|
|
|
|
program->next_fp_mode.round32 = fp_round_tz;
|
|
|
|
|
else
|
|
|
|
|
program->next_fp_mode.round32 = fp_round_ne;
|
|
|
|
|
|
|
|
|
|
if (float_controls &
|
|
|
|
|
(FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
|
|
|
|
|
program->next_fp_mode.round16_64 = fp_round_tz;
|
|
|
|
|
else
|
|
|
|
|
program->next_fp_mode.round16_64 = fp_round_ne;
|
|
|
|
|
|
|
|
|
|
ctx->block->fp_mode = program->next_fp_mode;
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-14 10:04:40 +02:00
|
|
|
Temp
|
|
|
|
|
merged_wave_info_to_mask(isel_context* ctx, unsigned i)
|
|
|
|
|
{
|
2024-09-15 10:19:05 +02:00
|
|
|
/* lanecount_to_mask() only cares about s0.byte[i].[6:0]
|
|
|
|
|
* so we don't need either s_bfe nor s_and here.
|
|
|
|
|
*/
|
|
|
|
|
Temp count = get_arg(ctx, ctx->args->merged_wave_info);
|
2020-09-14 10:04:40 +02:00
|
|
|
|
2024-09-15 10:19:05 +02:00
|
|
|
return lanecount_to_mask(ctx, count, i * 8u);
|
2020-09-14 10:04:40 +02:00
|
|
|
}
|
|
|
|
|
|
2025-05-14 15:13:06 +02:00
|
|
|
void
|
2023-07-16 12:16:38 +02:00
|
|
|
insert_rt_jump_next(isel_context& ctx, const struct ac_shader_args* args)
|
|
|
|
|
{
|
2024-01-17 15:58:13 +01:00
|
|
|
unsigned src_count = 0;
|
|
|
|
|
for (unsigned i = 0; i < ctx.args->arg_count; i++)
|
|
|
|
|
src_count += !!BITSET_TEST(ctx.output_args, i);
|
|
|
|
|
|
2024-03-25 15:55:27 +01:00
|
|
|
Instruction* ret = create_instruction(aco_opcode::p_return, Format::PSEUDO, src_count, 0);
|
2023-07-16 12:16:38 +02:00
|
|
|
ctx.block->instructions.emplace_back(ret);
|
|
|
|
|
|
2024-01-17 15:58:13 +01:00
|
|
|
src_count = 0;
|
|
|
|
|
for (unsigned i = 0; i < ctx.args->arg_count; i++) {
|
|
|
|
|
if (!BITSET_TEST(ctx.output_args, i))
|
|
|
|
|
continue;
|
|
|
|
|
|
2023-07-16 12:16:38 +02:00
|
|
|
enum ac_arg_regfile file = ctx.args->args[i].file;
|
|
|
|
|
unsigned size = ctx.args->args[i].size;
|
|
|
|
|
unsigned reg = ctx.args->args[i].offset + (file == AC_ARG_SGPR ? 0 : 256);
|
|
|
|
|
RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
|
|
|
|
|
Operand op = ctx.arg_temps[i].id() ? Operand(ctx.arg_temps[i], PhysReg{reg})
|
|
|
|
|
: Operand(PhysReg{reg}, type);
|
2024-01-17 15:58:13 +01:00
|
|
|
ret->operands[src_count] = op;
|
|
|
|
|
src_count++;
|
2023-07-16 12:16:38 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Builder bld(ctx.program, ctx.block);
|
|
|
|
|
bld.sop1(aco_opcode::s_setpc_b64, get_arg(&ctx, ctx.args->rt.uniform_shader_addr));
|
|
|
|
|
}
|
|
|
|
|
|
2023-03-10 19:56:43 +01:00
|
|
|
void
|
|
|
|
|
select_program_rt(isel_context& ctx, unsigned shader_count, struct nir_shader* const* shaders,
|
|
|
|
|
const struct ac_shader_args* args)
|
|
|
|
|
{
|
|
|
|
|
for (unsigned i = 0; i < shader_count; i++) {
|
|
|
|
|
if (i) {
|
|
|
|
|
ctx.block = ctx.program->create_and_insert_block();
|
|
|
|
|
ctx.block->kind = block_kind_top_level | block_kind_resume;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
nir_shader* nir = shaders[i];
|
|
|
|
|
init_context(&ctx, nir);
|
|
|
|
|
setup_fp_mode(&ctx, nir);
|
|
|
|
|
|
2024-03-25 12:05:50 +01:00
|
|
|
Instruction* startpgm = add_startpgm(&ctx);
|
2023-03-10 19:56:43 +01:00
|
|
|
append_logical_start(ctx.block);
|
|
|
|
|
split_arguments(&ctx, startpgm);
|
|
|
|
|
visit_cf_list(&ctx, &nir_shader_get_entrypoint(nir)->body);
|
2023-09-13 16:40:55 +02:00
|
|
|
append_logical_end(ctx.block);
|
|
|
|
|
ctx.block->kind |= block_kind_uniform;
|
2023-03-10 19:56:43 +01:00
|
|
|
|
2023-07-16 12:16:38 +02:00
|
|
|
/* Fix output registers and jump to next shader. We can skip this when dealing with a raygen
|
|
|
|
|
* shader without shader calls.
|
|
|
|
|
*/
|
|
|
|
|
if (shader_count > 1 || shaders[i]->info.stage != MESA_SHADER_RAYGEN)
|
|
|
|
|
insert_rt_jump_next(ctx, args);
|
2023-03-10 19:56:43 +01:00
|
|
|
|
|
|
|
|
cleanup_context(&ctx);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ctx.program->config->float_mode = ctx.program->blocks[0].fp_mode.val;
|
aco: insert a single p_end_wqm after the last derivative calculation
This new instruction replaces p_wqm.
Totals from 28065 (36.65% of 76572) affected shaders: (GFX11)
MaxWaves: 823922 -> 823952 (+0.00%); split: +0.01%, -0.01%
Instrs: 22221375 -> 22180465 (-0.18%); split: -0.26%, +0.08%
CodeSize: 117310676 -> 117040684 (-0.23%); split: -0.30%, +0.07%
VGPRs: 1183476 -> 1186656 (+0.27%); split: -0.19%, +0.46%
SpillSGPRs: 2305 -> 2302 (-0.13%)
Latency: 176559310 -> 176427793 (-0.07%); split: -0.21%, +0.14%
InvThroughput: 26245204 -> 26195550 (-0.19%); split: -0.26%, +0.07%
VClause: 368058 -> 369460 (+0.38%); split: -0.21%, +0.59%
SClause: 857077 -> 842588 (-1.69%); split: -2.06%, +0.37%
Copies: 1245650 -> 1249434 (+0.30%); split: -0.33%, +0.63%
Branches: 394837 -> 396070 (+0.31%); split: -0.01%, +0.32%
PreSGPRs: 1019139 -> 1019567 (+0.04%); split: -0.02%, +0.06%
PreVGPRs: 925739 -> 931860 (+0.66%); split: -0.00%, +0.66%
Changes are due to scheduling and re-enabling cross-lane optimizations.
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25038>
2023-09-02 11:14:33 +02:00
|
|
|
finish_program(&ctx);
|
2023-03-10 19:56:43 +01:00
|
|
|
}
|
|
|
|
|
|
2025-05-14 12:02:21 +02:00
|
|
|
static void
|
2023-08-24 09:41:46 +02:00
|
|
|
create_merged_jump_to_epilog(isel_context* ctx)
|
2023-08-15 15:20:16 +02:00
|
|
|
{
|
|
|
|
|
Builder bld(ctx->program, ctx->block);
|
|
|
|
|
std::vector<Operand> regs;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < ctx->args->arg_count; i++) {
|
|
|
|
|
if (!ctx->args->args[i].preserved)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
const enum ac_arg_regfile file = ctx->args->args[i].file;
|
|
|
|
|
const unsigned reg = ctx->args->args[i].offset;
|
|
|
|
|
|
|
|
|
|
Operand op(ctx->arg_temps[i]);
|
2024-09-11 11:11:42 +02:00
|
|
|
op.setPrecolored(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256});
|
2023-08-15 15:20:16 +02:00
|
|
|
regs.emplace_back(op);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Temp continue_pc =
|
|
|
|
|
convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->program->info.next_stage_pc));
|
|
|
|
|
|
2024-03-25 15:55:27 +01:00
|
|
|
aco_ptr<Instruction> jump{
|
|
|
|
|
create_instruction(aco_opcode::p_jump_to_epilog, Format::PSEUDO, 1 + regs.size(), 0)};
|
2023-08-15 15:20:16 +02:00
|
|
|
jump->operands[0] = Operand(continue_pc);
|
|
|
|
|
for (unsigned i = 0; i < regs.size(); i++) {
|
|
|
|
|
jump->operands[i + 1] = regs[i];
|
|
|
|
|
}
|
|
|
|
|
ctx->block->instructions.emplace_back(std::move(jump));
|
|
|
|
|
}
|
|
|
|
|
|
2025-05-14 15:13:06 +02:00
|
|
|
void
|
2023-08-29 14:21:08 +08:00
|
|
|
create_end_for_merged_shader(isel_context* ctx)
|
|
|
|
|
{
|
|
|
|
|
std::vector<Operand> regs;
|
|
|
|
|
|
|
|
|
|
unsigned max_args;
|
|
|
|
|
if (ctx->stage.sw == SWStage::VS) {
|
|
|
|
|
assert(ctx->args->vertex_id.used);
|
|
|
|
|
max_args = ctx->args->vertex_id.arg_index;
|
|
|
|
|
} else {
|
|
|
|
|
assert(ctx->stage.sw == SWStage::TES);
|
|
|
|
|
assert(ctx->args->tes_u.used);
|
|
|
|
|
max_args = ctx->args->tes_u.arg_index;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
struct ac_arg arg;
|
|
|
|
|
arg.used = true;
|
|
|
|
|
|
|
|
|
|
for (arg.arg_index = 0; arg.arg_index < max_args; arg.arg_index++)
|
|
|
|
|
regs.emplace_back(get_arg_for_end(ctx, arg));
|
|
|
|
|
|
|
|
|
|
build_end_with_regs(ctx, regs);
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
void
|
2023-10-11 14:35:42 +08:00
|
|
|
select_shader(isel_context& ctx, nir_shader* nir, const bool need_startpgm, const bool need_endpgm,
|
|
|
|
|
const bool need_barrier, if_context* ic_merged_wave_info,
|
|
|
|
|
const bool check_merged_wave_info, const bool endif_merged_wave_info)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
2023-06-20 14:03:34 +02:00
|
|
|
init_context(&ctx, nir);
|
|
|
|
|
setup_fp_mode(&ctx, nir);
|
2023-03-10 19:56:43 +01:00
|
|
|
|
2023-06-20 14:03:34 +02:00
|
|
|
Program* program = ctx.program;
|
2023-03-10 19:56:43 +01:00
|
|
|
|
2023-06-20 14:03:34 +02:00
|
|
|
if (need_startpgm) {
|
|
|
|
|
/* Needs to be after init_context() for FS. */
|
2024-03-25 12:05:50 +01:00
|
|
|
Instruction* startpgm = add_startpgm(&ctx);
|
2024-07-19 13:23:29 +01:00
|
|
|
|
|
|
|
|
if (!program->info.vs.has_prolog &&
|
|
|
|
|
(program->stage.has(SWStage::VS) || program->stage.has(SWStage::TES))) {
|
|
|
|
|
Builder(ctx.program, ctx.block).sopp(aco_opcode::s_setprio, 0x3u);
|
|
|
|
|
}
|
|
|
|
|
|
2023-06-20 14:03:34 +02:00
|
|
|
append_logical_start(ctx.block);
|
|
|
|
|
split_arguments(&ctx, startpgm);
|
|
|
|
|
}
|
2019-11-09 20:51:45 +00:00
|
|
|
|
2023-06-20 14:03:34 +02:00
|
|
|
if (program->gfx_level == GFX10 && program->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER &&
|
|
|
|
|
!program->stage.has(SWStage::GS)) {
|
|
|
|
|
/* Workaround for Navi1x HW bug to ensure that all NGG waves launch before
|
|
|
|
|
* s_sendmsg(GS_ALLOC_REQ).
|
|
|
|
|
*/
|
2024-03-19 15:21:00 +01:00
|
|
|
Builder(ctx.program, ctx.block).sopp(aco_opcode::s_barrier, 0u);
|
2023-06-20 14:03:34 +02:00
|
|
|
}
|
2020-03-09 12:44:03 +01:00
|
|
|
|
2023-06-20 14:03:34 +02:00
|
|
|
if (check_merged_wave_info) {
|
|
|
|
|
const unsigned i =
|
|
|
|
|
nir->info.stage == MESA_SHADER_VERTEX || nir->info.stage == MESA_SHADER_TESS_EVAL ? 0 : 1;
|
|
|
|
|
const Temp cond = merged_wave_info_to_mask(&ctx, i);
|
|
|
|
|
begin_divergent_if_then(&ctx, ic_merged_wave_info, cond);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (need_barrier) {
|
|
|
|
|
const sync_scope scope = ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq &&
|
|
|
|
|
program->wave_size % nir->info.tess.tcs_vertices_out == 0
|
|
|
|
|
? scope_subgroup
|
|
|
|
|
: scope_workgroup;
|
2020-03-09 12:44:03 +01:00
|
|
|
|
2023-06-20 14:03:34 +02:00
|
|
|
Builder(ctx.program, ctx.block)
|
|
|
|
|
.barrier(aco_opcode::p_barrier, memory_sync_info(storage_shared, semantic_acqrel, scope),
|
|
|
|
|
scope);
|
|
|
|
|
}
|
2021-04-08 13:38:13 +02:00
|
|
|
|
2023-06-20 14:03:34 +02:00
|
|
|
nir_function_impl* func = nir_shader_get_entrypoint(nir);
|
|
|
|
|
visit_cf_list(&ctx, &func->body);
|
|
|
|
|
|
2024-09-12 15:46:29 +02:00
|
|
|
if (ctx.program->info.ps.has_epilog) {
|
2023-08-02 08:53:18 +02:00
|
|
|
if (ctx.stage == fragment_fs) {
|
2023-08-11 16:58:36 +08:00
|
|
|
if (ctx.options->is_opengl)
|
|
|
|
|
create_fs_end_for_epilog(&ctx);
|
|
|
|
|
else
|
|
|
|
|
create_fs_jump_to_epilog(&ctx);
|
2023-06-20 14:03:34 +02:00
|
|
|
|
2023-08-02 08:53:18 +02:00
|
|
|
/* FS epilogs always have at least one color/null export. */
|
|
|
|
|
ctx.program->has_color_exports = true;
|
|
|
|
|
}
|
2023-06-20 14:03:34 +02:00
|
|
|
}
|
|
|
|
|
|
2023-08-22 20:36:57 +02:00
|
|
|
if (endif_merged_wave_info) {
|
|
|
|
|
begin_divergent_if_else(&ctx, ic_merged_wave_info);
|
|
|
|
|
end_divergent_if(&ctx, ic_merged_wave_info);
|
|
|
|
|
}
|
|
|
|
|
|
2023-08-29 19:29:02 +08:00
|
|
|
bool is_first_stage_of_merged_shader = false;
|
|
|
|
|
|
2023-09-01 15:21:11 +08:00
|
|
|
if (ctx.program->info.merged_shader_compiled_separately &&
|
2023-08-28 15:53:40 +02:00
|
|
|
(ctx.stage.sw == SWStage::VS || ctx.stage.sw == SWStage::TES)) {
|
2023-08-15 15:20:16 +02:00
|
|
|
assert(program->gfx_level >= GFX9);
|
2023-08-29 14:21:08 +08:00
|
|
|
if (ctx.options->is_opengl)
|
|
|
|
|
create_end_for_merged_shader(&ctx);
|
|
|
|
|
else
|
|
|
|
|
create_merged_jump_to_epilog(&ctx);
|
2023-08-29 19:29:02 +08:00
|
|
|
|
|
|
|
|
is_first_stage_of_merged_shader = true;
|
2023-08-15 15:20:16 +02:00
|
|
|
}
|
|
|
|
|
|
2023-06-20 14:03:34 +02:00
|
|
|
cleanup_context(&ctx);
|
2023-10-11 14:35:42 +08:00
|
|
|
|
|
|
|
|
if (need_endpgm) {
|
|
|
|
|
program->config->float_mode = program->blocks[0].fp_mode.val;
|
|
|
|
|
|
|
|
|
|
append_logical_end(ctx.block);
|
|
|
|
|
ctx.block->kind |= block_kind_uniform;
|
|
|
|
|
|
2024-09-12 15:46:29 +02:00
|
|
|
if ((!program->info.ps.has_epilog && !is_first_stage_of_merged_shader) ||
|
2023-10-11 14:35:42 +08:00
|
|
|
(nir->info.stage == MESA_SHADER_TESS_CTRL && program->gfx_level >= GFX9)) {
|
|
|
|
|
Builder(program, ctx.block).sopp(aco_opcode::s_endpgm);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
finish_program(&ctx);
|
|
|
|
|
}
|
2023-06-20 14:03:34 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
select_program_merged(isel_context& ctx, const unsigned shader_count, nir_shader* const* shaders)
|
|
|
|
|
{
|
|
|
|
|
if_context ic_merged_wave_info;
|
|
|
|
|
const bool ngg_gs = ctx.stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER && ctx.stage.has(SWStage::GS);
|
2025-04-19 08:07:12 -04:00
|
|
|
const bool hs = ctx.stage.hw == AC_HW_HULL_SHADER;
|
2023-06-20 14:03:34 +02:00
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < shader_count; i++) {
|
|
|
|
|
nir_shader* nir = shaders[i];
|
|
|
|
|
|
|
|
|
|
/* We always need to insert p_startpgm at the beginning of the first shader. */
|
|
|
|
|
const bool need_startpgm = i == 0;
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2023-10-11 14:35:42 +08:00
|
|
|
/* Need to handle program end for last shader stage. */
|
|
|
|
|
const bool need_endpgm = i == shader_count - 1;
|
|
|
|
|
|
2020-02-24 15:27:43 +01:00
|
|
|
/* In a merged VS+TCS HS, the VS implementation can be completely empty. */
|
|
|
|
|
nir_function_impl* func = nir_shader_get_entrypoint(nir);
|
2023-06-20 14:03:34 +02:00
|
|
|
const bool empty_shader =
|
2020-02-24 15:27:43 +01:00
|
|
|
nir_cf_list_is_empty_block(&func->body) &&
|
|
|
|
|
((nir->info.stage == MESA_SHADER_VERTEX &&
|
|
|
|
|
(ctx.stage == vertex_tess_control_hs || ctx.stage == vertex_geometry_gs)) ||
|
|
|
|
|
(nir->info.stage == MESA_SHADER_TESS_EVAL && ctx.stage == tess_eval_geometry_gs));
|
2021-06-09 10:14:54 +02:00
|
|
|
|
2023-06-20 14:03:34 +02:00
|
|
|
/* See if we need to emit a check of the merged wave info SGPR. */
|
|
|
|
|
const bool check_merged_wave_info =
|
2025-04-19 08:07:12 -04:00
|
|
|
ctx.tcs_in_out_eq ? i == 0
|
|
|
|
|
: (shader_count >= 2 && !empty_shader && ((!ngg_gs && !hs) || i != 1));
|
|
|
|
|
const bool endif_merged_wave_info = ctx.tcs_in_out_eq ? i == 1 : check_merged_wave_info;
|
2021-06-09 10:14:54 +02:00
|
|
|
|
2023-06-20 14:03:34 +02:00
|
|
|
/* Skip s_barrier from TCS when VS outputs are not stored in the LDS. */
|
|
|
|
|
const bool tcs_skip_barrier =
|
2024-10-02 16:48:39 -04:00
|
|
|
ctx.stage == vertex_tess_control_hs && !ctx.any_tcs_inputs_via_lds;
|
2023-03-30 09:32:58 +08:00
|
|
|
|
2023-06-20 14:03:34 +02:00
|
|
|
/* A barrier is usually needed at the beginning of the second shader, with exceptions. */
|
|
|
|
|
const bool need_barrier = i != 0 && !ngg_gs && !tcs_skip_barrier;
|
2020-01-23 17:50:25 +01:00
|
|
|
|
2023-10-11 14:35:42 +08:00
|
|
|
select_shader(ctx, nir, need_startpgm, need_endpgm, need_barrier, &ic_merged_wave_info,
|
2023-06-20 14:03:34 +02:00
|
|
|
check_merged_wave_info, endif_merged_wave_info);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-03-17 13:43:08 +01:00
|
|
|
if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {
|
2023-06-20 14:03:34 +02:00
|
|
|
/* Special handling when TCS input and output patch size is the same.
|
|
|
|
|
* Outputs of the previous stage are inputs to the next stage.
|
|
|
|
|
*/
|
2020-03-17 13:43:08 +01:00
|
|
|
ctx.inputs = ctx.outputs;
|
2020-03-24 15:46:55 +01:00
|
|
|
ctx.outputs = shader_io_state();
|
2020-03-17 13:43:08 +01:00
|
|
|
}
|
2023-06-20 14:03:34 +02:00
|
|
|
}
|
|
|
|
|
}
|
2020-11-02 14:01:38 +01:00
|
|
|
|
2023-06-20 14:03:34 +02:00
|
|
|
} /* end namespace */
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
|
|
|
|
|
ac_shader_config* config, const struct aco_compiler_options* options,
|
|
|
|
|
const struct aco_shader_info* info, const struct ac_shader_args* args)
|
|
|
|
|
{
|
|
|
|
|
isel_context ctx =
|
2023-08-03 17:08:17 +08:00
|
|
|
setup_isel_context(program, shader_count, shaders, config, options, info, args);
|
2023-06-20 14:03:34 +02:00
|
|
|
|
|
|
|
|
if (ctx.stage == raytracing_cs)
|
|
|
|
|
return select_program_rt(ctx, shader_count, shaders, args);
|
|
|
|
|
|
|
|
|
|
if (shader_count >= 2) {
|
2025-06-30 16:11:42 +02:00
|
|
|
program->needs_fp_mode_insertion = true;
|
2023-06-20 14:03:34 +02:00
|
|
|
select_program_merged(ctx, shader_count, shaders);
|
|
|
|
|
} else {
|
2023-08-15 15:20:16 +02:00
|
|
|
bool need_barrier = false, check_merged_wave_info = false, endif_merged_wave_info = false;
|
|
|
|
|
if_context ic_merged_wave_info;
|
|
|
|
|
|
2023-08-24 09:41:46 +02:00
|
|
|
/* Handle separate compilation of VS+TCS and {VS,TES}+GS on GFX9+. */
|
2023-09-01 15:21:11 +08:00
|
|
|
if (ctx.program->info.merged_shader_compiled_separately) {
|
2023-08-15 15:20:16 +02:00
|
|
|
assert(ctx.program->gfx_level >= GFX9);
|
2025-06-30 16:11:42 +02:00
|
|
|
program->needs_fp_mode_insertion = true;
|
2023-08-28 15:53:40 +02:00
|
|
|
if (ctx.stage.sw == SWStage::VS || ctx.stage.sw == SWStage::TES) {
|
2023-08-15 15:20:16 +02:00
|
|
|
check_merged_wave_info = endif_merged_wave_info = true;
|
|
|
|
|
} else {
|
2023-08-28 15:53:40 +02:00
|
|
|
const bool ngg_gs =
|
|
|
|
|
ctx.stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER && ctx.stage.sw == SWStage::GS;
|
|
|
|
|
assert(ctx.stage == tess_control_hs || ctx.stage == geometry_gs || ngg_gs);
|
|
|
|
|
check_merged_wave_info = endif_merged_wave_info = !ngg_gs;
|
|
|
|
|
need_barrier = !ngg_gs;
|
2023-08-15 15:20:16 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-10-11 14:35:42 +08:00
|
|
|
select_shader(ctx, shaders[0], true, true, need_barrier, &ic_merged_wave_info,
|
2023-08-15 15:20:16 +02:00
|
|
|
check_merged_wave_info, endif_merged_wave_info);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
2019-11-15 11:31:03 +00:00
|
|
|
}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
} // namespace aco
|