mesa/src/intel/compiler/brw/brw_nir.c
Arkady Shlykov 7f7ba20cca brw: Implement divergent atomics fusion optimization (single message approach)
For an atomic with a divergent addr generates a CFG grouping the same addrs
values together and emits a single atomic with fused data covering
the subgroup. Lanes with other addr values perform a default atomic.

Co-authored-by: Jhanani Thiagarajan <jhanani.thiagarajan@intel.com>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40631>
2026-04-03 12:17:01 +00:00

3822 lines
128 KiB
C

/*
* Copyright © 2014 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#include "intel_nir.h"
#include "brw_nir.h"
#include "brw_private.h"
#include "brw_sampler.h"
#include "compiler/glsl_types.h"
#include "compiler/nir/nir_builder.h"
#include "dev/intel_debug.h"
#include "util/sparse_bitset.h"
/**
* Returns the minimum number of vec4 elements needed to pack a type.
*
* For simple types, it will return 1 (a single vec4); for matrices, the
* number of columns; for array and struct, the sum of the vec4_size of
* each of its elements; and for sampler and atomic, zero.
*
* This method is useful to calculate how much register space is needed to
* store a particular type.
*/
int
type_size_vec4(const struct glsl_type *type, bool bindless)
{
return glsl_count_attribute_slots(type, false);
}
static bool
is_input(nir_intrinsic_instr *intrin)
{
return intrin->intrinsic == nir_intrinsic_load_input ||
intrin->intrinsic == nir_intrinsic_load_per_primitive_input ||
intrin->intrinsic == nir_intrinsic_load_per_vertex_input ||
intrin->intrinsic == nir_intrinsic_load_interpolated_input;
}
static bool
is_output(nir_intrinsic_instr *intrin)
{
return intrin->intrinsic == nir_intrinsic_load_output ||
intrin->intrinsic == nir_intrinsic_load_per_vertex_output ||
intrin->intrinsic == nir_intrinsic_load_per_view_output ||
intrin->intrinsic == nir_intrinsic_store_output ||
intrin->intrinsic == nir_intrinsic_store_per_vertex_output ||
intrin->intrinsic == nir_intrinsic_store_per_view_output;
}
static bool
is_per_primitive(nir_intrinsic_instr *intrin)
{
return intrin->intrinsic == nir_intrinsic_load_per_primitive_input ||
intrin->intrinsic == nir_intrinsic_load_per_primitive_output ||
intrin->intrinsic == nir_intrinsic_store_per_primitive_output;
}
static nir_variable_mode
io_mode(nir_intrinsic_instr *io)
{
return is_input(io) ? nir_var_shader_in : nir_var_shader_out;
}
/**
* Given an URB offset in 32-bit units, determine whether (offset % 4)
* is statically known. If so, add this to the value of first_component.
*/
static bool
io_vec4_static_mod(nir_def *offset_32b, unsigned *first_component)
{
unsigned mod;
const bool mod_known =
nir_mod_analysis(nir_get_scalar(offset_32b, 0), nir_type_uint, 4, &mod);
if (mod_known)
*first_component += mod;
return mod_known;
}
static unsigned
io_component(nir_intrinsic_instr *io,
const struct brw_lower_urb_cb_data *cb_data)
{
unsigned c = nir_intrinsic_has_component(io) ?
nir_intrinsic_component(io) : 0;
if (is_per_primitive(io)) {
/* Extract the 32-bit component index from the byte offset */
const nir_io_semantics sem = nir_intrinsic_io_semantics(io);
const int offset = cb_data->per_primitive_byte_offsets[sem.location];
assert(offset != -1);
c += (offset % 16) / 4;
} else if (nir_intrinsic_has_io_semantics(io)) {
const nir_io_semantics sem = nir_intrinsic_io_semantics(io);
/* The VUE header stores Primitive Shading Rate (.x), Layer (.y),
* Viewport (.z), and Point Size (.w) in a single vec4.
*/
if (sem.location == VARYING_SLOT_LAYER)
c += 1;
else if (sem.location == VARYING_SLOT_VIEWPORT)
c += 2;
else if (sem.location == VARYING_SLOT_PSIZ)
c += 3;
}
return c;
}
static unsigned
io_base_slot(nir_intrinsic_instr *io,
const struct brw_lower_urb_cb_data *cb_data)
{
if (io->intrinsic == nir_intrinsic_load_task_payload ||
io->intrinsic == nir_intrinsic_store_task_payload)
return nir_intrinsic_base(io) / 16; /* bytes to vec4 slots */
const nir_io_semantics io_sem = nir_intrinsic_io_semantics(io);
if (is_per_primitive(io)) {
if (io_sem.location == VARYING_SLOT_PRIMITIVE_INDICES)
return 0;
const int offset = cb_data->per_primitive_byte_offsets[io_sem.location];
assert(offset != -1);
return (cb_data->per_primitive_offset + offset) / 16;
} else if (cb_data->per_primitive_byte_offsets &&
io_sem.location == VARYING_SLOT_PRIMITIVE_COUNT) {
return 0;
} else {
const int slot = cb_data->varying_to_slot[io_sem.location];
assert(slot != -1);
return slot + cb_data->per_vertex_offset / 16;
}
}
static nir_def *
urb_offset(nir_builder *b,
const struct brw_lower_urb_cb_data *cb_data,
nir_intrinsic_instr *io)
{
const nir_io_semantics io_sem = nir_intrinsic_io_semantics(io);
nir_def *offset = nir_get_io_offset_src(io)->ssa;
/* Convert vec4 slot offset to 32-bit dwords */
if (!cb_data->vec4_access)
offset = nir_ishl_imm(b, offset, 2);
nir_src *index = nir_get_io_arrayed_index_src(io);
if (is_per_primitive(io)) {
const unsigned stride =
io_sem.location == VARYING_SLOT_PRIMITIVE_INDICES
? cb_data->per_primitive_indices_stride / 4
: cb_data->per_primitive_stride / 4;
offset = nir_iadd(b, offset, nir_imul_imm(b, index->ssa, stride));
} else if (index) {
const unsigned unit = cb_data->vec4_access ? 16 : 4;
nir_def *index_offset = cb_data->dynamic_tes
? nir_imul(b, index->ssa, intel_nir_tess_field(b, PER_VERTEX_SLOTS))
: nir_imul_imm(b, index->ssa, cb_data->per_vertex_stride / unit);
offset = nir_iadd(b, offset, index_offset);
/* In the Tessellation evaluation shader, reposition the offset of
* builtins when using separate layout.
*/
if (cb_data->dynamic_tes) {
assert(b->shader->info.stage == MESA_SHADER_TESS_EVAL);
const nir_io_semantics io_sem = nir_intrinsic_io_semantics(io);
const bool builtin = io_sem.location < VARYING_SLOT_VAR0;
const int old_base = builtin ? cb_data->tes_builtins_slot_offset
: cb_data->tes_per_patch_slots;
nir_def *new_base =
builtin ? intel_nir_tess_field(b, BUILTINS)
: intel_nir_tess_field(b, PER_PATCH_SLOTS);
offset = nir_iadd(b, offset, nir_iadd_imm(b, new_base, -old_base));
}
}
return offset;
}
static nir_def *
input_handle(nir_builder *b, nir_intrinsic_instr *intrin)
{
const enum mesa_shader_stage stage = b->shader->info.stage;
nir_src *vertex = nir_get_io_arrayed_index_src(intrin);
return stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_GEOMETRY ?
nir_load_urb_input_handle_indexed_intel(b, 1, 32, vertex->ssa) :
nir_load_urb_input_handle_intel(b);
}
static nir_def *
output_handle(nir_builder *b)
{
return nir_load_urb_output_handle_intel(b);
}
static nir_def *
load_urb(nir_builder *b,
const struct brw_lower_urb_cb_data *cb_data,
nir_intrinsic_instr *intrin,
nir_def *handle,
nir_def *offset,
enum gl_access_qualifier access)
{
const struct intel_device_info *devinfo = cb_data->devinfo;
const nir_variable_mode mode = io_mode(intrin);
const unsigned bits = intrin->def.bit_size;
unsigned base = io_base_slot(intrin, cb_data);
unsigned first_component = io_component(intrin, cb_data);
if (devinfo->ver >= 20) {
offset = nir_ishl_imm(b, offset, cb_data->vec4_access ? 4 : 2);
return nir_load_urb_lsc_intel(b, intrin->def.num_components, bits,
nir_iadd(b, handle, offset),
16 * base + 4 * first_component,
.access = access, .memory_modes = mode);
}
/* Load a whole vec4 or vec8 and return the desired portion */
nir_component_mask_t mask = nir_component_mask(intrin->def.num_components);
/* If the offset is in vec4 units, do a straightforward load */
if (cb_data->vec4_access) {
assert(intrin->def.num_components <= 4);
/* URB offsets are technically unsigned, and Icelake and earlier seem
* to perform the addition of global and per-slot offsets in a way that
* doesn't handle 32-bit overflow/unsigned wrapping correctly.
*
* Work around this by just including the base in the offset.
*/
if (devinfo->ver <= 11 && !nir_def_is_const(offset)) {
offset = nir_iadd_imm(b, offset, base);
base = 0;
}
nir_def *load =
nir_load_urb_vec4_intel(b, 4, bits, handle, offset,
.base = base, .access = access,
.memory_modes = mode);
return nir_channels(b, load, mask << first_component);
}
/* Otherwise, the offset is in 32-bit units. Split it into a vec4-aligned
* slot offset and a 32-bit component offset.
*/
nir_def *mod = nir_iand_imm(b, offset, 0x3);
nir_def *vec4_offset = nir_ishr_imm(b, offset, 2);
const bool static_mod = io_vec4_static_mod(offset, &first_component);
const bool single_vec4 = (static_mod || intrin->def.num_components == 1)
&& first_component + intrin->def.num_components <= 4;
nir_def *load =
nir_load_urb_vec4_intel(b, single_vec4 ? 4 : 8, bits, handle,
vec4_offset, .base = base, .access = access,
.memory_modes = mode);
if (static_mod) {
return nir_channels(b, load, mask << first_component);
} else {
nir_def *comps[NIR_MAX_VEC_COMPONENTS];
for (unsigned i = 0; i < intrin->def.num_components; i++) {
comps[i] =
nir_vector_extract(b, load,
nir_iadd_imm(b, mod, first_component + i));
}
return nir_vec(b, comps, intrin->def.num_components);
}
}
static void
store_urb(nir_builder *b,
const struct brw_lower_urb_cb_data *cb_data,
nir_intrinsic_instr *intrin,
nir_def *offset)
{
const struct intel_device_info *devinfo = cb_data->devinfo;
const unsigned base = io_base_slot(intrin, cb_data);
unsigned first_component = io_component(intrin, cb_data);
unsigned mask = nir_intrinsic_write_mask(intrin);
nir_def *src = intrin->src[0].ssa;
if (devinfo->ver >= 20) {
offset = nir_ishl_imm(b, offset, cb_data->vec4_access ? 4 : 2);
nir_def *addr = nir_iadd(b, output_handle(b), offset);
while (mask) {
int start, count;
u_bit_scan_consecutive_range(&mask, &start, &count);
const unsigned cur_mask = BITFIELD_MASK(count) << start;
const unsigned cur_base = 16 * base + 4 * (start + first_component);
nir_store_urb_lsc_intel(b, nir_channels(b, src, cur_mask), addr,
.base = cur_base);
}
return;
}
nir_def *channel_mask = nir_imm_int(b, mask);
const bool static_mod = cb_data->vec4_access ||
io_vec4_static_mod(offset, &first_component);
if (static_mod) {
src = nir_shift_channels(b, src, first_component,
align(src->num_components + first_component, 4));
channel_mask = nir_ishl_imm(b, channel_mask, first_component);
} else {
offset = nir_iadd_imm(b, offset, first_component);
nir_def *undef = nir_undef(b, 1, src->bit_size);
nir_def *mod = nir_iand_imm(b, offset, 0x3);
channel_mask = nir_ishl(b, channel_mask, mod);
nir_def *comps[8];
for (unsigned i = 0; i < 8; i++) {
nir_def *cond = nir_i2b(b, nir_iand_imm(b, channel_mask, 1u << i));
nir_def *src_idx = nir_imax_imm(b, nir_isub_imm(b, i, mod), 0);
nir_def *src_comp = src->num_components == 1 ? src :
nir_vector_extract(b, src, src_idx);
comps[i] = nir_bcsel(b, cond, src_comp, undef);
}
src = nir_vec(b, comps, 8);
}
nir_def *vec4_offset =
cb_data->vec4_access ? offset : nir_ishr_imm(b, offset, 2);
nir_store_urb_vec4_intel(b, src, output_handle(b), vec4_offset,
channel_mask, .base = base);
}
static nir_def *
load_push_input(nir_builder *b, nir_intrinsic_instr *io, unsigned byte_offset)
{
return nir_load_attribute_payload_intel(b, io->def.num_components,
io->def.bit_size,
nir_imm_int(b, byte_offset));
}
static nir_def *
try_load_push_input(nir_builder *b,
const struct brw_lower_urb_cb_data *cb_data,
nir_intrinsic_instr *io,
nir_def *offset)
{
const enum mesa_shader_stage stage = b->shader->info.stage;
if (!nir_def_is_const(offset))
return NULL;
const unsigned offset_unit = cb_data->vec4_access ? 16 : 4;
uint32_t byte_offset =
16 * io_base_slot(io, cb_data) + 4 * io_component(io, cb_data) +
offset_unit * nir_src_as_uint(nir_src_for_ssa(offset));
assert((byte_offset % 4) == 0);
if (byte_offset >= cb_data->max_push_bytes)
return NULL;
if (stage == MESA_SHADER_GEOMETRY) {
/* GS vertex index isn't part of the offset */
nir_src *index = nir_get_io_arrayed_index_src(io);
if (!nir_src_is_const(*index))
return NULL;
/* GS push inputs still use load_per_vertex_input */
const nir_io_semantics io_sem = nir_intrinsic_io_semantics(io);
const int slot = cb_data->varying_to_slot[io_sem.location];
assert(slot != -1);
nir_intrinsic_set_base(io, slot);
nir_intrinsic_set_component(io, io_component(io, cb_data));
return &io->def;
}
return load_push_input(b, io, byte_offset);
}
static bool
lower_urb_inputs(nir_builder *b, nir_intrinsic_instr *intrin, void *data)
{
const struct brw_lower_urb_cb_data *cb_data = data;
if (intrin->intrinsic == nir_intrinsic_load_input ||
intrin->intrinsic == nir_intrinsic_load_per_vertex_input) {
b->cursor = nir_before_instr(&intrin->instr);
b->constant_fold_alu = true;
nir_def *offset = urb_offset(b, cb_data, intrin);
nir_def *load = try_load_push_input(b, cb_data, intrin, offset);
if (!load) {
load = load_urb(b, cb_data, intrin, input_handle(b, intrin), offset,
ACCESS_CAN_REORDER | ACCESS_NON_WRITEABLE);
}
if (load != &intrin->def)
nir_def_replace(&intrin->def, load);
return true;
}
return false;
}
static bool
lower_urb_outputs(nir_builder *b, nir_intrinsic_instr *intrin, void *data)
{
const struct brw_lower_urb_cb_data *cb_data = data;
b->cursor = nir_before_instr(&intrin->instr);
b->constant_fold_alu = true;
nir_def *load = NULL;
switch (intrin->intrinsic) {
case nir_intrinsic_load_output:
case nir_intrinsic_load_per_vertex_output:
case nir_intrinsic_load_per_primitive_output:
load = load_urb(b, cb_data, intrin, output_handle(b),
urb_offset(b, cb_data, intrin), 0);
break;
case nir_intrinsic_store_output:
case nir_intrinsic_store_per_vertex_output:
case nir_intrinsic_store_per_primitive_output:
store_urb(b, cb_data, intrin, urb_offset(b, cb_data, intrin));
break;
case nir_intrinsic_load_per_view_output:
case nir_intrinsic_store_per_view_output:
UNREACHABLE("should have been lowered");
default:
return false;
}
if (load)
nir_def_replace(&intrin->def, load);
else
nir_instr_remove(&intrin->instr);
return true;
}
bool
brw_nir_lower_inputs_to_urb_intrinsics(nir_shader *nir,
const struct brw_lower_urb_cb_data *cd)
{
return nir_shader_intrinsics_pass(nir, lower_urb_inputs,
nir_metadata_control_flow, (void *) cd);
}
bool
brw_nir_lower_outputs_to_urb_intrinsics(nir_shader *nir,
const struct brw_lower_urb_cb_data *cd)
{
return nir_shader_intrinsics_pass(nir, lower_urb_outputs,
nir_metadata_control_flow, (void *) cd);
}
/* See if comps[0..3] has any non-undef values. */
static bool
slot_defined(nir_scalar *comps)
{
for (unsigned i = 0; i < 4; i++) {
if (comps[i].def && !nir_def_is_undef(comps[i].def))
return true;
}
return false;
}
/* Replace any NULL defs in comps[0..(n-1)] with undef */
static void
fill_undefs(nir_scalar *comps, nir_def *undef, unsigned n)
{
for (unsigned i = 0; i < n; i++) {
if (!comps[i].def)
comps[i] = nir_get_scalar(undef, 0);
}
}
static void
emit_urb_writes(nir_builder *b,
const struct intel_device_info *devinfo,
nir_scalar *outputs,
unsigned num_slots,
nir_def *offset)
{
nir_def *undef = nir_undef(b, 1, 32);
/* Primitive Shading Rate defaults to (1, 1) in half-float */
if (devinfo->has_coarse_pixel_primitive_and_cb && !outputs[0].def)
outputs[0] = nir_get_scalar(nir_imm_int(b, 0x3C003C00), 0);
/* Viewport, Layer, and Point Size default to 0 */
for (unsigned i = 0; i < 4; i++) {
if (!outputs[i].def)
outputs[i] = nir_get_scalar(nir_imm_int(b, 0), 0);
}
/* Emit URB writes */
for (unsigned slot = 0; slot < num_slots; slot++) {
if (!slot_defined(&outputs[4 * slot]))
continue;
const bool vec8 = slot + 1 < num_slots &&
slot_defined(&outputs[4 * (slot + 1)]);
fill_undefs(&outputs[4 * slot], undef, vec8 ? 8 : 4);
nir_def *val = nir_vec_scalars(b, &outputs[4 * slot], vec8 ? 8 : 4);
if (devinfo->ver >= 20) {
nir_def *addr = nir_iadd(b, output_handle(b),
nir_imul_imm(b, offset, 16));
nir_store_urb_lsc_intel(b, val, addr, .base = 16 * slot);
} else {
nir_store_urb_vec4_intel(b, val, output_handle(b), offset,
nir_imm_int(b, vec8 ? 0xff : 0xf),
.base = slot);
}
if (vec8)
slot++;
}
}
bool
brw_nir_lower_deferred_urb_writes(nir_shader *nir,
const struct intel_device_info *devinfo,
const struct intel_vue_map *vue_map,
unsigned extra_urb_slot_offset,
unsigned gs_vertex_stride)
{
nir_scalar *outputs = calloc(vue_map->num_slots, 4 * sizeof(nir_scalar));
nir_function_impl *impl = nir_shader_get_entrypoint(nir);
nir_foreach_block(block, impl) {
nir_foreach_instr_safe(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
switch (intrin->intrinsic) {
case nir_intrinsic_store_output:
case nir_intrinsic_store_per_view_output: {
nir_src *view_index = nir_get_io_arrayed_index_src(intrin);
nir_src *offset = nir_get_io_offset_src(intrin);
const nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
const unsigned slot =
vue_map->varying_to_slot[sem.location] +
(view_index ? nir_src_as_uint(*view_index) : 0);
nir_src_as_uint(*offset);
const unsigned c = io_component(intrin, NULL);
const unsigned mask = nir_intrinsic_write_mask(intrin);
assert(slot != -1);
assert(c < 4);
u_foreach_bit(i, mask) {
outputs[4 * slot + c + i] =
nir_scalar_resolved(intrin->src[0].ssa, i);
}
nir_instr_remove(instr);
break;
}
case nir_intrinsic_emit_vertex_with_counter: {
/* The only purpose of primitives sent to non-zero streams
* is to be recorded by transform feedback, so if it is disabled,
* we can discard all geometry bound for those streams.
*/
if (nir_intrinsic_stream_id(intrin) > 0 &&
!nir->info.has_transform_feedback_varyings) {
nir_instr_remove(instr);
break;
}
nir_builder b = nir_builder_at(nir_before_instr(instr));
b.constant_fold_alu = true;
nir_def *offset =
nir_iadd_imm(&b, nir_imul_imm(&b, intrin->src[0].ssa,
gs_vertex_stride),
extra_urb_slot_offset);
emit_urb_writes(&b, devinfo, outputs, vue_map->num_slots, offset);
/* After EmitVertex() all outputs are undefined */
memset(outputs, 0, 4 * vue_map->num_slots * sizeof(nir_scalar));
/* Leave emit_vertex_with_counter for control data writes */
break;
}
default:
break;
}
}
}
if (nir->info.stage != MESA_SHADER_GEOMETRY) {
nir_builder b = nir_builder_at(nir_after_impl(impl));
emit_urb_writes(&b, devinfo, outputs, vue_map->num_slots,
nir_imm_int(&b, 0));
}
free(outputs);
return nir_progress(true, impl, nir_metadata_control_flow);
}
static bool
lower_task_payload_to_urb(nir_builder *b, nir_intrinsic_instr *io, void *data)
{
const struct brw_lower_urb_cb_data *cb_data = data;
const enum mesa_shader_stage stage = b->shader->info.stage;
if (io->intrinsic != nir_intrinsic_load_task_payload &&
io->intrinsic != nir_intrinsic_store_task_payload)
return false;
b->cursor = nir_before_instr(&io->instr);
b->constant_fold_alu = true;
/* Convert byte offset to dword offset */
nir_def *offset = nir_ishr_imm(b, nir_get_io_offset_src(io)->ssa, 2);
if (io->intrinsic == nir_intrinsic_store_task_payload) {
store_urb(b, cb_data, io, offset);
nir_instr_remove(&io->instr);
} else {
const bool input = stage == MESA_SHADER_MESH;
nir_def *handle = input ? input_handle(b, io) : output_handle(b);
nir_def *load = load_urb(b, cb_data, io, handle, offset,
ACCESS_CAN_REORDER |
(input ? ACCESS_NON_WRITEABLE : 0));
nir_def_replace(&io->def, load);
}
return true;
}
static bool
lower_task_payload_to_urb_intrinsics(nir_shader *nir,
const struct intel_device_info *devinfo)
{
struct brw_lower_urb_cb_data cb_data = { .devinfo = devinfo };
return nir_shader_intrinsics_pass(nir, lower_task_payload_to_urb,
nir_metadata_control_flow, &cb_data);
}
static bool
remap_tess_levels_legacy(nir_builder *b,
nir_intrinsic_instr *intrin,
void *data)
{
/* Note that this pass does not work with Xe2 LSC URB messages, but
* we never use legacy layouts there anyway.
*/
enum tess_primitive_mode prim = (uintptr_t) data;
if (!(b->shader->info.stage == MESA_SHADER_TESS_CTRL && is_output(intrin)) &&
!(b->shader->info.stage == MESA_SHADER_TESS_EVAL && is_input(intrin)))
return false;
const nir_io_semantics io_sem = nir_intrinsic_io_semantics(intrin);
if (io_sem.location != VARYING_SLOT_TESS_LEVEL_INNER &&
io_sem.location != VARYING_SLOT_TESS_LEVEL_OUTER)
return false;
b->cursor = nir_before_instr(&intrin->instr);
const nir_variable_mode mode = io_mode(intrin);
const bool inner = io_sem.location == VARYING_SLOT_TESS_LEVEL_INNER;
nir_def *tess_config = nir_load_tess_config_intel(b);
nir_def *is_tri =
prim == TESS_PRIMITIVE_UNSPECIFIED ?
nir_test_mask(b, tess_config, INTEL_TESS_CONFIG_TRIANGLES) :
nir_imm_bool(b, prim == TESS_PRIMITIVE_TRIANGLES);
nir_def *is_isoline =
prim == TESS_PRIMITIVE_UNSPECIFIED ?
nir_test_mask(b, tess_config, INTEL_TESS_CONFIG_ISOLINES) :
nir_imm_bool(b, prim == TESS_PRIMITIVE_ISOLINES);
/* The patch layout is described in the SKL PRMs, Volume 7: 3D-Media-GPGPU,
* Patch URB Entry (Patch Record) Output, Patch Header DW0-7. In the chart
* below TessLevelInner = <ix, iy> and TessLevelOuter = <x, y, z, w>:
*
* [ 7 6 5 4 | 3 2 1 0]
*
* [ x y z w | ix iy __ __] quad legacy
* [ x y z ix | __ __ __ __] tri legacy
* [ y x __ __ | __ __ __ __] isoline legacy
*
* From this, we can see:
* - Outer lives at slot 1
* - Inner lives at slot 0 for quads but slot 1 for triangles
* - Inner does not exist for isolines
* - Isolines need the original value but mask << 2
* - Triangles+Inner need the original value and mask
* - Quads or Triangles+Outer need the value and mask flipped (WYZX)
*/
if (intrin->intrinsic == nir_intrinsic_load_input) {
/* The TES is guaranteed to know the primitive mode and we always
* push the first two input slots.
*/
assert(b->shader->info.stage == MESA_SHADER_TESS_EVAL);
assert(prim != TESS_PRIMITIVE_UNSPECIFIED);
nir_def *result;
if (inner && prim == TESS_PRIMITIVE_TRIANGLES) {
result = load_push_input(b, intrin, 4 * sizeof(uint32_t));
} else if (prim == TESS_PRIMITIVE_ISOLINES) {
result = load_push_input(b, intrin, 6 * sizeof(uint32_t));
} else {
const unsigned start =
(inner ? 4 : 8) - nir_intrinsic_component(intrin)
- intrin->def.num_components;
nir_def *tmp = load_push_input(b, intrin, start * sizeof(uint32_t));
unsigned reverse[NIR_MAX_VEC_COMPONENTS];
for (unsigned i = 0; i < tmp->num_components; ++i)
reverse[i] = tmp->num_components - 1 - i;
result = nir_swizzle(b, tmp, reverse, tmp->num_components);
}
nir_def_replace(&intrin->def, result);
} else {
assert(b->shader->info.stage == MESA_SHADER_TESS_CTRL);
const unsigned wzyx[4] = { 3, 2, 1, 0 };
const unsigned xxxy[4] = { 0, 0, 0, 1 };
const unsigned zwww[4] = { 2, 3, 3, 3 };
nir_def *slot = inner ? nir_b2i32(b, is_tri) : nir_imm_int(b, 1);
if (intrin->intrinsic == nir_intrinsic_store_output) {
const unsigned mask = nir_intrinsic_write_mask(intrin);
const unsigned revmask =
!!(mask & WRITEMASK_X) << 3 | !!(mask & WRITEMASK_Y) << 2 |
!!(mask & WRITEMASK_Z) << 1 | !!(mask & WRITEMASK_W) << 0;
nir_def *padded = nir_pad_vector_imm_int(b, intrin->src[0].ssa, 0, 4);
nir_def *new_val =
inner ? nir_bcsel(b, is_tri, nir_channel(b, padded, 0),
nir_swizzle(b, padded, wzyx, 4))
: nir_bcsel(b, is_isoline, nir_swizzle(b, padded, xxxy, 4),
nir_swizzle(b, padded, wzyx, 4));
nir_def *new_mask =
inner ? nir_bcsel(b, is_tri, nir_imm_int(b, mask & WRITEMASK_X),
nir_imm_int(b, revmask))
: nir_bcsel(b, is_isoline, nir_imm_int(b, mask << 2),
nir_bcsel(b, is_tri, nir_imm_int(b, revmask & WRITEMASK_YZW),
nir_imm_int(b, revmask)));
nir_store_urb_vec4_intel(b, new_val, output_handle(b),
slot, new_mask);
nir_instr_remove(&intrin->instr);
} else {
assert(intrin->intrinsic == nir_intrinsic_load_output);
nir_def *vec =
nir_load_urb_vec4_intel(b, 4, 32, output_handle(b), slot,
.memory_modes = mode);
const unsigned nc = intrin->def.num_components;
nir_def *result =
inner ? nir_bcsel(b, is_tri, nir_trim_vector(b, vec, nc),
nir_swizzle(b, vec, wzyx, nc))
: nir_bcsel(b, is_isoline, nir_swizzle(b, vec, zwww, nc),
nir_swizzle(b, vec, wzyx, nc));
nir_def_replace(&intrin->def, result);
}
}
return true;
}
struct remap_tesslevel_cb_data {
const struct intel_device_info *devinfo;
enum tess_primitive_mode prim_mode;
};
static bool
remap_tess_levels_reversed(nir_builder *b,
nir_intrinsic_instr *io,
void *data)
{
const struct remap_tesslevel_cb_data *cb_data = data;
const struct intel_device_info *devinfo = cb_data->devinfo;
/* The Gfx12+ reversed patch header layouts are:
*
* [ 7 6 5 4 | 3 2 1 0]
* [__ __ iy ix | w z y x] quad reversed
* [__ __ __ __ | ix z y x] tri reversed
* [__ __ __ ix | __ z y x] tri reversed inside separate
* [__ __ __ __ | __ __ x y] isoline reversed
*
* By using the separate layout for triangles, no remapping is required
* except that isolines is backwards for some reason. We flip it here.
*/
if (!nir_intrinsic_has_io_semantics(io) ||
nir_intrinsic_io_semantics(io).location !=
VARYING_SLOT_TESS_LEVEL_OUTER)
return false;
b->cursor = nir_after_instr(&io->instr);
nir_def *is_isoline;
if (cb_data->prim_mode == TESS_PRIMITIVE_UNSPECIFIED) {
nir_def *tess_config = nir_load_tess_config_intel(b);
is_isoline = nir_test_mask(b, tess_config, INTEL_TESS_CONFIG_ISOLINES);
} else {
is_isoline = nir_imm_true(b);
}
const unsigned yx[2] = { 1, 0 };
if (io->intrinsic == nir_intrinsic_store_output) {
/* Flip isolines source: xy__ -> yx__ */
const unsigned mask = nir_intrinsic_write_mask(io);
const unsigned revmask = (mask & ~WRITEMASK_XY) |
(mask & WRITEMASK_X) << 1 | (mask & WRITEMASK_Y) >> 1;
nir_def *new_val =
nir_bcsel(b, is_isoline,
nir_pad_vector(b, nir_swizzle(b, io->src[0].ssa, yx, 2),
nir_src_num_components(io->src[0])),
io->src[0].ssa);
if (devinfo->ver >= 20) {
nir_store_urb_lsc_intel(b, new_val, output_handle(b),
.base = mask == WRITEMASK_X ? 4 : 0);
} else {
nir_store_urb_vec4_intel(b, new_val, output_handle(b),
nir_imm_int(b, 0),
nir_bcsel(b, is_isoline,
nir_imm_int(b, revmask),
nir_imm_int(b, mask)));
}
nir_instr_remove(&io->instr);
} else {
/* Just leave these as load intrinsics and let the generic remapper
* take care of that part.
*/
nir_def *new_val =
nir_bcsel(b, is_isoline, nir_swizzle(b, &io->def, yx, 2), &io->def);
nir_def_rewrite_uses_after(&io->def, new_val);
}
return true;
}
static bool
remap_tess_levels(nir_shader *nir,
const struct intel_device_info *devinfo,
enum tess_primitive_mode prim)
{
/* Pre-Gfx12 use legacy patch header layouts */
if (devinfo->ver < 12) {
return nir_shader_intrinsics_pass(nir, remap_tess_levels_legacy,
nir_metadata_control_flow,
(void *)(uintptr_t) prim);
}
/* With the reversed layouts, remapping is only required for
* isolines (or unspecified, which might be isolines).
*/
if (prim != TESS_PRIMITIVE_ISOLINES && prim != TESS_PRIMITIVE_UNSPECIFIED)
return false;
struct remap_tesslevel_cb_data cb = {
.devinfo = devinfo, .prim_mode = prim
};
return nir_shader_intrinsics_pass(nir, remap_tess_levels_reversed,
nir_metadata_control_flow, &cb);
}
static bool
brw_nir_should_vectorize_urb(unsigned align_mul, unsigned align_offset,
unsigned bit_size,
unsigned num_components,
int64_t hole_size,
nir_intrinsic_instr *low,
nir_intrinsic_instr *high,
void *data)
{
if (bit_size != 32 || num_components > 8)
return false;
if (num_components > 4 && num_components < 8 &&
low->intrinsic == nir_intrinsic_store_urb_lsc_intel)
return false;
if (low->intrinsic == nir_intrinsic_store_urb_vec4_intel)
return nir_src_is_const(low->src[3]) && nir_src_is_const(high->src[3]);
return low->intrinsic == nir_intrinsic_load_urb_lsc_intel ||
low->intrinsic == nir_intrinsic_store_urb_lsc_intel ||
low->intrinsic == nir_intrinsic_load_urb_vec4_intel;
}
static unsigned
vec4_urb_round_up_components(unsigned n)
{
return n < 4 ? 4 : util_next_power_of_two(n);
}
static unsigned
lsc_urb_round_up_components(unsigned n)
{
return n < 4 ? n : util_next_power_of_two(n);
}
void
brw_nir_opt_vectorize_urb(brw_pass_tracker *pt)
{
const struct intel_device_info *devinfo = pt->compiler->devinfo;
BRW_NIR_PASS(nir_opt_cse);
nir_load_store_vectorize_options options = {
.modes = nir_var_shader_in | nir_var_shader_out,
.callback = brw_nir_should_vectorize_urb,
.round_up_store_components = true,
.round_up_components =
devinfo->ver >= 20 ? lsc_urb_round_up_components :
vec4_urb_round_up_components,
};
BRW_NIR_PASS(nir_opt_load_store_vectorize, &options);
}
void
brw_nir_lower_vs_inputs(nir_shader *nir)
{
/* Start with the location of the variable's base. */
nir_foreach_shader_in_variable(var, nir)
var->data.driver_location = var->data.location;
/* Now use nir_lower_io to walk dereference chains. Attribute arrays are
* loaded as one vec4 or dvec4 per element (or matrix column), depending on
* whether it is a double-precision type or not.
*/
NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in, type_size_vec4,
nir_lower_io_lower_64bit_to_32_new);
/* Fold constant offset srcs for IO. */
NIR_PASS(_, nir, nir_opt_constant_folding);
/* Update shader_info::dual_slot_inputs */
nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
/* The last step is to remap VERT_ATTRIB_* to actual registers */
/* Whether or not we have any system generated values. gl_DrawID is not
* included here as it lives in its own vec4.
*/
const bool has_sgvs =
BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FIRST_VERTEX) ||
BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE) ||
BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) ||
BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
const unsigned num_inputs = util_bitcount64(nir->info.inputs_read) +
util_bitcount64(nir->info.inputs_read & nir->info.dual_slot_inputs);
/* In the following loop, the intrinsic base value is the offset in
* register slots (2 slots can make up in single input for double/64bit
* values). The io_semantics location field is the offset in terms of
* attributes.
*/
nir_foreach_function_impl(impl, nir) {
nir_builder b = nir_builder_create(impl);
nir_foreach_block(block, impl) {
nir_foreach_instr_safe(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
switch (intrin->intrinsic) {
case nir_intrinsic_load_first_vertex:
case nir_intrinsic_load_base_instance:
case nir_intrinsic_load_vertex_id_zero_base:
case nir_intrinsic_load_instance_id:
case nir_intrinsic_load_is_indexed_draw:
case nir_intrinsic_load_draw_id: {
b.cursor = nir_after_instr(&intrin->instr);
/* gl_VertexID and friends are stored by the VF as the last
* vertex element. We convert them to load_input intrinsics at
* the right location.
*/
nir_intrinsic_instr *load =
nir_intrinsic_instr_create(nir, nir_intrinsic_load_input);
load->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
unsigned input_offset = 0;
unsigned location = BRW_SVGS_VE_INDEX;
switch (intrin->intrinsic) {
case nir_intrinsic_load_first_vertex:
nir_intrinsic_set_component(load, 0);
break;
case nir_intrinsic_load_base_instance:
nir_intrinsic_set_component(load, 1);
break;
case nir_intrinsic_load_vertex_id_zero_base:
nir_intrinsic_set_component(load, 2);
break;
case nir_intrinsic_load_instance_id:
nir_intrinsic_set_component(load, 3);
break;
case nir_intrinsic_load_draw_id:
case nir_intrinsic_load_is_indexed_draw:
/* gl_DrawID and IsIndexedDraw are stored right after
* gl_VertexID and friends if any of them exist.
*/
input_offset += has_sgvs ? 1 : 0;
location = BRW_DRAWID_VE_INDEX;
if (intrin->intrinsic == nir_intrinsic_load_draw_id)
nir_intrinsic_set_component(load, 0);
else
nir_intrinsic_set_component(load, 1);
break;
default:
UNREACHABLE("Invalid system value intrinsic");
}
/* Position the value behind the app's inputs, for base we
* account for the double inputs, for the io_semantics
* location, it's just the input count.
*/
nir_intrinsic_set_base(load, num_inputs + input_offset);
struct nir_io_semantics io = {
.location = VERT_ATTRIB_GENERIC0 + location,
.num_slots = 1,
};
nir_intrinsic_set_io_semantics(load, io);
load->num_components = 1;
nir_def_init(&load->instr, &load->def, 1, 32);
nir_builder_instr_insert(&b, &load->instr);
nir_def_replace(&intrin->def, &load->def);
break;
}
case nir_intrinsic_load_input: {
/* Attributes come in a contiguous block, ordered by their
* gl_vert_attrib value. That means we can compute the slot
* number for an attribute by masking out the enabled attributes
* before it and counting the bits.
*/
const struct nir_io_semantics io =
nir_intrinsic_io_semantics(intrin);
const int attr = nir_intrinsic_base(intrin);
const int slot = util_bitcount64(nir->info.inputs_read &
BITFIELD64_MASK(attr)) +
util_bitcount64(nir->info.dual_slot_inputs &
BITFIELD64_MASK(attr)) +
io.high_dvec2;
nir_intrinsic_set_base(intrin, slot);
break;
}
default:
break; /* Nothing to do */
}
}
}
}
}
void
brw_nir_lower_gs_inputs(nir_shader *nir,
const struct intel_device_info *devinfo,
const struct intel_vue_map *vue_map,
unsigned *out_urb_read_length)
{
/* Inputs are stored in vec4 slots, so use type_size_vec4(). */
NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in, type_size_vec4,
nir_lower_io_lower_64bit_to_32);
/* Fold constant offset srcs for IO. */
NIR_PASS(_, nir, nir_opt_constant_folding);
unsigned urb_read_length = 0;
if (nir->info.gs.invocations == 1) {
/* URB read length is in 256-bit units, which is two vec4s. */
urb_read_length = DIV_ROUND_UP(vue_map->num_slots, 2);
/* Because we're operating in scalar mode, the two vec4s take
* up 8 registers. Additionally, the GS reads URB Read Length
* for each vertex being processed, each unit of read length
* takes up 8 * VerticesIn registers.
*/
const unsigned regs_per_read = 8 * nir->info.gs.vertices_in;
/* Limit to 24 registers worth of pushed inputs */
const unsigned max_push_regs = 24;
if (urb_read_length * regs_per_read > max_push_regs)
urb_read_length = max_push_regs / regs_per_read;
}
*out_urb_read_length = urb_read_length;
const struct brw_lower_urb_cb_data cb_data = {
.devinfo = devinfo,
.vec4_access = true,
/* pushed bytes per vertex */
.max_push_bytes = urb_read_length * 8 * sizeof(uint32_t),
.varying_to_slot = vue_map->varying_to_slot,
};
NIR_PASS(_, nir, brw_nir_lower_inputs_to_urb_intrinsics, &cb_data);
}
void
brw_nir_lower_tes_inputs(nir_shader *nir,
const struct intel_device_info *devinfo,
const struct intel_vue_map *vue_map)
{
NIR_PASS(_, nir, nir_lower_tess_level_array_vars_to_vec);
NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in, type_size_vec4,
nir_lower_io_lower_64bit_to_32);
/* Run nir_opt_constant_folding to allow update base/io_semantic::location
* for the remapping pass to look into the VUE mapping.
*/
NIR_PASS(_, nir, nir_opt_constant_folding);
NIR_PASS(_, nir, remap_tess_levels, devinfo,
nir->info.tess._primitive_mode);
const struct brw_lower_urb_cb_data cb_data = {
.devinfo = devinfo,
.vec4_access = true,
.max_push_bytes = 32 * 16, /* 32 vec4s */
.varying_to_slot = vue_map->varying_to_slot,
.per_vertex_stride = vue_map->num_per_vertex_slots * 16,
.dynamic_tes = vue_map->layout == INTEL_VUE_LAYOUT_SEPARATE,
.tes_builtins_slot_offset = vue_map->builtins_slot_offset,
.tes_per_patch_slots = vue_map->num_per_patch_slots,
};
NIR_PASS(_, nir, brw_nir_lower_inputs_to_urb_intrinsics, &cb_data);
}
static bool
lower_barycentric_per_sample(nir_builder *b,
nir_intrinsic_instr *intrin,
UNUSED void *cb_data)
{
if (intrin->intrinsic != nir_intrinsic_load_barycentric_pixel &&
intrin->intrinsic != nir_intrinsic_load_barycentric_centroid)
return false;
b->cursor = nir_before_instr(&intrin->instr);
nir_def *centroid =
nir_load_barycentric(b, nir_intrinsic_load_barycentric_sample,
nir_intrinsic_interp_mode(intrin));
nir_def_replace(&intrin->def, centroid);
return true;
}
/**
* Convert interpolateAtOffset() offsets from [-0.5, +0.5] floating point
* offsets to integer [-8, +7] offsets (in units of 1/16th of a pixel).
*
* We clamp to +7/16 on the upper end of the range, since +0.5 isn't
* representable in a S0.4 value; a naive conversion would give us -8/16,
* which is the opposite of what was intended.
*
* This is allowed by GL_ARB_gpu_shader5's quantization rules:
*
* "Not all values of <offset> may be supported; x and y offsets may
* be rounded to fixed-point values with the number of fraction bits
* given by the implementation-dependent constant
* FRAGMENT_INTERPOLATION_OFFSET_BITS."
*/
static bool
lower_barycentric_at_offset(nir_builder *b, nir_intrinsic_instr *intrin,
void *data)
{
if (intrin->intrinsic != nir_intrinsic_load_barycentric_at_offset)
return false;
b->cursor = nir_before_instr(&intrin->instr);
assert(intrin->src[0].ssa);
nir_def *offset =
nir_imin(b, nir_imm_int(b, 7),
nir_f2i32(b, nir_fmul_imm(b, intrin->src[0].ssa, 16)));
nir_src_rewrite(&intrin->src[0], offset);
return true;
}
static bool
lower_indirect_primitive_id(nir_builder *b,
nir_intrinsic_instr *intrin,
void *data)
{
if (intrin->intrinsic != nir_intrinsic_load_per_primitive_input)
return false;
if (nir_intrinsic_io_semantics(intrin).location != VARYING_SLOT_PRIMITIVE_ID)
return false;
nir_def *indirect_primitive_id = data;
nir_def_replace(&intrin->def, indirect_primitive_id);
return true;
}
bool
brw_needs_vertex_attributes_bypass(const nir_shader *shader)
{
/* Even if there are no actual per-vertex inputs, if the fragment
* shader uses BaryCoord*, we need to set everything accordingly
* so the barycentrics don't get reordered.
*/
if (BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_BARYCENTRIC_LINEAR_COORD) ||
BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_BARYCENTRIC_PERSP_COORD))
return true;
nir_foreach_shader_in_variable(var, shader) {
if (var->data.per_vertex)
return true;
}
return false;
}
/* Build the per-vertex offset into the attribute section of the per-vertex
* thread payload. There is always one GRF of padding in front.
*
* The computation is fairly complicated due to the layout of the payload. You
* can find a description of the layout in brw_compile_fs.cpp
* brw_assign_urb_setup().
*
* Gfx < 20 packs 2 slots per GRF (hence the %/ 2 in the formula)
* Gfx >= 20 pack 5 slots per GRF (hence the %/ 5 in the formula)
*
* Then an additional offset needs to added to handle how multiple polygon
* data is interleaved.
*/
nir_def *
brw_nir_vertex_attribute_offset(nir_builder *b,
nir_def *attr_idx,
const struct intel_device_info *devinfo)
{
nir_def *max_poly = nir_load_max_polygon_intel(b);
return devinfo->ver >= 20 ?
nir_iadd(b,
nir_imul(b, nir_udiv_imm(b, attr_idx, 5), nir_imul_imm(b, max_poly, 64)),
nir_imul_imm(b, nir_umod_imm(b, attr_idx, 5), 12)) :
nir_iadd_imm(
b,
nir_iadd(
b,
nir_imul(b, nir_udiv_imm(b, attr_idx, 2), nir_imul_imm(b, max_poly, 32)),
nir_imul_imm(b, nir_umod_imm(b, attr_idx, 2), 16)),
12);
}
static nir_block *
fragment_top_block_or_after_wa_18019110168(nir_function_impl *impl)
{
nir_if *first_if =
nir_block_get_following_if(nir_start_block(impl));
nir_block *post_wa_18019110168_block = NULL;
if (first_if) {
nir_block *last_if_block = nir_if_last_then_block(first_if);
nir_foreach_block_in_cf_node(block, &first_if->cf_node) {
nir_foreach_instr(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic == nir_intrinsic_store_per_primitive_payload_intel) {
post_wa_18019110168_block = last_if_block->successors[0];
break;
}
}
if (post_wa_18019110168_block)
break;
}
}
return post_wa_18019110168_block ?
post_wa_18019110168_block : nir_start_block(impl);
}
void
brw_nir_lower_fs_inputs(nir_shader *nir,
const struct intel_device_info *devinfo,
const struct brw_fs_prog_key *key)
{
/* Always pull the PrimitiveID from the per-primitive block if mesh can be
* involved.
*/
if (key->mesh_input != INTEL_NEVER) {
nir_foreach_shader_in_variable(var, nir) {
if (var->data.location == VARYING_SLOT_PRIMITIVE_ID) {
var->data.per_primitive = true;
nir->info.per_primitive_inputs |= VARYING_BIT_PRIMITIVE_ID;
}
}
}
nir_def *indirect_primitive_id = NULL;
if (key->base.vue_layout == INTEL_VUE_LAYOUT_SEPARATE_MESH &&
(nir->info.inputs_read & VARYING_BIT_PRIMITIVE_ID)) {
nir_builder _b = nir_builder_at(
nir_before_block(
fragment_top_block_or_after_wa_18019110168(
nir_shader_get_entrypoint(nir)))), *b = &_b;
nir_def *index = nir_ubitfield_extract_imm(
b,
nir_load_fs_config_intel(b),
INTEL_FS_CONFIG_PRIMITIVE_ID_INDEX_OFFSET,
INTEL_FS_CONFIG_PRIMITIVE_ID_INDEX_SIZE);
nir_def *per_vertex_offset =
nir_iadd_imm(
b,
brw_nir_vertex_attribute_offset(
b, nir_imul_imm(b, index, 4), devinfo),
devinfo->grf_size);
/* When the attribute index is INTEL_FS_CONFIG_PRIMITIVE_ID_MESH_INDEX,
* it means the value is coming from the per-primitive block. We always
* lay out PrimitiveID at offset 0 in the per-primitive block.
*/
nir_def *attribute_offset = nir_bcsel(
b,
nir_ieq_imm(b, index, INTEL_FS_CONFIG_PRIMITIVE_ID_INDEX_MESH),
nir_imm_int(b, 0), per_vertex_offset);
indirect_primitive_id =
nir_load_attribute_payload_intel(b, 1, 32, attribute_offset);
}
nir_foreach_shader_in_variable(var, nir) {
var->data.driver_location = var->data.location;
if (var->data.interpolation == INTERP_MODE_NONE)
var->data.interpolation = INTERP_MODE_SMOOTH;
}
NIR_PASS(_, nir, nir_lower_io,
nir_var_shader_in, type_size_vec4,
nir_lower_io_lower_64bit_to_32 |
nir_lower_io_use_interpolated_input_intrinsics);
if (devinfo->ver >= 11)
NIR_PASS(_, nir, nir_lower_interpolation, ~0);
if (brw_needs_vertex_attributes_bypass(nir))
brw_nir_lower_fs_barycentrics(nir);
if (key->multisample_fbo == INTEL_NEVER) {
nir_lower_single_sampled_options lss_opts = {
.lower_sample_mask_in = key->coarse_pixel == INTEL_NEVER,
};
NIR_PASS(_, nir, nir_lower_single_sampled, &lss_opts);
} else if (key->persample_interp == INTEL_ALWAYS) {
NIR_PASS(_, nir, nir_shader_intrinsics_pass,
lower_barycentric_per_sample,
nir_metadata_control_flow,
NULL);
}
if (devinfo->ver < 20) {
NIR_PASS(_, nir, nir_shader_intrinsics_pass,
lower_barycentric_at_offset,
nir_metadata_control_flow,
NULL);
}
if (indirect_primitive_id != NULL) {
NIR_PASS(_, nir, nir_shader_intrinsics_pass,
lower_indirect_primitive_id,
nir_metadata_control_flow,
indirect_primitive_id);
}
/* Fold constant offset srcs for IO. */
NIR_PASS(_, nir, nir_opt_constant_folding);
}
void
brw_nir_lower_vue_outputs(nir_shader *nir)
{
NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out, type_size_vec4,
nir_lower_io_lower_64bit_to_32);
}
void
brw_nir_lower_tcs_inputs(nir_shader *nir,
const struct intel_device_info *devinfo,
const struct intel_vue_map *input_vue_map)
{
/* Inputs are stored in vec4 slots, so use type_size_vec4(). */
NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in, type_size_vec4,
nir_lower_io_lower_64bit_to_32);
/* Fold constant offset srcs for IO. */
NIR_PASS(_, nir, nir_opt_constant_folding);
const struct brw_lower_urb_cb_data cb_data = {
.devinfo = devinfo,
.vec4_access = true,
.varying_to_slot = input_vue_map->varying_to_slot,
};
NIR_PASS(_, nir, brw_nir_lower_inputs_to_urb_intrinsics, &cb_data);
}
void
brw_nir_lower_tcs_outputs(nir_shader *nir,
const struct intel_device_info *devinfo,
const struct intel_vue_map *vue_map,
enum tess_primitive_mode tes_primitive_mode)
{
NIR_PASS(_, nir, nir_lower_tess_level_array_vars_to_vec);
NIR_PASS(_, nir, nir_opt_combine_stores, nir_var_shader_out);
NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out, type_size_vec4,
nir_lower_io_lower_64bit_to_32);
/* Run nir_opt_constant_folding to allow update base/io_semantic::location
* for the remapping pass to look into the VUE mapping.
*/
NIR_PASS(_, nir, nir_opt_constant_folding);
NIR_PASS(_, nir, remap_tess_levels, devinfo, tes_primitive_mode);
const struct brw_lower_urb_cb_data cb_data = {
.devinfo = devinfo,
.vec4_access = true,
.varying_to_slot = vue_map->varying_to_slot,
.per_vertex_stride = vue_map->num_per_vertex_slots * 16,
};
NIR_PASS(_, nir, brw_nir_lower_outputs_to_urb_intrinsics, &cb_data);
}
void
brw_nir_lower_fs_outputs(nir_shader *nir)
{
nir_foreach_shader_out_variable(var, nir) {
var->data.driver_location =
SET_FIELD(var->data.index, BRW_NIR_FRAG_OUTPUT_INDEX) |
SET_FIELD(var->data.location, BRW_NIR_FRAG_OUTPUT_LOCATION);
}
NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out, type_size_vec4, 0);
nir->info.disable_output_offset_src_constant_folding = true;
}
static bool
lower_frag_coord_z_instr(nir_builder *b, nir_intrinsic_instr *intrin, void *data)
{
if (intrin->intrinsic != nir_intrinsic_load_frag_coord_z)
return false;
b->cursor = nir_after_instr(&intrin->instr);
b->fp_math_ctrl = nir_fp_no_fast_math;
const struct intel_device_info *devinfo = (const struct intel_device_info *)data;
const unsigned precision = devinfo->has_64bit_float ? 64 : 32;
nir_def *start = nir_f2fN(b, nir_load_fs_start_intel(b), precision);
nir_def *z_c = nir_f2fN(b, nir_load_fs_z_c_intel(b), precision);
nir_def *z_c0 = nir_f2fN(b, nir_load_fs_z_c0_intel(b), precision);
nir_def *coord = nir_fadd_imm(
b, nir_i2fN(b, nir_load_pixel_coord(b), precision), 0.5f);
nir_def *offset = nir_fsub(b, coord, start);
nir_def *dot = nir_fdot(b, offset, z_c);
nir_def *coarse_z = nir_fadd(b, dot, z_c0);
nir_def_replace(&intrin->def, nir_f2f32(b, coarse_z));
return true;
}
bool
brw_nir_lower_frag_coord_z(nir_shader *nir,
const struct intel_device_info *devinfo)
{
return nir_shader_intrinsics_pass(nir, lower_frag_coord_z_instr,
nir_metadata_control_flow,
(void *)devinfo);
}
static bool
tag_speculative_access(nir_builder *b,
nir_intrinsic_instr *intrin,
void *unused)
{
if (intrin->intrinsic == nir_intrinsic_load_ubo &&
brw_nir_ubo_surface_index_is_pushable(intrin->src[0])) {
nir_intrinsic_set_access(intrin, ACCESS_CAN_SPECULATE |
nir_intrinsic_access(intrin));
return true;
}
return false;
}
static bool
brw_nir_tag_speculative_access(nir_shader *nir)
{
return nir_shader_intrinsics_pass(nir, tag_speculative_access,
nir_metadata_all, NULL);
}
#define OPT BRW_NIR_PASS
#define LOOP_OPT BRW_NIR_LOOP_PASS
#define LOOP_OPT_NOT_IDEMPOTENT BRW_NIR_LOOP_PASS_NOT_IDEMPOTENT
void
brw_nir_optimize(brw_pass_tracker *pt)
{
nir_shader *nir = pt->nir;
pass_tracker_new_loop(pt);
do {
pass_tracker_new_iteration(pt);
/* This pass is causing problems with types used by OpenCL :
* https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13955
*
* Running with it disabled made no difference in the resulting assembly
* code.
*/
if (nir->info.stage != MESA_SHADER_KERNEL)
LOOP_OPT(nir_split_array_vars, nir_var_function_temp);
LOOP_OPT(nir_shrink_vec_array_vars, nir_var_function_temp);
LOOP_OPT(nir_lower_vars_to_ssa);
if (!nir->info.var_copies_lowered) {
/* Only run this pass if nir_lower_var_copies was not called
* yet. That would lower away any copy_deref instructions and we
* don't want to introduce any more.
*/
LOOP_OPT(nir_opt_find_array_copies);
}
LOOP_OPT(nir_opt_copy_prop_vars);
LOOP_OPT(nir_opt_dead_write_vars);
LOOP_OPT(nir_lower_alu_to_scalar, NULL, NULL);
LOOP_OPT(nir_opt_copy_prop);
LOOP_OPT(nir_lower_phis_to_scalar, NULL, NULL);
LOOP_OPT(nir_opt_copy_prop);
LOOP_OPT(nir_opt_dce);
LOOP_OPT(nir_opt_cse);
LOOP_OPT(nir_opt_combine_stores, nir_var_all);
/* For indirect loads of uniforms (push constants), we assume that array
* indices will nearly always be in bounds and the cost of the load is
* low. Therefore there shouldn't be a performance benefit to avoid it.
*/
nir_opt_peephole_select_options peephole_select_options = {
.limit = 8,
.indirect_load_ok = true,
.expensive_alu_ok = true,
.discard_ok = true,
};
LOOP_OPT(nir_opt_peephole_select, &peephole_select_options);
LOOP_OPT(nir_opt_intrinsics);
LOOP_OPT_NOT_IDEMPOTENT(nir_opt_algebraic);
LOOP_OPT(nir_lower_constant_convert_alu_types);
LOOP_OPT(nir_opt_constant_folding);
LOOP_OPT(nir_opt_dead_cf);
if (LOOP_OPT(nir_opt_loop)) {
/* If nir_opt_loop makes progress, then we need to clean
* things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
* to make progress.
*/
LOOP_OPT(nir_opt_copy_prop);
LOOP_OPT(nir_opt_dce);
}
LOOP_OPT_NOT_IDEMPOTENT(nir_opt_if, nir_opt_if_optimize_phi_true_false);
if (nir->options->max_unroll_iterations != 0) {
LOOP_OPT_NOT_IDEMPOTENT(nir_opt_loop_unroll);
}
LOOP_OPT(nir_opt_remove_phis);
LOOP_OPT(nir_opt_gcm, false);
LOOP_OPT(nir_opt_undef);
LOOP_OPT(nir_lower_pack);
} while (pt->progress);
OPT(nir_remove_dead_variables, nir_var_function_temp, NULL);
}
static unsigned
lower_bit_size_callback(const nir_instr *instr, void *data)
{
const struct intel_device_info *devinfo = data;
switch (instr->type) {
case nir_instr_type_alu: {
nir_alu_instr *alu = nir_instr_as_alu(instr);
switch (alu->op) {
case nir_op_bit_count:
case nir_op_ufind_msb:
case nir_op_ifind_msb:
case nir_op_find_lsb:
/* These are handled specially because the destination is always
* 32-bit and so the bit size of the instruction is given by the
* source.
*/
return alu->src[0].src.ssa->bit_size >= 32 ? 0 : 32;
default:
break;
}
if (alu->def.bit_size >= 32)
return 0;
/* Note: nir_op_iabs and nir_op_ineg are not lowered here because the
* 8-bit ABS or NEG instruction should eventually get copy propagated
* into the MOV that does the type conversion. This results in far
* fewer MOV instructions.
*/
switch (alu->op) {
case nir_op_bitfield_reverse:
return alu->def.bit_size != 32 ? 32 : 0;
case nir_op_idiv:
case nir_op_imod:
case nir_op_irem:
case nir_op_udiv:
case nir_op_umod:
/* Gfx12.5+ lacks integer division instructions. As nir_lower_idiv is
* far more efficient for int8/int16 divisions, we do not lower here.
*
* Older platforms have idiv instructions only for int32, so lower.
*/
return devinfo->verx10 >= 125 ? 0 : 32;
case nir_op_fceil:
case nir_op_ffloor:
case nir_op_ffract:
case nir_op_fround_even:
case nir_op_ftrunc:
return 32;
case nir_op_frcp:
case nir_op_frsq:
case nir_op_fsqrt:
case nir_op_fpow:
case nir_op_fexp2:
case nir_op_flog2:
case nir_op_fsin:
case nir_op_fcos:
return 0;
case nir_op_isign:
UNREACHABLE("Should have been lowered by nir_opt_algebraic.");
default:
if (nir_op_infos[alu->op].num_inputs >= 2 &&
alu->def.bit_size == 8)
return 16;
if (nir_alu_instr_is_comparison(alu) &&
alu->src[0].src.ssa->bit_size == 8)
return 16;
return 0;
}
break;
}
case nir_instr_type_intrinsic: {
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
switch (intrin->intrinsic) {
case nir_intrinsic_read_invocation:
case nir_intrinsic_read_first_invocation:
case nir_intrinsic_vote_feq:
case nir_intrinsic_vote_ieq:
case nir_intrinsic_shuffle:
case nir_intrinsic_shuffle_xor:
case nir_intrinsic_shuffle_up:
case nir_intrinsic_shuffle_down:
case nir_intrinsic_shuffle_up_intel:
case nir_intrinsic_shuffle_down_intel:
case nir_intrinsic_quad_broadcast:
case nir_intrinsic_quad_swap_horizontal:
case nir_intrinsic_quad_swap_vertical:
case nir_intrinsic_quad_swap_diagonal:
if (intrin->src[0].ssa->bit_size == 8)
return 16;
return 0;
case nir_intrinsic_reduce:
case nir_intrinsic_inclusive_scan:
case nir_intrinsic_exclusive_scan:
/* There are a couple of register region issues that make things
* complicated for 8-bit types:
*
* 1. Only raw moves are allowed to write to a packed 8-bit
* destination.
* 2. If we use a strided destination, the efficient way to do
* scan operations ends up using strides that are too big to
* encode in an instruction.
*
* To get around these issues, we just do all 8-bit scan operations
* in 16 bits. It's actually fewer instructions than what we'd have
* to do if we were trying to do it in native 8-bit types and the
* results are the same once we truncate to 8 bits at the end.
*/
if (intrin->def.bit_size == 8)
return 16;
return 0;
default:
return 0;
}
break;
}
case nir_instr_type_phi: {
nir_phi_instr *phi = nir_instr_as_phi(instr);
if (phi->def.bit_size == 8)
return 16;
return 0;
}
default:
return 0;
}
}
/* On gfx12.5+, if the offsets are not both constant and in the {-8,7} range,
* we will have nir_lower_tex() lower the source offset by returning true from
* this filter function.
*/
static bool
lower_xehp_tg4_offset_filter(const nir_instr *instr, UNUSED const void *data)
{
if (instr->type != nir_instr_type_tex)
return false;
nir_tex_instr *tex = nir_instr_as_tex(instr);
if (tex->op != nir_texop_tg4)
return false;
int offset_index = nir_tex_instr_src_index(tex, nir_tex_src_offset);
if (offset_index < 0)
return false;
/* When we have LOD & offset, we can pack both (see
* intel_nir_lower_texture.c pack_lod_or_bias_and_offset)
*/
bool has_lod =
nir_tex_instr_src_index(tex, nir_tex_src_lod) != -1 ||
nir_tex_instr_src_index(tex, nir_tex_src_bias) != -1;
if (has_lod)
return false;
if (!nir_src_is_const(tex->src[offset_index].src))
return true;
int64_t offset_x = nir_src_comp_as_int(tex->src[offset_index].src, 0);
int64_t offset_y = nir_src_comp_as_int(tex->src[offset_index].src, 1);
return offset_x < -8 || offset_x > 7 || offset_y < -8 || offset_y > 7;
}
/* Does some simple lowering and runs the standard suite of optimizations
*
* This is intended to be called more-or-less directly after you get the
* shader out of GLSL or some other source. While it is geared towards i965,
* it is not at all generator-specific.
*/
void
brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
const struct brw_nir_compiler_opts *opts)
{
const struct intel_device_info *devinfo = compiler->devinfo;
/* TODO: This is part of the "pre-processing" before the shader is fed to
* brw_compile_* functions, so there's no debug archiver available yet.
* In the future runtime/driver will create one for us to use here.
*/
brw_pass_tracker pt_ = {
.nir = nir,
.compiler = compiler,
}, *pt = &pt_;
nir_validate_ssa_dominance(nir, "before brw_preprocess_nir");
OPT(nir_lower_frexp);
OPT(nir_lower_alu_to_scalar, NULL, NULL);
if (nir->info.stage == MESA_SHADER_GEOMETRY)
OPT(nir_lower_gs_intrinsics, 0);
/* See also brw_nir_workarounds.py */
if (compiler->precise_trig &&
!(devinfo->ver >= 10 || devinfo->platform == INTEL_PLATFORM_KBL))
OPT(brw_nir_apply_trig_workarounds);
/* This workaround existing for performance reasons. Since it requires not
* setting RENDER_SURFACE_STATE::SurfaceArray when the array length is 1,
* we're loosing the HW robustness feature in that case.
*
* So when robust image access is enabled, just avoid the workaround.
*/
if (intel_needs_workaround(devinfo, 1806565034) && !opts->robust_image_access)
OPT(intel_nir_clamp_image_1d_2d_array_sizes);
OPT(nir_normalize_cubemap_coords);
OPT(nir_lower_global_vars_to_local);
OPT(nir_split_var_copies);
OPT(nir_split_struct_vars, nir_var_function_temp);
if (OPT(nir_opt_memcpy))
OPT(nir_split_var_copies);
brw_nir_optimize(pt);
if (nir->info.ray_queries) {
OPT(nir_opt_ray_queries);
OPT(nir_opt_ray_query_ranges);
}
OPT(nir_opt_deref);
unsigned lower_flrp =
(nir->options->lower_flrp16 ? 16 : 0) |
(nir->options->lower_flrp32 ? 32 : 0) |
(nir->options->lower_flrp64 ? 64 : 0);
OPT(nir_lower_flrp, lower_flrp, false /* always_precise */);
struct nir_opt_16bit_tex_image_options options = {
.rounding_mode = nir_rounding_mode_undef,
.opt_tex_dest_types = nir_type_float | nir_type_int | nir_type_uint,
};
OPT(nir_opt_16bit_tex_image, &options);
OPT(nir_lower_doubles, opts->softfp64, nir->options->lower_doubles_options);
if (OPT(nir_lower_int64_float_conversions)) {
OPT(nir_opt_algebraic);
OPT(nir_lower_doubles, opts->softfp64,
nir->options->lower_doubles_options);
}
OPT(nir_lower_bit_size, lower_bit_size_callback, (void *)devinfo);
/* Lower a bunch of stuff */
OPT(nir_lower_var_copies);
/* This needs to be run after the first optimization pass but before we
* lower indirect derefs away
*/
OPT(nir_opt_large_constants, NULL, 32);
OPT(nir_lower_load_const_to_scalar);
OPT(nir_lower_system_values);
nir_lower_compute_system_values_options lower_csv_options = {
.has_base_workgroup_id = nir->info.stage == MESA_SHADER_COMPUTE,
};
OPT(nir_lower_compute_system_values, &lower_csv_options);
const nir_lower_subgroups_options subgroups_options = {
.subgroup_size = brw_nir_api_subgroup_size(nir, 0),
.ballot_bit_size = 32,
.ballot_components = 1,
.lower_to_scalar = true,
.lower_relative_shuffle = true,
.lower_quad_broadcast_dynamic = true,
.lower_elect = true,
.lower_inverse_ballot = true,
.lower_rotate_to_shuffle = true,
};
OPT(nir_lower_subgroups, &subgroups_options);
nir_variable_mode indirect_mask = brw_nir_no_indirect_mask(nir->info.stage);
OPT(nir_lower_indirect_derefs_to_if_else_trees, indirect_mask, UINT32_MAX);
/* Even in cases where we can handle indirect temporaries via scratch, we
* it can still be expensive. Lower indirects on small arrays to
* conditional load/stores.
*
* The threshold of 16 was chosen semi-arbitrarily. The idea is that an
* indirect on an array of 16 elements is about 30 instructions at which
* point, you may be better off doing a send. With a SIMD8 program, 16
* floats is 1/8 of the entire register file. Any array larger than that
* is likely to cause pressure issues. Also, this value is sufficiently
* high that the benchmarks known to suffer from large temporary array
* issues are helped but nothing else in shader-db is hurt except for maybe
* that one kerbal space program shader.
*/
if (!(indirect_mask & nir_var_function_temp))
OPT(nir_lower_indirect_derefs_to_if_else_trees, nir_var_function_temp, 16);
/* Lower array derefs of vectors for SSBO and UBO loads. For both UBOs and
* SSBOs, our back-end is capable of loading an entire vec4 at a time and
* we would like to take advantage of that whenever possible regardless of
* whether or not the app gives us full loads. This should allow the
* optimizer to combine UBO and SSBO load operations and save us some send
* messages.
*/
OPT(nir_lower_array_deref_of_vec,
nir_var_mem_ubo | nir_var_mem_ssbo, NULL,
nir_lower_direct_array_deref_of_vec_load);
/* Clamp load_per_vertex_input of the TCS stage so that we do not generate
* loads reading out of bounds. We can do this here because we called
* nir_lower_system_values above.
*/
if (nir->info.stage == MESA_SHADER_TESS_CTRL &&
intel_use_tcs_multi_patch(devinfo))
OPT(intel_nir_clamp_per_vertex_loads);
/* Get rid of split copies */
brw_nir_optimize(pt);
}
static bool
brw_nir_zero_inputs_instr(struct nir_builder *b, nir_intrinsic_instr *intrin,
void *data)
{
if (intrin->intrinsic != nir_intrinsic_load_deref)
return false;
nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
if (!nir_deref_mode_is(deref, nir_var_shader_in))
return false;
if (deref->deref_type != nir_deref_type_var)
return false;
nir_variable *var = deref->var;
uint64_t zero_inputs = *(uint64_t *)data;
if (!(BITFIELD64_BIT(var->data.location) & zero_inputs))
return false;
b->cursor = nir_before_instr(&intrin->instr);
nir_def *zero = nir_imm_zero(b, 1, 32);
nir_def_replace(&intrin->def, zero);
return true;
}
static bool
brw_nir_zero_inputs(nir_shader *shader, uint64_t *zero_inputs)
{
return nir_shader_intrinsics_pass(shader, brw_nir_zero_inputs_instr,
nir_metadata_control_flow,
zero_inputs);
}
/* Code for Wa_18019110168 may have created input/output variables beyond
* VARYING_SLOT_MAX and removed uses of variables below VARYING_SLOT_MAX.
* Clean it up, so they all stay below VARYING_SLOT_MAX.
*/
static void
brw_mesh_compact_io(nir_shader *mesh, nir_shader *frag)
{
gl_varying_slot mapping[VARYING_SLOT_MAX] = {0, };
gl_varying_slot cur = VARYING_SLOT_VAR0;
bool compact = false;
nir_foreach_shader_out_variable(var, mesh) {
gl_varying_slot location = var->data.location;
if (location < VARYING_SLOT_VAR0)
continue;
assert(location < ARRAY_SIZE(mapping));
const struct glsl_type *type = var->type;
if (nir_is_arrayed_io(var, MESA_SHADER_MESH)) {
assert(glsl_type_is_array(type));
type = glsl_get_array_element(type);
}
if (mapping[location])
continue;
unsigned num_slots = glsl_count_attribute_slots(type, false);
compact |= location + num_slots > VARYING_SLOT_MAX;
mapping[location] = cur;
cur += num_slots;
}
if (!compact)
return;
/* The rest of this function should be hit only for Wa_18019110168. */
nir_foreach_shader_out_variable(var, mesh) {
gl_varying_slot location = var->data.location;
if (location < VARYING_SLOT_VAR0)
continue;
location = mapping[location];
if (location == 0)
continue;
var->data.location = location;
}
nir_foreach_shader_in_variable(var, frag) {
gl_varying_slot location = var->data.location;
if (location < VARYING_SLOT_VAR0)
continue;
location = mapping[location];
if (location == 0)
continue;
var->data.location = location;
}
nir_shader_gather_info(mesh, nir_shader_get_entrypoint(mesh));
nir_shader_gather_info(frag, nir_shader_get_entrypoint(frag));
if (should_print_nir(mesh)) {
printf("%s\n", __func__);
nir_print_shader(mesh, stdout);
}
if (should_print_nir(frag)) {
printf("%s\n", __func__);
nir_print_shader(frag, stdout);
}
}
void
brw_nir_link_shaders(const struct brw_compiler *compiler,
nir_shader *producer, nir_shader *consumer)
{
if (producer->info.stage == MESA_SHADER_MESH &&
consumer->info.stage == MESA_SHADER_FRAGMENT) {
uint64_t fs_inputs = 0, ms_outputs = 0;
/* gl_MeshPerPrimitiveEXT[].gl_ViewportIndex, gl_PrimitiveID and gl_Layer
* are per primitive, but fragment shader does not have them marked as
* such. Add the annotation here.
*/
nir_foreach_shader_in_variable(var, consumer) {
fs_inputs |= BITFIELD64_BIT(var->data.location);
switch (var->data.location) {
case VARYING_SLOT_LAYER:
case VARYING_SLOT_PRIMITIVE_ID:
case VARYING_SLOT_VIEWPORT:
var->data.per_primitive = 1;
break;
default:
continue;
}
}
nir_foreach_shader_out_variable(var, producer)
ms_outputs |= BITFIELD64_BIT(var->data.location);
uint64_t zero_inputs = ~ms_outputs & fs_inputs;
zero_inputs &= VARYING_BIT_LAYER |
VARYING_BIT_VIEWPORT;
if (zero_inputs)
NIR_PASS(_, consumer, brw_nir_zero_inputs, &zero_inputs);
}
nir_lower_io_array_vars_to_elements(producer, consumer);
nir_validate_shader(producer, "after nir_lower_io_arrays_to_elements");
nir_validate_shader(consumer, "after nir_lower_io_arrays_to_elements");
NIR_PASS(_, producer, nir_lower_io_vars_to_scalar, nir_var_shader_out);
NIR_PASS(_, consumer, nir_lower_io_vars_to_scalar, nir_var_shader_in);
/* TODO: This is part of the "pre-processing" before the shader is fed to
* brw_compile_* functions, so there's no debug archiver available yet.
* In the future runtime/driver will create one for us to use here.
*/
brw_pass_tracker pt_producer = { .nir = producer, .compiler = compiler };
brw_pass_tracker pt_consumer = { .nir = consumer, .compiler = compiler };
brw_nir_optimize(&pt_producer);
brw_nir_optimize(&pt_consumer);
if (nir_link_opt_varyings(producer, consumer))
brw_nir_optimize(&pt_consumer);
NIR_PASS(_, producer, nir_remove_dead_variables, nir_var_shader_out, NULL);
NIR_PASS(_, consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);
if (nir_remove_unused_varyings(producer, consumer)) {
if (should_print_nir(producer)) {
printf("nir_remove_unused_varyings\n");
nir_print_shader(producer, stdout);
}
if (should_print_nir(consumer)) {
printf("nir_remove_unused_varyings\n");
nir_print_shader(consumer, stdout);
}
NIR_PASS(_, producer, nir_lower_global_vars_to_local);
NIR_PASS(_, consumer, nir_lower_global_vars_to_local);
brw_nir_optimize(&pt_producer);
brw_nir_optimize(&pt_consumer);
if (producer->info.stage == MESA_SHADER_MESH &&
consumer->info.stage == MESA_SHADER_FRAGMENT) {
brw_mesh_compact_io(producer, consumer);
}
}
NIR_PASS(_, producer, nir_opt_vectorize_io_vars, nir_var_shader_out);
if (producer->info.stage == MESA_SHADER_TESS_CTRL &&
producer->options->vectorize_tess_levels)
NIR_PASS(_, producer, nir_lower_tess_level_array_vars_to_vec);
NIR_PASS(_, producer, nir_opt_combine_stores, nir_var_shader_out);
NIR_PASS(_, consumer, nir_opt_vectorize_io_vars, nir_var_shader_in);
if (producer->info.stage != MESA_SHADER_TESS_CTRL &&
producer->info.stage != MESA_SHADER_MESH &&
producer->info.stage != MESA_SHADER_TASK) {
/* Calling lower_io_to_vector creates output variable writes with
* write-masks. On non-TCS outputs, the back-end can't handle it and we
* need to call nir_lower_io_vars_to_temporaries to get rid of them. This,
* in turn, creates temporary variables and extra copy_deref intrinsics
* that we need to clean up.
*
* Note Mesh/Task don't support I/O as temporaries (I/O is shared
* between whole workgroup, possibly using multiple HW threads). For
* those write-mask in output is handled by I/O lowering.
*/
NIR_PASS(_, producer, nir_lower_io_vars_to_temporaries,
nir_shader_get_entrypoint(producer), nir_var_shader_out);
NIR_PASS(_, producer, nir_lower_global_vars_to_local);
NIR_PASS(_, producer, nir_split_var_copies);
NIR_PASS(_, producer, nir_lower_var_copies);
}
if (producer->info.stage == MESA_SHADER_TASK &&
consumer->info.stage == MESA_SHADER_MESH) {
for (unsigned i = 0; i < 3; ++i)
assert(producer->info.mesh.ts_mesh_dispatch_dimensions[i] <= UINT16_MAX);
nir_lower_compute_system_values_options options = {
.lower_workgroup_id_to_index = true,
.num_workgroups[0] = producer->info.mesh.ts_mesh_dispatch_dimensions[0],
.num_workgroups[1] = producer->info.mesh.ts_mesh_dispatch_dimensions[1],
.num_workgroups[2] = producer->info.mesh.ts_mesh_dispatch_dimensions[2],
/* nir_lower_idiv generates expensive code */
.shortcut_1d_workgroup_id = compiler->devinfo->verx10 >= 125,
};
NIR_PASS(_, consumer, nir_lower_compute_system_values, &options);
}
}
bool
brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
unsigned bit_size,
unsigned num_components,
int64_t hole_size,
nir_intrinsic_instr *low,
nir_intrinsic_instr *high,
void *data)
{
/* Don't combine things to generate 64-bit loads/stores. We have to split
* those back into 32-bit ones anyway and UBO loads aren't split in NIR so
* we don't want to make a mess for the back-end.
*/
if (bit_size > 32)
return false;
if (low->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel ||
low->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel ||
low->intrinsic == nir_intrinsic_load_shared_uniform_block_intel ||
low->intrinsic == nir_intrinsic_load_global_constant_uniform_block_intel ||
(low->intrinsic == nir_intrinsic_load_shader_indirect_data_intel &&
low->src[0].ssa == high->src[0].ssa)) {
if (num_components > 4) {
if (bit_size != 32)
return false;
if (num_components > 32)
return false;
if (hole_size >= 8 * 4)
return false;
}
} else {
/* We can handle at most a vec4 right now. Anything bigger would get
* immediately split by brw_nir_lower_mem_access_bit_sizes anyway.
*/
if (num_components > 4)
return false;
if (hole_size > 4)
return false;
}
const uint32_t align = nir_combined_align(align_mul, align_offset);
if (align < bit_size / 8)
return false;
return true;
}
static
bool combine_all_memory_barriers(nir_intrinsic_instr *a,
nir_intrinsic_instr *b,
void *data)
{
/* Combine control barriers with identical memory semantics. This prevents
* the second barrier generating a spurious, identical fence message as the
* first barrier.
*/
if (nir_intrinsic_memory_modes(a) == nir_intrinsic_memory_modes(b) &&
nir_intrinsic_memory_semantics(a) == nir_intrinsic_memory_semantics(b) &&
nir_intrinsic_memory_scope(a) == nir_intrinsic_memory_scope(b)) {
nir_intrinsic_set_execution_scope(a, MAX2(nir_intrinsic_execution_scope(a),
nir_intrinsic_execution_scope(b)));
return true;
}
/* Only combine pure memory barriers */
if ((nir_intrinsic_execution_scope(a) != SCOPE_NONE) ||
(nir_intrinsic_execution_scope(b) != SCOPE_NONE))
return false;
/* Translation to backend IR will get rid of modes we don't care about, so
* no harm in always combining them.
*
* TODO: While HW has only ACQUIRE|RELEASE fences, we could improve the
* scheduling so that it can take advantage of the different semantics.
*/
nir_intrinsic_set_memory_modes(a, nir_intrinsic_memory_modes(a) |
nir_intrinsic_memory_modes(b));
nir_intrinsic_set_memory_semantics(a, nir_intrinsic_memory_semantics(a) |
nir_intrinsic_memory_semantics(b));
nir_intrinsic_set_memory_scope(a, MAX2(nir_intrinsic_memory_scope(a),
nir_intrinsic_memory_scope(b)));
return true;
}
static nir_mem_access_size_align
get_mem_access_size_align(nir_intrinsic_op intrin, uint8_t bytes,
uint8_t bit_size, uint32_t align_mul, uint32_t align_offset,
bool offset_is_const, enum gl_access_qualifier access,
const void *cb_data)
{
const uint32_t align = nir_combined_align(align_mul, align_offset);
const struct brw_mem_access_cb_data *mem_cb_data =
(struct brw_mem_access_cb_data *)cb_data;
const struct intel_device_info *devinfo = mem_cb_data->devinfo;
switch (intrin) {
case nir_intrinsic_load_ssbo:
case nir_intrinsic_load_shared:
case nir_intrinsic_load_scratch:
/* The offset is constant so we can use a 32-bit load and just shift it
* around as needed.
*/
if (align < 4 && offset_is_const) {
assert(util_is_power_of_two_nonzero(align_mul) && align_mul >= 4);
const unsigned pad = align_offset % 4;
const unsigned comps32 = MIN2(DIV_ROUND_UP(bytes + pad, 4), 4);
return (nir_mem_access_size_align) {
.bit_size = 32,
.num_components = comps32,
.align = 4,
.shift = nir_mem_access_shift_method_scalar,
};
}
break;
case nir_intrinsic_load_task_payload:
if (bytes < 4 || align < 4) {
return (nir_mem_access_size_align) {
.bit_size = 32,
.num_components = 1,
.align = 4,
.shift = nir_mem_access_shift_method_scalar,
};
}
break;
default:
break;
}
const bool is_load = nir_intrinsic_infos[intrin].has_dest;
const bool is_scratch = intrin == nir_intrinsic_load_scratch ||
intrin == nir_intrinsic_store_scratch;
if (align < 4 || bytes < 4) {
/* Choose a byte, word, or dword */
bytes = MIN2(bytes, 4);
if (bytes == 3)
bytes = is_load ? 4 : 2;
if (is_scratch) {
/* The way scratch address swizzling works in the back-end, it
* happens at a DWORD granularity so we can't have a single load
* or store cross a DWORD boundary.
*/
if ((align_offset % 4) + bytes > MIN2(align_mul, 4))
bytes = MIN2(align_mul, 4) - (align_offset % 4);
/* Must be a power of two */
if (bytes == 3)
bytes = 2;
}
return (nir_mem_access_size_align) {
.bit_size = bytes * 8,
.num_components = 1,
.align = 1,
.shift = nir_mem_access_shift_method_scalar,
};
} else {
bytes = MIN2(bytes, 16);
/* With UGM LSC dataport, we don't need to lower 64bit data access into
* two 32bit single vector access since it supports direct 64bit data
* operation.
*/
if (devinfo->has_lsc && align == 8 && bit_size == 64) {
return (nir_mem_access_size_align) {
.bit_size = bit_size,
.num_components = bytes / 8,
.align = bit_size / 8,
.shift = nir_mem_access_shift_method_scalar,
};
} else {
return (nir_mem_access_size_align) {
.bit_size = 32,
.num_components = is_scratch ? 1 :
is_load ? DIV_ROUND_UP(bytes, 4) : bytes / 4,
.align = 4,
.shift = nir_mem_access_shift_method_scalar,
};
}
}
}
static bool
brw_nir_ssbo_intel_instr(nir_builder *b,
nir_intrinsic_instr *intrin,
void *cb_data)
{
switch (intrin->intrinsic) {
case nir_intrinsic_load_ssbo: {
b->cursor = nir_before_instr(&intrin->instr);
nir_def *value = nir_load_ssbo_intel(
b,
intrin->def.num_components,
intrin->def.bit_size,
intrin->src[0].ssa,
intrin->src[1].ssa,
.access = nir_intrinsic_access(intrin),
.align_mul = nir_intrinsic_align_mul(intrin),
.align_offset = nir_intrinsic_align_offset(intrin),
.base = 0);
value->loop_invariant = intrin->def.loop_invariant;
value->divergent = intrin->def.divergent;
nir_def_replace(&intrin->def, value);
return true;
}
case nir_intrinsic_store_ssbo: {
b->cursor = nir_instr_remove(&intrin->instr);
nir_store_ssbo_intel(
b,
intrin->src[0].ssa,
intrin->src[1].ssa,
intrin->src[2].ssa,
.access = nir_intrinsic_access(intrin),
.align_mul = nir_intrinsic_align_mul(intrin),
.align_offset = nir_intrinsic_align_offset(intrin),
.base = 0);
return true;
}
default:
return false;
}
}
static bool
brw_nir_ssbo_intel(nir_shader *shader)
{
return nir_shader_intrinsics_pass(shader,
brw_nir_ssbo_intel_instr,
nir_metadata_control_flow,
NULL);
}
static void
brw_vectorize_lower_mem_access(brw_pass_tracker *pt)
{
const struct intel_device_info *devinfo = pt->compiler->devinfo;
nir_load_store_vectorize_options options = {
.modes = nir_var_mem_ubo | nir_var_mem_ssbo |
nir_var_mem_global | nir_var_mem_shared |
nir_var_mem_task_payload,
.callback = brw_nir_should_vectorize_mem,
.robust_modes = (nir_variable_mode)0,
};
if (pt->key->robust_flags & BRW_ROBUSTNESS_UBO)
options.robust_modes |= nir_var_mem_ubo;
if (pt->key->robust_flags & BRW_ROBUSTNESS_SSBO)
options.robust_modes |= nir_var_mem_ssbo;
OPT(nir_opt_load_store_vectorize, &options);
/* When HW supports block loads, using the divergence analysis, try
* to find uniform SSBO loads and turn them into block loads.
*
* Rerun the vectorizer after that to make the largest possible block
* loads.
*
* This is a win on 2 fronts :
* - fewer send messages
* - reduced register pressure
*/
if (OPT(intel_nir_blockify_uniform_loads, devinfo)) {
OPT(nir_opt_load_store_vectorize, &options);
OPT(nir_opt_constant_folding);
OPT(nir_opt_copy_prop);
if (OPT(brw_nir_rebase_const_offset_ubo_loads)) {
OPT(nir_opt_cse);
OPT(nir_opt_copy_prop);
nir_load_store_vectorize_options ubo_options = {
.modes = nir_var_mem_ubo,
.callback = brw_nir_should_vectorize_mem,
.robust_modes = options.robust_modes & nir_var_mem_ubo,
};
OPT(nir_opt_load_store_vectorize, &ubo_options);
}
}
struct brw_mem_access_cb_data cb_data = {
.devinfo = devinfo,
};
nir_lower_mem_access_bit_sizes_options mem_access_options = {
.modes = nir_var_mem_ssbo |
nir_var_mem_constant |
nir_var_mem_task_payload |
nir_var_shader_temp |
nir_var_function_temp |
nir_var_mem_global |
nir_var_mem_shared,
.callback = get_mem_access_size_align,
.cb_data = &cb_data,
};
OPT(nir_lower_mem_access_bit_sizes, &mem_access_options);
OPT(nir_lower_pack);
OPT(nir_opt_copy_prop);
OPT(nir_opt_dce);
OPT(nir_opt_algebraic);
OPT(nir_opt_cse);
/* Do this after the vectorization & brw_nir_rebase_const_offset_ubo_loads
* so that we maximize the offset put into the messages.
*/
if (devinfo->ver >= 20) {
OPT(brw_nir_ssbo_intel);
const nir_opt_offsets_options offset_options = {
.buffer_max = UINT32_MAX,
.shared_max = UINT32_MAX,
.shared_atomic_max = UINT32_MAX,
};
OPT(nir_opt_offsets, &offset_options);
OPT(brw_nir_lower_immediate_offsets);
}
}
static bool
nir_shader_has_local_variables(const nir_shader *nir)
{
nir_foreach_function_impl(impl, nir) {
if (!exec_list_is_empty(&impl->locals))
return true;
}
return false;
}
static bool
lower_txd_cb(const nir_tex_instr *tex, const void *data)
{
const struct intel_device_info *devinfo = data;
int min_lod_index = nir_tex_instr_src_index(tex, nir_tex_src_min_lod);
if (tex->is_shadow && min_lod_index >= 0)
return true;
int offset_index = nir_tex_instr_src_index(tex, nir_tex_src_offset);
if (tex->is_shadow && offset_index >= 0 && min_lod_index >= 0)
return true;
/* Cases that require a sampler header and the payload is already too large
* for the HW to handle.
*/
const int sampler_offset_idx =
nir_tex_instr_src_index(tex, nir_tex_src_sampler_offset);
if (min_lod_index >= 0 && sampler_offset_idx >= 0) {
if (!nir_src_is_const(tex->src[sampler_offset_idx].src) ||
(nir_src_is_const(tex->src[sampler_offset_idx].src) &&
(tex->sampler_index +
nir_src_as_uint(tex->src[sampler_offset_idx].src)) >= 16))
return true;
}
const int sampler_handle_idx =
nir_tex_instr_src_index(tex, nir_tex_src_sampler_handle);
if (sampler_handle_idx >= 0 && min_lod_index >= 0)
return true;
if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
return true;
if (devinfo->verx10 >= 125) {
/* For below, See bspec 45942, "Enable new message layout for cube
* array"
*/
if (tex->sampler_dim == GLSL_SAMPLER_DIM_3D)
return true;
if (tex->is_array)
return true;
}
if (tex->is_shadow && offset_index >= 0 &&
!brw_nir_tex_offset_in_constant_range(tex, offset_index))
return true;
return false;
}
static bool
flag_fused_eu_disable_instr(nir_builder *b, nir_instr *instr, void *data)
{
switch (instr->type) {
case nir_instr_type_tex: {
nir_tex_instr *tex = nir_instr_as_tex(instr);
for (unsigned i = 0; i < tex->num_srcs; ++i) {
nir_tex_src_type src_type = tex->src[i].src_type;
/* backend2 is the packed dynamically programmable offset, goes into
* the sampler message header, so it needs to be considered for EU
* fusion.
*/
if (src_type != nir_tex_src_texture_handle &&
src_type != nir_tex_src_sampler_handle &&
src_type != nir_tex_src_texture_offset &&
src_type != nir_tex_src_sampler_offset &&
src_type != nir_tex_src_backend2)
continue;
if (nir_src_is_divergent(&tex->src[i].src)) {
tex->backend_flags |= BRW_TEX_INSTR_FUSED_EU_DISABLE;
return true;
}
}
return false;
}
case nir_instr_type_intrinsic: {
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
/* We only need to care of intrinsics that refers to a structure/descriptor
* outside of the EU's registers like RENDER_SURFACE_STATE/SAMPLER_STATE,
* because the fusing will pick one thread's descriptor handle and use that
* for the 2 fused threads.
*
* Global pointers don't have that problem since all the access' data is
* per lane in the payload of the SEND message (the 64bit pointer).
*
* URB/shared-memory don't have that problem either because there is no
* descriptor information outside the EU, it's just a per lane
* handle/offset.
*/
switch (intrin->intrinsic) {
case nir_intrinsic_load_ssbo_uniform_block_intel:
case nir_intrinsic_load_ubo_uniform_block_intel:
case nir_intrinsic_load_ssbo_block_intel:
case nir_intrinsic_load_ssbo_intel:
case nir_intrinsic_store_ssbo_intel:
case nir_intrinsic_load_ssbo:
case nir_intrinsic_store_ssbo:
case nir_intrinsic_get_ssbo_size:
case nir_intrinsic_load_ubo:
case nir_intrinsic_image_load:
case nir_intrinsic_image_store:
case nir_intrinsic_image_size:
case nir_intrinsic_image_levels:
case nir_intrinsic_image_atomic:
case nir_intrinsic_image_atomic_swap:
case nir_intrinsic_bindless_image_load:
case nir_intrinsic_bindless_image_store:
case nir_intrinsic_bindless_image_size:
case nir_intrinsic_bindless_image_levels:
case nir_intrinsic_bindless_image_atomic:
case nir_intrinsic_bindless_image_atomic_swap: {
int src_idx = nir_get_io_index_src_number(intrin);
assert(src_idx >= 0);
if (nir_src_is_divergent(&intrin->src[src_idx])) {
nir_intrinsic_set_access(intrin,
nir_intrinsic_access(intrin) |
ACCESS_FUSED_EU_DISABLE_INTEL);
return true;
}
return false;
}
default:
return false;
}
}
default:
return false;
}
}
static void
brw_nir_lower_int64(brw_pass_tracker *pt)
{
/* Potentially perform this optimization pass twice because it can create
* additional opportunities for itself.
*/
if (OPT(nir_opt_algebraic_before_lower_int64))
OPT(nir_opt_algebraic_before_lower_int64);
if (OPT(nir_lower_int64))
brw_nir_optimize(pt);
}
/* Prepare the given shader for codegen
*
* This function is intended to be called right before going into the actual
* backend and is highly backend-specific.
*/
void
brw_postprocess_nir_opts(brw_pass_tracker *pt)
{
const struct brw_compiler *compiler = pt->compiler;
const struct intel_device_info *devinfo = compiler->devinfo;
nir_shader *nir = pt->nir;
const nir_lower_tex_options tex_options = {
.lower_txp = ~0,
.lower_txf_offset = true,
.lower_rect_offset = true,
.lower_txb_shadow_clamp = true,
.lower_tg4_offsets = true,
.lower_txs_lod = true, /* Wa_14012320009 */
.lower_offset_filter =
devinfo->verx10 >= 125 ? lower_xehp_tg4_offset_filter : NULL,
.lower_invalid_implicit_lod = true,
.lower_index_to_offset = true,
.lower_txd_cb = lower_txd_cb,
.lower_txd_data = devinfo,
};
/* In the case where TG4 coords are lowered to offsets and we have a
* lower_xehp_tg4_offset_filter lowering those offsets further, we need to
* rerun the pass because the instructions inserted by the first lowering
* are not visible during that first pass.
*/
if (OPT(nir_lower_tex, &tex_options))
OPT(nir_lower_tex, &tex_options);
OPT(brw_nir_lower_mcs_fetch, devinfo);
OPT(intel_nir_lower_sparse_intrinsics);
/* Any constants leftover should be folded so we have constant textures */
OPT(nir_opt_constant_folding);
/* Needs to happen before the backend opcode selection */
OPT(brw_nir_pre_lower_texture, devinfo);
/* Needs to happen before the texture lowering */
OPT(brw_nir_texture_backend_opcode, devinfo);
OPT(brw_nir_lower_texture);
OPT(nir_lower_bit_size, lower_bit_size_callback, (void *)devinfo);
OPT(nir_opt_combine_barriers, combine_all_memory_barriers, NULL);
while (OPT(nir_opt_algebraic_before_ffma)) {}
OPT(nir_opt_idiv_const, 32);
if (devinfo->verx10 >= 125) {
/* Lower integer division by constants before nir_lower_idiv. */
const nir_lower_idiv_options options = {
.allow_fp16 = false
};
/* Given an 8-bit integer remainder, nir_lower_idiv will produce new
* 8-bit integer math which needs to be lowered.
*/
if (OPT(nir_lower_idiv, &options))
OPT(nir_lower_bit_size, lower_bit_size_callback, (void *)devinfo);
}
if (devinfo->ver >= 30)
NIR_PASS(_, nir, brw_nir_lower_sample_index_in_coord);
if (mesa_shader_stage_can_set_fragment_shading_rate(nir->info.stage))
NIR_PASS(_, nir, intel_nir_lower_shading_rate_output);
OPT(brw_nir_tag_speculative_access);
brw_nir_optimize(pt);
if (nir_shader_has_local_variables(nir)) {
OPT(nir_lower_vars_to_explicit_types, nir_var_function_temp,
glsl_get_natural_size_align_bytes);
OPT(nir_lower_explicit_io, nir_var_function_temp,
nir_address_format_32bit_offset);
brw_nir_optimize(pt);
}
brw_vectorize_lower_mem_access(pt);
/* Fence LSC SLM writes to avoid workgroups WaW hazards to the same SLM
* location.
*/
if (devinfo->has_lsc &&
mesa_shader_stage_uses_workgroup(nir->info.stage))
OPT(brw_nir_fence_shared_stores);
/* Do this after lowering memory access bit-sizes */
if (nir->info.stage == MESA_SHADER_MESH ||
nir->info.stage == MESA_SHADER_TASK) {
OPT(lower_task_payload_to_urb_intrinsics, devinfo);
brw_nir_opt_vectorize_urb(pt);
}
/* Needs to be prior int64 lower because it generates 64bit address
* manipulations
*/
OPT(intel_nir_lower_printf);
brw_nir_lower_int64(pt);
/* This pass specifically looks for sequences of fmul and fadd that
* intel_nir_opt_peephole_ffma will try to eliminate. Call this
* reassociation pass first.
*/
OPT(nir_opt_reassociate_matrix_mul);
/* Try and fuse multiply-adds, if successful, run shrink_vectors to
* avoid peephole_ffma to generate things like this :
* vec16 ssa_0 = ...
* vec16 ssa_1 = fneg ssa_0
* vec1 ssa_2 = ffma ssa_1, ...
*
* We want this instead :
* vec16 ssa_0 = ...
* vec1 ssa_1 = fneg ssa_0.x
* vec1 ssa_2 = ffma ssa_1, ...
*/
if (OPT(intel_nir_opt_peephole_ffma))
OPT(nir_opt_shrink_vectors, false);
OPT(intel_nir_opt_peephole_imul32x16);
OPT(nir_opt_generate_bfi);
OPT(nir_opt_reassociate_bfi);
if (OPT(nir_opt_comparison_pre)) {
OPT(nir_opt_copy_prop);
OPT(nir_opt_dce);
OPT(nir_opt_cse);
/* Do the select peepehole again. nir_opt_comparison_pre (combined with
* the other optimization passes) will have removed at least one
* instruction from one of the branches of the if-statement, so now it
* might be under the threshold of conversion to bcsel.
*/
nir_opt_peephole_select_options peephole_select_options = {
.limit = 1,
.expensive_alu_ok = true,
};
OPT(nir_opt_peephole_select, &peephole_select_options);
}
OPT(brw_nir_lower_fsign);
OPT(brw_nir_opt_fsat);
OPT(nir_lower_fp16_casts, nir_lower_fp16_split_fp64);
if (OPT(nir_lower_alu_to_scalar, NULL, NULL))
OPT(nir_opt_constant_folding);
OPT(nir_opt_copy_prop);
OPT(nir_opt_dce);
OPT(nir_opt_cse);
nir_move_options move_all = nir_move_const_undef | nir_move_load_ubo |
nir_move_load_input | nir_move_comparisons |
nir_move_copies | nir_move_load_ssbo |
nir_move_alu;
OPT(nir_opt_sink, move_all);
OPT(nir_opt_move, move_all);
OPT(nir_opt_dead_cf);
static const nir_lower_subgroups_options subgroups_options = {
.ballot_bit_size = 32,
.ballot_components = 1,
.lower_elect = true,
.lower_subgroup_masks = true,
};
if (OPT(nir_opt_uniform_atomics, false))
OPT(nir_lower_subgroups, &subgroups_options);
if (pt->key->divergent_atomics_flags)
OPT(brw_nir_opt_divergent_atomics, pt->key->divergent_atomics_flags);
/* nir_opt_uniform_subgroup can create some operations (e.g.,
* load_subgroup_lt_mask) that need to be lowered again.
*/
if (OPT(nir_opt_uniform_subgroup, &subgroups_options)) {
/* nir_opt_uniform_subgroup may have made some things
* that previously appeared divergent be marked as convergent. This
* allows the elimination of some loops over, say, a TXF instruction
* with a non-uniform texture handle.
*/
brw_nir_optimize(pt);
OPT(nir_lower_subgroups, &subgroups_options);
}
/* A few passes that run after the initial int64 lowering may produce
* new int64 operations. E.g. uniform subgroup may generate a 64-bit mul
* and peephole_select may generate a 64-bit select. So do another
* round at the tail end.
*/
brw_nir_lower_int64(pt);
do {
pt->progress = false;
OPT(nir_opt_algebraic_late);
OPT(nir_opt_algebraic_distribute_src_mods);
if (pt->progress) {
OPT(nir_opt_constant_folding);
OPT(nir_opt_copy_prop);
OPT(nir_opt_dce);
OPT(nir_opt_cse);
}
} while (pt->progress);
/* Deal with EU fusion */
if (devinfo->ver == 12) {
nir_divergence_options options =
nir_divergence_across_subgroups |
nir_divergence_multiple_workgroup_per_compute_subgroup;
nir_foreach_function_impl(impl, nir) {
nir_divergence_analysis_impl(impl, options);
impl->valid_metadata |= nir_metadata_divergence;
}
nir_shader_instructions_pass(nir,
flag_fused_eu_disable_instr,
nir_metadata_all, NULL);
/* We request a special divergence information which is not needed
* after.
*/
nir_foreach_function_impl(impl, nir) {
nir_progress(true, impl, ~nir_metadata_divergence);
}
}
}
void
brw_postprocess_nir_out_of_ssa(brw_pass_tracker *pt,
bool debug_enabled)
{
nir_shader *nir = pt->nir;
/* Run fsign lowering again after the last time brw_nir_optimize is called.
* As is the case with conversion lowering (below), brw_nir_optimize can
* create additional fsign instructions.
*/
if (OPT(brw_nir_lower_fsign))
OPT(nir_opt_dce);
/* Run nir_split_conversions only after the last tiem
* brw_nir_optimize is called. Various optimizations invoked there can
* rematerialize the conversions that the lowering pass eliminates.
*/
const nir_split_conversions_options split_conv_opts = {
.callback = intel_nir_split_conversions_cb,
};
OPT(nir_split_conversions, &split_conv_opts);
/* Do this only after the last opt_gcm. GCM will undo this lowering. */
if (nir->info.stage == MESA_SHADER_FRAGMENT) {
OPT(intel_nir_lower_non_uniform_barycentric_at_sample);
}
OPT(nir_lower_bool_to_int32);
OPT(nir_opt_copy_prop);
OPT(nir_opt_dce);
OPT(nir_lower_locals_to_regs, 32);
nir_validate_ssa_dominance(nir, "before nir_convert_from_ssa");
/* Rerun the divergence analysis before convert_from_ssa as this pass has
* some assert on consistent divergence flags.
*/
OPT(nir_convert_to_lcssa, true, true);
nir_divergence_analysis(nir);
if (unlikely(debug_enabled || pt->archiver)) {
/* Re-index SSA defs so we print more sensible numbers. */
nir_foreach_function_impl(impl, nir) {
nir_index_ssa_defs(impl);
}
if (debug_enabled) {
fprintf(stderr, "NIR (SSA form) for %s shader:\n",
_mesa_shader_stage_to_string(nir->info.stage));
nir_print_shader(nir, stderr);
}
BRW_NIR_SNAPSHOT("ssa");
}
OPT(nir_convert_from_ssa, true, true);
OPT(nir_opt_rematerialize_compares);
OPT(nir_opt_dce);
nir_trivialize_registers(nir);
nir_sweep(nir);
if (unlikely(debug_enabled)) {
fprintf(stderr, "NIR (final form) for %s shader:\n",
_mesa_shader_stage_to_string(nir->info.stage));
nir_print_shader(nir, stderr);
}
BRW_NIR_SNAPSHOT("out");
}
static unsigned
get_subgroup_size(const struct shader_info *info, unsigned max_subgroup_size)
{
if (info->api_subgroup_size) {
/* We have to use the global/required constant size. */
assert(info->api_subgroup_size >= 8 && info->api_subgroup_size <= 32);
return info->api_subgroup_size;
} else if (info->api_subgroup_size_draw_uniform) {
/* It has to be uniform across all invocations but can vary per stage
* if we want. This gives us a bit more freedom.
*
* For compute, brw_nir_apply_key is called per-dispatch-width so this
* is the actual subgroup size and not a maximum. However, we only
* invoke one size of any given compute shader so it's still guaranteed
* to be uniform across invocations.
*/
return max_subgroup_size;
} else {
/* The subgroup size is allowed to be fully varying. For geometry
* stages, we know it's always 8 which is max_subgroup_size so we can
* return that. For compute, brw_nir_apply_key is called once per
* dispatch-width so max_subgroup_size is the real subgroup size.
*
* For fragment, we return 0 and let it fall through to the back-end
* compiler. This means we can't optimize based on subgroup size but
* that's a risk the client took when it asked for a varying subgroup
* size.
*/
return info->stage == MESA_SHADER_FRAGMENT ? 0 : max_subgroup_size;
}
}
unsigned
brw_nir_api_subgroup_size(const nir_shader *nir,
unsigned hw_subgroup_size)
{
return get_subgroup_size(&nir->info, hw_subgroup_size);
}
void
brw_nir_apply_key(brw_pass_tracker *pt,
const struct brw_base_prog_key *key,
unsigned max_subgroup_size)
{
nir_shader *nir = pt->nir;
pt->progress = false;
const nir_lower_subgroups_options subgroups_options = {
.subgroup_size = get_subgroup_size(&nir->info, max_subgroup_size),
.ballot_bit_size = 32,
.ballot_components = 1,
.lower_subgroup_masks = true,
};
OPT(nir_lower_subgroups, &subgroups_options);
if (key->limit_trig_input_range)
OPT(brw_nir_limit_trig_input_range_workaround);
if (pt->progress)
brw_nir_optimize(pt);
}
enum brw_conditional_mod
brw_cmod_for_nir_comparison(nir_op op)
{
switch (op) {
case nir_op_flt:
case nir_op_flt32:
case nir_op_ilt:
case nir_op_ilt32:
case nir_op_ult:
case nir_op_ult32:
return BRW_CONDITIONAL_L;
case nir_op_fge:
case nir_op_fge32:
case nir_op_ige:
case nir_op_ige32:
case nir_op_uge:
case nir_op_uge32:
return BRW_CONDITIONAL_GE;
case nir_op_feq:
case nir_op_feq32:
case nir_op_ieq:
case nir_op_ieq32:
case nir_op_b32all_fequal2:
case nir_op_b32all_iequal2:
case nir_op_b32all_fequal3:
case nir_op_b32all_iequal3:
case nir_op_b32all_fequal4:
case nir_op_b32all_iequal4:
return BRW_CONDITIONAL_Z;
case nir_op_fneu:
case nir_op_fneu32:
case nir_op_ine:
case nir_op_ine32:
case nir_op_b32any_fnequal2:
case nir_op_b32any_inequal2:
case nir_op_b32any_fnequal3:
case nir_op_b32any_inequal3:
case nir_op_b32any_fnequal4:
case nir_op_b32any_inequal4:
return BRW_CONDITIONAL_NZ;
default:
UNREACHABLE("Unsupported NIR comparison op");
}
}
enum lsc_opcode
lsc_op_for_nir_intrinsic(const nir_intrinsic_instr *intrin)
{
switch (intrin->intrinsic) {
case nir_intrinsic_load_ssbo:
case nir_intrinsic_load_ssbo_intel:
case nir_intrinsic_load_shared:
case nir_intrinsic_load_global:
case nir_intrinsic_load_global_block_intel:
case nir_intrinsic_load_global_constant:
case nir_intrinsic_load_global_constant_uniform_block_intel:
case nir_intrinsic_load_shared_block_intel:
case nir_intrinsic_load_shared_uniform_block_intel:
case nir_intrinsic_load_ssbo_block_intel:
case nir_intrinsic_load_ssbo_uniform_block_intel:
case nir_intrinsic_load_ubo_uniform_block_intel:
case nir_intrinsic_load_scratch:
case nir_intrinsic_load_shader_indirect_data_intel:
return LSC_OP_LOAD;
case nir_intrinsic_store_ssbo:
case nir_intrinsic_store_ssbo_intel:
case nir_intrinsic_store_shared:
case nir_intrinsic_store_global:
case nir_intrinsic_store_global_block_intel:
case nir_intrinsic_store_shared_block_intel:
case nir_intrinsic_store_ssbo_block_intel:
case nir_intrinsic_store_scratch:
return LSC_OP_STORE;
case nir_intrinsic_image_load:
case nir_intrinsic_bindless_image_load:
return nir_intrinsic_image_dim(intrin) == GLSL_SAMPLER_DIM_MS ?
LSC_OP_LOAD_CMASK_MSRT :
LSC_OP_LOAD_CMASK;
case nir_intrinsic_image_store:
case nir_intrinsic_bindless_image_store:
return nir_intrinsic_image_dim(intrin) == GLSL_SAMPLER_DIM_MS ?
LSC_OP_STORE_CMASK_MSRT :
LSC_OP_STORE_CMASK;
default:
assert(nir_intrinsic_has_atomic_op(intrin));
break;
}
switch (nir_intrinsic_atomic_op(intrin)) {
case nir_atomic_op_iadd: {
unsigned src_idx;
switch (intrin->intrinsic) {
case nir_intrinsic_image_atomic:
case nir_intrinsic_bindless_image_atomic:
src_idx = 3;
break;
case nir_intrinsic_ssbo_atomic:
src_idx = 2;
break;
case nir_intrinsic_shared_atomic:
case nir_intrinsic_global_atomic:
src_idx = 1;
break;
default:
UNREACHABLE("Invalid add atomic opcode");
}
if (nir_src_is_const(intrin->src[src_idx])) {
int64_t add_val = nir_src_as_int(intrin->src[src_idx]);
if (add_val == 1)
return LSC_OP_ATOMIC_INC;
else if (add_val == -1)
return LSC_OP_ATOMIC_DEC;
}
return LSC_OP_ATOMIC_ADD;
}
case nir_atomic_op_imin: return LSC_OP_ATOMIC_MIN;
case nir_atomic_op_umin: return LSC_OP_ATOMIC_UMIN;
case nir_atomic_op_imax: return LSC_OP_ATOMIC_MAX;
case nir_atomic_op_umax: return LSC_OP_ATOMIC_UMAX;
case nir_atomic_op_iand: return LSC_OP_ATOMIC_AND;
case nir_atomic_op_ior: return LSC_OP_ATOMIC_OR;
case nir_atomic_op_ixor: return LSC_OP_ATOMIC_XOR;
case nir_atomic_op_xchg: return LSC_OP_ATOMIC_STORE;
case nir_atomic_op_cmpxchg: return LSC_OP_ATOMIC_CMPXCHG;
case nir_atomic_op_fmin: return LSC_OP_ATOMIC_FMIN;
case nir_atomic_op_fmax: return LSC_OP_ATOMIC_FMAX;
case nir_atomic_op_fcmpxchg: return LSC_OP_ATOMIC_FCMPXCHG;
case nir_atomic_op_fadd: return LSC_OP_ATOMIC_FADD;
default:
UNREACHABLE("Unsupported NIR atomic intrinsic");
}
}
enum brw_reg_type
brw_type_for_base_type(enum glsl_base_type base_type)
{
switch (base_type) {
case GLSL_TYPE_UINT: return BRW_TYPE_UD;
case GLSL_TYPE_INT: return BRW_TYPE_D;
case GLSL_TYPE_FLOAT: return BRW_TYPE_F;
case GLSL_TYPE_FLOAT16: return BRW_TYPE_HF;
case GLSL_TYPE_BFLOAT16: return BRW_TYPE_BF;
case GLSL_TYPE_FLOAT_E4M3FN: return BRW_TYPE_HF8;
case GLSL_TYPE_FLOAT_E5M2: return BRW_TYPE_BF8;
case GLSL_TYPE_DOUBLE: return BRW_TYPE_DF;
case GLSL_TYPE_UINT16: return BRW_TYPE_UW;
case GLSL_TYPE_INT16: return BRW_TYPE_W;
case GLSL_TYPE_UINT8: return BRW_TYPE_UB;
case GLSL_TYPE_INT8: return BRW_TYPE_B;
case GLSL_TYPE_UINT64: return BRW_TYPE_UQ;
case GLSL_TYPE_INT64: return BRW_TYPE_Q;
default:
UNREACHABLE("invalid base type");
}
}
enum brw_reg_type
brw_type_for_nir_type(const struct intel_device_info *devinfo,
nir_alu_type type)
{
switch (type) {
case nir_type_uint:
case nir_type_uint32:
return BRW_TYPE_UD;
case nir_type_bool:
case nir_type_int:
case nir_type_bool32:
case nir_type_int32:
return BRW_TYPE_D;
case nir_type_float:
case nir_type_float32:
return BRW_TYPE_F;
case nir_type_float16:
return BRW_TYPE_HF;
case nir_type_float64:
return BRW_TYPE_DF;
case nir_type_int64:
return BRW_TYPE_Q;
case nir_type_uint64:
return BRW_TYPE_UQ;
case nir_type_int16:
return BRW_TYPE_W;
case nir_type_uint16:
return BRW_TYPE_UW;
case nir_type_int8:
return BRW_TYPE_B;
case nir_type_uint8:
return BRW_TYPE_UB;
default:
UNREACHABLE("unknown type");
}
return BRW_TYPE_F;
}
nir_def *
brw_nir_load_global_const(nir_builder *b, nir_intrinsic_instr *load,
nir_def *base_addr, unsigned off)
{
assert(load->intrinsic == nir_intrinsic_load_push_data_intel);
unsigned bit_size = load->def.bit_size;
assert(bit_size >= 8 && bit_size % 8 == 0);
nir_def *sysval;
if (nir_src_is_const(load->src[0])) {
uint64_t offset = off +
nir_intrinsic_base(load) +
nir_src_as_uint(load->src[0]);
/* Things should be component-aligned. */
assert(offset % (bit_size / 8) == 0);
unsigned suboffset = offset % 64;
uint64_t aligned_offset = offset - suboffset;
/* Load two just in case we go over a 64B boundary */
nir_def *data[2];
for (unsigned i = 0; i < 2; i++) {
nir_def *addr = nir_iadd_imm(b, base_addr, aligned_offset + i * 64);
data[i] = nir_load_global_constant_uniform_block_intel(
b, 16, 32, addr,
.access = ACCESS_CAN_REORDER | ACCESS_NON_WRITEABLE,
.align_mul = 64);
}
sysval = nir_extract_bits(b, data, 2, suboffset * 8,
load->num_components, bit_size);
} else {
nir_def *offset32 =
nir_iadd_imm(b, load->src[0].ssa,
off + nir_intrinsic_base(load));
nir_def *addr = nir_iadd(b, base_addr, nir_u2u64(b, offset32));
sysval = nir_load_global_constant(b, load->num_components, bit_size, addr);
}
return sysval;
}
const struct glsl_type *
brw_nir_get_var_type(const struct nir_shader *nir, nir_variable *var)
{
const struct glsl_type *type = var->interface_type;
if (!type) {
type = var->type;
if (nir_is_arrayed_io(var, nir->info.stage)) {
assert(glsl_type_is_array(type));
type = glsl_get_array_element(type);
}
}
return type;
}
bool
brw_nir_uses_inline_data(nir_shader *shader)
{
nir_foreach_function_impl(impl, shader) {
nir_foreach_block(block, impl) {
nir_foreach_instr(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic != nir_intrinsic_load_inline_data_intel)
continue;
return true;
}
}
}
return false;
}
/**
* Move load_interpolated_input with simple (payload-based) barycentric modes
* to the top of the program so we don't emit multiple PLNs for the same input.
*
* This works around CSE not being able to handle non-dominating cases
* such as:
*
* if (...) {
* interpolate input
* } else {
* interpolate the same exact input
* }
*
* This should be replaced by global value numbering someday.
*/
bool
brw_nir_move_interpolation_to_top(nir_shader *nir)
{
bool progress = false;
nir_foreach_function_impl(impl, nir) {
nir_block *top = fragment_top_block_or_after_wa_18019110168(impl);
nir_cursor cursor = nir_before_instr(nir_block_first_instr(top));
bool impl_progress = false;
for (nir_block *block = nir_block_cf_tree_next(top);
block != NULL;
block = nir_block_cf_tree_next(block)) {
nir_foreach_instr_safe(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
continue;
nir_intrinsic_instr *bary_intrinsic =
nir_def_as_intrinsic(intrin->src[0].ssa);
nir_intrinsic_op op = bary_intrinsic->intrinsic;
/* Leave interpolateAtSample/Offset() where they are. */
if (op == nir_intrinsic_load_barycentric_at_sample ||
op == nir_intrinsic_load_barycentric_at_offset)
continue;
nir_instr *move[3] = {
&bary_intrinsic->instr,
nir_def_instr(intrin->src[1].ssa),
instr
};
for (unsigned i = 0; i < ARRAY_SIZE(move); i++)
nir_instr_move(cursor, move[i]);
impl_progress = true;
}
}
progress = progress || impl_progress;
nir_progress(impl_progress, impl, nir_metadata_control_flow);
}
return progress;
}
static bool
filter_simd(const nir_instr *instr, UNUSED const void *options)
{
if (instr->type != nir_instr_type_intrinsic)
return false;
switch (nir_instr_as_intrinsic(instr)->intrinsic) {
case nir_intrinsic_load_simd_width_intel:
case nir_intrinsic_load_subgroup_id:
return true;
default:
return false;
}
}
static nir_def *
lower_simd(nir_builder *b, nir_instr *instr, void *options)
{
uintptr_t simd_width = (uintptr_t)options;
switch (nir_instr_as_intrinsic(instr)->intrinsic) {
case nir_intrinsic_load_simd_width_intel:
return nir_imm_int(b, simd_width);
case nir_intrinsic_load_subgroup_id:
/* If the whole workgroup fits in one thread, we can lower subgroup_id
* to a constant zero.
*/
if (!b->shader->info.workgroup_size_variable) {
unsigned local_workgroup_size = b->shader->info.workgroup_size[0] *
b->shader->info.workgroup_size[1] *
b->shader->info.workgroup_size[2];
if (local_workgroup_size <= simd_width)
return nir_imm_int(b, 0);
}
return NULL;
default:
return NULL;
}
}
bool
brw_nir_lower_simd(nir_shader *nir, unsigned dispatch_width)
{
return nir_shader_lower_instructions(nir, filter_simd, lower_simd,
(void *)(uintptr_t)dispatch_width);
}
nir_variable *
brw_nir_find_complete_variable_with_location(nir_shader *shader,
nir_variable_mode mode,
int location)
{
nir_variable *best_var = NULL;
unsigned last_size = 0;
nir_foreach_variable_with_modes(var, shader, mode) {
if (var->data.location != location)
continue;
unsigned new_size = glsl_count_dword_slots(var->type, false);
if (new_size > last_size) {
best_var = var;
last_size = new_size;
}
}
return best_var;
}
struct brw_quick_pressure_state {
uint8_t *convergent_size;
uint8_t *divergent_size;
struct u_sparse_bitset live;
unsigned curr_convergent_size;
unsigned curr_divergent_size;
};
static inline bool
record_def_size(nir_def *def, void *v_state)
{
struct brw_quick_pressure_state *state = v_state;
unsigned num_components = def->num_components;
/* Texturing has return length reduction */
if (nir_def_is_tex(def))
num_components = util_last_bit(nir_def_components_read(def));
/* Assume tightly packed */
unsigned size = DIV_ROUND_UP(num_components * def->bit_size, 32);
nir_op alu_op =
nir_def_is_alu(def) ?
nir_def_as_alu(def)->op : nir_num_opcodes;
/* Assume these are handled via source modifiers */
if (alu_op == nir_op_fneg || alu_op == nir_op_ineg ||
alu_op == nir_op_fabs || alu_op == nir_op_iabs)
size = 0;
if (nir_def_is_unused(def))
size = 0;
if (def->divergent) {
state->convergent_size[def->index] = 0;
state->divergent_size[def->index] = size;
} else {
state->convergent_size[def->index] = size;
state->divergent_size[def->index] = 0;
}
return true;
}
static bool
set_src_live(nir_src *src, void *v_state)
{
struct brw_quick_pressure_state *state = v_state;
/* undefined variables are never live */
if (nir_src_is_undef(*src))
return true;
if (!u_sparse_bitset_test(&state->live, src->ssa->index)) {
u_sparse_bitset_set(&state->live, src->ssa->index);
/* This value just became live, add its size */
state->curr_convergent_size += state->convergent_size[src->ssa->index];
state->curr_divergent_size += state->divergent_size[src->ssa->index];
}
return true;
}
static bool
set_def_dead(nir_def *def, void *v_state)
{
struct brw_quick_pressure_state *state = v_state;
if (u_sparse_bitset_test(&state->live, def->index)) {
u_sparse_bitset_clear(&state->live, def->index);
/* This value just became dead, subtract its size */
state->curr_convergent_size -= state->convergent_size[def->index];
state->curr_divergent_size -= state->divergent_size[def->index];
}
return true;
}
static void
quick_pressure_estimate(nir_shader *nir,
unsigned *out_convergent_size,
unsigned *out_divergent_size)
{
nir_function_impl *impl = nir_shader_get_entrypoint(nir);
nir_metadata_require(impl, nir_metadata_divergence |
nir_metadata_live_defs);
struct brw_quick_pressure_state state = {
.convergent_size = calloc(impl->ssa_alloc, sizeof(uint8_t)),
.divergent_size = calloc(impl->ssa_alloc, sizeof(uint8_t)),
};
u_sparse_bitset_init(&state.live, impl->ssa_alloc, NULL);
unsigned max_convergent_size = 0, max_divergent_size = 0;
nir_foreach_block(block, impl) {
nir_foreach_instr(instr, block) {
nir_foreach_def(instr, record_def_size, &state);
}
state.curr_convergent_size = 0;
state.curr_divergent_size = 0;
/* Start with sizes for anything live-out from the block */
U_SPARSE_BITSET_FOREACH_SET(&block->live_out, i) {
state.curr_convergent_size += state.convergent_size[i];
state.curr_divergent_size += state.divergent_size[i];
}
/* Walk backwards, add source sizes on first sight, subtract on def */
u_sparse_bitset_dup(&state.live, &block->live_out);
nir_foreach_instr_reverse(instr, block) {
if (instr->type == nir_instr_type_phi)
break;
nir_foreach_def(instr, set_def_dead, &state);
nir_foreach_src(instr, set_src_live, &state);
max_convergent_size =
MAX2(max_convergent_size, state.curr_convergent_size);
max_divergent_size =
MAX2(max_divergent_size, state.curr_divergent_size);
}
}
*out_convergent_size = max_convergent_size;
*out_divergent_size = max_divergent_size;
free(state.convergent_size);
free(state.divergent_size);
u_sparse_bitset_free(&state.live);
}
/**
* This pass performs a quick/rough estimate of register pressure in
* SIMD8/16/32 modes, based on how many convergent and divergent values
* exist in the SSA NIR program. Divergent values scale up with SIMD
* width, while convergent ones do not.
*
* This is fundamentally inaccurate, and can't model everything properly.
* We try to err toward underestimating the register pressure. The hope
* is to use this for things like "is it worth even trying to compile a
* SIMD<X> shader, or will it ultimately fail?" If a lower bound on the
* pressure is too high, we can skip all the CPU overhead from invoking
* the backend compiler to try. If it's close though, we'd rather say
* to go ahead and try it rather than lose out on potential benefits of
* larger SIMD sizes.
*/
void
brw_nir_quick_pressure_estimate(nir_shader *nir,
const struct intel_device_info *devinfo,
unsigned simd_estimate[3])
{
unsigned convergent_size, divergent_size;
quick_pressure_estimate(nir, &convergent_size, &divergent_size);
/* Xe2 starts at SIMD16, rather than SIMD8 */
simd_estimate[0] = 0;
unsigned base_simd = devinfo->ver >= 20 ? 1 : 0;
for (unsigned i = base_simd; i < 3; i++) {
simd_estimate[i] = DIV_ROUND_UP(convergent_size, 8 << base_simd) +
divergent_size * (1 << (i - base_simd));
}
}
unsigned
brw_nir_pack_vs_input(nir_shader *nir, struct brw_vs_prog_data *prog_data)
{
struct vf_attribute {
unsigned reg_offset;
uint8_t component_mask;
bool is_64bit:1;
bool is_used:1;
} attributes[MAX_HW_VERT_ATTRIB] = {};
/* IO lowering is going to break dmat inputs into a location each, so we
* need to reproduce the 64bit nature of the variable into each slot.
*/
nir_foreach_shader_in_variable(var, nir) {
const bool is_64bit = glsl_type_is_64bit(var->type);
const uint32_t slots = glsl_count_vec4_slots(var->type, true, false);
for (uint32_t i = 0; i < slots; i++)
attributes[var->data.location + i].is_64bit = is_64bit;
}
/* First mark all used inputs */
nir_foreach_function_impl(impl, nir) {
nir_foreach_block(block, impl) {
nir_foreach_instr(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic != nir_intrinsic_load_input)
continue;
assert(intrin->def.bit_size == 32);
const struct nir_io_semantics io =
nir_intrinsic_io_semantics(intrin);
attributes[io.location].is_used = true;
/* SKL PRMs, Vol 2a: Command Reference: Instructions,
* 3DSTATE_VF_COMPONENT_PACKING:
*
* "Software shall enable all components (XYZW) for any and all
* VERTEX_ELEMENTs associated with a 256-bit SURFACE_FORMAT.
* It is INVALID to disable any components in these cases."
*
* Enable this XYZW for any > 128-bit format.
*/
if (nir->info.dual_slot_inputs & BITFIELD64_BIT(io.location)) {
attributes[io.location].component_mask |= 0xff;
} else {
const uint8_t mask =
nir_component_mask(intrin->num_components) <<
nir_intrinsic_component(intrin);
attributes[io.location].component_mask |= mask;
}
}
}
}
/* SKL PRMs, Vol 2a: Command Reference: Instructions,
* 3DSTATE_VF_COMPONENT_PACKING:
*
* "At least one component of one "valid" Vertex Element must be
* enabled."
*/
if (nir->info.inputs_read == 0) {
if (prog_data->no_vf_slot_compaction) {
attributes[VERT_ATTRIB_GENERIC0].is_used = true;
attributes[VERT_ATTRIB_GENERIC0].component_mask = 0x1;
} else if (!BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_IS_INDEXED_DRAW) &&
!BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FIRST_VERTEX) &&
!BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE) &&
!BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) &&
!BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID) &&
!BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID)) {
attributes[VERT_ATTRIB_GENERIC0].is_used = true;
attributes[VERT_ATTRIB_GENERIC0].component_mask = 0x1;
}
}
/* Compute the register offsets */
unsigned reg_offset = 0;
unsigned vertex_element = 0;
for (unsigned a = 0; a < ARRAY_SIZE(attributes); a++) {
if (!attributes[a].is_used)
continue;
/* SKL PRMs, Vol 2a: Command Reference: Instructions,
* 3DSTATE_VF_COMPONENT_PACKING:
*
* "No enable bits are provided for Vertex Elements [32-33],
* and therefore no packing is performed on these elements (if
* Valid, all 4 components are stored)."
*/
if (vertex_element >= 32 ||
(prog_data->no_vf_slot_compaction && a >= VERT_ATTRIB_GENERIC(32)))
attributes[a].component_mask = 0xf;
attributes[a].reg_offset = reg_offset;
reg_offset += util_bitcount(attributes[a].component_mask);
vertex_element++;
}
/* Remap inputs */
nir_foreach_function_impl(impl, nir) {
nir_foreach_block(block, impl) {
nir_foreach_instr(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic != nir_intrinsic_load_input)
continue;
struct nir_io_semantics io = nir_intrinsic_io_semantics(intrin);
unsigned slot = attributes[io.location].reg_offset / 4;
unsigned slot_component =
attributes[io.location].reg_offset % 4 +
util_bitcount(attributes[io.location].component_mask &
BITFIELD_MASK(io.high_dvec2 * 4 +
nir_intrinsic_component(intrin)));
slot += slot_component / 4;
slot_component %= 4;
nir_intrinsic_set_base(intrin, slot);
nir_intrinsic_set_component(intrin, slot_component);
/* The code above generates load_input with
* "component + num_component > 4", which is theoretically illegal.
*/
io.no_validate = 1;
nir_intrinsic_set_io_semantics(intrin, io);
}
}
}
/* Generate the packing array, we start from the first application
* attribute : VERT_ATTRIB_GENERIC0
*/
unsigned vf_element_count = 0;
for (unsigned a = VERT_ATTRIB_GENERIC0; a < ARRAY_SIZE(attributes) && vf_element_count < 32; a++) {
/* Consider all attributes used when no slot compaction is active */
if (!attributes[a].is_used && !prog_data->no_vf_slot_compaction)
continue;
uint32_t mask;
/* Stores masks in attributes[a].component_mask are in terms of 32-bit
* components, but the HW depending on the format will interpret
* prog_data->vf_component_packing[] bits as either a 32-bit or 64-bit
* component. So we need to only consider every other bit.
*/
if (attributes[a].is_64bit) {
mask = 0;
u_foreach_bit(b, attributes[a].component_mask)
mask |= BITFIELD_BIT(b / 2);
} else {
mask = attributes[a].component_mask;
}
/* We should only have 4bits enabled max */
assert((mask & ~0xfu) == 0);
prog_data->vf_component_packing[vf_element_count / 8] |=
mask << (4 * (vf_element_count % 8));
vf_element_count++;
}
nir_validate_shader(nir, __func__);
return reg_offset;
}