jay: split up jay_from_nir.c

Big monolithic file, split it up into the relevant pieces.

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40960>
This commit is contained in:
Alyssa Rosenzweig 2026-04-14 11:25:11 -04:00 committed by Marge Bot
parent 6925d9ee23
commit 4eb838eb48
6 changed files with 1149 additions and 1094 deletions

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,85 @@
/*
* Copyright 2026 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#include "jay_builder.h"
#include "jay_ir.h"
static void
set_cr0(jay_function *f, jay_cursor cursor, uint32_t *cr0, uint32_t desired)
{
/* Only touch cr0 if we are changing bits */
if ((*cr0) != desired) {
jay_builder b = jay_init_builder(f, cursor);
jay_XOR(&b, JAY_TYPE_U32, jay_control(), jay_control(), (*cr0) ^ desired);
*cr0 = desired;
}
}
void
jay_insert_fp_mode(jay_shader *shader, uint32_t api, uint32_t float_sizes)
{
/* First, work out the global float control mode for the shader */
uint32_t global = 0x0;
/* Initially fp16 denorms are flushed-to-zero, handle preserve. */
if ((api & FLOAT_CONTROLS_DENORM_PRESERVE_FP16) && (float_sizes & 16)) {
global |= BRW_CR0_FP16_DENORM_PRESERVE;
}
/* Initially fp32 denorms are flushed-to-zero, handle preserve.
*
* TODO: Optimize this, we have a dispatch bit.
*/
if ((api & FLOAT_CONTROLS_DENORM_PRESERVE_FP32) && (float_sizes & 32)) {
global |= BRW_CR0_FP32_DENORM_PRESERVE;
}
/* Initially fp64 denorms are flushed to zero, handle preserve. */
if ((api & FLOAT_CONTROLS_DENORM_PRESERVE_FP64) && (float_sizes & 64)) {
global |= BRW_CR0_FP64_DENORM_PRESERVE;
}
/* By default, we are in round-to-even mode. Note we do not permit setting
* round mode separately by bitsize but this is ok for current APIs. The
* Vulkan driver sets roundingModeIndependence = NONE.
*
* TODO: Optimize this, there is a command buffer bit for it.
*/
if (((api & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16) && (float_sizes & 16)) ||
((api & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32) && (float_sizes & 32)) ||
((api & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) && (float_sizes & 64))) {
global |= (BRW_RND_MODE_RTZ << BRW_CR0_RND_MODE_SHIFT);
}
uint32_t cr0 = 0;
jay_function *entrypoint = jay_shader_get_entrypoint(shader);
set_cr0(entrypoint, jay_before_function(entrypoint), &cr0, global);
/* Now handle per-instruction deltas to the global mode */
jay_foreach_function(shader, func) {
jay_foreach_block(func, block) {
uint32_t current = cr0;
jay_foreach_inst_in_block(block, I) {
uint32_t required = cr0;
enum jay_rounding_mode round =
(I->op == JAY_OPCODE_CVT) ? jay_cvt_rounding_mode(I) : JAY_ROUND;
if (round != JAY_ROUND) {
required &= ~BRW_CR0_RND_MODE_MASK;
required |= ((round - JAY_RNE) << BRW_CR0_RND_MODE_SHIFT);
}
if (jay_type_is_any_float(I->type)) {
set_cr0(func, jay_before_inst(I), &current, required);
}
}
/* Restore to global state on block boundaries */
if (jay_num_successors(block) > 0) {
set_cr0(func, jay_after_block(block), &current, cr0);
}
}
}
}

View file

@ -0,0 +1,462 @@
/*
* Copyright 2026 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#include "compiler/brw/brw_eu.h"
#include "compiler/brw/brw_eu_defines.h"
#include "compiler/brw/brw_nir.h"
#include "compiler/brw/brw_private.h"
#include "compiler/intel_nir.h"
#include "jay_private.h"
#include "nir.h"
#include "nir_builder.h"
/*
* Jay-to-NIR relies on a careful indexing of defs: every 32-bit word has
* its own index. Vectors/64-bit use contiguous indices. We therefore run a
* modified version of nir_index_ssa_defs right before translating NIR->Jay.
*/
static bool
index_ssa_def_cb(nir_def *def, void *state)
{
unsigned *index = (unsigned *) state;
def->index = *index;
*index += DIV_ROUND_UP(def->num_components * MAX2(def->bit_size, 32), 32);
return true;
}
static void
nj_index_ssa_defs(nir_shader *nir)
{
nir_foreach_function_impl(impl, nir) {
/* The zero index means null in Jay, so start SSA indices at 1 */
unsigned index = 1;
nir_foreach_block_unstructured(block, impl) {
nir_foreach_instr(instr, block)
nir_foreach_def(instr, index_ssa_def_cb, &index);
}
impl->ssa_alloc = index;
}
}
static bool
lower_helper_invocation(nir_builder *b, nir_intrinsic_instr *intr, void *_)
{
if (intr->intrinsic != nir_intrinsic_load_helper_invocation)
return false;
/* TODO: Is this right for multisampling? */
b->cursor = nir_before_instr(&intr->instr);
nir_def *active =
nir_inot(b, nir_inverse_ballot(b, nir_load_sample_mask_in(b)));
nir_def_replace(&intr->def, active);
return true;
}
static bool
lower_frag_coord(nir_builder *b, nir_intrinsic_instr *intr, void *simd_)
{
if (intr->intrinsic != nir_intrinsic_load_frag_coord &&
intr->intrinsic != nir_intrinsic_load_pixel_coord)
return false;
b->cursor = nir_before_instr(&intr->instr);
nir_def *c = nir_unpack_32_2x16(b, nir_load_pixel_coord_intel(b));
if (intr->intrinsic == nir_intrinsic_load_frag_coord) {
c = nir_vec4(b, nir_u2f32(b, nir_channel(b, c, 0)),
nir_u2f32(b, nir_channel(b, c, 1)), nir_load_frag_coord_z(b),
nir_frcp(b, nir_load_frag_coord_w_rcp(b)));
}
nir_def_replace(&intr->def, c);
return true;
}
static bool
jay_nir_lower_simd(nir_builder *b, nir_intrinsic_instr *intr, void *simd_)
{
b->cursor = nir_after_instr(&intr->instr);
unsigned *simd_width = simd_;
/* mask & -mask isolates the lowest set bit in the mask. */
if (intr->intrinsic == nir_intrinsic_elect) {
nir_def *mask = nir_ballot(b, 1, *simd_width, nir_imm_true(b));
mask = nir_iand(b, mask, nir_ineg(b, mask));
nir_def_replace(&intr->def, nir_inverse_ballot(b, mask));
return true;
}
/* Ballots must match the SIMD size */
if (intr->intrinsic == nir_intrinsic_ballot ||
intr->intrinsic == nir_intrinsic_ballot_relaxed) {
unsigned old_bitsize = intr->def.bit_size;
intr->def.bit_size = *simd_width;
nir_def *u2uN = nir_u2uN(b, &intr->def, old_bitsize);
nir_def_rewrite_uses_after(&intr->def, u2uN);
return true;
}
/* Note: we don't treat read_invocation specially because there's little
* benefit but doing so would require expensive uniformizing in some cases.
*/
if (intr->intrinsic != nir_intrinsic_shuffle &&
intr->intrinsic != nir_intrinsic_read_invocation)
return false;
nir_def *data = intr->src[0].ssa;
assert(data->num_components == 1 && data->bit_size <= 32 && "scalarized");
nir_def *offset_B = nir_imul_imm(b, intr->src[1].ssa, 4);
nir_def_replace(&intr->def, nir_shuffle_intel(b, 1, data, offset_B));
return true;
}
struct frag_out_ctx {
nir_def *colour[8], *depth, *stencil, *sample_mask;
};
static bool
collect_fragment_output(nir_builder *b, nir_intrinsic_instr *intr, void *ctx_)
{
struct frag_out_ctx *ctx = ctx_;
if (intr->intrinsic != nir_intrinsic_store_output)
return false;
unsigned wrmask = nir_intrinsic_write_mask(intr);
assert(nir_intrinsic_component(intr) == 0 && "component should be lowered");
assert(util_is_power_of_two_nonzero(wrmask + 1) &&
"complex writemasks should be lowered");
/* TODO: Optimize with write mask? */
gl_frag_result loc = nir_intrinsic_io_semantics(intr).location;
assert(!nir_intrinsic_io_semantics(intr).dual_source_blend_index && "todo");
nir_def **out;
if (loc == FRAG_RESULT_COLOR) {
out = &ctx->colour[0];
} else if (loc >= FRAG_RESULT_DATA0 && loc <= FRAG_RESULT_DATA7) {
out = &ctx->colour[loc - FRAG_RESULT_DATA0];
} else if (loc == FRAG_RESULT_DEPTH) {
out = &ctx->depth;
} else if (loc == FRAG_RESULT_STENCIL) {
UNREACHABLE("todo");
out = &ctx->stencil;
} else if (loc == FRAG_RESULT_SAMPLE_MASK) {
UNREACHABLE("todo");
out = &ctx->sample_mask;
} else {
UNREACHABLE("invalid location");
}
assert((*out) == NULL && "each location written exactly once");
*out = intr->src[0].ssa;
nir_instr_remove(&intr->instr);
return true;
}
static void
append_payload(nir_builder *b,
nir_def **payload,
unsigned *len,
unsigned max_len,
nir_def *value)
{
if (value != NULL) {
for (unsigned i = 0; i < value->num_components; ++i) {
payload[*len] = nir_channel(b, value, i);
(*len)++;
assert((*len) <= max_len);
}
}
}
static void
insert_rt_store(nir_builder *b,
const struct intel_device_info *devinfo,
signed target,
bool last,
nir_def *colour,
nir_def *src0_alpha,
nir_def *depth,
nir_def *stencil,
nir_def *sample_mask,
unsigned dispatch_width)
{
bool null_rt = target < 0;
target = MAX2(target, 0);
if (!colour) {
colour = nir_undef(b, 4, 32);
}
colour = nir_pad_vec4(b, colour);
if (null_rt) {
/* Even if we don't write a RT, we still need to write alpha for
* alpha-to-coverage and alpha testing. Optimize the other channels out.
*/
colour = nir_vector_insert_imm(b, nir_undef(b, 4, 32),
nir_channel(b, colour, 3), 3);
}
/* TODO: Not sure I like this. We'll see what 2src looks like. */
unsigned op = dispatch_width == 32 ?
XE2_DATAPORT_RENDER_TARGET_WRITE_SIMD32_SINGLE_SOURCE :
BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
uint64_t desc =
brw_fb_write_desc(devinfo, target, op, last, false /* coarse write */);
uint64_t ex_desc = 0;
if (devinfo->ver >= 20) {
ex_desc = target << 21 |
null_rt << 20 |
(src0_alpha ? (1 << 15) : 0) |
(stencil ? (1 << 14) : 0) |
(depth ? (1 << 13) : 0) |
(sample_mask ? (1 << 12) : 0);
} else if (devinfo->ver >= 11) {
/* Set the "Render Target Index" and "Src0 Alpha Present" fields
* in the extended message descriptor, in lieu of using a header.
*/
ex_desc = target << 12 | null_rt << 20 | (src0_alpha ? (1 << 15) : 0);
}
/* Build the payload */
nir_def *payload[8] = { NULL };
unsigned len = 0;
append_payload(b, payload, &len, ARRAY_SIZE(payload), colour);
append_payload(b, payload, &len, ARRAY_SIZE(payload), depth);
/* TODO */
nir_def *disable = b->shader->info.fs.uses_discard ?
nir_is_helper_invocation(b, 1) :
nir_imm_false(b);
nir_store_render_target_intel(b, nir_vec(b, payload, len),
nir_imm_ivec2(b, desc, ex_desc), disable,
.eot = last);
}
static void
lower_fragment_outputs(nir_function_impl *impl,
const struct intel_device_info *devinfo,
unsigned nr_color_regions,
unsigned dispatch_width)
{
struct frag_out_ctx ctx = { { NULL } };
nir_function_intrinsics_pass(impl, collect_fragment_output,
nir_metadata_control_flow, &ctx);
nir_builder b_ = nir_builder_at(nir_after_impl(impl));
nir_builder *b = &b_;
assert(nr_color_regions <= ARRAY_SIZE(ctx.colour));
signed first = -1;
for (unsigned i = 0; i < ARRAY_SIZE(ctx.colour); ++i) {
if (ctx.colour[i]) {
first = i;
break;
}
}
/* Do the later render targets first */
for (unsigned i = first + 1; i < nr_color_regions; ++i) {
if (ctx.colour[i]) {
insert_rt_store(b, devinfo, i, false, ctx.colour[i], NULL, NULL, NULL,
NULL, dispatch_width);
}
}
/* Finally do render target zero attaching all the sideband things and
* setting the LastRT bit. This needs to exist even if nothing is written
* since it also signals end-of-thread.
*/
insert_rt_store(b, devinfo, first < nr_color_regions ? first : -1, true,
first >= 0 ? ctx.colour[first] : NULL, NULL, ctx.depth,
ctx.stencil, ctx.sample_mask, dispatch_width);
}
unsigned
jay_process_nir(const struct intel_device_info *devinfo,
nir_shader *nir,
union brw_any_prog_data *prog_data,
union brw_any_prog_key *key)
{
enum mesa_shader_stage stage = nir->info.stage;
struct brw_compiler compiler = { .devinfo = devinfo };
unsigned nr_packed_regs = 0;
brw_pass_tracker pt_ = {
.nir = nir,
.key = &key->base,
.dispatch_width = 0,
.compiler = &compiler,
.archiver = NULL, //params->base.archiver,
}, *pt = &pt_;
BRW_NIR_SNAPSHOT("first");
prog_data->base.ray_queries = nir->info.ray_queries;
prog_data->base.stage = stage;
// TODO: Make the driver do this?
// prog_data->base.source_hash = params->source_hash;
prog_data->base.total_shared = nir->info.shared_size;
/* TODO: Real heuristic */
bool do_simd32 = INTEL_SIMD(FS, 32);
do_simd32 &= stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_FRAGMENT;
unsigned simd_width = do_simd32 ? (nir->info.api_subgroup_size ?: 32) : 16;
if (stage == MESA_SHADER_VERTEX) {
/* We only expect slot compaction to be disabled when using device
* generated commands, to provide an independent 3DSTATE_VERTEX_ELEMENTS
* programming. This should always be enabled together with VF component
* packing to minimize the size of the payload.
*/
assert(!key->vs.no_vf_slot_compaction || key->vs.vf_component_packing);
/* When using Primitive Replication for multiview, each view gets its own
* position slot.
*/
const uint32_t pos_slots =
(nir->info.per_view_outputs & VARYING_BIT_POS) ?
MAX2(1, util_bitcount(key->base.view_mask)) :
1;
/* Only position is allowed to be per-view */
assert(!(nir->info.per_view_outputs & ~VARYING_BIT_POS));
brw_compute_vue_map(devinfo, &prog_data->vue.vue_map,
nir->info.outputs_written, key->base.vue_layout,
pos_slots);
brw_nir_apply_key(pt, &key->base, simd_width);
prog_data->vs.inputs_read = nir->info.inputs_read;
prog_data->vs.double_inputs_read = nir->info.vs.double_inputs;
prog_data->vs.no_vf_slot_compaction = key->vs.no_vf_slot_compaction;
brw_nir_lower_vs_inputs(nir);
brw_nir_lower_vue_outputs(nir);
BRW_NIR_SNAPSHOT("after_lower_io");
memset(prog_data->vs.vf_component_packing, 0,
sizeof(prog_data->vs.vf_component_packing));
if (key->vs.vf_component_packing) {
nr_packed_regs = brw_nir_pack_vs_input(nir, &prog_data->vs);
}
/* Get constant offsets out of the way for proper clip/cull handling */
BRW_NIR_PASS(nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
BRW_NIR_PASS(nir_opt_constant_folding);
BRW_NIR_PASS(brw_nir_lower_deferred_urb_writes, devinfo,
&prog_data->vue.vue_map, 0, 0);
} else if (stage == MESA_SHADER_FRAGMENT) {
assert(key->fs.mesh_input == INTEL_NEVER && "todo");
assert(!key->fs.force_dual_color_blend && "todo");
brw_nir_apply_key(pt, &key->base, 32);
brw_nir_lower_fs_inputs(nir, devinfo, &key->fs);
brw_nir_lower_fs_outputs(nir);
NIR_PASS(_, nir, nir_lower_io_to_scalar, nir_var_shader_in, NULL, NULL);
if (!brw_can_coherent_fb_fetch(devinfo))
NIR_PASS(_, nir, brw_nir_lower_fs_load_output, &key->fs);
NIR_PASS(_, nir, nir_opt_frag_coord_to_pixel_coord);
NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_frag_coord,
nir_metadata_control_flow, NULL);
NIR_PASS(_, nir, nir_opt_barycentric, true);
lower_fragment_outputs(nir_shader_get_entrypoint(nir), devinfo,
key->fs.nr_color_regions, simd_width);
NIR_PASS(_, nir, nir_lower_helper_writes, true);
NIR_PASS(_, nir, nir_lower_is_helper_invocation);
NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_helper_invocation,
nir_metadata_control_flow, NULL);
if (key->fs.alpha_to_coverage != INTEL_NEVER) {
/* Run constant fold optimization in order to get the correct source
* offset to determine render target 0 store instruction in
* emit_alpha_to_coverage pass.
*/
NIR_PASS(_, nir, nir_opt_constant_folding);
NIR_PASS(_, nir, brw_nir_lower_alpha_to_coverage);
}
// TODO
// NIR_PASS(_, nir, brw_nir_move_interpolation_to_top);
if (!brw_fs_prog_key_is_dynamic(&key->fs)) {
uint32_t f = 0;
if (key->fs.multisample_fbo == INTEL_ALWAYS)
f |= INTEL_FS_CONFIG_MULTISAMPLE_FBO;
if (key->fs.alpha_to_coverage == INTEL_ALWAYS)
f |= INTEL_FS_CONFIG_ALPHA_TO_COVERAGE;
if (key->fs.provoking_vertex_last == INTEL_ALWAYS)
f |= INTEL_FS_CONFIG_PROVOKING_VERTEX_LAST;
if (key->fs.persample_interp == INTEL_ALWAYS) {
f |= INTEL_FS_CONFIG_PERSAMPLE_DISPATCH |
INTEL_FS_CONFIG_PERSAMPLE_INTERP;
}
NIR_PASS(_, nir, nir_inline_sysval, nir_intrinsic_load_fs_config_intel,
f);
}
} else {
brw_nir_apply_key(pt, &key->base, simd_width);
}
brw_postprocess_nir_opts(pt);
NIR_PASS(_, nir, nir_shader_intrinsics_pass, jay_nir_lower_simd,
nir_metadata_control_flow, &simd_width);
NIR_PASS(_, nir, nir_opt_algebraic_late);
NIR_PASS(_, nir, intel_nir_opt_peephole_imul32x16);
/* Late postprocess while remaining in SSA */
/* Run fsign lowering again after the last time brw_nir_optimize is called.
* As is the case with conversion lowering (below), brw_nir_optimize can
* create additional fsign instructions.
*/
NIR_PASS(_, nir, jay_nir_lower_fsign);
NIR_PASS(_, nir, jay_nir_lower_bool);
NIR_PASS(_, nir, nir_opt_cse);
NIR_PASS(_, nir, nir_opt_dce);
NIR_PASS(_, nir, jay_nir_opt_sel_zero);
/* Run nir_split_conversions only after the last tiem
* brw_nir_optimize is called. Various optimizations invoked there can
* rematerialize the conversions that the lowering pass eliminates.
*/
const nir_split_conversions_options split_conv_opts = {
.callback = intel_nir_split_conversions_cb,
};
NIR_PASS(_, nir, nir_split_conversions, &split_conv_opts);
/* Do this only after the last opt_gcm. GCM will undo this lowering. */
if (stage == MESA_SHADER_FRAGMENT) {
NIR_PASS(_, nir, intel_nir_lower_non_uniform_barycentric_at_sample);
}
NIR_PASS(_, nir, nir_opt_constant_folding);
NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
NIR_PASS(_, nir, nir_lower_all_phis_to_scalar);
NIR_PASS(_, nir, nir_opt_copy_prop);
NIR_PASS(_, nir, nir_opt_dce);
/* Run divergence analysis at the end */
nir_sweep(nir);
nj_index_ssa_defs(nir);
nir_divergence_analysis(nir);
jay_populate_prog_data(devinfo, nir, prog_data, key, nr_packed_regs);
return simd_width;
}

View file

@ -22,6 +22,16 @@ bool jay_nir_lower_bool(nir_shader *nir);
bool jay_nir_opt_sel_zero(nir_shader *nir);
bool jay_nir_lower_fsign(nir_shader *nir);
void jay_populate_prog_data(const struct intel_device_info *devinfo,
nir_shader *nir,
union brw_any_prog_data *prog_data,
union brw_any_prog_key *key,
unsigned nr_packed_regs);
unsigned jay_process_nir(const struct intel_device_info *devinfo,
nir_shader *nir,
union brw_any_prog_data *prog_data,
union brw_any_prog_key *key);
void jay_compute_liveness(jay_function *f);
void jay_calculate_register_demands(jay_function *f);
@ -63,6 +73,7 @@ void jay_lower_post_ra(jay_shader *s);
void jay_lower_spill(jay_function *func);
void jay_lower_simd_width(jay_shader *s);
void jay_lower_scoreboard(jay_shader *s);
void jay_insert_fp_mode(jay_shader *shader, uint32_t api, uint32_t float_sizes);
struct jay_shader_bin *
jay_to_binary(jay_shader *s, void *const_data, size_t const_data_size);

View file

@ -0,0 +1,581 @@
/*
* Copyright 2026 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#include "compiler/brw/brw_compiler.h"
#include "compiler/brw/brw_nir.h"
#include "compiler/intel_nir.h"
#include "jay_private.h"
#include "nir.h"
static inline enum intel_barycentric_mode
brw_barycentric_mode(const struct brw_fs_prog_key *key,
nir_intrinsic_instr *intr)
{
const enum glsl_interp_mode mode = nir_intrinsic_interp_mode(intr);
/* Barycentric modes don't make sense for flat inputs. */
assert(mode != INTERP_MODE_FLAT);
unsigned bary;
switch (intr->intrinsic) {
case nir_intrinsic_load_barycentric_pixel:
case nir_intrinsic_load_barycentric_at_offset:
/* When per sample interpolation is dynamic, assume sample interpolation.
* We'll dynamically remap things so that the FS payload is not affected.
*/
bary = key->persample_interp == INTEL_SOMETIMES ?
INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE :
INTEL_BARYCENTRIC_PERSPECTIVE_PIXEL;
break;
case nir_intrinsic_load_barycentric_centroid:
bary = INTEL_BARYCENTRIC_PERSPECTIVE_CENTROID;
break;
case nir_intrinsic_load_barycentric_sample:
case nir_intrinsic_load_barycentric_at_sample:
bary = INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE;
break;
default:
UNREACHABLE("invalid intrinsic");
}
if (mode == INTERP_MODE_NOPERSPECTIVE)
bary += 3;
return (enum intel_barycentric_mode) bary;
}
struct fs_info_ctx {
const struct brw_fs_prog_key *key;
struct brw_fs_prog_data *prog_data;
const struct intel_device_info *devinfo;
};
static bool
gather_fs_info(nir_builder *b, nir_intrinsic_instr *intr, void *data)
{
struct fs_info_ctx *ctx = data;
struct brw_fs_prog_data *prog_data = ctx->prog_data;
switch (intr->intrinsic) {
case nir_intrinsic_load_barycentric_pixel:
case nir_intrinsic_load_barycentric_centroid:
case nir_intrinsic_load_barycentric_sample:
prog_data->barycentric_interp_modes |=
1 << brw_barycentric_mode(ctx->key, intr);
break;
case nir_intrinsic_load_barycentric_at_sample:
case nir_intrinsic_load_barycentric_at_offset: {
unsigned mode = brw_barycentric_mode(ctx->key, intr);
prog_data->barycentric_interp_modes |= 1 << mode;
prog_data->uses_sample_offsets |=
mode == INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE ||
mode == INTEL_BARYCENTRIC_NONPERSPECTIVE_SAMPLE;
if ((1 << mode) & INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS)
prog_data->uses_npc_bary_coefficients = true;
else
prog_data->uses_pc_bary_coefficients = true;
break;
}
case nir_intrinsic_load_frag_coord_z:
prog_data->uses_src_depth = true;
break;
case nir_intrinsic_load_frag_coord_w_rcp:
prog_data->uses_src_w = true;
break;
case nir_intrinsic_load_sample_mask_in:
/* TODO: Sample masks are broken and discards are broken and simd32
* layouts are broken too. XXX.
*/
// prog_data->uses_sample_mask = true;
break;
case nir_intrinsic_load_pixel_coord_intel:
BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
break;
default:
break;
}
return false;
}
static void
brw_compute_flat_inputs(struct brw_fs_prog_data *prog_data,
const nir_shader *shader)
{
prog_data->flat_inputs = 0;
nir_foreach_shader_in_variable(var, shader) {
if (var->data.interpolation != INTERP_MODE_FLAT ||
var->data.per_primitive)
continue;
unsigned slots = glsl_count_attribute_slots(var->type, false);
for (unsigned s = 0; s < slots; s++) {
int input_index = prog_data->urb_setup[var->data.location + s];
if (input_index >= 0)
prog_data->flat_inputs |= 1 << input_index;
}
}
}
static uint8_t
computed_depth_mode(const nir_shader *shader)
{
if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
switch (shader->info.fs.depth_layout) {
case FRAG_DEPTH_LAYOUT_NONE:
case FRAG_DEPTH_LAYOUT_ANY:
return BRW_PSCDEPTH_ON;
case FRAG_DEPTH_LAYOUT_GREATER:
return BRW_PSCDEPTH_ON_GE;
case FRAG_DEPTH_LAYOUT_LESS:
return BRW_PSCDEPTH_ON_LE;
case FRAG_DEPTH_LAYOUT_UNCHANGED:
/* We initially set this to OFF, but having the shader write the
* depth means we allocate register space in the SEND message. The
* difference between the SEND register count and the OFF state
* programming makes the HW hang.
*
* Removing the depth writes also leads to test failures. So use
* LesserThanOrEqual, which fits writing the same value
* (unchanged/equal).
*
*/
return BRW_PSCDEPTH_ON_LE;
}
}
return BRW_PSCDEPTH_OFF;
}
/*
* Build up an array of indices into the urb_setup array that
* references the active entries of the urb_setup array.
* Used to accelerate walking the active entries of the urb_setup array
* on each upload.
*/
static void
brw_compute_urb_setup_index(struct brw_fs_prog_data *fs_prog_data)
{
/* TODO(mesh): Review usage of this in the context of Mesh, we may want to
* skip per-primitive attributes here.
*/
/* Make sure uint8_t is sufficient */
static_assert(VARYING_SLOT_MAX <= 0xff);
uint8_t index = 0;
for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) {
if (fs_prog_data->urb_setup[attr] >= 0) {
fs_prog_data->urb_setup_attribs[index++] = attr;
}
}
fs_prog_data->urb_setup_attribs_count = index;
}
static void
calculate_urb_setup(const struct intel_device_info *devinfo,
const struct brw_fs_prog_key *key,
struct brw_fs_prog_data *prog_data,
nir_shader *nir,
const struct brw_mue_map *mue_map,
int *per_primitive_offsets)
{
memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup));
int urb_next = 0; /* in vec4s */
/* Figure out where the PrimitiveID lives, either in the per-vertex block
* or in the per-primitive block or both.
*/
const uint64_t per_vert_primitive_id =
key->mesh_input == INTEL_ALWAYS ? 0 : VARYING_BIT_PRIMITIVE_ID;
const uint64_t per_prim_primitive_id =
key->mesh_input == INTEL_NEVER ? 0 : VARYING_BIT_PRIMITIVE_ID;
const uint64_t inputs_read =
nir->info.inputs_read &
(~nir->info.per_primitive_inputs | per_vert_primitive_id);
const uint64_t per_primitive_header_bits =
VARYING_BIT_PRIMITIVE_SHADING_RATE |
VARYING_BIT_LAYER |
VARYING_BIT_VIEWPORT |
VARYING_BIT_CULL_PRIMITIVE;
const uint64_t per_primitive_inputs =
nir->info.inputs_read &
(nir->info.per_primitive_inputs | per_prim_primitive_id) &
~per_primitive_header_bits;
struct intel_vue_map vue_map;
uint32_t per_primitive_stride = 0, first_read_offset = UINT32_MAX;
if (mue_map != NULL) {
memcpy(&vue_map, &mue_map->vue_map, sizeof(vue_map));
memcpy(per_primitive_offsets, mue_map->per_primitive_offsets,
sizeof(mue_map->per_primitive_offsets));
if (!mue_map->wa_18019110168_active) {
u_foreach_bit64(location, per_primitive_inputs) {
assert(per_primitive_offsets[location] != -1);
first_read_offset =
MIN2(first_read_offset,
(uint32_t) per_primitive_offsets[location]);
per_primitive_stride =
MAX2((uint32_t) per_primitive_offsets[location] + 16,
per_primitive_stride);
}
} else {
first_read_offset = per_primitive_stride = 0;
}
} else {
brw_compute_vue_map(devinfo, &vue_map, inputs_read, key->base.vue_layout,
1 /* pos_slots, TODO */);
brw_compute_per_primitive_map(per_primitive_offsets,
&per_primitive_stride, &first_read_offset,
0, nir, nir_var_shader_in,
per_primitive_inputs,
true /* separate_shader */);
}
if (per_primitive_stride > first_read_offset) {
first_read_offset = ROUND_DOWN_TO(first_read_offset, 32);
/* Remove the first few unused registers */
for (uint32_t i = 0; i < VARYING_SLOT_MAX; i++) {
if (per_primitive_offsets[i] == -1)
continue;
per_primitive_offsets[i] -= first_read_offset;
}
prog_data->num_per_primitive_inputs =
2 * DIV_ROUND_UP(per_primitive_stride - first_read_offset, 32);
} else {
prog_data->num_per_primitive_inputs = 0;
}
/* Now do the per-vertex stuff (what used to be legacy pipeline) */
/* If Mesh is involved, we cannot do any packing. Documentation doesn't say
* anything about this but 3DSTATE_SBE_SWIZ does not appear to work when
* using Mesh.
*/
if (util_bitcount64(inputs_read) <= 16 && key->mesh_input == INTEL_NEVER) {
/* When not in Mesh pipeline mode, the SF/SBE pipeline stage can do
* arbitrary rearrangement of the first 16 varying inputs, so we can put
* them wherever we want. Just put them in order.
*
* This is useful because it means that (a) inputs not used by the
* fragment shader won't take up valuable register space, and (b) we
* won't have to recompile the fragment shader if it gets paired with a
* different vertex (or geometry) shader.
*/
for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
if (inputs_read & BITFIELD64_BIT(i)) {
prog_data->urb_setup[i] = urb_next++;
}
}
} else {
/* We have enough input varyings that the SF/SBE pipeline stage can't
* arbitrarily rearrange them to suit our whim; we have to put them in
* an order that matches the output of the previous pipeline stage
* (geometry or vertex shader).
*/
int first_slot = 0;
for (int i = 0; i < vue_map.num_slots; i++) {
int varying = vue_map.slot_to_varying[i];
if (varying > 0 && (inputs_read & BITFIELD64_BIT(varying)) != 0) {
first_slot = ROUND_DOWN_TO(i, 2);
break;
}
}
for (int slot = first_slot; slot < vue_map.num_slots; slot++) {
int varying = vue_map.slot_to_varying[slot];
if (varying > 0 && (inputs_read & BITFIELD64_BIT(varying))) {
prog_data->urb_setup[varying] = slot - first_slot;
}
}
urb_next = vue_map.num_slots - first_slot;
}
prog_data->num_varying_inputs = urb_next;
prog_data->inputs = inputs_read;
prog_data->per_primitive_inputs = per_primitive_inputs;
brw_compute_urb_setup_index(prog_data);
}
static void
populate_fs_prog_data(nir_shader *shader,
const struct intel_device_info *devinfo,
const struct brw_fs_prog_key *key,
struct brw_fs_prog_data *prog_data,
const struct brw_mue_map *mue_map,
int *per_primitive_offsets)
{
struct fs_info_ctx ctx = {
.key = key,
.prog_data = prog_data,
.devinfo = devinfo,
};
nir_shader_intrinsics_pass(shader, gather_fs_info, nir_metadata_all, &ctx);
prog_data->uses_kill = shader->info.fs.uses_discard;
prog_data->uses_omask =
!key->ignore_sample_mask_out &&
(shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK));
prog_data->max_polygons = 1;
prog_data->computed_depth_mode = computed_depth_mode(shader);
prog_data->computed_stencil =
shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
prog_data->sample_shading = shader->info.fs.uses_sample_shading;
prog_data->api_sample_shading = key->api_sample_shading;
prog_data->min_sample_shading = key->min_sample_shading;
assert(key->multisample_fbo != INTEL_NEVER ||
key->persample_interp == INTEL_NEVER);
prog_data->persample_dispatch = key->persample_interp;
if (prog_data->sample_shading)
prog_data->persample_dispatch = INTEL_ALWAYS;
/* We can only persample dispatch if we have a multisample FBO */
prog_data->persample_dispatch =
MIN2(prog_data->persample_dispatch, key->multisample_fbo);
/* Currently only the Vulkan API allows alpha_to_coverage to be dynamic. If
* persample_dispatch & multisample_fbo are not dynamic, Anv should be able
* to definitively tell whether alpha_to_coverage is on or off.
*/
prog_data->alpha_to_coverage = key->alpha_to_coverage;
assert(devinfo->verx10 >= 125 || key->mesh_input == INTEL_NEVER);
prog_data->mesh_input = key->mesh_input;
assert(devinfo->verx10 >= 200 || key->provoking_vertex_last == INTEL_NEVER);
prog_data->provoking_vertex_last = key->provoking_vertex_last;
/* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
*
* "MSDISPMODE_PERSAMPLE is required in order to select
* POSOFFSET_SAMPLE"
*
* So we can only really get sample positions if we are doing real
* per-sample dispatch. If we need gl_SamplePosition and we don't have
* persample dispatch, we hard-code it to 0.5.
*/
prog_data->uses_pos_offset =
prog_data->persample_dispatch != INTEL_NEVER &&
(BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_POS) ||
BITSET_TEST(shader->info.system_values_read,
SYSTEM_VALUE_SAMPLE_POS_OR_CENTER));
prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage;
prog_data->inner_coverage = shader->info.fs.inner_coverage;
/* From the BDW PRM documentation for 3DSTATE_WM:
*
* "MSDISPMODE_PERSAMPLE is required in order to select Perspective
* Sample or Non- perspective Sample barycentric coordinates."
*
* So cleanup any potentially set sample barycentric mode when not in per
* sample dispatch.
*/
if (prog_data->persample_dispatch == INTEL_NEVER) {
prog_data->barycentric_interp_modes &=
~BITFIELD_BIT(INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE);
}
if (devinfo->ver >= 20) {
prog_data->vertex_attributes_bypass =
brw_needs_vertex_attributes_bypass(shader);
}
prog_data->uses_nonperspective_interp_modes =
(prog_data->barycentric_interp_modes &
INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS) ||
prog_data->uses_npc_bary_coefficients;
/* The current VK_EXT_graphics_pipeline_library specification requires
* coarse to specified at compile time. But per sample interpolation can be
* dynamic. So we should never be in a situation where coarse &
* persample_interp are both respectively true & INTEL_ALWAYS.
*
* Coarse will dynamically turned off when persample_interp is active.
*/
assert(!key->coarse_pixel || key->persample_interp != INTEL_ALWAYS);
prog_data->coarse_pixel_dispatch =
intel_sometimes_invert(prog_data->persample_dispatch);
if (!key->coarse_pixel ||
/* DG2 should support this, but Wa_22012766191 says there are issues
* with CPS 1x1 + MSAA + FS writing to oMask.
*/
(devinfo->verx10 < 200 &&
(prog_data->uses_omask || prog_data->uses_sample_mask)) ||
prog_data->sample_shading ||
(prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) ||
prog_data->computed_stencil ||
devinfo->ver < 11) {
prog_data->coarse_pixel_dispatch = INTEL_NEVER;
}
/* ICL PRMs, Volume 9: Render Engine, Shared Functions Pixel Interpolater,
* Message Descriptor :
*
* "Message Type. Specifies the type of message being sent when
* pixel-rate evaluation is requested :
*
* Format = U2
* 0: Per Message Offset (eval_snapped with immediate offset)
* 1: Sample Position Offset (eval_sindex)
* 2: Centroid Position Offset (eval_centroid)
* 3: Per Slot Offset (eval_snapped with register offset)
*
* Message Type. Specifies the type of message being sent when
* coarse-rate evaluation is requested :
*
* Format = U2
* 0: Coarse to Pixel Mapping Message (internal message)
* 1: Reserved
* 2: Coarse Centroid Position (eval_centroid)
* 3: Per Slot Coarse Pixel Offset (eval_snapped with register offset)"
*
* The Sample Position Offset is marked as reserved for coarse rate
* evaluation and leads to hangs if we try to use it. So disable coarse
* pixel shading if we have any intrinsic that will result in a pixel
* interpolater message at sample.
*/
if (intel_nir_pulls_at_sample(shader))
prog_data->coarse_pixel_dispatch = INTEL_NEVER;
/* We choose to always enable VMask prior to XeHP, as it would cause
* us to lose out on the eliminate_find_live_channel() optimization.
*/
prog_data->uses_vmask =
devinfo->verx10 < 125 ||
shader->info.fs.needs_coarse_quad_helper_invocations ||
shader->info.uses_wide_subgroup_intrinsics ||
prog_data->coarse_pixel_dispatch != INTEL_NEVER;
prog_data->uses_depth_w_coefficients = prog_data->uses_pc_bary_coefficients;
if (prog_data->coarse_pixel_dispatch != INTEL_NEVER) {
prog_data->uses_depth_w_coefficients |= prog_data->uses_src_depth;
prog_data->uses_src_depth = false;
}
calculate_urb_setup(devinfo, key, prog_data, shader, mue_map,
per_primitive_offsets);
brw_compute_flat_inputs(prog_data, shader);
prog_data->has_side_effects = shader->info.writes_memory;
}
static void
populate_vs_prog_data(nir_shader *nir,
const struct intel_device_info *devinfo,
const struct brw_vs_prog_key *key,
struct brw_vs_prog_data *prog_data,
unsigned nr_packed_regs)
{
unsigned nr_attribute_slots = util_bitcount64(prog_data->inputs_read);
BITSET_WORD *sysvals = nir->info.system_values_read;
/* gl_VertexID and gl_InstanceID are system values, but arrive via an
* incoming vertex attribute. So, add an extra slot.
*/
if (BITSET_TEST(sysvals, SYSTEM_VALUE_FIRST_VERTEX) ||
BITSET_TEST(sysvals, SYSTEM_VALUE_BASE_INSTANCE) ||
BITSET_TEST(sysvals, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) ||
BITSET_TEST(sysvals, SYSTEM_VALUE_INSTANCE_ID)) {
nr_attribute_slots++;
}
/* gl_DrawID and IsIndexedDraw share its very own vec4 */
if (BITSET_TEST(sysvals, SYSTEM_VALUE_DRAW_ID) ||
BITSET_TEST(sysvals, SYSTEM_VALUE_IS_INDEXED_DRAW)) {
nr_attribute_slots++;
}
const struct {
bool *data;
gl_system_value val;
} bool_sysvals[] = {
{ &prog_data->uses_is_indexed_draw, SYSTEM_VALUE_IS_INDEXED_DRAW },
{ &prog_data->uses_firstvertex, SYSTEM_VALUE_FIRST_VERTEX },
{ &prog_data->uses_baseinstance, SYSTEM_VALUE_BASE_INSTANCE },
{ &prog_data->uses_vertexid, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE },
{ &prog_data->uses_instanceid, SYSTEM_VALUE_INSTANCE_ID },
{ &prog_data->uses_drawid, SYSTEM_VALUE_DRAW_ID },
};
for (unsigned i = 0; i < ARRAY_SIZE(bool_sysvals); ++i) {
*bool_sysvals[i].data = BITSET_TEST(sysvals, bool_sysvals[i].val);
}
unsigned nr_attribute_regs;
if (key->vf_component_packing) {
prog_data->base.urb_read_length = DIV_ROUND_UP(nr_packed_regs, 8);
nr_attribute_regs = nr_packed_regs;
} else {
prog_data->base.urb_read_length = DIV_ROUND_UP(nr_attribute_slots, 2);
nr_attribute_regs = 4 * nr_attribute_slots;
}
/* Since vertex shaders reuse the same VUE entry for inputs and outputs
* (overwriting the original contents), we need to make sure the size is
* the larger of the two.
*/
const unsigned vue_entries = MAX2(DIV_ROUND_UP(nr_attribute_regs, 4),
prog_data->base.vue_map.num_slots);
prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4);
prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
}
void
jay_populate_prog_data(const struct intel_device_info *devinfo,
nir_shader *nir,
union brw_any_prog_data *prog_data,
union brw_any_prog_key *key,
unsigned nr_packed_regs)
{
if (nir->info.stage == MESA_SHADER_VERTEX) {
populate_vs_prog_data(nir, devinfo, &key->vs, &prog_data->vs,
nr_packed_regs);
} else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
int per_primitive_offsets[VARYING_SLOT_MAX];
memset(per_primitive_offsets, -1, sizeof(per_primitive_offsets));
populate_fs_prog_data(nir, devinfo, &key->fs, &prog_data->fs,
NULL /* TODO: mue_map */, per_primitive_offsets);
} else if (mesa_shader_stage_is_compute(nir->info.stage)) {
prog_data->cs.uses_inline_push_addr = key->base.uses_inline_push_addr;
prog_data->cs.uses_inline_data |= key->base.uses_inline_push_addr;
}
if (nir->info.stage == MESA_SHADER_VERTEX ||
nir->info.stage == MESA_SHADER_TESS_EVAL ||
nir->info.stage == MESA_SHADER_GEOMETRY ||
nir->info.stage == MESA_SHADER_MESH) {
uint32_t clip_mask = BITFIELD_MASK(nir->info.clip_distance_array_size);
uint32_t cull_mask = BITFIELD_RANGE(nir->info.clip_distance_array_size,
nir->info.cull_distance_array_size);
if (nir->info.stage == MESA_SHADER_MESH) {
prog_data->mesh.clip_distance_mask = clip_mask;
prog_data->mesh.cull_distance_mask = cull_mask;
} else {
prog_data->vue.clip_distance_mask = clip_mask;
prog_data->vue.cull_distance_mask = cull_mask;
}
}
}

View file

@ -50,16 +50,19 @@ libintel_compiler_jay_files = files(
'jay_assign_flags.c',
'jay_from_nir.c',
'jay_ir.h',
'jay_insert_fp_mode.c',
'jay_liveness.c',
'jay_lower_post_ra.c',
'jay_lower_pre_ra.c',
'jay_lower_scoreboard.c',
'jay_lower_spill.c',
'jay_nir.c',
'jay_opt_dead_code.c',
'jay_opt_control_flow.c',
'jay_opt_propagate.c',
'jay_print.c',
'jay_private.h',
'jay_prog_data.c',
'jay_repair_ssa.c',
'jay_register_allocate.c',
'jay_simd_width.c',