mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-07 13:38:06 +02:00
jay: split up jay_from_nir.c
Big monolithic file, split it up into the relevant pieces. Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40960>
This commit is contained in:
parent
6925d9ee23
commit
4eb838eb48
6 changed files with 1149 additions and 1094 deletions
File diff suppressed because it is too large
Load diff
85
src/intel/compiler/jay/jay_insert_fp_mode.c
Normal file
85
src/intel/compiler/jay/jay_insert_fp_mode.c
Normal file
|
|
@ -0,0 +1,85 @@
|
|||
/*
|
||||
* Copyright 2026 Intel Corporation
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
#include "jay_builder.h"
|
||||
#include "jay_ir.h"
|
||||
|
||||
static void
|
||||
set_cr0(jay_function *f, jay_cursor cursor, uint32_t *cr0, uint32_t desired)
|
||||
{
|
||||
/* Only touch cr0 if we are changing bits */
|
||||
if ((*cr0) != desired) {
|
||||
jay_builder b = jay_init_builder(f, cursor);
|
||||
jay_XOR(&b, JAY_TYPE_U32, jay_control(), jay_control(), (*cr0) ^ desired);
|
||||
*cr0 = desired;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
jay_insert_fp_mode(jay_shader *shader, uint32_t api, uint32_t float_sizes)
|
||||
{
|
||||
/* First, work out the global float control mode for the shader */
|
||||
uint32_t global = 0x0;
|
||||
|
||||
/* Initially fp16 denorms are flushed-to-zero, handle preserve. */
|
||||
if ((api & FLOAT_CONTROLS_DENORM_PRESERVE_FP16) && (float_sizes & 16)) {
|
||||
global |= BRW_CR0_FP16_DENORM_PRESERVE;
|
||||
}
|
||||
|
||||
/* Initially fp32 denorms are flushed-to-zero, handle preserve.
|
||||
*
|
||||
* TODO: Optimize this, we have a dispatch bit.
|
||||
*/
|
||||
if ((api & FLOAT_CONTROLS_DENORM_PRESERVE_FP32) && (float_sizes & 32)) {
|
||||
global |= BRW_CR0_FP32_DENORM_PRESERVE;
|
||||
}
|
||||
|
||||
/* Initially fp64 denorms are flushed to zero, handle preserve. */
|
||||
if ((api & FLOAT_CONTROLS_DENORM_PRESERVE_FP64) && (float_sizes & 64)) {
|
||||
global |= BRW_CR0_FP64_DENORM_PRESERVE;
|
||||
}
|
||||
|
||||
/* By default, we are in round-to-even mode. Note we do not permit setting
|
||||
* round mode separately by bitsize but this is ok for current APIs. The
|
||||
* Vulkan driver sets roundingModeIndependence = NONE.
|
||||
*
|
||||
* TODO: Optimize this, there is a command buffer bit for it.
|
||||
*/
|
||||
if (((api & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16) && (float_sizes & 16)) ||
|
||||
((api & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32) && (float_sizes & 32)) ||
|
||||
((api & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) && (float_sizes & 64))) {
|
||||
global |= (BRW_RND_MODE_RTZ << BRW_CR0_RND_MODE_SHIFT);
|
||||
}
|
||||
|
||||
uint32_t cr0 = 0;
|
||||
jay_function *entrypoint = jay_shader_get_entrypoint(shader);
|
||||
set_cr0(entrypoint, jay_before_function(entrypoint), &cr0, global);
|
||||
|
||||
/* Now handle per-instruction deltas to the global mode */
|
||||
jay_foreach_function(shader, func) {
|
||||
jay_foreach_block(func, block) {
|
||||
uint32_t current = cr0;
|
||||
|
||||
jay_foreach_inst_in_block(block, I) {
|
||||
uint32_t required = cr0;
|
||||
enum jay_rounding_mode round =
|
||||
(I->op == JAY_OPCODE_CVT) ? jay_cvt_rounding_mode(I) : JAY_ROUND;
|
||||
|
||||
if (round != JAY_ROUND) {
|
||||
required &= ~BRW_CR0_RND_MODE_MASK;
|
||||
required |= ((round - JAY_RNE) << BRW_CR0_RND_MODE_SHIFT);
|
||||
}
|
||||
|
||||
if (jay_type_is_any_float(I->type)) {
|
||||
set_cr0(func, jay_before_inst(I), ¤t, required);
|
||||
}
|
||||
}
|
||||
|
||||
/* Restore to global state on block boundaries */
|
||||
if (jay_num_successors(block) > 0) {
|
||||
set_cr0(func, jay_after_block(block), ¤t, cr0);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
462
src/intel/compiler/jay/jay_nir.c
Normal file
462
src/intel/compiler/jay/jay_nir.c
Normal file
|
|
@ -0,0 +1,462 @@
|
|||
/*
|
||||
* Copyright 2026 Intel Corporation
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
#include "compiler/brw/brw_eu.h"
|
||||
#include "compiler/brw/brw_eu_defines.h"
|
||||
#include "compiler/brw/brw_nir.h"
|
||||
#include "compiler/brw/brw_private.h"
|
||||
#include "compiler/intel_nir.h"
|
||||
#include "jay_private.h"
|
||||
#include "nir.h"
|
||||
#include "nir_builder.h"
|
||||
|
||||
/*
|
||||
* Jay-to-NIR relies on a careful indexing of defs: every 32-bit word has
|
||||
* its own index. Vectors/64-bit use contiguous indices. We therefore run a
|
||||
* modified version of nir_index_ssa_defs right before translating NIR->Jay.
|
||||
*/
|
||||
static bool
|
||||
index_ssa_def_cb(nir_def *def, void *state)
|
||||
{
|
||||
unsigned *index = (unsigned *) state;
|
||||
def->index = *index;
|
||||
*index += DIV_ROUND_UP(def->num_components * MAX2(def->bit_size, 32), 32);
|
||||
return true;
|
||||
}
|
||||
|
||||
static void
|
||||
nj_index_ssa_defs(nir_shader *nir)
|
||||
{
|
||||
nir_foreach_function_impl(impl, nir) {
|
||||
/* The zero index means null in Jay, so start SSA indices at 1 */
|
||||
unsigned index = 1;
|
||||
|
||||
nir_foreach_block_unstructured(block, impl) {
|
||||
nir_foreach_instr(instr, block)
|
||||
nir_foreach_def(instr, index_ssa_def_cb, &index);
|
||||
}
|
||||
|
||||
impl->ssa_alloc = index;
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_helper_invocation(nir_builder *b, nir_intrinsic_instr *intr, void *_)
|
||||
{
|
||||
if (intr->intrinsic != nir_intrinsic_load_helper_invocation)
|
||||
return false;
|
||||
|
||||
/* TODO: Is this right for multisampling? */
|
||||
b->cursor = nir_before_instr(&intr->instr);
|
||||
nir_def *active =
|
||||
nir_inot(b, nir_inverse_ballot(b, nir_load_sample_mask_in(b)));
|
||||
|
||||
nir_def_replace(&intr->def, active);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_frag_coord(nir_builder *b, nir_intrinsic_instr *intr, void *simd_)
|
||||
{
|
||||
if (intr->intrinsic != nir_intrinsic_load_frag_coord &&
|
||||
intr->intrinsic != nir_intrinsic_load_pixel_coord)
|
||||
return false;
|
||||
|
||||
b->cursor = nir_before_instr(&intr->instr);
|
||||
nir_def *c = nir_unpack_32_2x16(b, nir_load_pixel_coord_intel(b));
|
||||
|
||||
if (intr->intrinsic == nir_intrinsic_load_frag_coord) {
|
||||
c = nir_vec4(b, nir_u2f32(b, nir_channel(b, c, 0)),
|
||||
nir_u2f32(b, nir_channel(b, c, 1)), nir_load_frag_coord_z(b),
|
||||
nir_frcp(b, nir_load_frag_coord_w_rcp(b)));
|
||||
}
|
||||
|
||||
nir_def_replace(&intr->def, c);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
jay_nir_lower_simd(nir_builder *b, nir_intrinsic_instr *intr, void *simd_)
|
||||
{
|
||||
b->cursor = nir_after_instr(&intr->instr);
|
||||
unsigned *simd_width = simd_;
|
||||
|
||||
/* mask & -mask isolates the lowest set bit in the mask. */
|
||||
if (intr->intrinsic == nir_intrinsic_elect) {
|
||||
nir_def *mask = nir_ballot(b, 1, *simd_width, nir_imm_true(b));
|
||||
mask = nir_iand(b, mask, nir_ineg(b, mask));
|
||||
nir_def_replace(&intr->def, nir_inverse_ballot(b, mask));
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Ballots must match the SIMD size */
|
||||
if (intr->intrinsic == nir_intrinsic_ballot ||
|
||||
intr->intrinsic == nir_intrinsic_ballot_relaxed) {
|
||||
unsigned old_bitsize = intr->def.bit_size;
|
||||
intr->def.bit_size = *simd_width;
|
||||
nir_def *u2uN = nir_u2uN(b, &intr->def, old_bitsize);
|
||||
nir_def_rewrite_uses_after(&intr->def, u2uN);
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Note: we don't treat read_invocation specially because there's little
|
||||
* benefit but doing so would require expensive uniformizing in some cases.
|
||||
*/
|
||||
if (intr->intrinsic != nir_intrinsic_shuffle &&
|
||||
intr->intrinsic != nir_intrinsic_read_invocation)
|
||||
return false;
|
||||
|
||||
nir_def *data = intr->src[0].ssa;
|
||||
assert(data->num_components == 1 && data->bit_size <= 32 && "scalarized");
|
||||
|
||||
nir_def *offset_B = nir_imul_imm(b, intr->src[1].ssa, 4);
|
||||
nir_def_replace(&intr->def, nir_shuffle_intel(b, 1, data, offset_B));
|
||||
return true;
|
||||
}
|
||||
|
||||
struct frag_out_ctx {
|
||||
nir_def *colour[8], *depth, *stencil, *sample_mask;
|
||||
};
|
||||
|
||||
static bool
|
||||
collect_fragment_output(nir_builder *b, nir_intrinsic_instr *intr, void *ctx_)
|
||||
{
|
||||
struct frag_out_ctx *ctx = ctx_;
|
||||
if (intr->intrinsic != nir_intrinsic_store_output)
|
||||
return false;
|
||||
|
||||
unsigned wrmask = nir_intrinsic_write_mask(intr);
|
||||
assert(nir_intrinsic_component(intr) == 0 && "component should be lowered");
|
||||
assert(util_is_power_of_two_nonzero(wrmask + 1) &&
|
||||
"complex writemasks should be lowered");
|
||||
|
||||
/* TODO: Optimize with write mask? */
|
||||
|
||||
gl_frag_result loc = nir_intrinsic_io_semantics(intr).location;
|
||||
assert(!nir_intrinsic_io_semantics(intr).dual_source_blend_index && "todo");
|
||||
nir_def **out;
|
||||
if (loc == FRAG_RESULT_COLOR) {
|
||||
out = &ctx->colour[0];
|
||||
} else if (loc >= FRAG_RESULT_DATA0 && loc <= FRAG_RESULT_DATA7) {
|
||||
out = &ctx->colour[loc - FRAG_RESULT_DATA0];
|
||||
} else if (loc == FRAG_RESULT_DEPTH) {
|
||||
out = &ctx->depth;
|
||||
} else if (loc == FRAG_RESULT_STENCIL) {
|
||||
UNREACHABLE("todo");
|
||||
out = &ctx->stencil;
|
||||
} else if (loc == FRAG_RESULT_SAMPLE_MASK) {
|
||||
UNREACHABLE("todo");
|
||||
out = &ctx->sample_mask;
|
||||
} else {
|
||||
UNREACHABLE("invalid location");
|
||||
}
|
||||
|
||||
assert((*out) == NULL && "each location written exactly once");
|
||||
*out = intr->src[0].ssa;
|
||||
|
||||
nir_instr_remove(&intr->instr);
|
||||
return true;
|
||||
}
|
||||
|
||||
static void
|
||||
append_payload(nir_builder *b,
|
||||
nir_def **payload,
|
||||
unsigned *len,
|
||||
unsigned max_len,
|
||||
nir_def *value)
|
||||
{
|
||||
if (value != NULL) {
|
||||
for (unsigned i = 0; i < value->num_components; ++i) {
|
||||
payload[*len] = nir_channel(b, value, i);
|
||||
(*len)++;
|
||||
assert((*len) <= max_len);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
insert_rt_store(nir_builder *b,
|
||||
const struct intel_device_info *devinfo,
|
||||
signed target,
|
||||
bool last,
|
||||
nir_def *colour,
|
||||
nir_def *src0_alpha,
|
||||
nir_def *depth,
|
||||
nir_def *stencil,
|
||||
nir_def *sample_mask,
|
||||
unsigned dispatch_width)
|
||||
{
|
||||
bool null_rt = target < 0;
|
||||
target = MAX2(target, 0);
|
||||
|
||||
if (!colour) {
|
||||
colour = nir_undef(b, 4, 32);
|
||||
}
|
||||
|
||||
colour = nir_pad_vec4(b, colour);
|
||||
|
||||
if (null_rt) {
|
||||
/* Even if we don't write a RT, we still need to write alpha for
|
||||
* alpha-to-coverage and alpha testing. Optimize the other channels out.
|
||||
*/
|
||||
colour = nir_vector_insert_imm(b, nir_undef(b, 4, 32),
|
||||
nir_channel(b, colour, 3), 3);
|
||||
}
|
||||
|
||||
/* TODO: Not sure I like this. We'll see what 2src looks like. */
|
||||
unsigned op = dispatch_width == 32 ?
|
||||
XE2_DATAPORT_RENDER_TARGET_WRITE_SIMD32_SINGLE_SOURCE :
|
||||
BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
|
||||
uint64_t desc =
|
||||
brw_fb_write_desc(devinfo, target, op, last, false /* coarse write */);
|
||||
|
||||
uint64_t ex_desc = 0;
|
||||
if (devinfo->ver >= 20) {
|
||||
ex_desc = target << 21 |
|
||||
null_rt << 20 |
|
||||
(src0_alpha ? (1 << 15) : 0) |
|
||||
(stencil ? (1 << 14) : 0) |
|
||||
(depth ? (1 << 13) : 0) |
|
||||
(sample_mask ? (1 << 12) : 0);
|
||||
} else if (devinfo->ver >= 11) {
|
||||
/* Set the "Render Target Index" and "Src0 Alpha Present" fields
|
||||
* in the extended message descriptor, in lieu of using a header.
|
||||
*/
|
||||
ex_desc = target << 12 | null_rt << 20 | (src0_alpha ? (1 << 15) : 0);
|
||||
}
|
||||
|
||||
/* Build the payload */
|
||||
nir_def *payload[8] = { NULL };
|
||||
unsigned len = 0;
|
||||
append_payload(b, payload, &len, ARRAY_SIZE(payload), colour);
|
||||
append_payload(b, payload, &len, ARRAY_SIZE(payload), depth);
|
||||
/* TODO */
|
||||
|
||||
nir_def *disable = b->shader->info.fs.uses_discard ?
|
||||
nir_is_helper_invocation(b, 1) :
|
||||
nir_imm_false(b);
|
||||
|
||||
nir_store_render_target_intel(b, nir_vec(b, payload, len),
|
||||
nir_imm_ivec2(b, desc, ex_desc), disable,
|
||||
.eot = last);
|
||||
}
|
||||
|
||||
static void
|
||||
lower_fragment_outputs(nir_function_impl *impl,
|
||||
const struct intel_device_info *devinfo,
|
||||
unsigned nr_color_regions,
|
||||
unsigned dispatch_width)
|
||||
{
|
||||
struct frag_out_ctx ctx = { { NULL } };
|
||||
nir_function_intrinsics_pass(impl, collect_fragment_output,
|
||||
nir_metadata_control_flow, &ctx);
|
||||
nir_builder b_ = nir_builder_at(nir_after_impl(impl));
|
||||
nir_builder *b = &b_;
|
||||
assert(nr_color_regions <= ARRAY_SIZE(ctx.colour));
|
||||
|
||||
signed first = -1;
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(ctx.colour); ++i) {
|
||||
if (ctx.colour[i]) {
|
||||
first = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Do the later render targets first */
|
||||
for (unsigned i = first + 1; i < nr_color_regions; ++i) {
|
||||
if (ctx.colour[i]) {
|
||||
insert_rt_store(b, devinfo, i, false, ctx.colour[i], NULL, NULL, NULL,
|
||||
NULL, dispatch_width);
|
||||
}
|
||||
}
|
||||
|
||||
/* Finally do render target zero attaching all the sideband things and
|
||||
* setting the LastRT bit. This needs to exist even if nothing is written
|
||||
* since it also signals end-of-thread.
|
||||
*/
|
||||
insert_rt_store(b, devinfo, first < nr_color_regions ? first : -1, true,
|
||||
first >= 0 ? ctx.colour[first] : NULL, NULL, ctx.depth,
|
||||
ctx.stencil, ctx.sample_mask, dispatch_width);
|
||||
}
|
||||
|
||||
unsigned
|
||||
jay_process_nir(const struct intel_device_info *devinfo,
|
||||
nir_shader *nir,
|
||||
union brw_any_prog_data *prog_data,
|
||||
union brw_any_prog_key *key)
|
||||
{
|
||||
enum mesa_shader_stage stage = nir->info.stage;
|
||||
struct brw_compiler compiler = { .devinfo = devinfo };
|
||||
unsigned nr_packed_regs = 0;
|
||||
|
||||
brw_pass_tracker pt_ = {
|
||||
.nir = nir,
|
||||
.key = &key->base,
|
||||
.dispatch_width = 0,
|
||||
.compiler = &compiler,
|
||||
.archiver = NULL, //params->base.archiver,
|
||||
}, *pt = &pt_;
|
||||
|
||||
BRW_NIR_SNAPSHOT("first");
|
||||
|
||||
prog_data->base.ray_queries = nir->info.ray_queries;
|
||||
prog_data->base.stage = stage;
|
||||
// TODO: Make the driver do this?
|
||||
// prog_data->base.source_hash = params->source_hash;
|
||||
prog_data->base.total_shared = nir->info.shared_size;
|
||||
|
||||
/* TODO: Real heuristic */
|
||||
bool do_simd32 = INTEL_SIMD(FS, 32);
|
||||
do_simd32 &= stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_FRAGMENT;
|
||||
unsigned simd_width = do_simd32 ? (nir->info.api_subgroup_size ?: 32) : 16;
|
||||
|
||||
if (stage == MESA_SHADER_VERTEX) {
|
||||
/* We only expect slot compaction to be disabled when using device
|
||||
* generated commands, to provide an independent 3DSTATE_VERTEX_ELEMENTS
|
||||
* programming. This should always be enabled together with VF component
|
||||
* packing to minimize the size of the payload.
|
||||
*/
|
||||
assert(!key->vs.no_vf_slot_compaction || key->vs.vf_component_packing);
|
||||
|
||||
/* When using Primitive Replication for multiview, each view gets its own
|
||||
* position slot.
|
||||
*/
|
||||
const uint32_t pos_slots =
|
||||
(nir->info.per_view_outputs & VARYING_BIT_POS) ?
|
||||
MAX2(1, util_bitcount(key->base.view_mask)) :
|
||||
1;
|
||||
|
||||
/* Only position is allowed to be per-view */
|
||||
assert(!(nir->info.per_view_outputs & ~VARYING_BIT_POS));
|
||||
|
||||
brw_compute_vue_map(devinfo, &prog_data->vue.vue_map,
|
||||
nir->info.outputs_written, key->base.vue_layout,
|
||||
pos_slots);
|
||||
|
||||
brw_nir_apply_key(pt, &key->base, simd_width);
|
||||
|
||||
prog_data->vs.inputs_read = nir->info.inputs_read;
|
||||
prog_data->vs.double_inputs_read = nir->info.vs.double_inputs;
|
||||
prog_data->vs.no_vf_slot_compaction = key->vs.no_vf_slot_compaction;
|
||||
|
||||
brw_nir_lower_vs_inputs(nir);
|
||||
brw_nir_lower_vue_outputs(nir);
|
||||
BRW_NIR_SNAPSHOT("after_lower_io");
|
||||
|
||||
memset(prog_data->vs.vf_component_packing, 0,
|
||||
sizeof(prog_data->vs.vf_component_packing));
|
||||
if (key->vs.vf_component_packing) {
|
||||
nr_packed_regs = brw_nir_pack_vs_input(nir, &prog_data->vs);
|
||||
}
|
||||
|
||||
/* Get constant offsets out of the way for proper clip/cull handling */
|
||||
BRW_NIR_PASS(nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
|
||||
BRW_NIR_PASS(nir_opt_constant_folding);
|
||||
BRW_NIR_PASS(brw_nir_lower_deferred_urb_writes, devinfo,
|
||||
&prog_data->vue.vue_map, 0, 0);
|
||||
} else if (stage == MESA_SHADER_FRAGMENT) {
|
||||
assert(key->fs.mesh_input == INTEL_NEVER && "todo");
|
||||
assert(!key->fs.force_dual_color_blend && "todo");
|
||||
brw_nir_apply_key(pt, &key->base, 32);
|
||||
brw_nir_lower_fs_inputs(nir, devinfo, &key->fs);
|
||||
brw_nir_lower_fs_outputs(nir);
|
||||
NIR_PASS(_, nir, nir_lower_io_to_scalar, nir_var_shader_in, NULL, NULL);
|
||||
|
||||
if (!brw_can_coherent_fb_fetch(devinfo))
|
||||
NIR_PASS(_, nir, brw_nir_lower_fs_load_output, &key->fs);
|
||||
|
||||
NIR_PASS(_, nir, nir_opt_frag_coord_to_pixel_coord);
|
||||
NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_frag_coord,
|
||||
nir_metadata_control_flow, NULL);
|
||||
NIR_PASS(_, nir, nir_opt_barycentric, true);
|
||||
|
||||
lower_fragment_outputs(nir_shader_get_entrypoint(nir), devinfo,
|
||||
key->fs.nr_color_regions, simd_width);
|
||||
NIR_PASS(_, nir, nir_lower_helper_writes, true);
|
||||
NIR_PASS(_, nir, nir_lower_is_helper_invocation);
|
||||
NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_helper_invocation,
|
||||
nir_metadata_control_flow, NULL);
|
||||
|
||||
if (key->fs.alpha_to_coverage != INTEL_NEVER) {
|
||||
/* Run constant fold optimization in order to get the correct source
|
||||
* offset to determine render target 0 store instruction in
|
||||
* emit_alpha_to_coverage pass.
|
||||
*/
|
||||
NIR_PASS(_, nir, nir_opt_constant_folding);
|
||||
NIR_PASS(_, nir, brw_nir_lower_alpha_to_coverage);
|
||||
}
|
||||
|
||||
// TODO
|
||||
// NIR_PASS(_, nir, brw_nir_move_interpolation_to_top);
|
||||
|
||||
if (!brw_fs_prog_key_is_dynamic(&key->fs)) {
|
||||
uint32_t f = 0;
|
||||
|
||||
if (key->fs.multisample_fbo == INTEL_ALWAYS)
|
||||
f |= INTEL_FS_CONFIG_MULTISAMPLE_FBO;
|
||||
|
||||
if (key->fs.alpha_to_coverage == INTEL_ALWAYS)
|
||||
f |= INTEL_FS_CONFIG_ALPHA_TO_COVERAGE;
|
||||
|
||||
if (key->fs.provoking_vertex_last == INTEL_ALWAYS)
|
||||
f |= INTEL_FS_CONFIG_PROVOKING_VERTEX_LAST;
|
||||
|
||||
if (key->fs.persample_interp == INTEL_ALWAYS) {
|
||||
f |= INTEL_FS_CONFIG_PERSAMPLE_DISPATCH |
|
||||
INTEL_FS_CONFIG_PERSAMPLE_INTERP;
|
||||
}
|
||||
|
||||
NIR_PASS(_, nir, nir_inline_sysval, nir_intrinsic_load_fs_config_intel,
|
||||
f);
|
||||
}
|
||||
} else {
|
||||
brw_nir_apply_key(pt, &key->base, simd_width);
|
||||
}
|
||||
|
||||
brw_postprocess_nir_opts(pt);
|
||||
|
||||
NIR_PASS(_, nir, nir_shader_intrinsics_pass, jay_nir_lower_simd,
|
||||
nir_metadata_control_flow, &simd_width);
|
||||
NIR_PASS(_, nir, nir_opt_algebraic_late);
|
||||
NIR_PASS(_, nir, intel_nir_opt_peephole_imul32x16);
|
||||
|
||||
/* Late postprocess while remaining in SSA */
|
||||
/* Run fsign lowering again after the last time brw_nir_optimize is called.
|
||||
* As is the case with conversion lowering (below), brw_nir_optimize can
|
||||
* create additional fsign instructions.
|
||||
*/
|
||||
NIR_PASS(_, nir, jay_nir_lower_fsign);
|
||||
NIR_PASS(_, nir, jay_nir_lower_bool);
|
||||
NIR_PASS(_, nir, nir_opt_cse);
|
||||
NIR_PASS(_, nir, nir_opt_dce);
|
||||
NIR_PASS(_, nir, jay_nir_opt_sel_zero);
|
||||
|
||||
/* Run nir_split_conversions only after the last tiem
|
||||
* brw_nir_optimize is called. Various optimizations invoked there can
|
||||
* rematerialize the conversions that the lowering pass eliminates.
|
||||
*/
|
||||
const nir_split_conversions_options split_conv_opts = {
|
||||
.callback = intel_nir_split_conversions_cb,
|
||||
};
|
||||
NIR_PASS(_, nir, nir_split_conversions, &split_conv_opts);
|
||||
|
||||
/* Do this only after the last opt_gcm. GCM will undo this lowering. */
|
||||
if (stage == MESA_SHADER_FRAGMENT) {
|
||||
NIR_PASS(_, nir, intel_nir_lower_non_uniform_barycentric_at_sample);
|
||||
}
|
||||
|
||||
NIR_PASS(_, nir, nir_opt_constant_folding);
|
||||
NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
|
||||
NIR_PASS(_, nir, nir_lower_all_phis_to_scalar);
|
||||
NIR_PASS(_, nir, nir_opt_copy_prop);
|
||||
NIR_PASS(_, nir, nir_opt_dce);
|
||||
|
||||
/* Run divergence analysis at the end */
|
||||
nir_sweep(nir);
|
||||
nj_index_ssa_defs(nir);
|
||||
nir_divergence_analysis(nir);
|
||||
|
||||
jay_populate_prog_data(devinfo, nir, prog_data, key, nr_packed_regs);
|
||||
return simd_width;
|
||||
}
|
||||
|
|
@ -22,6 +22,16 @@ bool jay_nir_lower_bool(nir_shader *nir);
|
|||
bool jay_nir_opt_sel_zero(nir_shader *nir);
|
||||
bool jay_nir_lower_fsign(nir_shader *nir);
|
||||
|
||||
void jay_populate_prog_data(const struct intel_device_info *devinfo,
|
||||
nir_shader *nir,
|
||||
union brw_any_prog_data *prog_data,
|
||||
union brw_any_prog_key *key,
|
||||
unsigned nr_packed_regs);
|
||||
unsigned jay_process_nir(const struct intel_device_info *devinfo,
|
||||
nir_shader *nir,
|
||||
union brw_any_prog_data *prog_data,
|
||||
union brw_any_prog_key *key);
|
||||
|
||||
void jay_compute_liveness(jay_function *f);
|
||||
void jay_calculate_register_demands(jay_function *f);
|
||||
|
||||
|
|
@ -63,6 +73,7 @@ void jay_lower_post_ra(jay_shader *s);
|
|||
void jay_lower_spill(jay_function *func);
|
||||
void jay_lower_simd_width(jay_shader *s);
|
||||
void jay_lower_scoreboard(jay_shader *s);
|
||||
void jay_insert_fp_mode(jay_shader *shader, uint32_t api, uint32_t float_sizes);
|
||||
|
||||
struct jay_shader_bin *
|
||||
jay_to_binary(jay_shader *s, void *const_data, size_t const_data_size);
|
||||
|
|
|
|||
581
src/intel/compiler/jay/jay_prog_data.c
Normal file
581
src/intel/compiler/jay/jay_prog_data.c
Normal file
|
|
@ -0,0 +1,581 @@
|
|||
/*
|
||||
* Copyright 2026 Intel Corporation
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
#include "compiler/brw/brw_compiler.h"
|
||||
#include "compiler/brw/brw_nir.h"
|
||||
#include "compiler/intel_nir.h"
|
||||
#include "jay_private.h"
|
||||
#include "nir.h"
|
||||
|
||||
static inline enum intel_barycentric_mode
|
||||
brw_barycentric_mode(const struct brw_fs_prog_key *key,
|
||||
nir_intrinsic_instr *intr)
|
||||
{
|
||||
const enum glsl_interp_mode mode = nir_intrinsic_interp_mode(intr);
|
||||
|
||||
/* Barycentric modes don't make sense for flat inputs. */
|
||||
assert(mode != INTERP_MODE_FLAT);
|
||||
|
||||
unsigned bary;
|
||||
switch (intr->intrinsic) {
|
||||
case nir_intrinsic_load_barycentric_pixel:
|
||||
case nir_intrinsic_load_barycentric_at_offset:
|
||||
/* When per sample interpolation is dynamic, assume sample interpolation.
|
||||
* We'll dynamically remap things so that the FS payload is not affected.
|
||||
*/
|
||||
bary = key->persample_interp == INTEL_SOMETIMES ?
|
||||
INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE :
|
||||
INTEL_BARYCENTRIC_PERSPECTIVE_PIXEL;
|
||||
break;
|
||||
case nir_intrinsic_load_barycentric_centroid:
|
||||
bary = INTEL_BARYCENTRIC_PERSPECTIVE_CENTROID;
|
||||
break;
|
||||
case nir_intrinsic_load_barycentric_sample:
|
||||
case nir_intrinsic_load_barycentric_at_sample:
|
||||
bary = INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE;
|
||||
break;
|
||||
default:
|
||||
UNREACHABLE("invalid intrinsic");
|
||||
}
|
||||
|
||||
if (mode == INTERP_MODE_NOPERSPECTIVE)
|
||||
bary += 3;
|
||||
|
||||
return (enum intel_barycentric_mode) bary;
|
||||
}
|
||||
|
||||
struct fs_info_ctx {
|
||||
const struct brw_fs_prog_key *key;
|
||||
struct brw_fs_prog_data *prog_data;
|
||||
const struct intel_device_info *devinfo;
|
||||
};
|
||||
|
||||
static bool
|
||||
gather_fs_info(nir_builder *b, nir_intrinsic_instr *intr, void *data)
|
||||
{
|
||||
struct fs_info_ctx *ctx = data;
|
||||
struct brw_fs_prog_data *prog_data = ctx->prog_data;
|
||||
|
||||
switch (intr->intrinsic) {
|
||||
case nir_intrinsic_load_barycentric_pixel:
|
||||
case nir_intrinsic_load_barycentric_centroid:
|
||||
case nir_intrinsic_load_barycentric_sample:
|
||||
prog_data->barycentric_interp_modes |=
|
||||
1 << brw_barycentric_mode(ctx->key, intr);
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_barycentric_at_sample:
|
||||
case nir_intrinsic_load_barycentric_at_offset: {
|
||||
unsigned mode = brw_barycentric_mode(ctx->key, intr);
|
||||
prog_data->barycentric_interp_modes |= 1 << mode;
|
||||
prog_data->uses_sample_offsets |=
|
||||
mode == INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE ||
|
||||
mode == INTEL_BARYCENTRIC_NONPERSPECTIVE_SAMPLE;
|
||||
|
||||
if ((1 << mode) & INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS)
|
||||
prog_data->uses_npc_bary_coefficients = true;
|
||||
else
|
||||
prog_data->uses_pc_bary_coefficients = true;
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_load_frag_coord_z:
|
||||
prog_data->uses_src_depth = true;
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_frag_coord_w_rcp:
|
||||
prog_data->uses_src_w = true;
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_sample_mask_in:
|
||||
/* TODO: Sample masks are broken and discards are broken and simd32
|
||||
* layouts are broken too. XXX.
|
||||
*/
|
||||
// prog_data->uses_sample_mask = true;
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_pixel_coord_intel:
|
||||
BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static void
|
||||
brw_compute_flat_inputs(struct brw_fs_prog_data *prog_data,
|
||||
const nir_shader *shader)
|
||||
{
|
||||
prog_data->flat_inputs = 0;
|
||||
|
||||
nir_foreach_shader_in_variable(var, shader) {
|
||||
if (var->data.interpolation != INTERP_MODE_FLAT ||
|
||||
var->data.per_primitive)
|
||||
continue;
|
||||
|
||||
unsigned slots = glsl_count_attribute_slots(var->type, false);
|
||||
for (unsigned s = 0; s < slots; s++) {
|
||||
int input_index = prog_data->urb_setup[var->data.location + s];
|
||||
|
||||
if (input_index >= 0)
|
||||
prog_data->flat_inputs |= 1 << input_index;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static uint8_t
|
||||
computed_depth_mode(const nir_shader *shader)
|
||||
{
|
||||
if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
|
||||
switch (shader->info.fs.depth_layout) {
|
||||
case FRAG_DEPTH_LAYOUT_NONE:
|
||||
case FRAG_DEPTH_LAYOUT_ANY:
|
||||
return BRW_PSCDEPTH_ON;
|
||||
case FRAG_DEPTH_LAYOUT_GREATER:
|
||||
return BRW_PSCDEPTH_ON_GE;
|
||||
case FRAG_DEPTH_LAYOUT_LESS:
|
||||
return BRW_PSCDEPTH_ON_LE;
|
||||
case FRAG_DEPTH_LAYOUT_UNCHANGED:
|
||||
/* We initially set this to OFF, but having the shader write the
|
||||
* depth means we allocate register space in the SEND message. The
|
||||
* difference between the SEND register count and the OFF state
|
||||
* programming makes the HW hang.
|
||||
*
|
||||
* Removing the depth writes also leads to test failures. So use
|
||||
* LesserThanOrEqual, which fits writing the same value
|
||||
* (unchanged/equal).
|
||||
*
|
||||
*/
|
||||
return BRW_PSCDEPTH_ON_LE;
|
||||
}
|
||||
}
|
||||
return BRW_PSCDEPTH_OFF;
|
||||
}
|
||||
|
||||
/*
|
||||
* Build up an array of indices into the urb_setup array that
|
||||
* references the active entries of the urb_setup array.
|
||||
* Used to accelerate walking the active entries of the urb_setup array
|
||||
* on each upload.
|
||||
*/
|
||||
static void
|
||||
brw_compute_urb_setup_index(struct brw_fs_prog_data *fs_prog_data)
|
||||
{
|
||||
/* TODO(mesh): Review usage of this in the context of Mesh, we may want to
|
||||
* skip per-primitive attributes here.
|
||||
*/
|
||||
|
||||
/* Make sure uint8_t is sufficient */
|
||||
static_assert(VARYING_SLOT_MAX <= 0xff);
|
||||
uint8_t index = 0;
|
||||
for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) {
|
||||
if (fs_prog_data->urb_setup[attr] >= 0) {
|
||||
fs_prog_data->urb_setup_attribs[index++] = attr;
|
||||
}
|
||||
}
|
||||
fs_prog_data->urb_setup_attribs_count = index;
|
||||
}
|
||||
|
||||
static void
|
||||
calculate_urb_setup(const struct intel_device_info *devinfo,
|
||||
const struct brw_fs_prog_key *key,
|
||||
struct brw_fs_prog_data *prog_data,
|
||||
nir_shader *nir,
|
||||
const struct brw_mue_map *mue_map,
|
||||
int *per_primitive_offsets)
|
||||
{
|
||||
memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup));
|
||||
int urb_next = 0; /* in vec4s */
|
||||
|
||||
/* Figure out where the PrimitiveID lives, either in the per-vertex block
|
||||
* or in the per-primitive block or both.
|
||||
*/
|
||||
const uint64_t per_vert_primitive_id =
|
||||
key->mesh_input == INTEL_ALWAYS ? 0 : VARYING_BIT_PRIMITIVE_ID;
|
||||
const uint64_t per_prim_primitive_id =
|
||||
key->mesh_input == INTEL_NEVER ? 0 : VARYING_BIT_PRIMITIVE_ID;
|
||||
const uint64_t inputs_read =
|
||||
nir->info.inputs_read &
|
||||
(~nir->info.per_primitive_inputs | per_vert_primitive_id);
|
||||
const uint64_t per_primitive_header_bits =
|
||||
VARYING_BIT_PRIMITIVE_SHADING_RATE |
|
||||
VARYING_BIT_LAYER |
|
||||
VARYING_BIT_VIEWPORT |
|
||||
VARYING_BIT_CULL_PRIMITIVE;
|
||||
const uint64_t per_primitive_inputs =
|
||||
nir->info.inputs_read &
|
||||
(nir->info.per_primitive_inputs | per_prim_primitive_id) &
|
||||
~per_primitive_header_bits;
|
||||
struct intel_vue_map vue_map;
|
||||
uint32_t per_primitive_stride = 0, first_read_offset = UINT32_MAX;
|
||||
|
||||
if (mue_map != NULL) {
|
||||
memcpy(&vue_map, &mue_map->vue_map, sizeof(vue_map));
|
||||
memcpy(per_primitive_offsets, mue_map->per_primitive_offsets,
|
||||
sizeof(mue_map->per_primitive_offsets));
|
||||
|
||||
if (!mue_map->wa_18019110168_active) {
|
||||
u_foreach_bit64(location, per_primitive_inputs) {
|
||||
assert(per_primitive_offsets[location] != -1);
|
||||
|
||||
first_read_offset =
|
||||
MIN2(first_read_offset,
|
||||
(uint32_t) per_primitive_offsets[location]);
|
||||
per_primitive_stride =
|
||||
MAX2((uint32_t) per_primitive_offsets[location] + 16,
|
||||
per_primitive_stride);
|
||||
}
|
||||
} else {
|
||||
first_read_offset = per_primitive_stride = 0;
|
||||
}
|
||||
} else {
|
||||
brw_compute_vue_map(devinfo, &vue_map, inputs_read, key->base.vue_layout,
|
||||
1 /* pos_slots, TODO */);
|
||||
brw_compute_per_primitive_map(per_primitive_offsets,
|
||||
&per_primitive_stride, &first_read_offset,
|
||||
0, nir, nir_var_shader_in,
|
||||
per_primitive_inputs,
|
||||
true /* separate_shader */);
|
||||
}
|
||||
|
||||
if (per_primitive_stride > first_read_offset) {
|
||||
first_read_offset = ROUND_DOWN_TO(first_read_offset, 32);
|
||||
|
||||
/* Remove the first few unused registers */
|
||||
for (uint32_t i = 0; i < VARYING_SLOT_MAX; i++) {
|
||||
if (per_primitive_offsets[i] == -1)
|
||||
continue;
|
||||
per_primitive_offsets[i] -= first_read_offset;
|
||||
}
|
||||
|
||||
prog_data->num_per_primitive_inputs =
|
||||
2 * DIV_ROUND_UP(per_primitive_stride - first_read_offset, 32);
|
||||
} else {
|
||||
prog_data->num_per_primitive_inputs = 0;
|
||||
}
|
||||
|
||||
/* Now do the per-vertex stuff (what used to be legacy pipeline) */
|
||||
|
||||
/* If Mesh is involved, we cannot do any packing. Documentation doesn't say
|
||||
* anything about this but 3DSTATE_SBE_SWIZ does not appear to work when
|
||||
* using Mesh.
|
||||
*/
|
||||
if (util_bitcount64(inputs_read) <= 16 && key->mesh_input == INTEL_NEVER) {
|
||||
/* When not in Mesh pipeline mode, the SF/SBE pipeline stage can do
|
||||
* arbitrary rearrangement of the first 16 varying inputs, so we can put
|
||||
* them wherever we want. Just put them in order.
|
||||
*
|
||||
* This is useful because it means that (a) inputs not used by the
|
||||
* fragment shader won't take up valuable register space, and (b) we
|
||||
* won't have to recompile the fragment shader if it gets paired with a
|
||||
* different vertex (or geometry) shader.
|
||||
*/
|
||||
for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
|
||||
if (inputs_read & BITFIELD64_BIT(i)) {
|
||||
prog_data->urb_setup[i] = urb_next++;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* We have enough input varyings that the SF/SBE pipeline stage can't
|
||||
* arbitrarily rearrange them to suit our whim; we have to put them in
|
||||
* an order that matches the output of the previous pipeline stage
|
||||
* (geometry or vertex shader).
|
||||
*/
|
||||
int first_slot = 0;
|
||||
for (int i = 0; i < vue_map.num_slots; i++) {
|
||||
int varying = vue_map.slot_to_varying[i];
|
||||
if (varying > 0 && (inputs_read & BITFIELD64_BIT(varying)) != 0) {
|
||||
first_slot = ROUND_DOWN_TO(i, 2);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (int slot = first_slot; slot < vue_map.num_slots; slot++) {
|
||||
int varying = vue_map.slot_to_varying[slot];
|
||||
if (varying > 0 && (inputs_read & BITFIELD64_BIT(varying))) {
|
||||
prog_data->urb_setup[varying] = slot - first_slot;
|
||||
}
|
||||
}
|
||||
urb_next = vue_map.num_slots - first_slot;
|
||||
}
|
||||
|
||||
prog_data->num_varying_inputs = urb_next;
|
||||
prog_data->inputs = inputs_read;
|
||||
prog_data->per_primitive_inputs = per_primitive_inputs;
|
||||
|
||||
brw_compute_urb_setup_index(prog_data);
|
||||
}
|
||||
|
||||
static void
|
||||
populate_fs_prog_data(nir_shader *shader,
|
||||
const struct intel_device_info *devinfo,
|
||||
const struct brw_fs_prog_key *key,
|
||||
struct brw_fs_prog_data *prog_data,
|
||||
const struct brw_mue_map *mue_map,
|
||||
int *per_primitive_offsets)
|
||||
{
|
||||
struct fs_info_ctx ctx = {
|
||||
.key = key,
|
||||
.prog_data = prog_data,
|
||||
.devinfo = devinfo,
|
||||
};
|
||||
nir_shader_intrinsics_pass(shader, gather_fs_info, nir_metadata_all, &ctx);
|
||||
|
||||
prog_data->uses_kill = shader->info.fs.uses_discard;
|
||||
prog_data->uses_omask =
|
||||
!key->ignore_sample_mask_out &&
|
||||
(shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK));
|
||||
prog_data->max_polygons = 1;
|
||||
prog_data->computed_depth_mode = computed_depth_mode(shader);
|
||||
prog_data->computed_stencil =
|
||||
shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
|
||||
|
||||
prog_data->sample_shading = shader->info.fs.uses_sample_shading;
|
||||
prog_data->api_sample_shading = key->api_sample_shading;
|
||||
prog_data->min_sample_shading = key->min_sample_shading;
|
||||
|
||||
assert(key->multisample_fbo != INTEL_NEVER ||
|
||||
key->persample_interp == INTEL_NEVER);
|
||||
|
||||
prog_data->persample_dispatch = key->persample_interp;
|
||||
if (prog_data->sample_shading)
|
||||
prog_data->persample_dispatch = INTEL_ALWAYS;
|
||||
|
||||
/* We can only persample dispatch if we have a multisample FBO */
|
||||
prog_data->persample_dispatch =
|
||||
MIN2(prog_data->persample_dispatch, key->multisample_fbo);
|
||||
|
||||
/* Currently only the Vulkan API allows alpha_to_coverage to be dynamic. If
|
||||
* persample_dispatch & multisample_fbo are not dynamic, Anv should be able
|
||||
* to definitively tell whether alpha_to_coverage is on or off.
|
||||
*/
|
||||
prog_data->alpha_to_coverage = key->alpha_to_coverage;
|
||||
|
||||
assert(devinfo->verx10 >= 125 || key->mesh_input == INTEL_NEVER);
|
||||
prog_data->mesh_input = key->mesh_input;
|
||||
|
||||
assert(devinfo->verx10 >= 200 || key->provoking_vertex_last == INTEL_NEVER);
|
||||
prog_data->provoking_vertex_last = key->provoking_vertex_last;
|
||||
|
||||
/* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
|
||||
*
|
||||
* "MSDISPMODE_PERSAMPLE is required in order to select
|
||||
* POSOFFSET_SAMPLE"
|
||||
*
|
||||
* So we can only really get sample positions if we are doing real
|
||||
* per-sample dispatch. If we need gl_SamplePosition and we don't have
|
||||
* persample dispatch, we hard-code it to 0.5.
|
||||
*/
|
||||
prog_data->uses_pos_offset =
|
||||
prog_data->persample_dispatch != INTEL_NEVER &&
|
||||
(BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_POS) ||
|
||||
BITSET_TEST(shader->info.system_values_read,
|
||||
SYSTEM_VALUE_SAMPLE_POS_OR_CENTER));
|
||||
|
||||
prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
|
||||
prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage;
|
||||
prog_data->inner_coverage = shader->info.fs.inner_coverage;
|
||||
|
||||
/* From the BDW PRM documentation for 3DSTATE_WM:
|
||||
*
|
||||
* "MSDISPMODE_PERSAMPLE is required in order to select Perspective
|
||||
* Sample or Non- perspective Sample barycentric coordinates."
|
||||
*
|
||||
* So cleanup any potentially set sample barycentric mode when not in per
|
||||
* sample dispatch.
|
||||
*/
|
||||
if (prog_data->persample_dispatch == INTEL_NEVER) {
|
||||
prog_data->barycentric_interp_modes &=
|
||||
~BITFIELD_BIT(INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE);
|
||||
}
|
||||
|
||||
if (devinfo->ver >= 20) {
|
||||
prog_data->vertex_attributes_bypass =
|
||||
brw_needs_vertex_attributes_bypass(shader);
|
||||
}
|
||||
|
||||
prog_data->uses_nonperspective_interp_modes =
|
||||
(prog_data->barycentric_interp_modes &
|
||||
INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS) ||
|
||||
prog_data->uses_npc_bary_coefficients;
|
||||
|
||||
/* The current VK_EXT_graphics_pipeline_library specification requires
|
||||
* coarse to specified at compile time. But per sample interpolation can be
|
||||
* dynamic. So we should never be in a situation where coarse &
|
||||
* persample_interp are both respectively true & INTEL_ALWAYS.
|
||||
*
|
||||
* Coarse will dynamically turned off when persample_interp is active.
|
||||
*/
|
||||
assert(!key->coarse_pixel || key->persample_interp != INTEL_ALWAYS);
|
||||
|
||||
prog_data->coarse_pixel_dispatch =
|
||||
intel_sometimes_invert(prog_data->persample_dispatch);
|
||||
if (!key->coarse_pixel ||
|
||||
/* DG2 should support this, but Wa_22012766191 says there are issues
|
||||
* with CPS 1x1 + MSAA + FS writing to oMask.
|
||||
*/
|
||||
(devinfo->verx10 < 200 &&
|
||||
(prog_data->uses_omask || prog_data->uses_sample_mask)) ||
|
||||
prog_data->sample_shading ||
|
||||
(prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) ||
|
||||
prog_data->computed_stencil ||
|
||||
devinfo->ver < 11) {
|
||||
prog_data->coarse_pixel_dispatch = INTEL_NEVER;
|
||||
}
|
||||
|
||||
/* ICL PRMs, Volume 9: Render Engine, Shared Functions Pixel Interpolater,
|
||||
* Message Descriptor :
|
||||
*
|
||||
* "Message Type. Specifies the type of message being sent when
|
||||
* pixel-rate evaluation is requested :
|
||||
*
|
||||
* Format = U2
|
||||
* 0: Per Message Offset (eval_snapped with immediate offset)
|
||||
* 1: Sample Position Offset (eval_sindex)
|
||||
* 2: Centroid Position Offset (eval_centroid)
|
||||
* 3: Per Slot Offset (eval_snapped with register offset)
|
||||
*
|
||||
* Message Type. Specifies the type of message being sent when
|
||||
* coarse-rate evaluation is requested :
|
||||
*
|
||||
* Format = U2
|
||||
* 0: Coarse to Pixel Mapping Message (internal message)
|
||||
* 1: Reserved
|
||||
* 2: Coarse Centroid Position (eval_centroid)
|
||||
* 3: Per Slot Coarse Pixel Offset (eval_snapped with register offset)"
|
||||
*
|
||||
* The Sample Position Offset is marked as reserved for coarse rate
|
||||
* evaluation and leads to hangs if we try to use it. So disable coarse
|
||||
* pixel shading if we have any intrinsic that will result in a pixel
|
||||
* interpolater message at sample.
|
||||
*/
|
||||
if (intel_nir_pulls_at_sample(shader))
|
||||
prog_data->coarse_pixel_dispatch = INTEL_NEVER;
|
||||
|
||||
/* We choose to always enable VMask prior to XeHP, as it would cause
|
||||
* us to lose out on the eliminate_find_live_channel() optimization.
|
||||
*/
|
||||
prog_data->uses_vmask =
|
||||
devinfo->verx10 < 125 ||
|
||||
shader->info.fs.needs_coarse_quad_helper_invocations ||
|
||||
shader->info.uses_wide_subgroup_intrinsics ||
|
||||
prog_data->coarse_pixel_dispatch != INTEL_NEVER;
|
||||
|
||||
prog_data->uses_depth_w_coefficients = prog_data->uses_pc_bary_coefficients;
|
||||
|
||||
if (prog_data->coarse_pixel_dispatch != INTEL_NEVER) {
|
||||
prog_data->uses_depth_w_coefficients |= prog_data->uses_src_depth;
|
||||
prog_data->uses_src_depth = false;
|
||||
}
|
||||
|
||||
calculate_urb_setup(devinfo, key, prog_data, shader, mue_map,
|
||||
per_primitive_offsets);
|
||||
brw_compute_flat_inputs(prog_data, shader);
|
||||
|
||||
prog_data->has_side_effects = shader->info.writes_memory;
|
||||
}
|
||||
|
||||
static void
|
||||
populate_vs_prog_data(nir_shader *nir,
|
||||
const struct intel_device_info *devinfo,
|
||||
const struct brw_vs_prog_key *key,
|
||||
struct brw_vs_prog_data *prog_data,
|
||||
unsigned nr_packed_regs)
|
||||
{
|
||||
unsigned nr_attribute_slots = util_bitcount64(prog_data->inputs_read);
|
||||
BITSET_WORD *sysvals = nir->info.system_values_read;
|
||||
|
||||
/* gl_VertexID and gl_InstanceID are system values, but arrive via an
|
||||
* incoming vertex attribute. So, add an extra slot.
|
||||
*/
|
||||
if (BITSET_TEST(sysvals, SYSTEM_VALUE_FIRST_VERTEX) ||
|
||||
BITSET_TEST(sysvals, SYSTEM_VALUE_BASE_INSTANCE) ||
|
||||
BITSET_TEST(sysvals, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) ||
|
||||
BITSET_TEST(sysvals, SYSTEM_VALUE_INSTANCE_ID)) {
|
||||
nr_attribute_slots++;
|
||||
}
|
||||
|
||||
/* gl_DrawID and IsIndexedDraw share its very own vec4 */
|
||||
if (BITSET_TEST(sysvals, SYSTEM_VALUE_DRAW_ID) ||
|
||||
BITSET_TEST(sysvals, SYSTEM_VALUE_IS_INDEXED_DRAW)) {
|
||||
nr_attribute_slots++;
|
||||
}
|
||||
|
||||
const struct {
|
||||
bool *data;
|
||||
gl_system_value val;
|
||||
} bool_sysvals[] = {
|
||||
{ &prog_data->uses_is_indexed_draw, SYSTEM_VALUE_IS_INDEXED_DRAW },
|
||||
{ &prog_data->uses_firstvertex, SYSTEM_VALUE_FIRST_VERTEX },
|
||||
{ &prog_data->uses_baseinstance, SYSTEM_VALUE_BASE_INSTANCE },
|
||||
{ &prog_data->uses_vertexid, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE },
|
||||
{ &prog_data->uses_instanceid, SYSTEM_VALUE_INSTANCE_ID },
|
||||
{ &prog_data->uses_drawid, SYSTEM_VALUE_DRAW_ID },
|
||||
};
|
||||
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(bool_sysvals); ++i) {
|
||||
*bool_sysvals[i].data = BITSET_TEST(sysvals, bool_sysvals[i].val);
|
||||
}
|
||||
|
||||
unsigned nr_attribute_regs;
|
||||
if (key->vf_component_packing) {
|
||||
prog_data->base.urb_read_length = DIV_ROUND_UP(nr_packed_regs, 8);
|
||||
nr_attribute_regs = nr_packed_regs;
|
||||
} else {
|
||||
prog_data->base.urb_read_length = DIV_ROUND_UP(nr_attribute_slots, 2);
|
||||
nr_attribute_regs = 4 * nr_attribute_slots;
|
||||
}
|
||||
|
||||
/* Since vertex shaders reuse the same VUE entry for inputs and outputs
|
||||
* (overwriting the original contents), we need to make sure the size is
|
||||
* the larger of the two.
|
||||
*/
|
||||
const unsigned vue_entries = MAX2(DIV_ROUND_UP(nr_attribute_regs, 4),
|
||||
prog_data->base.vue_map.num_slots);
|
||||
prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4);
|
||||
prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
|
||||
}
|
||||
|
||||
void
|
||||
jay_populate_prog_data(const struct intel_device_info *devinfo,
|
||||
nir_shader *nir,
|
||||
union brw_any_prog_data *prog_data,
|
||||
union brw_any_prog_key *key,
|
||||
unsigned nr_packed_regs)
|
||||
{
|
||||
if (nir->info.stage == MESA_SHADER_VERTEX) {
|
||||
populate_vs_prog_data(nir, devinfo, &key->vs, &prog_data->vs,
|
||||
nr_packed_regs);
|
||||
} else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
|
||||
int per_primitive_offsets[VARYING_SLOT_MAX];
|
||||
memset(per_primitive_offsets, -1, sizeof(per_primitive_offsets));
|
||||
|
||||
populate_fs_prog_data(nir, devinfo, &key->fs, &prog_data->fs,
|
||||
NULL /* TODO: mue_map */, per_primitive_offsets);
|
||||
} else if (mesa_shader_stage_is_compute(nir->info.stage)) {
|
||||
prog_data->cs.uses_inline_push_addr = key->base.uses_inline_push_addr;
|
||||
prog_data->cs.uses_inline_data |= key->base.uses_inline_push_addr;
|
||||
}
|
||||
|
||||
if (nir->info.stage == MESA_SHADER_VERTEX ||
|
||||
nir->info.stage == MESA_SHADER_TESS_EVAL ||
|
||||
nir->info.stage == MESA_SHADER_GEOMETRY ||
|
||||
nir->info.stage == MESA_SHADER_MESH) {
|
||||
|
||||
uint32_t clip_mask = BITFIELD_MASK(nir->info.clip_distance_array_size);
|
||||
uint32_t cull_mask = BITFIELD_RANGE(nir->info.clip_distance_array_size,
|
||||
nir->info.cull_distance_array_size);
|
||||
|
||||
if (nir->info.stage == MESA_SHADER_MESH) {
|
||||
prog_data->mesh.clip_distance_mask = clip_mask;
|
||||
prog_data->mesh.cull_distance_mask = cull_mask;
|
||||
} else {
|
||||
prog_data->vue.clip_distance_mask = clip_mask;
|
||||
prog_data->vue.cull_distance_mask = cull_mask;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -50,16 +50,19 @@ libintel_compiler_jay_files = files(
|
|||
'jay_assign_flags.c',
|
||||
'jay_from_nir.c',
|
||||
'jay_ir.h',
|
||||
'jay_insert_fp_mode.c',
|
||||
'jay_liveness.c',
|
||||
'jay_lower_post_ra.c',
|
||||
'jay_lower_pre_ra.c',
|
||||
'jay_lower_scoreboard.c',
|
||||
'jay_lower_spill.c',
|
||||
'jay_nir.c',
|
||||
'jay_opt_dead_code.c',
|
||||
'jay_opt_control_flow.c',
|
||||
'jay_opt_propagate.c',
|
||||
'jay_print.c',
|
||||
'jay_private.h',
|
||||
'jay_prog_data.c',
|
||||
'jay_repair_ssa.c',
|
||||
'jay_register_allocate.c',
|
||||
'jay_simd_width.c',
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue