/* * Copyright 2026 Intel Corporation * SPDX-License-Identifier: MIT */ #include "compiler/brw/brw_eu.h" #include "compiler/brw/brw_eu_defines.h" #include "compiler/brw/brw_nir.h" #include "compiler/brw/brw_private.h" #include "compiler/intel_nir.h" #include "jay_private.h" #include "nir.h" #include "nir_builder.h" #include "nir_intrinsics.h" /* * Jay-to-NIR relies on a careful indexing of defs: every 32-bit word has * its own index. Vectors/64-bit use contiguous indices. We therefore run a * modified version of nir_index_ssa_defs right before translating NIR->Jay. */ static bool index_ssa_def_cb(nir_def *def, void *state) { unsigned *index = (unsigned *) state; def->index = *index; *index += DIV_ROUND_UP(def->num_components * MAX2(def->bit_size, 32), 32); return true; } static void nj_index_ssa_defs(nir_shader *nir) { nir_foreach_function_impl(impl, nir) { /* The zero index means null in Jay, so start SSA indices at 1 */ unsigned index = 1; nir_foreach_block_unstructured(block, impl) { nir_foreach_instr(instr, block) nir_foreach_def(instr, index_ssa_def_cb, &index); } impl->ssa_alloc = index; } } static bool lower_helper_invocation(nir_builder *b, nir_intrinsic_instr *intr, void *_) { if (intr->intrinsic != nir_intrinsic_load_helper_invocation) return false; /* TODO: Is this right for multisampling? */ b->cursor = nir_before_instr(&intr->instr); nir_def *active = nir_inot(b, nir_inverse_ballot(b, nir_load_dispatch_mask_intel(b))); nir_def_replace(&intr->def, active); return true; } static bool lower_frag_coord(nir_builder *b, nir_intrinsic_instr *intr, void *simd_) { if (intr->intrinsic != nir_intrinsic_load_frag_coord && intr->intrinsic != nir_intrinsic_load_pixel_coord) return false; b->cursor = nir_before_instr(&intr->instr); nir_def *c = nir_unpack_32_2x16(b, nir_load_pixel_coord_intel(b)); if (intr->intrinsic == nir_intrinsic_load_frag_coord) { c = nir_vec4(b, nir_u2f32(b, nir_channel(b, c, 0)), nir_u2f32(b, nir_channel(b, c, 1)), nir_load_frag_coord_z(b), nir_frcp(b, nir_load_frag_coord_w_rcp(b))); } nir_def_replace(&intr->def, c); return true; } static bool jay_nir_lower_simd(nir_builder *b, nir_intrinsic_instr *intr, void *simd_) { b->cursor = nir_after_instr(&intr->instr); unsigned *simd_width = simd_; /* mask & -mask isolates the lowest set bit in the mask. */ if (intr->intrinsic == nir_intrinsic_elect) { nir_def *mask = nir_ballot(b, 1, *simd_width, nir_imm_true(b)); mask = nir_iand(b, mask, nir_ineg(b, mask)); nir_def_replace(&intr->def, nir_inverse_ballot(b, mask)); return true; } /* Ballots must match the SIMD size */ if (intr->intrinsic == nir_intrinsic_ballot || intr->intrinsic == nir_intrinsic_ballot_relaxed) { unsigned old_bitsize = intr->def.bit_size; intr->def.bit_size = *simd_width; nir_def *u2uN = nir_u2uN(b, &intr->def, old_bitsize); nir_def_rewrite_uses_after(&intr->def, u2uN); return true; } /* Just a constant */ if (intr->intrinsic == nir_intrinsic_load_simd_width_intel) { nir_def_replace(&intr->def, nir_imm_int(b, *simd_width)); return true; } /* Note: we don't treat read_invocation specially because there's little * benefit but doing so would require expensive uniformizing in some cases. */ if (intr->intrinsic != nir_intrinsic_shuffle && intr->intrinsic != nir_intrinsic_read_invocation) return false; nir_def *data = intr->src[0].ssa; assert(data->num_components == 1 && data->bit_size <= 32 && "scalarized"); nir_def *offset_B = nir_imul_imm(b, intr->src[1].ssa, 4); nir_def_replace(&intr->def, nir_shuffle_intel(b, 1, data, offset_B)); return true; } struct frag_out_ctx { nir_def *colour[8], *depth, *stencil, *sample_mask; }; static bool collect_fragment_output(nir_builder *b, nir_intrinsic_instr *intr, void *ctx_) { struct frag_out_ctx *ctx = ctx_; if (intr->intrinsic != nir_intrinsic_store_output) return false; unsigned wrmask = nir_intrinsic_write_mask(intr); assert(nir_intrinsic_component(intr) == 0 && "component should be lowered"); assert(util_is_power_of_two_nonzero(wrmask + 1) && "complex writemasks should be lowered"); /* TODO: Optimize with write mask? */ gl_frag_result loc = nir_intrinsic_io_semantics(intr).location; assert(!nir_intrinsic_io_semantics(intr).dual_source_blend_index && "todo"); assert(loc != FRAG_RESULT_DUAL_SRC_BLEND && "todo"); nir_def **out; if (loc == FRAG_RESULT_COLOR) { out = &ctx->colour[0]; } else if (loc >= FRAG_RESULT_DATA0 && loc <= FRAG_RESULT_DATA7) { out = &ctx->colour[loc - FRAG_RESULT_DATA0]; } else if (loc == FRAG_RESULT_DEPTH) { out = &ctx->depth; } else if (loc == FRAG_RESULT_STENCIL) { out = &ctx->stencil; } else if (loc == FRAG_RESULT_SAMPLE_MASK) { out = &ctx->sample_mask; } else { UNREACHABLE("invalid location"); } assert((*out) == NULL && "each location written exactly once"); *out = intr->src[0].ssa; nir_instr_remove(&intr->instr); return true; } static void insert_rt_store(nir_builder *b, signed target, nir_def *colour, nir_def *src0_colour, nir_def *depth, nir_def *stencil, nir_def *sample_mask) { bool null_rt = target < 0; target = MAX2(target, 0); colour = nir_pad_vec4(b, colour ?: nir_undef(b, 4, 32)); if (null_rt) { /* Even if we don't write a RT, we still need to write alpha for * alpha-to-coverage and alpha testing. Optimize the other channels out. */ colour = nir_vector_insert_imm(b, nir_undef(b, 4, 32), nir_channel(b, colour, 3), 3); } nir_def *src0_alpha = nir_channel_or_undef(b, src0_colour ?: colour, 3); nir_def *disable = b->shader->info.fs.uses_discard ? nir_is_helper_invocation(b, 1) : nir_imm_false(b); nir_store_render_target_intel(b, colour, src0_alpha, sample_mask, depth, stencil, disable, .target = target); } static void lower_fragment_outputs(nir_function_impl *impl, const struct intel_device_info *devinfo, unsigned nr_color_regions, unsigned dispatch_width) { struct frag_out_ctx ctx = { { NULL } }; nir_function_intrinsics_pass(impl, collect_fragment_output, nir_metadata_control_flow, &ctx); nir_builder b_ = nir_builder_at(nir_after_impl(impl)); nir_builder *b = &b_; assert(nr_color_regions <= ARRAY_SIZE(ctx.colour)); signed last = -1; for (signed i = nr_color_regions - 1; i >= 0; --i) { if (ctx.colour[i]) { last = i; break; } } nir_def *undef = nir_undef(b, 1, 32); for (signed i = 0; i < last; ++i) { if (ctx.colour[i]) { insert_rt_store(b, i, ctx.colour[i], i > 0 ? ctx.colour[0] : NULL, ctx.depth ?: undef, ctx.stencil ?: undef, ctx.sample_mask ?: undef); } } insert_rt_store(b, last, last >= 0 ? ctx.colour[last] : NULL, last > 0 ? ctx.colour[0] : NULL, ctx.depth ?: undef, ctx.stencil ?: undef, ctx.sample_mask ?: undef); } unsigned jay_process_nir(const struct intel_device_info *devinfo, nir_shader *nir, union brw_any_prog_data *prog_data, union brw_any_prog_key *key) { enum mesa_shader_stage stage = nir->info.stage; struct brw_compiler compiler = { .devinfo = devinfo }; unsigned nr_packed_regs = 0; brw_pass_tracker pt_ = { .nir = nir, .key = &key->base, .dispatch_width = 0, .compiler = &compiler, .archiver = NULL, //params->base.archiver, }, *pt = &pt_; BRW_NIR_SNAPSHOT("first"); prog_data->base.ray_queries = nir->info.ray_queries; prog_data->base.stage = stage; // TODO: Make the driver do this? // prog_data->base.source_hash = params->source_hash; prog_data->base.total_shared = nir->info.shared_size; /* TODO: Real heuristic */ bool do_simd32 = INTEL_SIMD(FS, 32); do_simd32 &= stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_FRAGMENT; unsigned simd_width = do_simd32 ? (nir->info.api_subgroup_size ?: 32) : 16; if (stage == MESA_SHADER_VERTEX) { /* We only expect slot compaction to be disabled when using device * generated commands, to provide an independent 3DSTATE_VERTEX_ELEMENTS * programming. This should always be enabled together with VF component * packing to minimize the size of the payload. */ assert(!key->vs.no_vf_slot_compaction || key->vs.vf_component_packing); /* When using Primitive Replication for multiview, each view gets its own * position slot. */ const uint32_t pos_slots = (nir->info.per_view_outputs & VARYING_BIT_POS) ? MAX2(1, util_bitcount(key->base.view_mask)) : 1; /* Only position is allowed to be per-view */ assert(!(nir->info.per_view_outputs & ~VARYING_BIT_POS)); brw_compute_vue_map(devinfo, &prog_data->vue.vue_map, nir->info.outputs_written, key->base.vue_layout, pos_slots); brw_nir_apply_key(pt, &key->base, simd_width); prog_data->vs.inputs_read = nir->info.inputs_read; prog_data->vs.double_inputs_read = nir->info.vs.double_inputs; prog_data->vs.no_vf_slot_compaction = key->vs.no_vf_slot_compaction; brw_nir_lower_vs_inputs(nir); brw_nir_lower_vue_outputs(nir); BRW_NIR_SNAPSHOT("after_lower_io"); memset(prog_data->vs.vf_component_packing, 0, sizeof(prog_data->vs.vf_component_packing)); if (key->vs.vf_component_packing) { nr_packed_regs = brw_nir_pack_vs_input(nir, &prog_data->vs); } /* Get constant offsets out of the way for proper clip/cull handling */ BRW_NIR_PASS(nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL); BRW_NIR_PASS(nir_opt_constant_folding); BRW_NIR_PASS(brw_nir_lower_deferred_urb_writes, devinfo, &prog_data->vue.vue_map, 0, 0); } else if (stage == MESA_SHADER_FRAGMENT) { assert(key->fs.mesh_input == INTEL_NEVER && "todo"); brw_nir_apply_key(pt, &key->base, 32); brw_nir_lower_fs_inputs(nir, devinfo, &key->fs); brw_nir_lower_fs_outputs(nir); NIR_PASS(_, nir, nir_lower_io_to_scalar, nir_var_shader_in, NULL, NULL); if (!brw_can_coherent_fb_fetch(devinfo)) NIR_PASS(_, nir, brw_nir_lower_fs_load_output, &key->fs); NIR_PASS(_, nir, nir_opt_frag_coord_to_pixel_coord); NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_frag_coord, nir_metadata_control_flow, NULL); NIR_PASS(_, nir, nir_opt_barycentric, true); lower_fragment_outputs(nir_shader_get_entrypoint(nir), devinfo, key->fs.nr_color_regions, simd_width); NIR_PASS(_, nir, nir_lower_helper_writes, true); NIR_PASS(_, nir, nir_lower_is_helper_invocation); NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_helper_invocation, nir_metadata_control_flow, NULL); if (key->fs.alpha_to_coverage != INTEL_NEVER) { /* Run constant fold optimization in order to get the correct source * offset to determine render target 0 store instruction in * emit_alpha_to_coverage pass. */ NIR_PASS(_, nir, nir_opt_constant_folding); NIR_PASS(_, nir, brw_nir_lower_alpha_to_coverage); } // TODO // NIR_PASS(_, nir, brw_nir_move_interpolation_to_top); /* Do this before lower_fs_config_intel so that the pass has the right * information. */ jay_populate_prog_data(devinfo, nir, prog_data, key, 0); NIR_PASS(_, nir, brw_nir_lower_fs_config_intel, &key->fs, &prog_data->fs); } else { brw_nir_apply_key(pt, &key->base, simd_width); } brw_postprocess_nir_opts(pt); NIR_PASS(_, nir, nir_shader_intrinsics_pass, jay_nir_lower_simd, nir_metadata_control_flow, &simd_width); NIR_PASS(_, nir, nir_opt_algebraic_late); NIR_PASS(_, nir, intel_nir_opt_peephole_imul32x16); /* Late postprocess while remaining in SSA */ /* Run fsign lowering again after the last time brw_nir_optimize is called. * As is the case with conversion lowering (below), brw_nir_optimize can * create additional fsign instructions. */ NIR_PASS(_, nir, jay_nir_lower_fsign); NIR_PASS(_, nir, jay_nir_lower_bool); NIR_PASS(_, nir, nir_opt_cse); NIR_PASS(_, nir, nir_opt_dce); NIR_PASS(_, nir, jay_nir_opt_sel_zero); /* Run nir_split_conversions only after the last tiem * brw_nir_optimize is called. Various optimizations invoked there can * rematerialize the conversions that the lowering pass eliminates. */ const nir_split_conversions_options split_conv_opts = { .callback = intel_nir_split_conversions_cb, }; NIR_PASS(_, nir, nir_split_conversions, &split_conv_opts); /* Do this only after the last opt_gcm. GCM will undo this lowering. */ if (stage == MESA_SHADER_FRAGMENT) { NIR_PASS(_, nir, intel_nir_lower_non_uniform_barycentric_at_sample); } NIR_PASS(_, nir, nir_opt_constant_folding); NIR_PASS(_, nir, nir_lower_load_const_to_scalar); NIR_PASS(_, nir, nir_lower_all_phis_to_scalar); NIR_PASS(_, nir, nir_opt_copy_prop); NIR_PASS(_, nir, nir_opt_dce); /* Jay requires LCSSA for correctness reading convergent loop-dependent * values outside of a divergent loop. Converting to LCSSA inserts the * required divergent 1-source phi after the loop. */ NIR_PASS(_, nir, nir_convert_to_lcssa, true, true); /* Run divergence analysis at the end */ nir_sweep(nir); nj_index_ssa_defs(nir); nir_divergence_analysis(nir); if (stage != MESA_SHADER_FRAGMENT) jay_populate_prog_data(devinfo, nir, prog_data, key, nr_packed_regs); return simd_width; }