From 4eb838eb48a8e489978c6d6fbf1a636da807c434 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Tue, 14 Apr 2026 11:25:11 -0400 Subject: [PATCH] jay: split up jay_from_nir.c Big monolithic file, split it up into the relevant pieces. Signed-off-by: Alyssa Rosenzweig Part-of: --- src/intel/compiler/jay/jay_from_nir.c | 1101 +------------------ src/intel/compiler/jay/jay_insert_fp_mode.c | 85 ++ src/intel/compiler/jay/jay_nir.c | 462 ++++++++ src/intel/compiler/jay/jay_private.h | 11 + src/intel/compiler/jay/jay_prog_data.c | 581 ++++++++++ src/intel/compiler/jay/meson.build | 3 + 6 files changed, 1149 insertions(+), 1094 deletions(-) create mode 100644 src/intel/compiler/jay/jay_insert_fp_mode.c create mode 100644 src/intel/compiler/jay/jay_nir.c create mode 100644 src/intel/compiler/jay/jay_prog_data.c diff --git a/src/intel/compiler/jay/jay_from_nir.c b/src/intel/compiler/jay/jay_from_nir.c index a194586a86d..aed974af0de 100644 --- a/src/intel/compiler/jay/jay_from_nir.c +++ b/src/intel/compiler/jay/jay_from_nir.c @@ -7,7 +7,6 @@ #include "compiler/brw/brw_eu.h" #include "compiler/brw/brw_eu_defines.h" #include "compiler/brw/brw_nir.h" -#include "compiler/brw/brw_private.h" #include "compiler/brw/brw_sampler.h" #include "compiler/intel_nir.h" #include "compiler/intel_shader_enums.h" @@ -28,7 +27,6 @@ #include "jay_private.h" #include "nir.h" #include "nir_builder.h" -#include "nir_builder_opcodes.h" #include "nir_defines.h" #include "nir_intrinsics.h" #include "nir_intrinsics_indices.h" @@ -2310,85 +2308,6 @@ jay_emit_eot(struct nir_to_jay_state *nj) } } -static void -set_cr0(jay_function *f, jay_cursor cursor, uint32_t *cr0, uint32_t desired) -{ - /* Only touch cr0 if we are changing bits */ - if ((*cr0) != desired) { - jay_builder b = jay_init_builder(f, cursor); - jay_XOR(&b, JAY_TYPE_U32, jay_control(), jay_control(), (*cr0) ^ desired); - *cr0 = desired; - } -} - -static void -jay_insert_fp_mode(jay_shader *shader, uint32_t api, uint32_t float_sizes) -{ - /* First, work out the global float control mode for the shader */ - uint32_t global = 0x0; - - /* Initially fp16 denorms are flushed-to-zero, handle preserve. */ - if ((api & FLOAT_CONTROLS_DENORM_PRESERVE_FP16) && (float_sizes & 16)) { - global |= BRW_CR0_FP16_DENORM_PRESERVE; - } - - /* Initially fp32 denorms are flushed-to-zero, handle preserve. - * - * TODO: Optimize this, we have a dispatch bit. - */ - if ((api & FLOAT_CONTROLS_DENORM_PRESERVE_FP32) && (float_sizes & 32)) { - global |= BRW_CR0_FP32_DENORM_PRESERVE; - } - - /* Initially fp64 denorms are flushed to zero, handle preserve. */ - if ((api & FLOAT_CONTROLS_DENORM_PRESERVE_FP64) && (float_sizes & 64)) { - global |= BRW_CR0_FP64_DENORM_PRESERVE; - } - - /* By default, we are in round-to-even mode. Note we do not permit setting - * round mode separately by bitsize but this is ok for current APIs. The - * Vulkan driver sets roundingModeIndependence = NONE. - * - * TODO: Optimize this, there is a command buffer bit for it. - */ - if (((api & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16) && (float_sizes & 16)) || - ((api & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32) && (float_sizes & 32)) || - ((api & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) && (float_sizes & 64))) { - global |= (BRW_RND_MODE_RTZ << BRW_CR0_RND_MODE_SHIFT); - } - - uint32_t cr0 = 0; - jay_function *entrypoint = jay_shader_get_entrypoint(shader); - set_cr0(entrypoint, jay_before_function(entrypoint), &cr0, global); - - /* Now handle per-instruction deltas to the global mode */ - jay_foreach_function(shader, func) { - jay_foreach_block(func, block) { - uint32_t current = cr0; - - jay_foreach_inst_in_block(block, I) { - uint32_t required = cr0; - enum jay_rounding_mode round = - (I->op == JAY_OPCODE_CVT) ? jay_cvt_rounding_mode(I) : JAY_ROUND; - - if (round != JAY_ROUND) { - required &= ~BRW_CR0_RND_MODE_MASK; - required |= ((round - JAY_RNE) << BRW_CR0_RND_MODE_SHIFT); - } - - if (jay_type_is_any_float(I->type)) { - set_cr0(func, jay_before_inst(I), ¤t, required); - } - } - - /* Restore to global state on block boundaries */ - if (jay_num_successors(block) > 0) { - set_cr0(func, jay_after_block(block), ¤t, cr0); - } - } - } -} - struct payload_builder { jay_builder *b; unsigned offsets[JAY_NUM_SSA_FILES]; @@ -2467,542 +2386,6 @@ setup_compute_payload(struct nir_to_jay_state *nj, struct payload_builder *p) read_vector_payload(p, UGPR, jay_ugpr_per_grf(nj->s)); } -static inline enum intel_barycentric_mode -brw_barycentric_mode(const struct brw_fs_prog_key *key, - nir_intrinsic_instr *intr) -{ - const enum glsl_interp_mode mode = nir_intrinsic_interp_mode(intr); - - /* Barycentric modes don't make sense for flat inputs. */ - assert(mode != INTERP_MODE_FLAT); - - unsigned bary; - switch (intr->intrinsic) { - case nir_intrinsic_load_barycentric_pixel: - case nir_intrinsic_load_barycentric_at_offset: - /* When per sample interpolation is dynamic, assume sample interpolation. - * We'll dynamically remap things so that the FS payload is not affected. - */ - bary = key->persample_interp == INTEL_SOMETIMES ? - INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE : - INTEL_BARYCENTRIC_PERSPECTIVE_PIXEL; - break; - case nir_intrinsic_load_barycentric_centroid: - bary = INTEL_BARYCENTRIC_PERSPECTIVE_CENTROID; - break; - case nir_intrinsic_load_barycentric_sample: - case nir_intrinsic_load_barycentric_at_sample: - bary = INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE; - break; - default: - UNREACHABLE("invalid intrinsic"); - } - - if (mode == INTERP_MODE_NOPERSPECTIVE) - bary += 3; - - return (enum intel_barycentric_mode) bary; -} - -struct fs_info_ctx { - const struct brw_fs_prog_key *key; - struct brw_fs_prog_data *prog_data; - const struct intel_device_info *devinfo; -}; - -static bool -gather_fs_info(nir_builder *b, nir_intrinsic_instr *intr, void *data) -{ - struct fs_info_ctx *ctx = data; - struct brw_fs_prog_data *prog_data = ctx->prog_data; - - switch (intr->intrinsic) { - case nir_intrinsic_load_barycentric_pixel: - case nir_intrinsic_load_barycentric_centroid: - case nir_intrinsic_load_barycentric_sample: - prog_data->barycentric_interp_modes |= - 1 << brw_barycentric_mode(ctx->key, intr); - break; - - case nir_intrinsic_load_barycentric_at_sample: - case nir_intrinsic_load_barycentric_at_offset: { - unsigned mode = brw_barycentric_mode(ctx->key, intr); - prog_data->barycentric_interp_modes |= 1 << mode; - prog_data->uses_sample_offsets |= - mode == INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE || - mode == INTEL_BARYCENTRIC_NONPERSPECTIVE_SAMPLE; - - if ((1 << mode) & INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS) - prog_data->uses_npc_bary_coefficients = true; - else - prog_data->uses_pc_bary_coefficients = true; - break; - } - - case nir_intrinsic_load_frag_coord_z: - prog_data->uses_src_depth = true; - break; - - case nir_intrinsic_load_frag_coord_w_rcp: - prog_data->uses_src_w = true; - break; - - case nir_intrinsic_load_sample_mask_in: - /* TODO: Sample masks are broken and discards are broken and simd32 - * layouts are broken too. XXX. - */ - // prog_data->uses_sample_mask = true; - break; - - case nir_intrinsic_load_pixel_coord_intel: - BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD); - break; - - default: - break; - } - - return false; -} - -static void -brw_compute_flat_inputs(struct brw_fs_prog_data *prog_data, - const nir_shader *shader) -{ - prog_data->flat_inputs = 0; - - nir_foreach_shader_in_variable(var, shader) { - if (var->data.interpolation != INTERP_MODE_FLAT || - var->data.per_primitive) - continue; - - unsigned slots = glsl_count_attribute_slots(var->type, false); - for (unsigned s = 0; s < slots; s++) { - int input_index = prog_data->urb_setup[var->data.location + s]; - - if (input_index >= 0) - prog_data->flat_inputs |= 1 << input_index; - } - } -} - -static uint8_t -computed_depth_mode(const nir_shader *shader) -{ - if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) { - switch (shader->info.fs.depth_layout) { - case FRAG_DEPTH_LAYOUT_NONE: - case FRAG_DEPTH_LAYOUT_ANY: - return BRW_PSCDEPTH_ON; - case FRAG_DEPTH_LAYOUT_GREATER: - return BRW_PSCDEPTH_ON_GE; - case FRAG_DEPTH_LAYOUT_LESS: - return BRW_PSCDEPTH_ON_LE; - case FRAG_DEPTH_LAYOUT_UNCHANGED: - /* We initially set this to OFF, but having the shader write the - * depth means we allocate register space in the SEND message. The - * difference between the SEND register count and the OFF state - * programming makes the HW hang. - * - * Removing the depth writes also leads to test failures. So use - * LesserThanOrEqual, which fits writing the same value - * (unchanged/equal). - * - */ - return BRW_PSCDEPTH_ON_LE; - } - } - return BRW_PSCDEPTH_OFF; -} - -/* - * Build up an array of indices into the urb_setup array that - * references the active entries of the urb_setup array. - * Used to accelerate walking the active entries of the urb_setup array - * on each upload. - */ -static void -brw_compute_urb_setup_index(struct brw_fs_prog_data *fs_prog_data) -{ - /* TODO(mesh): Review usage of this in the context of Mesh, we may want to - * skip per-primitive attributes here. - */ - - /* Make sure uint8_t is sufficient */ - static_assert(VARYING_SLOT_MAX <= 0xff); - uint8_t index = 0; - for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) { - if (fs_prog_data->urb_setup[attr] >= 0) { - fs_prog_data->urb_setup_attribs[index++] = attr; - } - } - fs_prog_data->urb_setup_attribs_count = index; -} - -static void -calculate_urb_setup(const struct intel_device_info *devinfo, - const struct brw_fs_prog_key *key, - struct brw_fs_prog_data *prog_data, - nir_shader *nir, - const struct brw_mue_map *mue_map, - int *per_primitive_offsets) -{ - memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup)); - int urb_next = 0; /* in vec4s */ - - /* Figure out where the PrimitiveID lives, either in the per-vertex block - * or in the per-primitive block or both. - */ - const uint64_t per_vert_primitive_id = - key->mesh_input == INTEL_ALWAYS ? 0 : VARYING_BIT_PRIMITIVE_ID; - const uint64_t per_prim_primitive_id = - key->mesh_input == INTEL_NEVER ? 0 : VARYING_BIT_PRIMITIVE_ID; - const uint64_t inputs_read = - nir->info.inputs_read & - (~nir->info.per_primitive_inputs | per_vert_primitive_id); - const uint64_t per_primitive_header_bits = - VARYING_BIT_PRIMITIVE_SHADING_RATE | - VARYING_BIT_LAYER | - VARYING_BIT_VIEWPORT | - VARYING_BIT_CULL_PRIMITIVE; - const uint64_t per_primitive_inputs = - nir->info.inputs_read & - (nir->info.per_primitive_inputs | per_prim_primitive_id) & - ~per_primitive_header_bits; - struct intel_vue_map vue_map; - uint32_t per_primitive_stride = 0, first_read_offset = UINT32_MAX; - - if (mue_map != NULL) { - memcpy(&vue_map, &mue_map->vue_map, sizeof(vue_map)); - memcpy(per_primitive_offsets, mue_map->per_primitive_offsets, - sizeof(mue_map->per_primitive_offsets)); - - if (!mue_map->wa_18019110168_active) { - u_foreach_bit64(location, per_primitive_inputs) { - assert(per_primitive_offsets[location] != -1); - - first_read_offset = - MIN2(first_read_offset, - (uint32_t) per_primitive_offsets[location]); - per_primitive_stride = - MAX2((uint32_t) per_primitive_offsets[location] + 16, - per_primitive_stride); - } - } else { - first_read_offset = per_primitive_stride = 0; - } - } else { - brw_compute_vue_map(devinfo, &vue_map, inputs_read, key->base.vue_layout, - 1 /* pos_slots, TODO */); - brw_compute_per_primitive_map(per_primitive_offsets, - &per_primitive_stride, &first_read_offset, - 0, nir, nir_var_shader_in, - per_primitive_inputs, - true /* separate_shader */); - } - - if (per_primitive_stride > first_read_offset) { - first_read_offset = ROUND_DOWN_TO(first_read_offset, 32); - - /* Remove the first few unused registers */ - for (uint32_t i = 0; i < VARYING_SLOT_MAX; i++) { - if (per_primitive_offsets[i] == -1) - continue; - per_primitive_offsets[i] -= first_read_offset; - } - - prog_data->num_per_primitive_inputs = - 2 * DIV_ROUND_UP(per_primitive_stride - first_read_offset, 32); - } else { - prog_data->num_per_primitive_inputs = 0; - } - - /* Now do the per-vertex stuff (what used to be legacy pipeline) */ - - /* If Mesh is involved, we cannot do any packing. Documentation doesn't say - * anything about this but 3DSTATE_SBE_SWIZ does not appear to work when - * using Mesh. - */ - if (util_bitcount64(inputs_read) <= 16 && key->mesh_input == INTEL_NEVER) { - /* When not in Mesh pipeline mode, the SF/SBE pipeline stage can do - * arbitrary rearrangement of the first 16 varying inputs, so we can put - * them wherever we want. Just put them in order. - * - * This is useful because it means that (a) inputs not used by the - * fragment shader won't take up valuable register space, and (b) we - * won't have to recompile the fragment shader if it gets paired with a - * different vertex (or geometry) shader. - */ - for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) { - if (inputs_read & BITFIELD64_BIT(i)) { - prog_data->urb_setup[i] = urb_next++; - } - } - } else { - /* We have enough input varyings that the SF/SBE pipeline stage can't - * arbitrarily rearrange them to suit our whim; we have to put them in - * an order that matches the output of the previous pipeline stage - * (geometry or vertex shader). - */ - int first_slot = 0; - for (int i = 0; i < vue_map.num_slots; i++) { - int varying = vue_map.slot_to_varying[i]; - if (varying > 0 && (inputs_read & BITFIELD64_BIT(varying)) != 0) { - first_slot = ROUND_DOWN_TO(i, 2); - break; - } - } - - for (int slot = first_slot; slot < vue_map.num_slots; slot++) { - int varying = vue_map.slot_to_varying[slot]; - if (varying > 0 && (inputs_read & BITFIELD64_BIT(varying))) { - prog_data->urb_setup[varying] = slot - first_slot; - } - } - urb_next = vue_map.num_slots - first_slot; - } - - prog_data->num_varying_inputs = urb_next; - prog_data->inputs = inputs_read; - prog_data->per_primitive_inputs = per_primitive_inputs; - - brw_compute_urb_setup_index(prog_data); -} - -static void -populate_fs_prog_data(nir_shader *shader, - const struct intel_device_info *devinfo, - const struct brw_fs_prog_key *key, - struct brw_fs_prog_data *prog_data, - const struct brw_mue_map *mue_map, - int *per_primitive_offsets) -{ - struct fs_info_ctx ctx = { - .key = key, - .prog_data = prog_data, - .devinfo = devinfo, - }; - nir_shader_intrinsics_pass(shader, gather_fs_info, nir_metadata_all, &ctx); - - prog_data->uses_kill = shader->info.fs.uses_discard; - prog_data->uses_omask = - !key->ignore_sample_mask_out && - (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)); - prog_data->max_polygons = 1; - prog_data->computed_depth_mode = computed_depth_mode(shader); - prog_data->computed_stencil = - shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL); - - prog_data->sample_shading = shader->info.fs.uses_sample_shading; - prog_data->api_sample_shading = key->api_sample_shading; - prog_data->min_sample_shading = key->min_sample_shading; - - assert(key->multisample_fbo != INTEL_NEVER || - key->persample_interp == INTEL_NEVER); - - prog_data->persample_dispatch = key->persample_interp; - if (prog_data->sample_shading) - prog_data->persample_dispatch = INTEL_ALWAYS; - - /* We can only persample dispatch if we have a multisample FBO */ - prog_data->persample_dispatch = - MIN2(prog_data->persample_dispatch, key->multisample_fbo); - - /* Currently only the Vulkan API allows alpha_to_coverage to be dynamic. If - * persample_dispatch & multisample_fbo are not dynamic, Anv should be able - * to definitively tell whether alpha_to_coverage is on or off. - */ - prog_data->alpha_to_coverage = key->alpha_to_coverage; - - assert(devinfo->verx10 >= 125 || key->mesh_input == INTEL_NEVER); - prog_data->mesh_input = key->mesh_input; - - assert(devinfo->verx10 >= 200 || key->provoking_vertex_last == INTEL_NEVER); - prog_data->provoking_vertex_last = key->provoking_vertex_last; - - /* From the Ivy Bridge PRM documentation for 3DSTATE_PS: - * - * "MSDISPMODE_PERSAMPLE is required in order to select - * POSOFFSET_SAMPLE" - * - * So we can only really get sample positions if we are doing real - * per-sample dispatch. If we need gl_SamplePosition and we don't have - * persample dispatch, we hard-code it to 0.5. - */ - prog_data->uses_pos_offset = - prog_data->persample_dispatch != INTEL_NEVER && - (BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_POS) || - BITSET_TEST(shader->info.system_values_read, - SYSTEM_VALUE_SAMPLE_POS_OR_CENTER)); - - prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests; - prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage; - prog_data->inner_coverage = shader->info.fs.inner_coverage; - - /* From the BDW PRM documentation for 3DSTATE_WM: - * - * "MSDISPMODE_PERSAMPLE is required in order to select Perspective - * Sample or Non- perspective Sample barycentric coordinates." - * - * So cleanup any potentially set sample barycentric mode when not in per - * sample dispatch. - */ - if (prog_data->persample_dispatch == INTEL_NEVER) { - prog_data->barycentric_interp_modes &= - ~BITFIELD_BIT(INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE); - } - - if (devinfo->ver >= 20) { - prog_data->vertex_attributes_bypass = - brw_needs_vertex_attributes_bypass(shader); - } - - prog_data->uses_nonperspective_interp_modes = - (prog_data->barycentric_interp_modes & - INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS) || - prog_data->uses_npc_bary_coefficients; - - /* The current VK_EXT_graphics_pipeline_library specification requires - * coarse to specified at compile time. But per sample interpolation can be - * dynamic. So we should never be in a situation where coarse & - * persample_interp are both respectively true & INTEL_ALWAYS. - * - * Coarse will dynamically turned off when persample_interp is active. - */ - assert(!key->coarse_pixel || key->persample_interp != INTEL_ALWAYS); - - prog_data->coarse_pixel_dispatch = - intel_sometimes_invert(prog_data->persample_dispatch); - if (!key->coarse_pixel || - /* DG2 should support this, but Wa_22012766191 says there are issues - * with CPS 1x1 + MSAA + FS writing to oMask. - */ - (devinfo->verx10 < 200 && - (prog_data->uses_omask || prog_data->uses_sample_mask)) || - prog_data->sample_shading || - (prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) || - prog_data->computed_stencil || - devinfo->ver < 11) { - prog_data->coarse_pixel_dispatch = INTEL_NEVER; - } - - /* ICL PRMs, Volume 9: Render Engine, Shared Functions Pixel Interpolater, - * Message Descriptor : - * - * "Message Type. Specifies the type of message being sent when - * pixel-rate evaluation is requested : - * - * Format = U2 - * 0: Per Message Offset (eval_snapped with immediate offset) - * 1: Sample Position Offset (eval_sindex) - * 2: Centroid Position Offset (eval_centroid) - * 3: Per Slot Offset (eval_snapped with register offset) - * - * Message Type. Specifies the type of message being sent when - * coarse-rate evaluation is requested : - * - * Format = U2 - * 0: Coarse to Pixel Mapping Message (internal message) - * 1: Reserved - * 2: Coarse Centroid Position (eval_centroid) - * 3: Per Slot Coarse Pixel Offset (eval_snapped with register offset)" - * - * The Sample Position Offset is marked as reserved for coarse rate - * evaluation and leads to hangs if we try to use it. So disable coarse - * pixel shading if we have any intrinsic that will result in a pixel - * interpolater message at sample. - */ - if (intel_nir_pulls_at_sample(shader)) - prog_data->coarse_pixel_dispatch = INTEL_NEVER; - - /* We choose to always enable VMask prior to XeHP, as it would cause - * us to lose out on the eliminate_find_live_channel() optimization. - */ - prog_data->uses_vmask = - devinfo->verx10 < 125 || - shader->info.fs.needs_coarse_quad_helper_invocations || - shader->info.uses_wide_subgroup_intrinsics || - prog_data->coarse_pixel_dispatch != INTEL_NEVER; - - prog_data->uses_depth_w_coefficients = prog_data->uses_pc_bary_coefficients; - - if (prog_data->coarse_pixel_dispatch != INTEL_NEVER) { - prog_data->uses_depth_w_coefficients |= prog_data->uses_src_depth; - prog_data->uses_src_depth = false; - } - - calculate_urb_setup(devinfo, key, prog_data, shader, mue_map, - per_primitive_offsets); - brw_compute_flat_inputs(prog_data, shader); -} - -static void -populate_vs_prog_data(nir_shader *nir, - const struct intel_device_info *devinfo, - const struct brw_vs_prog_key *key, - struct brw_vs_prog_data *prog_data, - unsigned nr_packed_regs, - bool debug) -{ - unsigned nr_attribute_slots = util_bitcount64(prog_data->inputs_read); - BITSET_WORD *sysvals = nir->info.system_values_read; - - /* gl_VertexID and gl_InstanceID are system values, but arrive via an - * incoming vertex attribute. So, add an extra slot. - */ - if (BITSET_TEST(sysvals, SYSTEM_VALUE_FIRST_VERTEX) || - BITSET_TEST(sysvals, SYSTEM_VALUE_BASE_INSTANCE) || - BITSET_TEST(sysvals, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) || - BITSET_TEST(sysvals, SYSTEM_VALUE_INSTANCE_ID)) { - nr_attribute_slots++; - } - - /* gl_DrawID and IsIndexedDraw share its very own vec4 */ - if (BITSET_TEST(sysvals, SYSTEM_VALUE_DRAW_ID) || - BITSET_TEST(sysvals, SYSTEM_VALUE_IS_INDEXED_DRAW)) { - nr_attribute_slots++; - } - - const struct { - bool *data; - gl_system_value val; - } bool_sysvals[] = { - { &prog_data->uses_is_indexed_draw, SYSTEM_VALUE_IS_INDEXED_DRAW }, - { &prog_data->uses_firstvertex, SYSTEM_VALUE_FIRST_VERTEX }, - { &prog_data->uses_baseinstance, SYSTEM_VALUE_BASE_INSTANCE }, - { &prog_data->uses_vertexid, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE }, - { &prog_data->uses_instanceid, SYSTEM_VALUE_INSTANCE_ID }, - { &prog_data->uses_drawid, SYSTEM_VALUE_DRAW_ID }, - }; - - for (unsigned i = 0; i < ARRAY_SIZE(bool_sysvals); ++i) { - *bool_sysvals[i].data = BITSET_TEST(sysvals, bool_sysvals[i].val); - } - - unsigned nr_attribute_regs; - if (key->vf_component_packing) { - prog_data->base.urb_read_length = DIV_ROUND_UP(nr_packed_regs, 8); - nr_attribute_regs = nr_packed_regs; - } else { - prog_data->base.urb_read_length = DIV_ROUND_UP(nr_attribute_slots, 2); - nr_attribute_regs = 4 * nr_attribute_slots; - } - - /* Since vertex shaders reuse the same VUE entry for inputs and outputs - * (overwriting the original contents), we need to make sure the size is - * the larger of the two. - */ - const unsigned vue_entries = MAX2(DIV_ROUND_UP(nr_attribute_regs, 4), - prog_data->base.vue_map.num_slots); - prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4); - prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8; - - if (unlikely(debug)) { - fprintf(stderr, "VS Output "); - brw_print_vue_map(stderr, &prog_data->base.vue_map, MESA_SHADER_VERTEX); - } -} - static void setup_fragment_payload(struct nir_to_jay_state *nj, struct payload_builder *p) { @@ -3196,275 +2579,6 @@ jay_gather_stats(const jay_shader *s, struct genisa_stats *stats) stats->sends -= (s->spills + s->fills); } -/* - * Jay-to-NIR relies on a careful indexing of defs: every 32-bit word has - * its own index. Vectors/64-bit use contiguous indices. We therefore run a - * modified version of nir_index_ssa_defs right before translating NIR->Jay. - */ -static bool -index_ssa_def_cb(nir_def *def, void *state) -{ - unsigned *index = (unsigned *) state; - def->index = *index; - *index += DIV_ROUND_UP(def->num_components * MAX2(def->bit_size, 32), 32); - return true; -} - -static void -nj_index_ssa_defs(nir_shader *nir) -{ - nir_foreach_function_impl(impl, nir) { - /* The zero index means null in Jay, so start SSA indices at 1 */ - unsigned index = 1; - - nir_foreach_block_unstructured(block, impl) { - nir_foreach_instr(instr, block) - nir_foreach_def(instr, index_ssa_def_cb, &index); - } - - impl->ssa_alloc = index; - } -} - -static bool -lower_helper_invocation(nir_builder *b, nir_intrinsic_instr *intr, void *_) -{ - if (intr->intrinsic != nir_intrinsic_load_helper_invocation) - return false; - - /* TODO: Is this right for multisampling? */ - b->cursor = nir_before_instr(&intr->instr); - nir_def *active = - nir_inot(b, nir_inverse_ballot(b, nir_load_sample_mask_in(b))); - - nir_def_replace(&intr->def, active); - return true; -} - -static bool -lower_frag_coord(nir_builder *b, nir_intrinsic_instr *intr, void *simd_) -{ - if (intr->intrinsic != nir_intrinsic_load_frag_coord && - intr->intrinsic != nir_intrinsic_load_pixel_coord) - return false; - - b->cursor = nir_before_instr(&intr->instr); - nir_def *c = nir_unpack_32_2x16(b, nir_load_pixel_coord_intel(b)); - - if (intr->intrinsic == nir_intrinsic_load_frag_coord) { - c = nir_vec4(b, nir_u2f32(b, nir_channel(b, c, 0)), - nir_u2f32(b, nir_channel(b, c, 1)), nir_load_frag_coord_z(b), - nir_frcp(b, nir_load_frag_coord_w_rcp(b))); - } - - nir_def_replace(&intr->def, c); - return true; -} - -static bool -jay_nir_lower_simd(nir_builder *b, nir_intrinsic_instr *intr, void *simd_) -{ - b->cursor = nir_after_instr(&intr->instr); - unsigned *simd_width = simd_; - - /* mask & -mask isolates the lowest set bit in the mask. */ - if (intr->intrinsic == nir_intrinsic_elect) { - nir_def *mask = nir_ballot(b, 1, *simd_width, nir_imm_true(b)); - mask = nir_iand(b, mask, nir_ineg(b, mask)); - nir_def_replace(&intr->def, nir_inverse_ballot(b, mask)); - return true; - } - - /* Ballots must match the SIMD size */ - if (intr->intrinsic == nir_intrinsic_ballot || - intr->intrinsic == nir_intrinsic_ballot_relaxed) { - unsigned old_bitsize = intr->def.bit_size; - intr->def.bit_size = *simd_width; - nir_def *u2uN = nir_u2uN(b, &intr->def, old_bitsize); - nir_def_rewrite_uses_after(&intr->def, u2uN); - return true; - } - - /* Note: we don't treat read_invocation specially because there's little - * benefit but doing so would require expensive uniformizing in some cases. - */ - if (intr->intrinsic != nir_intrinsic_shuffle && - intr->intrinsic != nir_intrinsic_read_invocation) - return false; - - nir_def *data = intr->src[0].ssa; - assert(data->num_components == 1 && data->bit_size <= 32 && "scalarized"); - - nir_def *offset_B = nir_imul_imm(b, intr->src[1].ssa, 4); - nir_def_replace(&intr->def, nir_shuffle_intel(b, 1, data, offset_B)); - return true; -} - -struct frag_out_ctx { - nir_def *colour[8], *depth, *stencil, *sample_mask; -}; - -static bool -collect_fragment_output(nir_builder *b, nir_intrinsic_instr *intr, void *ctx_) -{ - struct frag_out_ctx *ctx = ctx_; - if (intr->intrinsic != nir_intrinsic_store_output) - return false; - - unsigned wrmask = nir_intrinsic_write_mask(intr); - assert(nir_intrinsic_component(intr) == 0 && "component should be lowered"); - assert(util_is_power_of_two_nonzero(wrmask + 1) && - "complex writemasks should be lowered"); - - /* TODO: Optimize with write mask? */ - - gl_frag_result loc = nir_intrinsic_io_semantics(intr).location; - assert(!nir_intrinsic_io_semantics(intr).dual_source_blend_index && "todo"); - nir_def **out; - if (loc == FRAG_RESULT_COLOR) { - out = &ctx->colour[0]; - } else if (loc >= FRAG_RESULT_DATA0 && loc <= FRAG_RESULT_DATA7) { - out = &ctx->colour[loc - FRAG_RESULT_DATA0]; - } else if (loc == FRAG_RESULT_DEPTH) { - out = &ctx->depth; - } else if (loc == FRAG_RESULT_STENCIL) { - UNREACHABLE("todo"); - out = &ctx->stencil; - } else if (loc == FRAG_RESULT_SAMPLE_MASK) { - UNREACHABLE("todo"); - out = &ctx->sample_mask; - } else { - UNREACHABLE("invalid location"); - } - - assert((*out) == NULL && "each location written exactly once"); - *out = intr->src[0].ssa; - - nir_instr_remove(&intr->instr); - return true; -} - -static void -append_payload(nir_builder *b, - nir_def **payload, - unsigned *len, - unsigned max_len, - nir_def *value) -{ - if (value != NULL) { - for (unsigned i = 0; i < value->num_components; ++i) { - payload[*len] = nir_channel(b, value, i); - (*len)++; - assert((*len) <= max_len); - } - } -} - -static void -insert_rt_store(nir_builder *b, - const struct intel_device_info *devinfo, - signed target, - bool last, - nir_def *colour, - nir_def *src0_alpha, - nir_def *depth, - nir_def *stencil, - nir_def *sample_mask, - unsigned dispatch_width) -{ - bool null_rt = target < 0; - target = MAX2(target, 0); - - if (!colour) { - colour = nir_undef(b, 4, 32); - } - - colour = nir_pad_vec4(b, colour); - - if (null_rt) { - /* Even if we don't write a RT, we still need to write alpha for - * alpha-to-coverage and alpha testing. Optimize the other channels out. - */ - colour = nir_vector_insert_imm(b, nir_undef(b, 4, 32), - nir_channel(b, colour, 3), 3); - } - - /* TODO: Not sure I like this. We'll see what 2src looks like. */ - unsigned op = dispatch_width == 32 ? - XE2_DATAPORT_RENDER_TARGET_WRITE_SIMD32_SINGLE_SOURCE : - BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE; - uint64_t desc = - brw_fb_write_desc(devinfo, target, op, last, false /* coarse write */); - - uint64_t ex_desc = 0; - if (devinfo->ver >= 20) { - ex_desc = target << 21 | - null_rt << 20 | - (src0_alpha ? (1 << 15) : 0) | - (stencil ? (1 << 14) : 0) | - (depth ? (1 << 13) : 0) | - (sample_mask ? (1 << 12) : 0); - } else if (devinfo->ver >= 11) { - /* Set the "Render Target Index" and "Src0 Alpha Present" fields - * in the extended message descriptor, in lieu of using a header. - */ - ex_desc = target << 12 | null_rt << 20 | (src0_alpha ? (1 << 15) : 0); - } - - /* Build the payload */ - nir_def *payload[8] = { NULL }; - unsigned len = 0; - append_payload(b, payload, &len, ARRAY_SIZE(payload), colour); - append_payload(b, payload, &len, ARRAY_SIZE(payload), depth); - /* TODO */ - - nir_def *disable = b->shader->info.fs.uses_discard ? - nir_is_helper_invocation(b, 1) : - nir_imm_false(b); - - nir_store_render_target_intel(b, nir_vec(b, payload, len), - nir_imm_ivec2(b, desc, ex_desc), disable, - .eot = last); -} - -static void -lower_fragment_outputs(nir_function_impl *impl, - const struct intel_device_info *devinfo, - unsigned nr_color_regions, - unsigned dispatch_width) -{ - struct frag_out_ctx ctx = { { NULL } }; - nir_function_intrinsics_pass(impl, collect_fragment_output, - nir_metadata_control_flow, &ctx); - nir_builder b_ = nir_builder_at(nir_after_impl(impl)); - nir_builder *b = &b_; - assert(nr_color_regions <= ARRAY_SIZE(ctx.colour)); - - signed first = -1; - for (unsigned i = 0; i < ARRAY_SIZE(ctx.colour); ++i) { - if (ctx.colour[i]) { - first = i; - break; - } - } - - /* Do the later render targets first */ - for (unsigned i = first + 1; i < nr_color_regions; ++i) { - if (ctx.colour[i]) { - insert_rt_store(b, devinfo, i, false, ctx.colour[i], NULL, NULL, NULL, - NULL, dispatch_width); - } - } - - /* Finally do render target zero attaching all the sideband things and - * setting the LastRT bit. This needs to exist even if nothing is written - * since it also signals end-of-thread. - */ - insert_rt_store(b, devinfo, first < nr_color_regions ? first : -1, true, - first >= 0 ? ctx.colour[first] : NULL, NULL, ctx.depth, - ctx.stencil, ctx.sample_mask, dispatch_width); -} - struct jay_shader_bin * jay_compile(const struct intel_device_info *devinfo, void *mem_ctx, @@ -3473,177 +2587,8 @@ jay_compile(const struct intel_device_info *devinfo, union brw_any_prog_key *key) { jay_debug = debug_get_option_jay_debug(); - enum mesa_shader_stage stage = nir->info.stage; - bool debug = INTEL_DEBUG(intel_debug_flag_for_shader_stage(stage)); - struct brw_compiler compiler = { .devinfo = devinfo }; - unsigned nr_packed_regs = 0; - - brw_pass_tracker pt_ = { - .nir = nir, - .key = &key->base, - .dispatch_width = 0, - .compiler = &compiler, - .archiver = NULL, //params->base.archiver, - }, *pt = &pt_; - - BRW_NIR_SNAPSHOT("first"); - - prog_data->base.ray_queries = nir->info.ray_queries; - prog_data->base.stage = stage; - // TODO: Make the driver do this? - // prog_data->base.source_hash = params->source_hash; - prog_data->base.total_shared = nir->info.shared_size; - - /* TODO: Real heuristic */ - bool do_simd32 = INTEL_SIMD(FS, 32); - do_simd32 &= stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_FRAGMENT; - unsigned simd_width = do_simd32 ? (nir->info.api_subgroup_size ?: 32) : 16; - - if (stage == MESA_SHADER_VERTEX) { - /* We only expect slot compaction to be disabled when using device - * generated commands, to provide an independent 3DSTATE_VERTEX_ELEMENTS - * programming. This should always be enabled together with VF component - * packing to minimize the size of the payload. - */ - assert(!key->vs.no_vf_slot_compaction || key->vs.vf_component_packing); - - /* When using Primitive Replication for multiview, each view gets its own - * position slot. - */ - const uint32_t pos_slots = - (nir->info.per_view_outputs & VARYING_BIT_POS) ? - MAX2(1, util_bitcount(key->base.view_mask)) : - 1; - - /* Only position is allowed to be per-view */ - assert(!(nir->info.per_view_outputs & ~VARYING_BIT_POS)); - - brw_compute_vue_map(devinfo, &prog_data->vue.vue_map, - nir->info.outputs_written, key->base.vue_layout, - pos_slots); - - brw_nir_apply_key(pt, &key->base, simd_width); - - prog_data->vs.inputs_read = nir->info.inputs_read; - prog_data->vs.double_inputs_read = nir->info.vs.double_inputs; - prog_data->vs.no_vf_slot_compaction = key->vs.no_vf_slot_compaction; - - brw_nir_lower_vs_inputs(nir); - brw_nir_lower_vue_outputs(nir); - BRW_NIR_SNAPSHOT("after_lower_io"); - - memset(prog_data->vs.vf_component_packing, 0, - sizeof(prog_data->vs.vf_component_packing)); - if (key->vs.vf_component_packing) { - nr_packed_regs = brw_nir_pack_vs_input(nir, &prog_data->vs); - } - - /* Get constant offsets out of the way for proper clip/cull handling */ - BRW_NIR_PASS(nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL); - BRW_NIR_PASS(nir_opt_constant_folding); - BRW_NIR_PASS(brw_nir_lower_deferred_urb_writes, devinfo, - &prog_data->vue.vue_map, 0, 0); - } else if (stage == MESA_SHADER_FRAGMENT) { - assert(key->fs.mesh_input == INTEL_NEVER && "todo"); - assert(!key->fs.force_dual_color_blend && "todo"); - brw_nir_apply_key(pt, &key->base, 32); - brw_nir_lower_fs_inputs(nir, devinfo, &key->fs); - brw_nir_lower_fs_outputs(nir); - NIR_PASS(_, nir, nir_lower_io_to_scalar, nir_var_shader_in, NULL, NULL); - - if (!brw_can_coherent_fb_fetch(devinfo)) - NIR_PASS(_, nir, brw_nir_lower_fs_load_output, &key->fs); - - NIR_PASS(_, nir, nir_opt_frag_coord_to_pixel_coord); - NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_frag_coord, - nir_metadata_control_flow, NULL); - NIR_PASS(_, nir, nir_opt_barycentric, true); - - lower_fragment_outputs(nir_shader_get_entrypoint(nir), devinfo, - key->fs.nr_color_regions, simd_width); - NIR_PASS(_, nir, nir_lower_helper_writes, true); - NIR_PASS(_, nir, nir_lower_is_helper_invocation); - NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_helper_invocation, - nir_metadata_control_flow, NULL); - - if (key->fs.alpha_to_coverage != INTEL_NEVER) { - /* Run constant fold optimization in order to get the correct source - * offset to determine render target 0 store instruction in - * emit_alpha_to_coverage pass. - */ - NIR_PASS(_, nir, nir_opt_constant_folding); - NIR_PASS(_, nir, brw_nir_lower_alpha_to_coverage); - } - - // TODO - // NIR_PASS(_, nir, brw_nir_move_interpolation_to_top); - - if (!brw_fs_prog_key_is_dynamic(&key->fs)) { - uint32_t f = 0; - - if (key->fs.multisample_fbo == INTEL_ALWAYS) - f |= INTEL_FS_CONFIG_MULTISAMPLE_FBO; - - if (key->fs.alpha_to_coverage == INTEL_ALWAYS) - f |= INTEL_FS_CONFIG_ALPHA_TO_COVERAGE; - - if (key->fs.provoking_vertex_last == INTEL_ALWAYS) - f |= INTEL_FS_CONFIG_PROVOKING_VERTEX_LAST; - - if (key->fs.persample_interp == INTEL_ALWAYS) { - f |= INTEL_FS_CONFIG_PERSAMPLE_DISPATCH | - INTEL_FS_CONFIG_PERSAMPLE_INTERP; - } - - NIR_PASS(_, nir, nir_inline_sysval, nir_intrinsic_load_fs_config_intel, - f); - } - } else { - brw_nir_apply_key(pt, &key->base, simd_width); - } - - brw_postprocess_nir_opts(pt); - - NIR_PASS(_, nir, nir_shader_intrinsics_pass, jay_nir_lower_simd, - nir_metadata_control_flow, &simd_width); - NIR_PASS(_, nir, nir_opt_algebraic_late); - NIR_PASS(_, nir, intel_nir_opt_peephole_imul32x16); - - /* Late postprocess while remaining in SSA */ - /* Run fsign lowering again after the last time brw_nir_optimize is called. - * As is the case with conversion lowering (below), brw_nir_optimize can - * create additional fsign instructions. - */ - NIR_PASS(_, nir, jay_nir_lower_fsign); - NIR_PASS(_, nir, jay_nir_lower_bool); - NIR_PASS(_, nir, nir_opt_cse); - NIR_PASS(_, nir, nir_opt_dce); - NIR_PASS(_, nir, jay_nir_opt_sel_zero); - - /* Run nir_split_conversions only after the last tiem - * brw_nir_optimize is called. Various optimizations invoked there can - * rematerialize the conversions that the lowering pass eliminates. - */ - const nir_split_conversions_options split_conv_opts = { - .callback = intel_nir_split_conversions_cb, - }; - NIR_PASS(_, nir, nir_split_conversions, &split_conv_opts); - - /* Do this only after the last opt_gcm. GCM will undo this lowering. */ - if (stage == MESA_SHADER_FRAGMENT) { - NIR_PASS(_, nir, intel_nir_lower_non_uniform_barycentric_at_sample); - } - - NIR_PASS(_, nir, nir_opt_constant_folding); - NIR_PASS(_, nir, nir_lower_load_const_to_scalar); - NIR_PASS(_, nir, nir_lower_all_phis_to_scalar); - NIR_PASS(_, nir, nir_opt_copy_prop); - NIR_PASS(_, nir, nir_opt_dce); - - /* Run divergence analysis at the end */ - nir_sweep(nir); - nj_index_ssa_defs(nir); - nir_divergence_analysis(nir); + bool debug = INTEL_DEBUG(intel_debug_flag_for_shader_stage(nir->info.stage)); + unsigned simd_width = jay_process_nir(devinfo, nir, prog_data, key); if (debug) { /* We can't use nir_print_shader since it reindexes SSA defs. */ @@ -3652,18 +2597,7 @@ jay_compile(const struct intel_device_info *devinfo, fflush(stdout); } - if (stage == MESA_SHADER_VERTEX) { - populate_vs_prog_data(nir, devinfo, &key->vs, &prog_data->vs, - nr_packed_regs, debug); - } else if (stage == MESA_SHADER_FRAGMENT) { - int per_primitive_offsets[VARYING_SLOT_MAX]; - memset(per_primitive_offsets, -1, sizeof(per_primitive_offsets)); - - populate_fs_prog_data(nir, devinfo, &key->fs, &prog_data->fs, - NULL /* TODO: mue_map */, per_primitive_offsets); - } - - jay_shader *s = jay_new_shader(NULL, stage); + jay_shader *s = jay_new_shader(NULL, nir->info.stage); s->dispatch_width = simd_width; s->scratch_size = align(nir->scratch_size, 4) * s->dispatch_width; s->devinfo = devinfo; @@ -3729,13 +2663,13 @@ jay_compile(const struct intel_device_info *devinfo, jay_gather_stats(s, &bin->stats); bin->stats.code_size = bin->size; - if (INTEL_DEBUG(intel_debug_flag_for_shader_stage(stage))) { + if (debug) { if (nir->info.label) { printf("%s - ", nir->info.label); } const char *shader_name = - ralloc_asprintf(s, "%s SIMD%u", _mesa_shader_stage_to_abbrev(stage), + ralloc_asprintf(s, "%s SIMD%u", _mesa_shader_stage_to_abbrev(s->stage), s->dispatch_width); genisa_stats_fprintf(stdout, shader_name, &bin->stats); } @@ -3743,7 +2677,7 @@ jay_compile(const struct intel_device_info *devinfo, bin->stats.workgroup_memory_size = nir->info.shared_size; bin->stats.dispatch_width = simd_width; - if (stage == MESA_SHADER_FRAGMENT) { + if (s->stage == MESA_SHADER_FRAGMENT) { if (simd_width == 8) { prog_data->fs.dispatch_8 = true; } else if (simd_width == 16) { @@ -3754,13 +2688,10 @@ jay_compile(const struct intel_device_info *devinfo, prog_data->fs.prog_offset_32 = 0; } - prog_data->fs.has_side_effects = nir->info.writes_memory; - } else if (mesa_shader_stage_is_compute(stage)) { + } else if (mesa_shader_stage_is_compute(s->stage)) { unsigned i = simd_width == 8 ? 0 : simd_width == 16 ? 1 : 2; prog_data->cs.prog_offset[i] = 0; prog_data->cs.prog_mask = BITFIELD_BIT(i); - prog_data->cs.uses_inline_push_addr = key->base.uses_inline_push_addr; - prog_data->cs.uses_inline_data |= key->base.uses_inline_push_addr; prog_data->cs.prog_spilled = s->scratch_size > 0; /* XXX */ } @@ -3789,24 +2720,6 @@ jay_compile(const struct intel_device_info *devinfo, util_next_power_of_two(s->scratch_size)); } - if (stage == MESA_SHADER_VERTEX || - stage == MESA_SHADER_TESS_EVAL || - stage == MESA_SHADER_GEOMETRY || - stage == MESA_SHADER_MESH) { - - uint32_t clip_mask = BITFIELD_MASK(nir->info.clip_distance_array_size); - uint32_t cull_mask = BITFIELD_RANGE(nir->info.clip_distance_array_size, - nir->info.cull_distance_array_size); - - if (stage == MESA_SHADER_MESH) { - prog_data->mesh.clip_distance_mask = clip_mask; - prog_data->mesh.cull_distance_mask = cull_mask; - } else { - prog_data->vue.clip_distance_mask = clip_mask; - prog_data->vue.cull_distance_mask = cull_mask; - } - } - /* Scratch is allocated in 1KiB increments. */ prog_data->base.total_scratch = align(prog_data->base.total_scratch, 1024); diff --git a/src/intel/compiler/jay/jay_insert_fp_mode.c b/src/intel/compiler/jay/jay_insert_fp_mode.c new file mode 100644 index 00000000000..f7fbc82b31d --- /dev/null +++ b/src/intel/compiler/jay/jay_insert_fp_mode.c @@ -0,0 +1,85 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ +#include "jay_builder.h" +#include "jay_ir.h" + +static void +set_cr0(jay_function *f, jay_cursor cursor, uint32_t *cr0, uint32_t desired) +{ + /* Only touch cr0 if we are changing bits */ + if ((*cr0) != desired) { + jay_builder b = jay_init_builder(f, cursor); + jay_XOR(&b, JAY_TYPE_U32, jay_control(), jay_control(), (*cr0) ^ desired); + *cr0 = desired; + } +} + +void +jay_insert_fp_mode(jay_shader *shader, uint32_t api, uint32_t float_sizes) +{ + /* First, work out the global float control mode for the shader */ + uint32_t global = 0x0; + + /* Initially fp16 denorms are flushed-to-zero, handle preserve. */ + if ((api & FLOAT_CONTROLS_DENORM_PRESERVE_FP16) && (float_sizes & 16)) { + global |= BRW_CR0_FP16_DENORM_PRESERVE; + } + + /* Initially fp32 denorms are flushed-to-zero, handle preserve. + * + * TODO: Optimize this, we have a dispatch bit. + */ + if ((api & FLOAT_CONTROLS_DENORM_PRESERVE_FP32) && (float_sizes & 32)) { + global |= BRW_CR0_FP32_DENORM_PRESERVE; + } + + /* Initially fp64 denorms are flushed to zero, handle preserve. */ + if ((api & FLOAT_CONTROLS_DENORM_PRESERVE_FP64) && (float_sizes & 64)) { + global |= BRW_CR0_FP64_DENORM_PRESERVE; + } + + /* By default, we are in round-to-even mode. Note we do not permit setting + * round mode separately by bitsize but this is ok for current APIs. The + * Vulkan driver sets roundingModeIndependence = NONE. + * + * TODO: Optimize this, there is a command buffer bit for it. + */ + if (((api & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16) && (float_sizes & 16)) || + ((api & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32) && (float_sizes & 32)) || + ((api & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) && (float_sizes & 64))) { + global |= (BRW_RND_MODE_RTZ << BRW_CR0_RND_MODE_SHIFT); + } + + uint32_t cr0 = 0; + jay_function *entrypoint = jay_shader_get_entrypoint(shader); + set_cr0(entrypoint, jay_before_function(entrypoint), &cr0, global); + + /* Now handle per-instruction deltas to the global mode */ + jay_foreach_function(shader, func) { + jay_foreach_block(func, block) { + uint32_t current = cr0; + + jay_foreach_inst_in_block(block, I) { + uint32_t required = cr0; + enum jay_rounding_mode round = + (I->op == JAY_OPCODE_CVT) ? jay_cvt_rounding_mode(I) : JAY_ROUND; + + if (round != JAY_ROUND) { + required &= ~BRW_CR0_RND_MODE_MASK; + required |= ((round - JAY_RNE) << BRW_CR0_RND_MODE_SHIFT); + } + + if (jay_type_is_any_float(I->type)) { + set_cr0(func, jay_before_inst(I), ¤t, required); + } + } + + /* Restore to global state on block boundaries */ + if (jay_num_successors(block) > 0) { + set_cr0(func, jay_after_block(block), ¤t, cr0); + } + } + } +} diff --git a/src/intel/compiler/jay/jay_nir.c b/src/intel/compiler/jay/jay_nir.c new file mode 100644 index 00000000000..b50b98248fd --- /dev/null +++ b/src/intel/compiler/jay/jay_nir.c @@ -0,0 +1,462 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ +#include "compiler/brw/brw_eu.h" +#include "compiler/brw/brw_eu_defines.h" +#include "compiler/brw/brw_nir.h" +#include "compiler/brw/brw_private.h" +#include "compiler/intel_nir.h" +#include "jay_private.h" +#include "nir.h" +#include "nir_builder.h" + +/* + * Jay-to-NIR relies on a careful indexing of defs: every 32-bit word has + * its own index. Vectors/64-bit use contiguous indices. We therefore run a + * modified version of nir_index_ssa_defs right before translating NIR->Jay. + */ +static bool +index_ssa_def_cb(nir_def *def, void *state) +{ + unsigned *index = (unsigned *) state; + def->index = *index; + *index += DIV_ROUND_UP(def->num_components * MAX2(def->bit_size, 32), 32); + return true; +} + +static void +nj_index_ssa_defs(nir_shader *nir) +{ + nir_foreach_function_impl(impl, nir) { + /* The zero index means null in Jay, so start SSA indices at 1 */ + unsigned index = 1; + + nir_foreach_block_unstructured(block, impl) { + nir_foreach_instr(instr, block) + nir_foreach_def(instr, index_ssa_def_cb, &index); + } + + impl->ssa_alloc = index; + } +} + +static bool +lower_helper_invocation(nir_builder *b, nir_intrinsic_instr *intr, void *_) +{ + if (intr->intrinsic != nir_intrinsic_load_helper_invocation) + return false; + + /* TODO: Is this right for multisampling? */ + b->cursor = nir_before_instr(&intr->instr); + nir_def *active = + nir_inot(b, nir_inverse_ballot(b, nir_load_sample_mask_in(b))); + + nir_def_replace(&intr->def, active); + return true; +} + +static bool +lower_frag_coord(nir_builder *b, nir_intrinsic_instr *intr, void *simd_) +{ + if (intr->intrinsic != nir_intrinsic_load_frag_coord && + intr->intrinsic != nir_intrinsic_load_pixel_coord) + return false; + + b->cursor = nir_before_instr(&intr->instr); + nir_def *c = nir_unpack_32_2x16(b, nir_load_pixel_coord_intel(b)); + + if (intr->intrinsic == nir_intrinsic_load_frag_coord) { + c = nir_vec4(b, nir_u2f32(b, nir_channel(b, c, 0)), + nir_u2f32(b, nir_channel(b, c, 1)), nir_load_frag_coord_z(b), + nir_frcp(b, nir_load_frag_coord_w_rcp(b))); + } + + nir_def_replace(&intr->def, c); + return true; +} + +static bool +jay_nir_lower_simd(nir_builder *b, nir_intrinsic_instr *intr, void *simd_) +{ + b->cursor = nir_after_instr(&intr->instr); + unsigned *simd_width = simd_; + + /* mask & -mask isolates the lowest set bit in the mask. */ + if (intr->intrinsic == nir_intrinsic_elect) { + nir_def *mask = nir_ballot(b, 1, *simd_width, nir_imm_true(b)); + mask = nir_iand(b, mask, nir_ineg(b, mask)); + nir_def_replace(&intr->def, nir_inverse_ballot(b, mask)); + return true; + } + + /* Ballots must match the SIMD size */ + if (intr->intrinsic == nir_intrinsic_ballot || + intr->intrinsic == nir_intrinsic_ballot_relaxed) { + unsigned old_bitsize = intr->def.bit_size; + intr->def.bit_size = *simd_width; + nir_def *u2uN = nir_u2uN(b, &intr->def, old_bitsize); + nir_def_rewrite_uses_after(&intr->def, u2uN); + return true; + } + + /* Note: we don't treat read_invocation specially because there's little + * benefit but doing so would require expensive uniformizing in some cases. + */ + if (intr->intrinsic != nir_intrinsic_shuffle && + intr->intrinsic != nir_intrinsic_read_invocation) + return false; + + nir_def *data = intr->src[0].ssa; + assert(data->num_components == 1 && data->bit_size <= 32 && "scalarized"); + + nir_def *offset_B = nir_imul_imm(b, intr->src[1].ssa, 4); + nir_def_replace(&intr->def, nir_shuffle_intel(b, 1, data, offset_B)); + return true; +} + +struct frag_out_ctx { + nir_def *colour[8], *depth, *stencil, *sample_mask; +}; + +static bool +collect_fragment_output(nir_builder *b, nir_intrinsic_instr *intr, void *ctx_) +{ + struct frag_out_ctx *ctx = ctx_; + if (intr->intrinsic != nir_intrinsic_store_output) + return false; + + unsigned wrmask = nir_intrinsic_write_mask(intr); + assert(nir_intrinsic_component(intr) == 0 && "component should be lowered"); + assert(util_is_power_of_two_nonzero(wrmask + 1) && + "complex writemasks should be lowered"); + + /* TODO: Optimize with write mask? */ + + gl_frag_result loc = nir_intrinsic_io_semantics(intr).location; + assert(!nir_intrinsic_io_semantics(intr).dual_source_blend_index && "todo"); + nir_def **out; + if (loc == FRAG_RESULT_COLOR) { + out = &ctx->colour[0]; + } else if (loc >= FRAG_RESULT_DATA0 && loc <= FRAG_RESULT_DATA7) { + out = &ctx->colour[loc - FRAG_RESULT_DATA0]; + } else if (loc == FRAG_RESULT_DEPTH) { + out = &ctx->depth; + } else if (loc == FRAG_RESULT_STENCIL) { + UNREACHABLE("todo"); + out = &ctx->stencil; + } else if (loc == FRAG_RESULT_SAMPLE_MASK) { + UNREACHABLE("todo"); + out = &ctx->sample_mask; + } else { + UNREACHABLE("invalid location"); + } + + assert((*out) == NULL && "each location written exactly once"); + *out = intr->src[0].ssa; + + nir_instr_remove(&intr->instr); + return true; +} + +static void +append_payload(nir_builder *b, + nir_def **payload, + unsigned *len, + unsigned max_len, + nir_def *value) +{ + if (value != NULL) { + for (unsigned i = 0; i < value->num_components; ++i) { + payload[*len] = nir_channel(b, value, i); + (*len)++; + assert((*len) <= max_len); + } + } +} + +static void +insert_rt_store(nir_builder *b, + const struct intel_device_info *devinfo, + signed target, + bool last, + nir_def *colour, + nir_def *src0_alpha, + nir_def *depth, + nir_def *stencil, + nir_def *sample_mask, + unsigned dispatch_width) +{ + bool null_rt = target < 0; + target = MAX2(target, 0); + + if (!colour) { + colour = nir_undef(b, 4, 32); + } + + colour = nir_pad_vec4(b, colour); + + if (null_rt) { + /* Even if we don't write a RT, we still need to write alpha for + * alpha-to-coverage and alpha testing. Optimize the other channels out. + */ + colour = nir_vector_insert_imm(b, nir_undef(b, 4, 32), + nir_channel(b, colour, 3), 3); + } + + /* TODO: Not sure I like this. We'll see what 2src looks like. */ + unsigned op = dispatch_width == 32 ? + XE2_DATAPORT_RENDER_TARGET_WRITE_SIMD32_SINGLE_SOURCE : + BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE; + uint64_t desc = + brw_fb_write_desc(devinfo, target, op, last, false /* coarse write */); + + uint64_t ex_desc = 0; + if (devinfo->ver >= 20) { + ex_desc = target << 21 | + null_rt << 20 | + (src0_alpha ? (1 << 15) : 0) | + (stencil ? (1 << 14) : 0) | + (depth ? (1 << 13) : 0) | + (sample_mask ? (1 << 12) : 0); + } else if (devinfo->ver >= 11) { + /* Set the "Render Target Index" and "Src0 Alpha Present" fields + * in the extended message descriptor, in lieu of using a header. + */ + ex_desc = target << 12 | null_rt << 20 | (src0_alpha ? (1 << 15) : 0); + } + + /* Build the payload */ + nir_def *payload[8] = { NULL }; + unsigned len = 0; + append_payload(b, payload, &len, ARRAY_SIZE(payload), colour); + append_payload(b, payload, &len, ARRAY_SIZE(payload), depth); + /* TODO */ + + nir_def *disable = b->shader->info.fs.uses_discard ? + nir_is_helper_invocation(b, 1) : + nir_imm_false(b); + + nir_store_render_target_intel(b, nir_vec(b, payload, len), + nir_imm_ivec2(b, desc, ex_desc), disable, + .eot = last); +} + +static void +lower_fragment_outputs(nir_function_impl *impl, + const struct intel_device_info *devinfo, + unsigned nr_color_regions, + unsigned dispatch_width) +{ + struct frag_out_ctx ctx = { { NULL } }; + nir_function_intrinsics_pass(impl, collect_fragment_output, + nir_metadata_control_flow, &ctx); + nir_builder b_ = nir_builder_at(nir_after_impl(impl)); + nir_builder *b = &b_; + assert(nr_color_regions <= ARRAY_SIZE(ctx.colour)); + + signed first = -1; + for (unsigned i = 0; i < ARRAY_SIZE(ctx.colour); ++i) { + if (ctx.colour[i]) { + first = i; + break; + } + } + + /* Do the later render targets first */ + for (unsigned i = first + 1; i < nr_color_regions; ++i) { + if (ctx.colour[i]) { + insert_rt_store(b, devinfo, i, false, ctx.colour[i], NULL, NULL, NULL, + NULL, dispatch_width); + } + } + + /* Finally do render target zero attaching all the sideband things and + * setting the LastRT bit. This needs to exist even if nothing is written + * since it also signals end-of-thread. + */ + insert_rt_store(b, devinfo, first < nr_color_regions ? first : -1, true, + first >= 0 ? ctx.colour[first] : NULL, NULL, ctx.depth, + ctx.stencil, ctx.sample_mask, dispatch_width); +} + +unsigned +jay_process_nir(const struct intel_device_info *devinfo, + nir_shader *nir, + union brw_any_prog_data *prog_data, + union brw_any_prog_key *key) +{ + enum mesa_shader_stage stage = nir->info.stage; + struct brw_compiler compiler = { .devinfo = devinfo }; + unsigned nr_packed_regs = 0; + + brw_pass_tracker pt_ = { + .nir = nir, + .key = &key->base, + .dispatch_width = 0, + .compiler = &compiler, + .archiver = NULL, //params->base.archiver, + }, *pt = &pt_; + + BRW_NIR_SNAPSHOT("first"); + + prog_data->base.ray_queries = nir->info.ray_queries; + prog_data->base.stage = stage; + // TODO: Make the driver do this? + // prog_data->base.source_hash = params->source_hash; + prog_data->base.total_shared = nir->info.shared_size; + + /* TODO: Real heuristic */ + bool do_simd32 = INTEL_SIMD(FS, 32); + do_simd32 &= stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_FRAGMENT; + unsigned simd_width = do_simd32 ? (nir->info.api_subgroup_size ?: 32) : 16; + + if (stage == MESA_SHADER_VERTEX) { + /* We only expect slot compaction to be disabled when using device + * generated commands, to provide an independent 3DSTATE_VERTEX_ELEMENTS + * programming. This should always be enabled together with VF component + * packing to minimize the size of the payload. + */ + assert(!key->vs.no_vf_slot_compaction || key->vs.vf_component_packing); + + /* When using Primitive Replication for multiview, each view gets its own + * position slot. + */ + const uint32_t pos_slots = + (nir->info.per_view_outputs & VARYING_BIT_POS) ? + MAX2(1, util_bitcount(key->base.view_mask)) : + 1; + + /* Only position is allowed to be per-view */ + assert(!(nir->info.per_view_outputs & ~VARYING_BIT_POS)); + + brw_compute_vue_map(devinfo, &prog_data->vue.vue_map, + nir->info.outputs_written, key->base.vue_layout, + pos_slots); + + brw_nir_apply_key(pt, &key->base, simd_width); + + prog_data->vs.inputs_read = nir->info.inputs_read; + prog_data->vs.double_inputs_read = nir->info.vs.double_inputs; + prog_data->vs.no_vf_slot_compaction = key->vs.no_vf_slot_compaction; + + brw_nir_lower_vs_inputs(nir); + brw_nir_lower_vue_outputs(nir); + BRW_NIR_SNAPSHOT("after_lower_io"); + + memset(prog_data->vs.vf_component_packing, 0, + sizeof(prog_data->vs.vf_component_packing)); + if (key->vs.vf_component_packing) { + nr_packed_regs = brw_nir_pack_vs_input(nir, &prog_data->vs); + } + + /* Get constant offsets out of the way for proper clip/cull handling */ + BRW_NIR_PASS(nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL); + BRW_NIR_PASS(nir_opt_constant_folding); + BRW_NIR_PASS(brw_nir_lower_deferred_urb_writes, devinfo, + &prog_data->vue.vue_map, 0, 0); + } else if (stage == MESA_SHADER_FRAGMENT) { + assert(key->fs.mesh_input == INTEL_NEVER && "todo"); + assert(!key->fs.force_dual_color_blend && "todo"); + brw_nir_apply_key(pt, &key->base, 32); + brw_nir_lower_fs_inputs(nir, devinfo, &key->fs); + brw_nir_lower_fs_outputs(nir); + NIR_PASS(_, nir, nir_lower_io_to_scalar, nir_var_shader_in, NULL, NULL); + + if (!brw_can_coherent_fb_fetch(devinfo)) + NIR_PASS(_, nir, brw_nir_lower_fs_load_output, &key->fs); + + NIR_PASS(_, nir, nir_opt_frag_coord_to_pixel_coord); + NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_frag_coord, + nir_metadata_control_flow, NULL); + NIR_PASS(_, nir, nir_opt_barycentric, true); + + lower_fragment_outputs(nir_shader_get_entrypoint(nir), devinfo, + key->fs.nr_color_regions, simd_width); + NIR_PASS(_, nir, nir_lower_helper_writes, true); + NIR_PASS(_, nir, nir_lower_is_helper_invocation); + NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_helper_invocation, + nir_metadata_control_flow, NULL); + + if (key->fs.alpha_to_coverage != INTEL_NEVER) { + /* Run constant fold optimization in order to get the correct source + * offset to determine render target 0 store instruction in + * emit_alpha_to_coverage pass. + */ + NIR_PASS(_, nir, nir_opt_constant_folding); + NIR_PASS(_, nir, brw_nir_lower_alpha_to_coverage); + } + + // TODO + // NIR_PASS(_, nir, brw_nir_move_interpolation_to_top); + + if (!brw_fs_prog_key_is_dynamic(&key->fs)) { + uint32_t f = 0; + + if (key->fs.multisample_fbo == INTEL_ALWAYS) + f |= INTEL_FS_CONFIG_MULTISAMPLE_FBO; + + if (key->fs.alpha_to_coverage == INTEL_ALWAYS) + f |= INTEL_FS_CONFIG_ALPHA_TO_COVERAGE; + + if (key->fs.provoking_vertex_last == INTEL_ALWAYS) + f |= INTEL_FS_CONFIG_PROVOKING_VERTEX_LAST; + + if (key->fs.persample_interp == INTEL_ALWAYS) { + f |= INTEL_FS_CONFIG_PERSAMPLE_DISPATCH | + INTEL_FS_CONFIG_PERSAMPLE_INTERP; + } + + NIR_PASS(_, nir, nir_inline_sysval, nir_intrinsic_load_fs_config_intel, + f); + } + } else { + brw_nir_apply_key(pt, &key->base, simd_width); + } + + brw_postprocess_nir_opts(pt); + + NIR_PASS(_, nir, nir_shader_intrinsics_pass, jay_nir_lower_simd, + nir_metadata_control_flow, &simd_width); + NIR_PASS(_, nir, nir_opt_algebraic_late); + NIR_PASS(_, nir, intel_nir_opt_peephole_imul32x16); + + /* Late postprocess while remaining in SSA */ + /* Run fsign lowering again after the last time brw_nir_optimize is called. + * As is the case with conversion lowering (below), brw_nir_optimize can + * create additional fsign instructions. + */ + NIR_PASS(_, nir, jay_nir_lower_fsign); + NIR_PASS(_, nir, jay_nir_lower_bool); + NIR_PASS(_, nir, nir_opt_cse); + NIR_PASS(_, nir, nir_opt_dce); + NIR_PASS(_, nir, jay_nir_opt_sel_zero); + + /* Run nir_split_conversions only after the last tiem + * brw_nir_optimize is called. Various optimizations invoked there can + * rematerialize the conversions that the lowering pass eliminates. + */ + const nir_split_conversions_options split_conv_opts = { + .callback = intel_nir_split_conversions_cb, + }; + NIR_PASS(_, nir, nir_split_conversions, &split_conv_opts); + + /* Do this only after the last opt_gcm. GCM will undo this lowering. */ + if (stage == MESA_SHADER_FRAGMENT) { + NIR_PASS(_, nir, intel_nir_lower_non_uniform_barycentric_at_sample); + } + + NIR_PASS(_, nir, nir_opt_constant_folding); + NIR_PASS(_, nir, nir_lower_load_const_to_scalar); + NIR_PASS(_, nir, nir_lower_all_phis_to_scalar); + NIR_PASS(_, nir, nir_opt_copy_prop); + NIR_PASS(_, nir, nir_opt_dce); + + /* Run divergence analysis at the end */ + nir_sweep(nir); + nj_index_ssa_defs(nir); + nir_divergence_analysis(nir); + + jay_populate_prog_data(devinfo, nir, prog_data, key, nr_packed_regs); + return simd_width; +} diff --git a/src/intel/compiler/jay/jay_private.h b/src/intel/compiler/jay/jay_private.h index 2799eaa7b7b..e0ceaebdaed 100644 --- a/src/intel/compiler/jay/jay_private.h +++ b/src/intel/compiler/jay/jay_private.h @@ -22,6 +22,16 @@ bool jay_nir_lower_bool(nir_shader *nir); bool jay_nir_opt_sel_zero(nir_shader *nir); bool jay_nir_lower_fsign(nir_shader *nir); +void jay_populate_prog_data(const struct intel_device_info *devinfo, + nir_shader *nir, + union brw_any_prog_data *prog_data, + union brw_any_prog_key *key, + unsigned nr_packed_regs); +unsigned jay_process_nir(const struct intel_device_info *devinfo, + nir_shader *nir, + union brw_any_prog_data *prog_data, + union brw_any_prog_key *key); + void jay_compute_liveness(jay_function *f); void jay_calculate_register_demands(jay_function *f); @@ -63,6 +73,7 @@ void jay_lower_post_ra(jay_shader *s); void jay_lower_spill(jay_function *func); void jay_lower_simd_width(jay_shader *s); void jay_lower_scoreboard(jay_shader *s); +void jay_insert_fp_mode(jay_shader *shader, uint32_t api, uint32_t float_sizes); struct jay_shader_bin * jay_to_binary(jay_shader *s, void *const_data, size_t const_data_size); diff --git a/src/intel/compiler/jay/jay_prog_data.c b/src/intel/compiler/jay/jay_prog_data.c new file mode 100644 index 00000000000..bc56c13dae6 --- /dev/null +++ b/src/intel/compiler/jay/jay_prog_data.c @@ -0,0 +1,581 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ +#include "compiler/brw/brw_compiler.h" +#include "compiler/brw/brw_nir.h" +#include "compiler/intel_nir.h" +#include "jay_private.h" +#include "nir.h" + +static inline enum intel_barycentric_mode +brw_barycentric_mode(const struct brw_fs_prog_key *key, + nir_intrinsic_instr *intr) +{ + const enum glsl_interp_mode mode = nir_intrinsic_interp_mode(intr); + + /* Barycentric modes don't make sense for flat inputs. */ + assert(mode != INTERP_MODE_FLAT); + + unsigned bary; + switch (intr->intrinsic) { + case nir_intrinsic_load_barycentric_pixel: + case nir_intrinsic_load_barycentric_at_offset: + /* When per sample interpolation is dynamic, assume sample interpolation. + * We'll dynamically remap things so that the FS payload is not affected. + */ + bary = key->persample_interp == INTEL_SOMETIMES ? + INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE : + INTEL_BARYCENTRIC_PERSPECTIVE_PIXEL; + break; + case nir_intrinsic_load_barycentric_centroid: + bary = INTEL_BARYCENTRIC_PERSPECTIVE_CENTROID; + break; + case nir_intrinsic_load_barycentric_sample: + case nir_intrinsic_load_barycentric_at_sample: + bary = INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE; + break; + default: + UNREACHABLE("invalid intrinsic"); + } + + if (mode == INTERP_MODE_NOPERSPECTIVE) + bary += 3; + + return (enum intel_barycentric_mode) bary; +} + +struct fs_info_ctx { + const struct brw_fs_prog_key *key; + struct brw_fs_prog_data *prog_data; + const struct intel_device_info *devinfo; +}; + +static bool +gather_fs_info(nir_builder *b, nir_intrinsic_instr *intr, void *data) +{ + struct fs_info_ctx *ctx = data; + struct brw_fs_prog_data *prog_data = ctx->prog_data; + + switch (intr->intrinsic) { + case nir_intrinsic_load_barycentric_pixel: + case nir_intrinsic_load_barycentric_centroid: + case nir_intrinsic_load_barycentric_sample: + prog_data->barycentric_interp_modes |= + 1 << brw_barycentric_mode(ctx->key, intr); + break; + + case nir_intrinsic_load_barycentric_at_sample: + case nir_intrinsic_load_barycentric_at_offset: { + unsigned mode = brw_barycentric_mode(ctx->key, intr); + prog_data->barycentric_interp_modes |= 1 << mode; + prog_data->uses_sample_offsets |= + mode == INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE || + mode == INTEL_BARYCENTRIC_NONPERSPECTIVE_SAMPLE; + + if ((1 << mode) & INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS) + prog_data->uses_npc_bary_coefficients = true; + else + prog_data->uses_pc_bary_coefficients = true; + break; + } + + case nir_intrinsic_load_frag_coord_z: + prog_data->uses_src_depth = true; + break; + + case nir_intrinsic_load_frag_coord_w_rcp: + prog_data->uses_src_w = true; + break; + + case nir_intrinsic_load_sample_mask_in: + /* TODO: Sample masks are broken and discards are broken and simd32 + * layouts are broken too. XXX. + */ + // prog_data->uses_sample_mask = true; + break; + + case nir_intrinsic_load_pixel_coord_intel: + BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD); + break; + + default: + break; + } + + return false; +} + +static void +brw_compute_flat_inputs(struct brw_fs_prog_data *prog_data, + const nir_shader *shader) +{ + prog_data->flat_inputs = 0; + + nir_foreach_shader_in_variable(var, shader) { + if (var->data.interpolation != INTERP_MODE_FLAT || + var->data.per_primitive) + continue; + + unsigned slots = glsl_count_attribute_slots(var->type, false); + for (unsigned s = 0; s < slots; s++) { + int input_index = prog_data->urb_setup[var->data.location + s]; + + if (input_index >= 0) + prog_data->flat_inputs |= 1 << input_index; + } + } +} + +static uint8_t +computed_depth_mode(const nir_shader *shader) +{ + if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) { + switch (shader->info.fs.depth_layout) { + case FRAG_DEPTH_LAYOUT_NONE: + case FRAG_DEPTH_LAYOUT_ANY: + return BRW_PSCDEPTH_ON; + case FRAG_DEPTH_LAYOUT_GREATER: + return BRW_PSCDEPTH_ON_GE; + case FRAG_DEPTH_LAYOUT_LESS: + return BRW_PSCDEPTH_ON_LE; + case FRAG_DEPTH_LAYOUT_UNCHANGED: + /* We initially set this to OFF, but having the shader write the + * depth means we allocate register space in the SEND message. The + * difference between the SEND register count and the OFF state + * programming makes the HW hang. + * + * Removing the depth writes also leads to test failures. So use + * LesserThanOrEqual, which fits writing the same value + * (unchanged/equal). + * + */ + return BRW_PSCDEPTH_ON_LE; + } + } + return BRW_PSCDEPTH_OFF; +} + +/* + * Build up an array of indices into the urb_setup array that + * references the active entries of the urb_setup array. + * Used to accelerate walking the active entries of the urb_setup array + * on each upload. + */ +static void +brw_compute_urb_setup_index(struct brw_fs_prog_data *fs_prog_data) +{ + /* TODO(mesh): Review usage of this in the context of Mesh, we may want to + * skip per-primitive attributes here. + */ + + /* Make sure uint8_t is sufficient */ + static_assert(VARYING_SLOT_MAX <= 0xff); + uint8_t index = 0; + for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) { + if (fs_prog_data->urb_setup[attr] >= 0) { + fs_prog_data->urb_setup_attribs[index++] = attr; + } + } + fs_prog_data->urb_setup_attribs_count = index; +} + +static void +calculate_urb_setup(const struct intel_device_info *devinfo, + const struct brw_fs_prog_key *key, + struct brw_fs_prog_data *prog_data, + nir_shader *nir, + const struct brw_mue_map *mue_map, + int *per_primitive_offsets) +{ + memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup)); + int urb_next = 0; /* in vec4s */ + + /* Figure out where the PrimitiveID lives, either in the per-vertex block + * or in the per-primitive block or both. + */ + const uint64_t per_vert_primitive_id = + key->mesh_input == INTEL_ALWAYS ? 0 : VARYING_BIT_PRIMITIVE_ID; + const uint64_t per_prim_primitive_id = + key->mesh_input == INTEL_NEVER ? 0 : VARYING_BIT_PRIMITIVE_ID; + const uint64_t inputs_read = + nir->info.inputs_read & + (~nir->info.per_primitive_inputs | per_vert_primitive_id); + const uint64_t per_primitive_header_bits = + VARYING_BIT_PRIMITIVE_SHADING_RATE | + VARYING_BIT_LAYER | + VARYING_BIT_VIEWPORT | + VARYING_BIT_CULL_PRIMITIVE; + const uint64_t per_primitive_inputs = + nir->info.inputs_read & + (nir->info.per_primitive_inputs | per_prim_primitive_id) & + ~per_primitive_header_bits; + struct intel_vue_map vue_map; + uint32_t per_primitive_stride = 0, first_read_offset = UINT32_MAX; + + if (mue_map != NULL) { + memcpy(&vue_map, &mue_map->vue_map, sizeof(vue_map)); + memcpy(per_primitive_offsets, mue_map->per_primitive_offsets, + sizeof(mue_map->per_primitive_offsets)); + + if (!mue_map->wa_18019110168_active) { + u_foreach_bit64(location, per_primitive_inputs) { + assert(per_primitive_offsets[location] != -1); + + first_read_offset = + MIN2(first_read_offset, + (uint32_t) per_primitive_offsets[location]); + per_primitive_stride = + MAX2((uint32_t) per_primitive_offsets[location] + 16, + per_primitive_stride); + } + } else { + first_read_offset = per_primitive_stride = 0; + } + } else { + brw_compute_vue_map(devinfo, &vue_map, inputs_read, key->base.vue_layout, + 1 /* pos_slots, TODO */); + brw_compute_per_primitive_map(per_primitive_offsets, + &per_primitive_stride, &first_read_offset, + 0, nir, nir_var_shader_in, + per_primitive_inputs, + true /* separate_shader */); + } + + if (per_primitive_stride > first_read_offset) { + first_read_offset = ROUND_DOWN_TO(first_read_offset, 32); + + /* Remove the first few unused registers */ + for (uint32_t i = 0; i < VARYING_SLOT_MAX; i++) { + if (per_primitive_offsets[i] == -1) + continue; + per_primitive_offsets[i] -= first_read_offset; + } + + prog_data->num_per_primitive_inputs = + 2 * DIV_ROUND_UP(per_primitive_stride - first_read_offset, 32); + } else { + prog_data->num_per_primitive_inputs = 0; + } + + /* Now do the per-vertex stuff (what used to be legacy pipeline) */ + + /* If Mesh is involved, we cannot do any packing. Documentation doesn't say + * anything about this but 3DSTATE_SBE_SWIZ does not appear to work when + * using Mesh. + */ + if (util_bitcount64(inputs_read) <= 16 && key->mesh_input == INTEL_NEVER) { + /* When not in Mesh pipeline mode, the SF/SBE pipeline stage can do + * arbitrary rearrangement of the first 16 varying inputs, so we can put + * them wherever we want. Just put them in order. + * + * This is useful because it means that (a) inputs not used by the + * fragment shader won't take up valuable register space, and (b) we + * won't have to recompile the fragment shader if it gets paired with a + * different vertex (or geometry) shader. + */ + for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) { + if (inputs_read & BITFIELD64_BIT(i)) { + prog_data->urb_setup[i] = urb_next++; + } + } + } else { + /* We have enough input varyings that the SF/SBE pipeline stage can't + * arbitrarily rearrange them to suit our whim; we have to put them in + * an order that matches the output of the previous pipeline stage + * (geometry or vertex shader). + */ + int first_slot = 0; + for (int i = 0; i < vue_map.num_slots; i++) { + int varying = vue_map.slot_to_varying[i]; + if (varying > 0 && (inputs_read & BITFIELD64_BIT(varying)) != 0) { + first_slot = ROUND_DOWN_TO(i, 2); + break; + } + } + + for (int slot = first_slot; slot < vue_map.num_slots; slot++) { + int varying = vue_map.slot_to_varying[slot]; + if (varying > 0 && (inputs_read & BITFIELD64_BIT(varying))) { + prog_data->urb_setup[varying] = slot - first_slot; + } + } + urb_next = vue_map.num_slots - first_slot; + } + + prog_data->num_varying_inputs = urb_next; + prog_data->inputs = inputs_read; + prog_data->per_primitive_inputs = per_primitive_inputs; + + brw_compute_urb_setup_index(prog_data); +} + +static void +populate_fs_prog_data(nir_shader *shader, + const struct intel_device_info *devinfo, + const struct brw_fs_prog_key *key, + struct brw_fs_prog_data *prog_data, + const struct brw_mue_map *mue_map, + int *per_primitive_offsets) +{ + struct fs_info_ctx ctx = { + .key = key, + .prog_data = prog_data, + .devinfo = devinfo, + }; + nir_shader_intrinsics_pass(shader, gather_fs_info, nir_metadata_all, &ctx); + + prog_data->uses_kill = shader->info.fs.uses_discard; + prog_data->uses_omask = + !key->ignore_sample_mask_out && + (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)); + prog_data->max_polygons = 1; + prog_data->computed_depth_mode = computed_depth_mode(shader); + prog_data->computed_stencil = + shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL); + + prog_data->sample_shading = shader->info.fs.uses_sample_shading; + prog_data->api_sample_shading = key->api_sample_shading; + prog_data->min_sample_shading = key->min_sample_shading; + + assert(key->multisample_fbo != INTEL_NEVER || + key->persample_interp == INTEL_NEVER); + + prog_data->persample_dispatch = key->persample_interp; + if (prog_data->sample_shading) + prog_data->persample_dispatch = INTEL_ALWAYS; + + /* We can only persample dispatch if we have a multisample FBO */ + prog_data->persample_dispatch = + MIN2(prog_data->persample_dispatch, key->multisample_fbo); + + /* Currently only the Vulkan API allows alpha_to_coverage to be dynamic. If + * persample_dispatch & multisample_fbo are not dynamic, Anv should be able + * to definitively tell whether alpha_to_coverage is on or off. + */ + prog_data->alpha_to_coverage = key->alpha_to_coverage; + + assert(devinfo->verx10 >= 125 || key->mesh_input == INTEL_NEVER); + prog_data->mesh_input = key->mesh_input; + + assert(devinfo->verx10 >= 200 || key->provoking_vertex_last == INTEL_NEVER); + prog_data->provoking_vertex_last = key->provoking_vertex_last; + + /* From the Ivy Bridge PRM documentation for 3DSTATE_PS: + * + * "MSDISPMODE_PERSAMPLE is required in order to select + * POSOFFSET_SAMPLE" + * + * So we can only really get sample positions if we are doing real + * per-sample dispatch. If we need gl_SamplePosition and we don't have + * persample dispatch, we hard-code it to 0.5. + */ + prog_data->uses_pos_offset = + prog_data->persample_dispatch != INTEL_NEVER && + (BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_POS) || + BITSET_TEST(shader->info.system_values_read, + SYSTEM_VALUE_SAMPLE_POS_OR_CENTER)); + + prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests; + prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage; + prog_data->inner_coverage = shader->info.fs.inner_coverage; + + /* From the BDW PRM documentation for 3DSTATE_WM: + * + * "MSDISPMODE_PERSAMPLE is required in order to select Perspective + * Sample or Non- perspective Sample barycentric coordinates." + * + * So cleanup any potentially set sample barycentric mode when not in per + * sample dispatch. + */ + if (prog_data->persample_dispatch == INTEL_NEVER) { + prog_data->barycentric_interp_modes &= + ~BITFIELD_BIT(INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE); + } + + if (devinfo->ver >= 20) { + prog_data->vertex_attributes_bypass = + brw_needs_vertex_attributes_bypass(shader); + } + + prog_data->uses_nonperspective_interp_modes = + (prog_data->barycentric_interp_modes & + INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS) || + prog_data->uses_npc_bary_coefficients; + + /* The current VK_EXT_graphics_pipeline_library specification requires + * coarse to specified at compile time. But per sample interpolation can be + * dynamic. So we should never be in a situation where coarse & + * persample_interp are both respectively true & INTEL_ALWAYS. + * + * Coarse will dynamically turned off when persample_interp is active. + */ + assert(!key->coarse_pixel || key->persample_interp != INTEL_ALWAYS); + + prog_data->coarse_pixel_dispatch = + intel_sometimes_invert(prog_data->persample_dispatch); + if (!key->coarse_pixel || + /* DG2 should support this, but Wa_22012766191 says there are issues + * with CPS 1x1 + MSAA + FS writing to oMask. + */ + (devinfo->verx10 < 200 && + (prog_data->uses_omask || prog_data->uses_sample_mask)) || + prog_data->sample_shading || + (prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) || + prog_data->computed_stencil || + devinfo->ver < 11) { + prog_data->coarse_pixel_dispatch = INTEL_NEVER; + } + + /* ICL PRMs, Volume 9: Render Engine, Shared Functions Pixel Interpolater, + * Message Descriptor : + * + * "Message Type. Specifies the type of message being sent when + * pixel-rate evaluation is requested : + * + * Format = U2 + * 0: Per Message Offset (eval_snapped with immediate offset) + * 1: Sample Position Offset (eval_sindex) + * 2: Centroid Position Offset (eval_centroid) + * 3: Per Slot Offset (eval_snapped with register offset) + * + * Message Type. Specifies the type of message being sent when + * coarse-rate evaluation is requested : + * + * Format = U2 + * 0: Coarse to Pixel Mapping Message (internal message) + * 1: Reserved + * 2: Coarse Centroid Position (eval_centroid) + * 3: Per Slot Coarse Pixel Offset (eval_snapped with register offset)" + * + * The Sample Position Offset is marked as reserved for coarse rate + * evaluation and leads to hangs if we try to use it. So disable coarse + * pixel shading if we have any intrinsic that will result in a pixel + * interpolater message at sample. + */ + if (intel_nir_pulls_at_sample(shader)) + prog_data->coarse_pixel_dispatch = INTEL_NEVER; + + /* We choose to always enable VMask prior to XeHP, as it would cause + * us to lose out on the eliminate_find_live_channel() optimization. + */ + prog_data->uses_vmask = + devinfo->verx10 < 125 || + shader->info.fs.needs_coarse_quad_helper_invocations || + shader->info.uses_wide_subgroup_intrinsics || + prog_data->coarse_pixel_dispatch != INTEL_NEVER; + + prog_data->uses_depth_w_coefficients = prog_data->uses_pc_bary_coefficients; + + if (prog_data->coarse_pixel_dispatch != INTEL_NEVER) { + prog_data->uses_depth_w_coefficients |= prog_data->uses_src_depth; + prog_data->uses_src_depth = false; + } + + calculate_urb_setup(devinfo, key, prog_data, shader, mue_map, + per_primitive_offsets); + brw_compute_flat_inputs(prog_data, shader); + + prog_data->has_side_effects = shader->info.writes_memory; +} + +static void +populate_vs_prog_data(nir_shader *nir, + const struct intel_device_info *devinfo, + const struct brw_vs_prog_key *key, + struct brw_vs_prog_data *prog_data, + unsigned nr_packed_regs) +{ + unsigned nr_attribute_slots = util_bitcount64(prog_data->inputs_read); + BITSET_WORD *sysvals = nir->info.system_values_read; + + /* gl_VertexID and gl_InstanceID are system values, but arrive via an + * incoming vertex attribute. So, add an extra slot. + */ + if (BITSET_TEST(sysvals, SYSTEM_VALUE_FIRST_VERTEX) || + BITSET_TEST(sysvals, SYSTEM_VALUE_BASE_INSTANCE) || + BITSET_TEST(sysvals, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) || + BITSET_TEST(sysvals, SYSTEM_VALUE_INSTANCE_ID)) { + nr_attribute_slots++; + } + + /* gl_DrawID and IsIndexedDraw share its very own vec4 */ + if (BITSET_TEST(sysvals, SYSTEM_VALUE_DRAW_ID) || + BITSET_TEST(sysvals, SYSTEM_VALUE_IS_INDEXED_DRAW)) { + nr_attribute_slots++; + } + + const struct { + bool *data; + gl_system_value val; + } bool_sysvals[] = { + { &prog_data->uses_is_indexed_draw, SYSTEM_VALUE_IS_INDEXED_DRAW }, + { &prog_data->uses_firstvertex, SYSTEM_VALUE_FIRST_VERTEX }, + { &prog_data->uses_baseinstance, SYSTEM_VALUE_BASE_INSTANCE }, + { &prog_data->uses_vertexid, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE }, + { &prog_data->uses_instanceid, SYSTEM_VALUE_INSTANCE_ID }, + { &prog_data->uses_drawid, SYSTEM_VALUE_DRAW_ID }, + }; + + for (unsigned i = 0; i < ARRAY_SIZE(bool_sysvals); ++i) { + *bool_sysvals[i].data = BITSET_TEST(sysvals, bool_sysvals[i].val); + } + + unsigned nr_attribute_regs; + if (key->vf_component_packing) { + prog_data->base.urb_read_length = DIV_ROUND_UP(nr_packed_regs, 8); + nr_attribute_regs = nr_packed_regs; + } else { + prog_data->base.urb_read_length = DIV_ROUND_UP(nr_attribute_slots, 2); + nr_attribute_regs = 4 * nr_attribute_slots; + } + + /* Since vertex shaders reuse the same VUE entry for inputs and outputs + * (overwriting the original contents), we need to make sure the size is + * the larger of the two. + */ + const unsigned vue_entries = MAX2(DIV_ROUND_UP(nr_attribute_regs, 4), + prog_data->base.vue_map.num_slots); + prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4); + prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8; +} + +void +jay_populate_prog_data(const struct intel_device_info *devinfo, + nir_shader *nir, + union brw_any_prog_data *prog_data, + union brw_any_prog_key *key, + unsigned nr_packed_regs) +{ + if (nir->info.stage == MESA_SHADER_VERTEX) { + populate_vs_prog_data(nir, devinfo, &key->vs, &prog_data->vs, + nr_packed_regs); + } else if (nir->info.stage == MESA_SHADER_FRAGMENT) { + int per_primitive_offsets[VARYING_SLOT_MAX]; + memset(per_primitive_offsets, -1, sizeof(per_primitive_offsets)); + + populate_fs_prog_data(nir, devinfo, &key->fs, &prog_data->fs, + NULL /* TODO: mue_map */, per_primitive_offsets); + } else if (mesa_shader_stage_is_compute(nir->info.stage)) { + prog_data->cs.uses_inline_push_addr = key->base.uses_inline_push_addr; + prog_data->cs.uses_inline_data |= key->base.uses_inline_push_addr; + } + + if (nir->info.stage == MESA_SHADER_VERTEX || + nir->info.stage == MESA_SHADER_TESS_EVAL || + nir->info.stage == MESA_SHADER_GEOMETRY || + nir->info.stage == MESA_SHADER_MESH) { + + uint32_t clip_mask = BITFIELD_MASK(nir->info.clip_distance_array_size); + uint32_t cull_mask = BITFIELD_RANGE(nir->info.clip_distance_array_size, + nir->info.cull_distance_array_size); + + if (nir->info.stage == MESA_SHADER_MESH) { + prog_data->mesh.clip_distance_mask = clip_mask; + prog_data->mesh.cull_distance_mask = cull_mask; + } else { + prog_data->vue.clip_distance_mask = clip_mask; + prog_data->vue.cull_distance_mask = cull_mask; + } + } +} diff --git a/src/intel/compiler/jay/meson.build b/src/intel/compiler/jay/meson.build index e9c47ada78c..492d04c8bb2 100644 --- a/src/intel/compiler/jay/meson.build +++ b/src/intel/compiler/jay/meson.build @@ -50,16 +50,19 @@ libintel_compiler_jay_files = files( 'jay_assign_flags.c', 'jay_from_nir.c', 'jay_ir.h', + 'jay_insert_fp_mode.c', 'jay_liveness.c', 'jay_lower_post_ra.c', 'jay_lower_pre_ra.c', 'jay_lower_scoreboard.c', 'jay_lower_spill.c', + 'jay_nir.c', 'jay_opt_dead_code.c', 'jay_opt_control_flow.c', 'jay_opt_propagate.c', 'jay_print.c', 'jay_private.h', + 'jay_prog_data.c', 'jay_repair_ssa.c', 'jay_register_allocate.c', 'jay_simd_width.c',