diff --git a/src/panfrost/compiler/bifrost/bifrost_compile.c b/src/panfrost/compiler/bifrost/bifrost_compile.c index 3fd5ece4d81..02e30a1620d 100644 --- a/src/panfrost/compiler/bifrost/bifrost_compile.c +++ b/src/panfrost/compiler/bifrost/bifrost_compile.c @@ -1,18 +1,12 @@ /* - * Copyright (C) 2020 Collabora Ltd. + * Copyright (C) 2020,2026 Collabora Ltd. * Copyright (C) 2022 Alyssa Rosenzweig * Copyright (C) 2025 Arm Ltd. * SPDX-License-Identifier: MIT */ -#include "compiler/glsl/glsl_to_nir.h" -#include "compiler/glsl_types.h" #include "compiler/nir/nir_builder.h" -#include "compiler/nir/nir_deref.h" #include "panfrost/compiler/pan_compiler.h" -#include "panfrost/compiler/pan_nir.h" -#include "util/perf/cpu_trace.h" -#include "util/u_debug.h" #include "util/u_qsort.h" #include "bifrost/bi_debug.h" @@ -976,78 +970,6 @@ bi_is_zs(unsigned location) return location == FRAG_RESULT_DEPTH || location == FRAG_RESULT_STENCIL; } -/* Atomics and memory write on the vertex stage have implementation-defined - * behaviors on how many invocations will happen. However for some reasons, - * atomic counters on GL/GLES specs are quite ambigous here and even have tests - * counting how many invocations have been made on VS.... This pass detects - * atomics that result in a direct store output of one specific IDVS stage - * and ensure it's only executed for said stage. - * - * This allows - * "dEQP-GLES31.functional.shaders.opaque_type_indexing.atomic_counter.*" to - * pass under ANGLE. - */ - -static bool -bifrost_nir_lower_vs_atomics_impl(nir_builder *b, nir_intrinsic_instr *intr, - UNUSED void *data) -{ - if (intr->intrinsic != nir_intrinsic_global_atomic) - return false; - - unsigned output_mask = 0; - nir_foreach_use(use, &intr->def) { - nir_instr *parent = nir_src_parent_instr(use); - if (parent->type != nir_instr_type_intrinsic) - continue; - - nir_intrinsic_instr *parent_intr = nir_instr_as_intrinsic(parent); - if (parent_intr->intrinsic != nir_intrinsic_store_output && - parent_intr->intrinsic != nir_intrinsic_store_per_view_output) - continue; - - nir_io_semantics sem = nir_intrinsic_io_semantics(parent_intr); - output_mask |= BITFIELD_BIT(va_shader_output_from_loc(sem.location)); - } - - /* In case they are not written to any outputs, we default to only output in - * the position stage */ - if (output_mask == 0) - output_mask |= VA_SHADER_OUTPUT_POSITION_BIT; - - /* In case they are not written to both IDVS stages, we just do not try - * lowering it */ - if (((output_mask & VA_SHADER_OUTPUT_VARY_BIT) && - (output_mask & (VA_SHADER_OUTPUT_POSITION_BIT | - VA_SHADER_OUTPUT_ATTRIB_BIT)))) - return false; - - /* In case we know we have only outputs to a certain type, we can make the - * atomic exclusive to this */ - b->cursor = nir_before_instr(&intr->instr); - nir_def *res = nir_undef(b, intr->def.num_components, intr->def.bit_size); - - nir_def *shader_output = nir_load_shader_output_pan(b); - nir_push_if(b, nir_i2b(b, nir_iand_imm(b, shader_output, output_mask))); - nir_instr *new_instr = nir_instr_clone(b->shader, &intr->instr); - nir_intrinsic_instr *new_intr = nir_instr_as_intrinsic(new_instr); - nir_builder_instr_insert(b, new_instr); - nir_pop_if(b, NULL); - - res = nir_if_phi(b, &new_intr->def, res); - nir_def_replace(&intr->def, res); - - return true; -} - -static bool -bifrost_nir_lower_vs_atomics(nir_shader *shader) -{ - assert(shader->info.stage == MESA_SHADER_VERTEX); - return nir_shader_intrinsics_pass(shader, bifrost_nir_lower_vs_atomics_impl, - nir_metadata_none, NULL); -} - static void bi_emit_load_ubo(bi_builder *b, nir_intrinsic_instr *instr) { @@ -5252,445 +5174,6 @@ va_gather_stats(bi_context *ctx, unsigned size, struct valhall_stats *out, va_count_stats(ctx, nr_ins, size, &counts, out); } -/* - * Some operations are only available as 32-bit instructions. 64-bit floats are - * unsupported and ints are lowered with nir_lower_int64. Certain 8-bit and - * 16-bit instructions, however, are lowered here. - */ -static unsigned -bi_lower_bit_size(const nir_instr *instr, void *data) -{ - switch (instr->type) { - case nir_instr_type_alu: { - nir_alu_instr *alu = nir_instr_as_alu(instr); - unsigned gpu_id = *((unsigned *)data); - - switch (alu->op) { - case nir_op_fexp2: - case nir_op_flog2: - case nir_op_fpow: - case nir_op_fsin: - case nir_op_fcos: - case nir_op_bit_count: - case nir_op_bitfield_reverse: - return (nir_src_bit_size(alu->src[0].src) == 32) ? 0 : 32; - case nir_op_fround_even: - case nir_op_fceil: - case nir_op_ffloor: - case nir_op_ffract: - case nir_op_ftrunc: - case nir_op_frexp_sig: - case nir_op_frexp_exp: - /* On v11+, FROUND.v2s16 is gone */ - if (pan_arch(gpu_id) < 11) - return 0; - return (nir_src_bit_size(alu->src[0].src) == 32) ? 0 : 32; - case nir_op_iadd: - case nir_op_isub: - case nir_op_iadd_sat: - case nir_op_uadd_sat: - case nir_op_isub_sat: - case nir_op_usub_sat: - case nir_op_ineg: - case nir_op_iabs: - /* On v11+, IABS.v4s8, IADD.v4s8 and ISUB.v4s8 are gone */ - if (pan_arch(gpu_id) < 11) - return 0; - - return (nir_src_bit_size(alu->src[0].src) == 8) ? 16 : 0; - default: - return 0; - } - } - - case nir_instr_type_intrinsic: { - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - - /* We only support ballot on 32-bit types. */ - switch (intr->intrinsic) { - case nir_intrinsic_ballot: - case nir_intrinsic_ballot_relaxed: - return (nir_src_bit_size(intr->src[0]) == 32) ? 0 : 32; - default: - return 0; - } - } - - default: - return 0; - } -} - -/* Although Bifrost generally supports packed 16-bit vec2 and 8-bit vec4, - * transcendentals are an exception. Also shifts because of lane size mismatch - * (8-bit in Bifrost, 32-bit in NIR TODO - workaround!). Some conversions need - * to be scalarized due to type size. */ - -static uint8_t -bi_vectorize_filter(const nir_instr *instr, const void *data) -{ - unsigned gpu_id = *((unsigned *)data); - - if (instr->type == nir_instr_type_phi) { - unsigned bit_size = nir_instr_as_phi(instr)->def.bit_size; - if (bit_size == 8) - return 4; - if (bit_size == 16) - return 2; - return 1; - } - - /* Do not vectorize all non-ALU instruction */ - if (instr->type != nir_instr_type_alu) - return 0; - - const nir_alu_instr *alu = nir_instr_as_alu(instr); - - switch (alu->op) { - case nir_op_ball_fequal2: - case nir_op_ball_fequal3: - case nir_op_ball_fequal4: - case nir_op_bany_fnequal2: - case nir_op_bany_fnequal3: - case nir_op_bany_fnequal4: - case nir_op_ball_iequal2: - case nir_op_ball_iequal3: - case nir_op_ball_iequal4: - case nir_op_bany_inequal2: - case nir_op_bany_inequal3: - case nir_op_bany_inequal4: return 1; - case nir_op_pack_uvec2_to_uint: - case nir_op_pack_uvec4_to_uint: - return 0; - case nir_op_frcp: - case nir_op_frsq: - case nir_op_ishl: - case nir_op_ishr: - case nir_op_ushr: - case nir_op_b2f16: - case nir_op_f2i16: - case nir_op_f2u16: - case nir_op_f2i8: - case nir_op_f2u8: - case nir_op_f2fmp: - case nir_op_extract_u8: - case nir_op_extract_i8: - case nir_op_extract_u16: - case nir_op_extract_i16: - case nir_op_insert_u16: - return 1; - /* On v11+, we lost all packed F16 conversions */ - case nir_op_f2f16: - case nir_op_f2f16_rtz: - case nir_op_f2f16_rtne: - case nir_op_u2f16: - case nir_op_i2f16: - case nir_op_frexp_sig: - case nir_op_frexp_exp: - if (pan_arch(gpu_id) >= 11) - return 1; - - break; - default: - break; - } - - const uint8_t bit_size = nir_alu_instr_is_comparison(alu) - ? nir_src_bit_size(alu->src[0].src) - : alu->def.bit_size; - if (bit_size == 1) - return 0; - else if (bit_size == 8) - switch (alu->op) { - case nir_op_imul: - case nir_op_i2i8: - case nir_op_u2u8: - return 4; - default: - return 2; - } - else if (bit_size == 16) - return 2; - else - return 1; -} - -/* Ensure we write exactly 4 components */ -static nir_def * -bifrost_nir_valid_channel(nir_builder *b, nir_def *in, unsigned channel, - unsigned first, unsigned mask) -{ - if (!(mask & BITFIELD_BIT(channel))) - channel = first; - - return nir_channel(b, in, channel); -} - -static nir_mem_access_size_align -mem_access_size_align_cb(nir_intrinsic_op intrin, uint8_t bytes, - uint8_t bit_size, uint32_t align_mul, - uint32_t align_offset, bool offset_is_const, - enum gl_access_qualifier access, const void *cb_data) -{ - uint32_t align = nir_combined_align(align_mul, align_offset); - assert(util_is_power_of_two_nonzero(align)); - - /* No more than 16 bytes at a time. */ - bytes = MIN2(bytes, 16); - - /* If the number of bytes is a multiple of 4, use 32-bit loads. Else if it's - * a multiple of 2, use 16-bit loads. Else use 8-bit loads. - * - * But if we're only aligned to 1 byte, use 8-bit loads. If we're only - * aligned to 2 bytes, use 16-bit loads, unless we needed 8-bit loads due to - * the size. - */ - if ((bytes & 1) || (align == 1)) - bit_size = 8; - else if ((bytes & 2) || (align == 2)) - bit_size = 16; - else if (bit_size >= 32) - bit_size = 32; - - unsigned num_comps = MIN2(bytes / (bit_size / 8), 4); - - /* Push constants require 32-bit loads. */ - if (intrin == nir_intrinsic_load_push_constant) { - if (align_mul >= 4) { - /* If align_mul is bigger than 4 we can use align_offset to find - * the exact number of words we need to read. - */ - num_comps = DIV_ROUND_UP((align_offset % 4) + bytes, 4); - } else { - /* If bytes is aligned on 32-bit, the access might still cross one - * word at the beginning, and one word at the end. If bytes is not - * aligned on 32-bit, the extra two words should cover for both the - * size and offset mis-alignment. - */ - num_comps = (bytes / 4) + 2; - } - - bit_size = MAX2(bit_size, 32); - align = 4; - } else { - align = bit_size / 8; - } - - return (nir_mem_access_size_align){ - .num_components = num_comps, - .bit_size = bit_size, - .align = align, - .shift = nir_mem_access_shift_method_scalar, - }; -} - -static bool -mem_vectorize_cb(unsigned align_mul, unsigned align_offset, unsigned bit_size, - unsigned num_components, int64_t hole_size, - nir_intrinsic_instr *low, nir_intrinsic_instr *high, - void *data) -{ - if (hole_size > 0) - return false; - - /* We have a hard limit of at most 4 components */ - if (num_components > 4) - return false; - - const unsigned bytes = num_components * (bit_size / 8); - const unsigned max_bytes = 128u / 8u; /* LOAD.i128 */ - - const unsigned combined_align = nir_combined_align(align_mul, align_offset); - return bytes <= combined_align && bytes <= max_bytes; -} - -static void -bi_optimize_loop_nir(nir_shader *nir, unsigned gpu_id, bool allow_copies) -{ - bool progress; - - do { - progress = false; - - NIR_PASS(progress, nir, nir_split_array_vars, nir_var_function_temp); - NIR_PASS(progress, nir, nir_shrink_vec_array_vars, nir_var_function_temp); - NIR_PASS(progress, nir, nir_opt_deref); - - NIR_PASS(progress, nir, nir_lower_vars_to_ssa); - NIR_PASS(progress, nir, nir_lower_wrmasks); - - if (allow_copies) { - /* Only run this pass in the first call to bi_optimize_nir. Later - * calls assume that we've lowered away any copy_deref instructions - * and we don't want to introduce any more. - */ - NIR_PASS(progress, nir, nir_opt_find_array_copies); - } - - NIR_PASS(progress, nir, nir_opt_copy_prop_vars); - NIR_PASS(progress, nir, nir_opt_dead_write_vars); - NIR_PASS(progress, nir, nir_opt_combine_stores, nir_var_all); - - NIR_PASS(progress, nir, nir_lower_alu_width, bi_vectorize_filter, &gpu_id); - NIR_PASS(progress, nir, nir_opt_vectorize, bi_vectorize_filter, &gpu_id); - NIR_PASS(progress, nir, nir_opt_copy_prop); - NIR_PASS(progress, nir, nir_opt_dce); - NIR_PASS(progress, nir, nir_opt_cse); - - nir_opt_peephole_select_options peephole_select_options = { - .limit = 64, - .expensive_alu_ok = true, - }; - NIR_PASS(progress, nir, nir_opt_peephole_select, - &peephole_select_options); - NIR_PASS(progress, nir, nir_opt_idiv_const, 8); - NIR_PASS(progress, nir, nir_opt_algebraic); - NIR_PASS(progress, nir, nir_opt_constant_folding); - - NIR_PASS(progress, nir, nir_opt_dead_cf); - - bool loop_progress = false; - NIR_PASS(loop_progress, nir, nir_opt_loop); - progress |= loop_progress; - - if (loop_progress) { - /* If nir_opt_loop makes progress, then we need to clean things up - * if we want any hope of nir_opt_if or nir_opt_loop_unroll to make - * progress. - */ - NIR_PASS(progress, nir, nir_opt_copy_prop); - NIR_PASS(progress, nir, nir_opt_dce); - } - - /* XXX: On Bifrost (G52), this cause a failure on - * "dEQP-VK.graphicsfuzz.spv-composite-phi" and is related to an unknown - * scheduling issue */ - if (pan_arch(gpu_id) >= 9) - NIR_PASS( - progress, nir, nir_opt_if, - nir_opt_if_optimize_phi_true_false | nir_opt_if_avoid_64bit_phis); - - NIR_PASS(progress, nir, nir_opt_phi_to_bool); - NIR_PASS(progress, nir, nir_opt_loop_unroll); - NIR_PASS(progress, nir, nir_opt_remove_phis); - NIR_PASS(progress, nir, nir_opt_undef); - } while (progress); - - NIR_PASS(_, nir, nir_lower_undef_to_zero); - - NIR_PASS(_, nir, nir_remove_dead_variables, nir_var_function_temp, NULL); -} - -void -bifrost_optimize_nir(nir_shader *nir, unsigned gpu_id) -{ - bi_optimize_loop_nir(nir, gpu_id, true); -} - -static void -bi_optimize_nir(nir_shader *nir, unsigned gpu_id, nir_variable_mode robust_modes) -{ - NIR_PASS(_, nir, nir_opt_shrink_stores, true); - bi_optimize_loop_nir(nir, gpu_id, false); - - NIR_PASS(_, nir, nir_opt_shrink_vectors, false); - - nir_load_store_vectorize_options vectorize_opts = { - .modes = nir_var_mem_global | - nir_var_mem_shared | - nir_var_mem_ubo | - nir_var_shader_temp, - .callback = mem_vectorize_cb, - .robust_modes = robust_modes, - }; - - /* Only allow vectorization of SSBOs when no robustness2 is configured */ - if (!(robust_modes & nir_var_mem_ssbo)) - vectorize_opts.modes |= nir_var_mem_ssbo; - - NIR_PASS(_, nir, nir_opt_load_store_vectorize, &vectorize_opts); - - /* nir_lower_pack can generate split operations, execute algebraic again to - * handle them */ - NIR_PASS(_, nir, nir_opt_algebraic); - - /* TODO: Why is 64-bit getting rematerialized? - * KHR-GLES31.core.shader_image_load_store.basic-allTargets-atomicFS */ - NIR_PASS(_, nir, nir_lower_int64); - - /* Algebraic can materialize instructions with a bit_size that we need to lower */ - NIR_PASS(_, nir, nir_lower_bit_size, bi_lower_bit_size, &gpu_id); - - /* We need to cleanup after each iteration of late algebraic - * optimizations, since otherwise NIR can produce weird edge cases - * (like fneg of a constant) which we don't handle */ - bool late_algebraic = true; - while (late_algebraic) { - late_algebraic = false; - NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late); - NIR_PASS(_, nir, nir_lower_alu_width, bi_vectorize_filter, &gpu_id); - NIR_PASS(_, nir, nir_opt_constant_folding); - NIR_PASS(_, nir, nir_opt_copy_prop); - NIR_PASS(_, nir, nir_opt_dce); - NIR_PASS(_, nir, nir_opt_cse); - } - - /* This opt currently helps on Bifrost but not Valhall */ - if (pan_arch(gpu_id) < 9) - NIR_PASS(_, nir, bifrost_nir_opt_boolean_bitwise); - - NIR_PASS(_, nir, pan_nir_lower_bool_to_bitsize); - NIR_PASS(_, nir, nir_lower_alu_width, bi_vectorize_filter, &gpu_id); - NIR_PASS(_, nir, nir_opt_vectorize, bi_vectorize_filter, &gpu_id); - - /* Prepass to simplify instruction selection */ - bool late_algebraic_progress = true; - while (late_algebraic_progress) { - late_algebraic_progress = false; - NIR_PASS(late_algebraic_progress, nir, bifrost_nir_lower_algebraic_late, - pan_arch(gpu_id)); - late_algebraic |= late_algebraic_progress; - } - - while (late_algebraic) { - late_algebraic = false; - NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late); - NIR_PASS(_, nir, nir_lower_alu_width, bi_vectorize_filter, &gpu_id); - NIR_PASS(_, nir, nir_opt_constant_folding); - NIR_PASS(_, nir, nir_opt_copy_prop); - NIR_PASS(_, nir, nir_opt_dce); - NIR_PASS(_, nir, nir_opt_cse); - } - - NIR_PASS(_, nir, nir_lower_load_const_to_scalar); - NIR_PASS(_, nir, nir_opt_dce); - - /* Backend scheduler is purely local, so do some global optimizations - * to reduce register pressure. */ - nir_move_options move_all = nir_move_const_undef | nir_move_load_ubo | - nir_move_load_input | nir_move_load_frag_coord | - nir_move_comparisons | nir_move_copies | - nir_move_load_ssbo; - - NIR_PASS(_, nir, nir_opt_sink, move_all); - NIR_PASS(_, nir, nir_opt_move, move_all); - - /* We might lower attribute, varying, and image indirects. Use the - * gathered info to skip the extra analysis in the happy path. */ - bool any_indirects = - nir->info.inputs_read_indirectly || nir->info.outputs_read_indirectly || - nir->info.outputs_written_indirectly || - nir->info.patch_inputs_read_indirectly || - nir->info.patch_outputs_read_indirectly || - nir->info.patch_outputs_written_indirectly || nir->info.images_used[0]; - - if (any_indirects) { - nir_divergence_analysis(nir); - NIR_PASS(_, nir, bi_lower_divergent_indirects, - pan_subgroup_size(pan_arch(gpu_id))); - } -} - static void bi_opt_post_ra(bi_context *ctx) { @@ -5760,496 +5243,6 @@ bi_pack_clauses(bi_context *ctx, struct util_dynarray *binary, unsigned offset) } } -/* - * Build a bit mask of varyings (by location) that are flatshaded. This - * information is needed by lower_mediump_io, as we don't yet support 16-bit - * flat varyings. - * - * Also varyings that are used as texture coordinates should be kept at fp32 so - * the texture instruction may be promoted to VAR_TEX. In general this is a good - * idea, as fp16 texture coordinates are not supported by the hardware and are - * usually inappropriate. (There are both relevant CTS bugs here, even.) - * - * TODO: If we compacted the varyings with some fixup code in the vertex shader, - * we could implement 16-bit flat varyings. Consider if this case matters. - * - * TODO: The texture coordinate handling could be less heavyhanded. - */ -static bool -bi_gather_texcoords(nir_builder *b, nir_instr *instr, void *data) -{ - uint64_t *mask = data; - - if (instr->type != nir_instr_type_tex) - return false; - - nir_tex_instr *tex = nir_instr_as_tex(instr); - - int coord_idx = nir_tex_instr_src_index(tex, nir_tex_src_coord); - if (coord_idx < 0) - return false; - - nir_src src = tex->src[coord_idx].src; - nir_scalar x = nir_scalar_resolved(src.ssa, 0); - nir_scalar y = nir_scalar_resolved(src.ssa, 1); - - if (x.def != y.def) - return false; - - nir_instr *parent = nir_def_instr(x.def); - - if (parent->type != nir_instr_type_intrinsic) - return false; - - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent); - - if (intr->intrinsic != nir_intrinsic_load_interpolated_input) - return false; - - nir_io_semantics sem = nir_intrinsic_io_semantics(intr); - *mask |= BITFIELD64_BIT(sem.location); - return false; -} - -static uint64_t -bi_fp32_varying_mask(nir_shader *nir) -{ - uint64_t mask = 0; - - assert(nir->info.stage == MESA_SHADER_FRAGMENT); - - nir_foreach_shader_in_variable(var, nir) { - if (var->data.interpolation == INTERP_MODE_FLAT) { - unsigned slots = glsl_count_attribute_slots(var->type, false); - mask |= BITFIELD64_RANGE(var->data.location, slots); - } - } - - nir_shader_instructions_pass(nir, bi_gather_texcoords, nir_metadata_all, - &mask); - - return mask; -} - -static bool -bi_lower_load_output(nir_builder *b, nir_intrinsic_instr *intr, - UNUSED void *data) -{ - if (intr->intrinsic != nir_intrinsic_load_output) - return false; - - unsigned loc = nir_intrinsic_io_semantics(intr).location; - assert(loc >= FRAG_RESULT_DATA0); - unsigned rt = loc - FRAG_RESULT_DATA0; - - b->cursor = nir_before_instr(&intr->instr); - - nir_def *conversion = nir_load_rt_conversion_pan( - b, .base = rt, .src_type = nir_intrinsic_dest_type(intr)); - - nir_def *lowered = nir_load_tile_pan( - b, intr->def.num_components, intr->def.bit_size, - pan_nir_tile_location_sample(b, loc, nir_imm_int(b, 0)), - pan_nir_tile_default_coverage(b), - conversion, .dest_type = nir_intrinsic_dest_type(intr), - .io_semantics = nir_intrinsic_io_semantics(intr)); - - nir_def_rewrite_uses(&intr->def, lowered); - return true; -} - -static bool -bi_lower_subgroups(nir_builder *b, nir_intrinsic_instr *intr, void *data) -{ - unsigned int gpu_id = *(unsigned int *)data; - unsigned int arch = pan_arch(gpu_id); - - b->cursor = nir_before_instr(&intr->instr); - - nir_def *val = NULL; - switch (intr->intrinsic) { - case nir_intrinsic_vote_any: - val = nir_ine_imm(b, nir_ballot(b, 1, 32, intr->src[0].ssa), 0); - break; - - case nir_intrinsic_vote_all: - val = nir_ieq_imm(b, nir_ballot(b, 1, 32, nir_inot(b, intr->src[0].ssa)), 0); - break; - - case nir_intrinsic_load_subgroup_id: { - nir_def *local_id = nir_load_local_invocation_id(b); - nir_def *local_size = nir_load_workgroup_size(b); - /* local_id.x + local_size.x * (local_id.y + local_size.y * local_id.z) */ - nir_def *flat_local_id = - nir_iadd(b, - nir_channel(b, local_id, 0), - nir_imul(b, - nir_channel(b, local_size, 0), - nir_iadd(b, - nir_channel(b, local_id, 1), - nir_imul(b, - nir_channel(b, local_size, 1), - nir_channel(b, local_id, 2))))); - /* - * nir_udiv_imm with a power of two divisor, which pan_subgroup_size is, - * will construct a right shift instead of an udiv. - */ - val = nir_udiv_imm(b, flat_local_id, pan_subgroup_size(arch)); - break; - } - - case nir_intrinsic_load_subgroup_size: - val = nir_imm_int(b, pan_subgroup_size(arch)); - break; - - case nir_intrinsic_load_num_subgroups: { - uint32_t subgroup_size = pan_subgroup_size(arch); - assert(!b->shader->info.workgroup_size_variable); - uint32_t workgroup_size = - b->shader->info.workgroup_size[0] * - b->shader->info.workgroup_size[1] * - b->shader->info.workgroup_size[2]; - uint32_t num_subgroups = DIV_ROUND_UP(workgroup_size, subgroup_size); - val = nir_imm_int(b, num_subgroups); - break; - } - - default: - return false; - } - - nir_def_rewrite_uses(&intr->def, val); - return true; -} - -static bool -bifrost_nir_lower_load_output(nir_shader *nir) -{ - assert(nir->info.stage == MESA_SHADER_FRAGMENT); - - return nir_shader_intrinsics_pass( - nir, bi_lower_load_output, - nir_metadata_control_flow, NULL); -} - -void -bifrost_preprocess_nir(nir_shader *nir, unsigned gpu_id) -{ - MESA_TRACE_FUNC(); - - /* The DISCARD instruction just flags the thread as discarded, but the - * actual termination only happens when all threads in the quad are - * discarded, or when an instruction with a .discard flow is - * encountered (Valhall) or when a clause with a .terminate_discarded_thread - * is reached (Bifrost). - * We could do without nir_lower_terminate_to_demote(), but this allows - * for extra dead-code elimination when code sections are detected as - * being unused after a termination is crossed. - */ - if (nir->info.stage == MESA_SHADER_FRAGMENT) - NIR_PASS(_, nir, nir_lower_terminate_to_demote); - - /* Ensure that halt are translated to returns and get ride of them */ - NIR_PASS(_, nir, nir_lower_halt_to_return); - NIR_PASS(_, nir, nir_lower_returns); - - /* Lower gl_Position pre-optimisation, but after lowering vars to ssa - * (so we don't accidentally duplicate the epilogue since mesa/st has - * messed with our I/O quite a bit already) */ - - NIR_PASS(_, nir, nir_lower_vars_to_ssa); - - if (nir->info.stage == MESA_SHADER_VERTEX) { - if (pan_arch(gpu_id) <= 7) - NIR_PASS(_, nir, pan_nir_lower_vertex_id); - } - - /* Get rid of any global vars before we lower to scratch. */ - NIR_PASS(_, nir, nir_lower_global_vars_to_local); - - /* Valhall introduces packed thread local storage, which improves cache - * locality of TLS access. However, access to packed TLS cannot - * straddle 16-byte boundaries. As such, when packed TLS is in use - * (currently unconditional for Valhall), we force vec4 alignment for - * scratch access. - */ - glsl_type_size_align_func vars_to_scratch_size_align_func = - (pan_arch(gpu_id) >= 9) ? glsl_get_vec4_size_align_bytes - : glsl_get_natural_size_align_bytes; - /* Lower large arrays to scratch and small arrays to bcsel */ - NIR_PASS(_, nir, nir_lower_scratch_to_var); - NIR_PASS(_, nir, nir_lower_vars_to_scratch, 256, - vars_to_scratch_size_align_func, vars_to_scratch_size_align_func); - NIR_PASS(_, nir, nir_lower_indirect_derefs_to_if_else_trees, - nir_var_function_temp, ~0); - - NIR_PASS(_, nir, nir_split_var_copies); - NIR_PASS(_, nir, nir_lower_var_copies); - NIR_PASS(_, nir, nir_lower_vars_to_ssa); - - bi_optimize_loop_nir(nir, gpu_id, true); - - /* Lower away all variables for smaller shaders */ - NIR_PASS(_, nir, nir_lower_vars_to_ssa); - NIR_PASS(_, nir, nir_remove_dead_variables, nir_var_function_temp, NULL); -} - -void -bifrost_postprocess_nir(nir_shader *nir, unsigned gpu_id) -{ - MESA_TRACE_FUNC(); - - /* We assume that UBO and SSBO were lowered, let's move things around. */ - nir_move_options move_all = nir_move_const_undef | nir_move_load_ubo | - nir_move_comparisons | nir_move_copies | - nir_move_load_ssbo; - - NIR_PASS(_, nir, nir_opt_sink, move_all); - NIR_PASS(_, nir, nir_opt_move, move_all); - - bifrost_lower_texture_nir(nir, gpu_id); - - if (nir->info.stage == MESA_SHADER_FRAGMENT) { - NIR_PASS(_, nir, pan_nir_lower_noperspective_fs); - - NIR_PASS(_, nir, nir_lower_mediump_io, - nir_var_shader_in | nir_var_shader_out, - ~bi_fp32_varying_mask(nir), false); - - NIR_PASS(_, nir, bifrost_nir_lower_load_output); - } else if (nir->info.stage == MESA_SHADER_VERTEX) { - NIR_PASS(_, nir, nir_lower_viewport_transform); - NIR_PASS(_, nir, nir_lower_point_size, 1.0, 0.0); - NIR_PASS(_, nir, pan_nir_lower_noperspective_vs); - } - - nir_lower_mem_access_bit_sizes_options mem_size_options = { - .modes = nir_var_mem_ubo | nir_var_mem_push_const | nir_var_mem_ssbo | - nir_var_mem_constant | nir_var_mem_task_payload | - nir_var_shader_temp | nir_var_function_temp | - nir_var_mem_global | nir_var_mem_shared, - .callback = mem_access_size_align_cb, - }; - NIR_PASS(_, nir, nir_lower_mem_access_bit_sizes, &mem_size_options); - - nir_lower_ssbo_options ssbo_opts = { - .native_loads = pan_arch(gpu_id) >= 9, - .native_offset = pan_arch(gpu_id) >= 9, - }; - NIR_PASS(_, nir, nir_lower_ssbo, &ssbo_opts); - - /* - * Lower subgroups ops before lowering int64: nir_lower_int64 doesn't know - * how to lower imul reductions and scans. - * - * TODO: we can implement certain operations (notably reductions, scans, - * certain shuffles, etc) more efficiently than nir_lower_subgroups. Moreover - * we can implement reductions and scans on f16vec2 values without splitting - * to scalar first. - */ - bool lower_subgroups_progress = false; - NIR_PASS(lower_subgroups_progress, nir, nir_lower_subgroups, - &(nir_lower_subgroups_options) { - .subgroup_size = pan_subgroup_size(pan_arch(gpu_id)), - .ballot_bit_size = 32, - .ballot_components = 1, - .lower_to_scalar = true, - .lower_vote_feq = true, - .lower_vote_ieq = true, - .lower_vote_bool_eq = true, - .lower_first_invocation_to_ballot = true, - .lower_read_first_invocation = true, - .lower_subgroup_masks = true, - .lower_relative_shuffle = true, - .lower_shuffle = true, - .lower_quad = true, - .lower_quad_broadcast_dynamic = true, - .lower_quad_vote = true, - .lower_elect = true, - .lower_rotate_to_shuffle = true, - .lower_rotate_clustered_to_shuffle = true, - .lower_inverse_ballot = true, - .lower_reduce = true, - .lower_boolean_reduce = true, - .lower_boolean_shuffle = true, - }); - /* nir_lower_subgroups creates new vars, clean them up. */ - if (lower_subgroups_progress) - NIR_PASS(_, nir, nir_lower_vars_to_ssa); - - NIR_PASS(_, nir, nir_shader_intrinsics_pass, bi_lower_subgroups, - nir_metadata_control_flow, &gpu_id); - - NIR_PASS(_, nir, nir_lower_64bit_phis); - NIR_PASS(_, nir, nir_lower_int64); - NIR_PASS(_, nir, nir_lower_bit_size, bi_lower_bit_size, &gpu_id); - - NIR_PASS(_, nir, nir_opt_idiv_const, 8); - NIR_PASS(_, nir, nir_lower_idiv, - &(nir_lower_idiv_options){.allow_fp16 = true}); - - NIR_PASS(_, nir, nir_lower_alu_width, bi_vectorize_filter, &gpu_id); - NIR_PASS(_, nir, nir_lower_load_const_to_scalar); - NIR_PASS(_, nir, nir_lower_phis_to_scalar, bi_vectorize_filter, &gpu_id); - NIR_PASS(_, nir, nir_lower_flrp, 16 | 32 | 64, false /* always_precise */); - NIR_PASS(_, nir, nir_lower_var_copies); - NIR_PASS(_, nir, nir_lower_alu); - NIR_PASS(_, nir, nir_lower_frag_coord_to_pixel_coord); - NIR_PASS(_, nir, pan_nir_lower_var_special_pan); -} - -void bifrost_lower_texture_nir(nir_shader *nir, unsigned gpu_id) -{ - NIR_PASS(_, nir, nir_lower_image_atomics_to_global, NULL, NULL); - - /* on Bifrost, lower MSAA load/stores to 3D load/stores */ - if (pan_arch(gpu_id) < 9) - NIR_PASS(_, nir, pan_nir_lower_image_ms); - - if (nir->info.stage == MESA_SHADER_FRAGMENT) { - NIR_PASS(_, nir, nir_lower_is_helper_invocation); - NIR_PASS(_, nir, pan_nir_lower_helper_invocation); - NIR_PASS(_, nir, pan_nir_lower_sample_pos); - } -} - -static bool -lower_texel_buffer_fetch(nir_builder *b, nir_tex_instr *tex, void *data) -{ - if (tex->op != nir_texop_txf || tex->sampler_dim != GLSL_SAMPLER_DIM_BUF) - return false; - - unsigned *arch = data; - b->cursor = nir_before_instr(&tex->instr); - - nir_def *res_handle = nir_imm_int(b, tex->texture_index); - nir_def *buf_index = NULL; - for (unsigned i = 0; i < tex->num_srcs; ++i) { - switch (tex->src[i].src_type) { - case nir_tex_src_coord: - buf_index = tex->src[i].src.ssa; - break; - case nir_tex_src_texture_offset: - /* This should always be 0 as lower_index_to_offset is expected to be - * set */ - assert(tex->texture_index == 0); - res_handle = tex->src[i].src.ssa; - break; - default: - continue; - } - } - - nir_def *loaded_texel_addr = - nir_load_texel_buf_index_address_pan(b, res_handle, buf_index); - nir_def *texel_addr = - nir_pack_64_2x32(b, nir_channels(b, loaded_texel_addr, BITFIELD_MASK(2))); - - nir_def *loaded_mem; - if (*arch >= 9) { - nir_def *icd = nir_load_texel_buf_conv_pan(b, res_handle); - loaded_mem = nir_load_global_cvt_pan(b, tex->def.num_components, - tex->def.bit_size, texel_addr, - icd, tex->dest_type); - } else { - nir_def *icd = nir_channel(b, loaded_texel_addr, 2); - loaded_mem = nir_load_global_cvt_pan(b, tex->def.num_components, - tex->def.bit_size, texel_addr, - icd, tex->dest_type); - } - nir_def_replace(&tex->def, loaded_mem); - return true; -} - -static bool -pan_nir_lower_texel_buffer_fetch(nir_shader *shader, unsigned arch) -{ - return nir_shader_tex_pass(shader, lower_texel_buffer_fetch, - nir_metadata_control_flow, &arch); -} - -static bool -lower_buf_image_access(nir_builder *b, nir_intrinsic_instr *intr, void *data) -{ - switch (intr->intrinsic) { - case nir_intrinsic_image_texel_address: - case nir_intrinsic_image_load: - case nir_intrinsic_image_store: - break; - default: - return false; - } - enum glsl_sampler_dim dim = nir_intrinsic_image_dim(intr); - if (dim != GLSL_SAMPLER_DIM_BUF) - return false; - - unsigned *arch = data; - b->cursor = nir_before_instr(&intr->instr); - - nir_def *res_handle = intr->src[0].ssa; - nir_def *buf_index = nir_channel(b, intr->src[1].ssa, 0); - nir_def *loaded_texel_addr = - nir_load_texel_buf_index_address_pan(b, res_handle, buf_index); - nir_def *texel_addr = - nir_pack_64_2x32(b, nir_channels(b, loaded_texel_addr, BITFIELD_MASK(2))); - - switch (intr->intrinsic) { - case nir_intrinsic_image_texel_address: - nir_def_replace(&intr->def, texel_addr); - break; - case nir_intrinsic_image_load: { - nir_def *icd; - if (*arch >= 9) - icd = nir_load_texel_buf_conv_pan(b, res_handle); - else - icd = nir_channel(b, loaded_texel_addr, 2); - nir_def *loaded_mem = nir_load_global_cvt_pan( - b, intr->def.num_components, intr->def.bit_size, texel_addr, icd, - .dest_type = nir_intrinsic_dest_type(intr)); - nir_def_replace(&intr->def, loaded_mem); - break; - } - case nir_intrinsic_image_store: { - /* Due to SPIR-V limitations, the source type is not fully reliable: it - * reports uint32 even for write_imagei. This causes an incorrect - * u32->s32->u32 roundtrip which incurs an unwanted clamping. Use auto32 - * instead, which will match per the OpenCL spec. Of course this does - * not work for 16-bit stores, but those are not available in OpenCL. - */ - ASSERTED nir_alu_type T = nir_intrinsic_src_type(intr); - assert(nir_alu_type_get_type_size(T) == 32); - - nir_def *value = intr->src[3].ssa; - nir_def *icd; - if (*arch >= 9) - icd = nir_load_texel_buf_conv_pan(b, res_handle); - else - icd = nir_channel(b, loaded_texel_addr, 2); - nir_store_global_cvt_pan(b, value, texel_addr, icd, .src_type = 32); - nir_instr_remove(&intr->instr); - break; - } - default: - UNREACHABLE("Unexpected intrinsic"); - } - - return true; -} - -static bool -pan_nir_lower_buf_image_access(nir_shader *shader, unsigned arch) -{ - return nir_shader_intrinsics_pass(shader, lower_buf_image_access, - nir_metadata_control_flow, &arch); -} - -void -bifrost_lower_texture_late_nir(nir_shader *nir, unsigned gpu_id) -{ - NIR_PASS(_, nir, pan_nir_lower_texel_buffer_fetch, pan_arch(gpu_id)); - NIR_PASS(_, nir, pan_nir_lower_buf_image_access, pan_arch(gpu_id)); -} - static int compare_u32(const void* a, const void* b, void* _) { @@ -6573,7 +5566,7 @@ bi_compile_variant_nir(nir_shader *nir, return ctx; } -static void +void bi_compile_variant(nir_shader *nir, const struct pan_compile_inputs *inputs, struct util_dynarray *binary, struct pan_shader_info *info, @@ -6672,142 +5665,6 @@ bi_compile_variant(nir_shader *nir, ralloc_free(ctx); } -/* Decide if Index-Driven Vertex Shading should be used for a given shader */ -static bool -bi_should_idvs(nir_shader *nir, const struct pan_compile_inputs *inputs) -{ - /* Opt-out */ - if (inputs->no_idvs || bifrost_debug & BIFROST_DBG_NOIDVS) - return false; - - /* IDVS splits up vertex shaders, not defined on other shader stages */ - if (nir->info.stage != MESA_SHADER_VERTEX) - return false; - - /* Bifrost cannot write gl_PointSize during IDVS */ - if ((pan_arch(inputs->gpu_id) < 9) && - nir->info.outputs_written & VARYING_BIT_PSIZ) - return false; - - /* Otherwise, IDVS is usually better */ - return true; -} - -void -bifrost_compile_shader_nir(nir_shader *nir, - const struct pan_compile_inputs *inputs, - struct util_dynarray *binary, - struct pan_shader_info *info) -{ - MESA_TRACE_FUNC(); - - bifrost_debug = debug_get_option_bifrost_debug(); - - /* The varying layout (if any) may have different bit sizes for some - * varyings than we have in the shader. For descriptors, this isn't a - * problem as it's handled by the descriptor layout. However, for direct - * loads and stores on Valhall+, we need the right bit sizes in the shader. - * We could do this in the back-end as we emit but it's easier for now to - * lower in NIR. This also handles the case where we do a load from the - * fragment shader of something that isn't written by the vertex shader. - * In that case, we just return zero. - */ - if (pan_arch(inputs->gpu_id) >= 9 && inputs->varying_layout) { - NIR_PASS(_, nir, pan_nir_resize_varying_io, inputs->varying_layout); - - /* pan_nir_resize_varying_io may generate vector conversions which we - * need to clean up so the back-end doesn't see them. - */ - unsigned gpu_id = inputs->gpu_id; - NIR_PASS(_, nir, nir_lower_alu_width, bi_vectorize_filter, &gpu_id); - NIR_PASS(_, nir, nir_lower_load_const_to_scalar); - NIR_PASS(_, nir, nir_opt_copy_prop); - NIR_PASS(_, nir, nir_opt_dce); - } - - if (nir->info.stage == MESA_SHADER_VERTEX) { - info->vs.idvs = bi_should_idvs(nir, inputs); - - if (info->vs.idvs && nir->info.writes_memory) - NIR_PASS(_, nir, bifrost_nir_lower_vs_atomics); - - NIR_PASS(_, nir, pan_nir_lower_vs_outputs, inputs->gpu_id, - inputs->varying_layout, info->vs.idvs, - &info->vs.needs_extended_fifo); - } - - if (nir->info.stage == MESA_SHADER_FRAGMENT) { - /* Blit shaders may not need to run ATEST, since ATEST is not needed if - * early-z is forced, alpha-to-coverage is disabled, and there are no - * writes to the coverage mask. The latter two are satisfied for all - * blit shaders, so we just care about early-z, which blit shaders force - * iff they do not write depth or stencil - */ - const bool emit_zs = - nir->info.outputs_written & (BITFIELD_BIT(FRAG_RESULT_DEPTH) | - BITFIELD_BIT(FRAG_RESULT_STENCIL)); - const bool skip_atest = inputs->is_blit && !emit_zs; - NIR_PASS(_, nir, pan_nir_lower_fs_outputs, skip_atest); - } - - bi_optimize_nir(nir, inputs->gpu_id, inputs->robust_modes); - - { - bool scalar_phis_pass = false; - unsigned gpu_id = inputs->gpu_id; - NIR_PASS(scalar_phis_pass, nir, nir_lower_phis_to_scalar, - bi_vectorize_filter, &gpu_id); - if (scalar_phis_pass) { - NIR_PASS(_, nir, nir_opt_copy_prop); - NIR_PASS(_, nir, nir_opt_dce); - } - } - - info->tls_size = nir->scratch_size; - info->stage = nir->info.stage; - - if (nir->info.stage == MESA_SHADER_VERTEX) { - assert(inputs->varying_layout); - memcpy(&info->varyings.formats, inputs->varying_layout, - sizeof(*inputs->varying_layout)); - } else if (nir->info.stage == MESA_SHADER_FRAGMENT) { - pan_varying_collect_formats(&info->varyings.formats, - nir, inputs->gpu_id, - inputs->trust_varying_flat_highp_types, false); - info->varyings.noperspective = - pan_nir_collect_noperspective_varyings_fs(nir); - - if (!inputs->is_blend) - NIR_PASS(_, nir, pan_nir_lower_fs_inputs, inputs->gpu_id, - inputs->varying_layout, info); - } - - if (nir->info.stage == MESA_SHADER_VERTEX && info->vs.idvs) { - /* On 5th Gen, IDVS is only in one binary */ - if (pan_arch(inputs->gpu_id) >= 12) - bi_compile_variant(nir, inputs, binary, info, BI_IDVS_ALL); - else { - bi_compile_variant(nir, inputs, binary, info, BI_IDVS_POSITION); - bi_compile_variant(nir, inputs, binary, info, BI_IDVS_VARYING); - } - } else { - bi_compile_variant(nir, inputs, binary, info, BI_IDVS_NONE); - } - - if (mesa_shader_stage_is_compute(nir->info.stage)) { - /* Workgroups may be merged if the structure of the workgroup is - * not software visible. This is true if neither shared memory - * nor barriers are used. The hardware may be able to optimize - * compute shaders that set this flag. - */ - info->cs.allow_merging_workgroups = (nir->info.shared_size == 0) && - !nir->info.uses_control_barrier && - !nir->info.uses_memory_barrier; - } - - info->ubo_mask &= (1 << nir->info.num_ubos) - 1; -} - static void find_all_predecessors(const bi_context *ctx, bi_block *b, BITSET_WORD *out) { diff --git a/src/panfrost/compiler/bifrost/bifrost_nir.c b/src/panfrost/compiler/bifrost/bifrost_nir.c new file mode 100644 index 00000000000..43848659315 --- /dev/null +++ b/src/panfrost/compiler/bifrost/bifrost_nir.c @@ -0,0 +1,1145 @@ +/* + * Copyright (C) 2020,2026 Collabora Ltd. + * Copyright (C) 2022 Alyssa Rosenzweig + * Copyright (C) 2025 Arm Ltd. + * SPDX-License-Identifier: MIT + */ + +#include "compiler/glsl_types.h" +#include "compiler/nir/nir_builder.h" +#include "panfrost/compiler/pan_compiler.h" +#include "panfrost/compiler/pan_nir.h" +#include "util/perf/cpu_trace.h" + +#include "panfrost/model/pan_model.h" +#include "valhall/valhall.h" +#include "bi_debug.h" +#include "bifrost_compile.h" +#include "bifrost_nir.h" +#include "compiler.h" + +/* + * Some operations are only available as 32-bit instructions. 64-bit floats are + * unsupported and ints are lowered with nir_lower_int64. Certain 8-bit and + * 16-bit instructions, however, are lowered here. + */ +static unsigned +bi_lower_bit_size(const nir_instr *instr, void *data) +{ + switch (instr->type) { + case nir_instr_type_alu: { + nir_alu_instr *alu = nir_instr_as_alu(instr); + unsigned gpu_id = *((unsigned *)data); + + switch (alu->op) { + case nir_op_fexp2: + case nir_op_flog2: + case nir_op_fpow: + case nir_op_fsin: + case nir_op_fcos: + case nir_op_bit_count: + case nir_op_bitfield_reverse: + return (nir_src_bit_size(alu->src[0].src) == 32) ? 0 : 32; + case nir_op_fround_even: + case nir_op_fceil: + case nir_op_ffloor: + case nir_op_ffract: + case nir_op_ftrunc: + case nir_op_frexp_sig: + case nir_op_frexp_exp: + /* On v11+, FROUND.v2s16 is gone */ + if (pan_arch(gpu_id) < 11) + return 0; + return (nir_src_bit_size(alu->src[0].src) == 32) ? 0 : 32; + case nir_op_iadd: + case nir_op_isub: + case nir_op_iadd_sat: + case nir_op_uadd_sat: + case nir_op_isub_sat: + case nir_op_usub_sat: + case nir_op_ineg: + case nir_op_iabs: + /* On v11+, IABS.v4s8, IADD.v4s8 and ISUB.v4s8 are gone */ + if (pan_arch(gpu_id) < 11) + return 0; + + return (nir_src_bit_size(alu->src[0].src) == 8) ? 16 : 0; + default: + return 0; + } + } + + case nir_instr_type_intrinsic: { + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + + /* We only support ballot on 32-bit types. */ + switch (intr->intrinsic) { + case nir_intrinsic_ballot: + case nir_intrinsic_ballot_relaxed: + return (nir_src_bit_size(intr->src[0]) == 32) ? 0 : 32; + default: + return 0; + } + } + + default: + return 0; + } +} + +/* Although Bifrost generally supports packed 16-bit vec2 and 8-bit vec4, + * transcendentals are an exception. Also shifts because of lane size mismatch + * (8-bit in Bifrost, 32-bit in NIR TODO - workaround!). Some conversions need + * to be scalarized due to type size. */ + +static uint8_t +bi_vectorize_filter(const nir_instr *instr, const void *data) +{ + unsigned gpu_id = *((unsigned *)data); + + if (instr->type == nir_instr_type_phi) { + unsigned bit_size = nir_instr_as_phi(instr)->def.bit_size; + if (bit_size == 8) + return 4; + if (bit_size == 16) + return 2; + return 1; + } + + /* Do not vectorize all non-ALU instruction */ + if (instr->type != nir_instr_type_alu) + return 0; + + const nir_alu_instr *alu = nir_instr_as_alu(instr); + + switch (alu->op) { + case nir_op_ball_fequal2: + case nir_op_ball_fequal3: + case nir_op_ball_fequal4: + case nir_op_bany_fnequal2: + case nir_op_bany_fnequal3: + case nir_op_bany_fnequal4: + case nir_op_ball_iequal2: + case nir_op_ball_iequal3: + case nir_op_ball_iequal4: + case nir_op_bany_inequal2: + case nir_op_bany_inequal3: + case nir_op_bany_inequal4: return 1; + case nir_op_pack_uvec2_to_uint: + case nir_op_pack_uvec4_to_uint: + return 0; + case nir_op_frcp: + case nir_op_frsq: + case nir_op_ishl: + case nir_op_ishr: + case nir_op_ushr: + case nir_op_b2f16: + case nir_op_f2i16: + case nir_op_f2u16: + case nir_op_f2i8: + case nir_op_f2u8: + case nir_op_f2fmp: + case nir_op_extract_u8: + case nir_op_extract_i8: + case nir_op_extract_u16: + case nir_op_extract_i16: + case nir_op_insert_u16: + return 1; + /* On v11+, we lost all packed F16 conversions */ + case nir_op_f2f16: + case nir_op_f2f16_rtz: + case nir_op_f2f16_rtne: + case nir_op_u2f16: + case nir_op_i2f16: + case nir_op_frexp_sig: + case nir_op_frexp_exp: + if (pan_arch(gpu_id) >= 11) + return 1; + + break; + default: + break; + } + + const uint8_t bit_size = nir_alu_instr_is_comparison(alu) + ? nir_src_bit_size(alu->src[0].src) + : alu->def.bit_size; + if (bit_size == 1) + return 0; + else if (bit_size == 8) + switch (alu->op) { + case nir_op_imul: + case nir_op_i2i8: + case nir_op_u2u8: + return 4; + default: + return 2; + } + else if (bit_size == 16) + return 2; + else + return 1; +} + +static bool +mem_vectorize_cb(unsigned align_mul, unsigned align_offset, unsigned bit_size, + unsigned num_components, int64_t hole_size, + nir_intrinsic_instr *low, nir_intrinsic_instr *high, + void *data) +{ + if (hole_size > 0) + return false; + + /* We have a hard limit of at most 4 components */ + if (num_components > 4) + return false; + + const unsigned bytes = num_components * (bit_size / 8); + const unsigned max_bytes = 128u / 8u; /* LOAD.i128 */ + + const unsigned combined_align = nir_combined_align(align_mul, align_offset); + return bytes <= combined_align && bytes <= max_bytes; +} + +static void +bi_optimize_loop_nir(nir_shader *nir, unsigned gpu_id, bool allow_copies) +{ + bool progress; + + do { + progress = false; + + NIR_PASS(progress, nir, nir_split_array_vars, nir_var_function_temp); + NIR_PASS(progress, nir, nir_shrink_vec_array_vars, nir_var_function_temp); + NIR_PASS(progress, nir, nir_opt_deref); + + NIR_PASS(progress, nir, nir_lower_vars_to_ssa); + NIR_PASS(progress, nir, nir_lower_wrmasks); + + if (allow_copies) { + /* Only run this pass in the first call to bi_optimize_nir. Later + * calls assume that we've lowered away any copy_deref instructions + * and we don't want to introduce any more. + */ + NIR_PASS(progress, nir, nir_opt_find_array_copies); + } + + NIR_PASS(progress, nir, nir_opt_copy_prop_vars); + NIR_PASS(progress, nir, nir_opt_dead_write_vars); + NIR_PASS(progress, nir, nir_opt_combine_stores, nir_var_all); + + NIR_PASS(progress, nir, nir_lower_alu_width, bi_vectorize_filter, &gpu_id); + NIR_PASS(progress, nir, nir_opt_vectorize, bi_vectorize_filter, &gpu_id); + NIR_PASS(progress, nir, nir_opt_copy_prop); + NIR_PASS(progress, nir, nir_opt_dce); + NIR_PASS(progress, nir, nir_opt_cse); + + nir_opt_peephole_select_options peephole_select_options = { + .limit = 64, + .expensive_alu_ok = true, + }; + NIR_PASS(progress, nir, nir_opt_peephole_select, + &peephole_select_options); + NIR_PASS(progress, nir, nir_opt_idiv_const, 8); + NIR_PASS(progress, nir, nir_opt_algebraic); + NIR_PASS(progress, nir, nir_opt_constant_folding); + + NIR_PASS(progress, nir, nir_opt_dead_cf); + + bool loop_progress = false; + NIR_PASS(loop_progress, nir, nir_opt_loop); + progress |= loop_progress; + + if (loop_progress) { + /* If nir_opt_loop makes progress, then we need to clean things up + * if we want any hope of nir_opt_if or nir_opt_loop_unroll to make + * progress. + */ + NIR_PASS(progress, nir, nir_opt_copy_prop); + NIR_PASS(progress, nir, nir_opt_dce); + } + + /* XXX: On Bifrost (G52), this cause a failure on + * "dEQP-VK.graphicsfuzz.spv-composite-phi" and is related to an unknown + * scheduling issue */ + if (pan_arch(gpu_id) >= 9) + NIR_PASS( + progress, nir, nir_opt_if, + nir_opt_if_optimize_phi_true_false | nir_opt_if_avoid_64bit_phis); + + NIR_PASS(progress, nir, nir_opt_phi_to_bool); + NIR_PASS(progress, nir, nir_opt_loop_unroll); + NIR_PASS(progress, nir, nir_opt_remove_phis); + NIR_PASS(progress, nir, nir_opt_undef); + } while (progress); + + NIR_PASS(_, nir, nir_lower_undef_to_zero); + + NIR_PASS(_, nir, nir_remove_dead_variables, nir_var_function_temp, NULL); +} + +void +bifrost_optimize_nir(nir_shader *nir, unsigned gpu_id) +{ + bi_optimize_loop_nir(nir, gpu_id, true); +} + +static void +bi_optimize_nir(nir_shader *nir, unsigned gpu_id, nir_variable_mode robust_modes) +{ + NIR_PASS(_, nir, nir_opt_shrink_stores, true); + bi_optimize_loop_nir(nir, gpu_id, false); + + NIR_PASS(_, nir, nir_opt_shrink_vectors, false); + + nir_load_store_vectorize_options vectorize_opts = { + .modes = nir_var_mem_global | + nir_var_mem_shared | + nir_var_mem_ubo | + nir_var_shader_temp, + .callback = mem_vectorize_cb, + .robust_modes = robust_modes, + }; + + /* Only allow vectorization of SSBOs when no robustness2 is configured */ + if (!(robust_modes & nir_var_mem_ssbo)) + vectorize_opts.modes |= nir_var_mem_ssbo; + + NIR_PASS(_, nir, nir_opt_load_store_vectorize, &vectorize_opts); + + /* nir_lower_pack can generate split operations, execute algebraic again to + * handle them */ + NIR_PASS(_, nir, nir_opt_algebraic); + + /* TODO: Why is 64-bit getting rematerialized? + * KHR-GLES31.core.shader_image_load_store.basic-allTargets-atomicFS */ + NIR_PASS(_, nir, nir_lower_int64); + + /* Algebraic can materialize instructions with a bit_size that we need to lower */ + NIR_PASS(_, nir, nir_lower_bit_size, bi_lower_bit_size, &gpu_id); + + /* We need to cleanup after each iteration of late algebraic + * optimizations, since otherwise NIR can produce weird edge cases + * (like fneg of a constant) which we don't handle */ + bool late_algebraic = true; + while (late_algebraic) { + late_algebraic = false; + NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late); + NIR_PASS(_, nir, nir_lower_alu_width, bi_vectorize_filter, &gpu_id); + NIR_PASS(_, nir, nir_opt_constant_folding); + NIR_PASS(_, nir, nir_opt_copy_prop); + NIR_PASS(_, nir, nir_opt_dce); + NIR_PASS(_, nir, nir_opt_cse); + } + + /* This opt currently helps on Bifrost but not Valhall */ + if (pan_arch(gpu_id) < 9) + NIR_PASS(_, nir, bifrost_nir_opt_boolean_bitwise); + + NIR_PASS(_, nir, pan_nir_lower_bool_to_bitsize); + NIR_PASS(_, nir, nir_lower_alu_width, bi_vectorize_filter, &gpu_id); + NIR_PASS(_, nir, nir_opt_vectorize, bi_vectorize_filter, &gpu_id); + + /* Prepass to simplify instruction selection */ + bool late_algebraic_progress = true; + while (late_algebraic_progress) { + late_algebraic_progress = false; + NIR_PASS(late_algebraic_progress, nir, bifrost_nir_lower_algebraic_late, + pan_arch(gpu_id)); + late_algebraic |= late_algebraic_progress; + } + + while (late_algebraic) { + late_algebraic = false; + NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late); + NIR_PASS(_, nir, nir_lower_alu_width, bi_vectorize_filter, &gpu_id); + NIR_PASS(_, nir, nir_opt_constant_folding); + NIR_PASS(_, nir, nir_opt_copy_prop); + NIR_PASS(_, nir, nir_opt_dce); + NIR_PASS(_, nir, nir_opt_cse); + } + + NIR_PASS(_, nir, nir_lower_load_const_to_scalar); + NIR_PASS(_, nir, nir_opt_dce); + + /* Backend scheduler is purely local, so do some global optimizations + * to reduce register pressure. */ + nir_move_options move_all = nir_move_const_undef | nir_move_load_ubo | + nir_move_load_input | nir_move_load_frag_coord | + nir_move_comparisons | nir_move_copies | + nir_move_load_ssbo; + + NIR_PASS(_, nir, nir_opt_sink, move_all); + NIR_PASS(_, nir, nir_opt_move, move_all); + + /* We might lower attribute, varying, and image indirects. Use the + * gathered info to skip the extra analysis in the happy path. */ + bool any_indirects = + nir->info.inputs_read_indirectly || nir->info.outputs_read_indirectly || + nir->info.outputs_written_indirectly || + nir->info.patch_inputs_read_indirectly || + nir->info.patch_outputs_read_indirectly || + nir->info.patch_outputs_written_indirectly || nir->info.images_used[0]; + + if (any_indirects) { + nir_divergence_analysis(nir); + NIR_PASS(_, nir, bi_lower_divergent_indirects, + pan_subgroup_size(pan_arch(gpu_id))); + } +} + +void +bifrost_preprocess_nir(nir_shader *nir, unsigned gpu_id) +{ + MESA_TRACE_FUNC(); + + /* The DISCARD instruction just flags the thread as discarded, but the + * actual termination only happens when all threads in the quad are + * discarded, or when an instruction with a .discard flow is + * encountered (Valhall) or when a clause with a .terminate_discarded_thread + * is reached (Bifrost). + * We could do without nir_lower_terminate_to_demote(), but this allows + * for extra dead-code elimination when code sections are detected as + * being unused after a termination is crossed. + */ + if (nir->info.stage == MESA_SHADER_FRAGMENT) + NIR_PASS(_, nir, nir_lower_terminate_to_demote); + + /* Ensure that halt are translated to returns and get ride of them */ + NIR_PASS(_, nir, nir_lower_halt_to_return); + NIR_PASS(_, nir, nir_lower_returns); + + /* Lower gl_Position pre-optimisation, but after lowering vars to ssa + * (so we don't accidentally duplicate the epilogue since mesa/st has + * messed with our I/O quite a bit already) */ + + NIR_PASS(_, nir, nir_lower_vars_to_ssa); + + if (nir->info.stage == MESA_SHADER_VERTEX) { + if (pan_arch(gpu_id) <= 7) + NIR_PASS(_, nir, pan_nir_lower_vertex_id); + } + + /* Get rid of any global vars before we lower to scratch. */ + NIR_PASS(_, nir, nir_lower_global_vars_to_local); + + /* Valhall introduces packed thread local storage, which improves cache + * locality of TLS access. However, access to packed TLS cannot + * straddle 16-byte boundaries. As such, when packed TLS is in use + * (currently unconditional for Valhall), we force vec4 alignment for + * scratch access. + */ + glsl_type_size_align_func vars_to_scratch_size_align_func = + (pan_arch(gpu_id) >= 9) ? glsl_get_vec4_size_align_bytes + : glsl_get_natural_size_align_bytes; + /* Lower large arrays to scratch and small arrays to bcsel */ + NIR_PASS(_, nir, nir_lower_scratch_to_var); + NIR_PASS(_, nir, nir_lower_vars_to_scratch, 256, + vars_to_scratch_size_align_func, vars_to_scratch_size_align_func); + NIR_PASS(_, nir, nir_lower_indirect_derefs_to_if_else_trees, + nir_var_function_temp, ~0); + + NIR_PASS(_, nir, nir_split_var_copies); + NIR_PASS(_, nir, nir_lower_var_copies); + NIR_PASS(_, nir, nir_lower_vars_to_ssa); + + bi_optimize_loop_nir(nir, gpu_id, true); + + /* Lower away all variables for smaller shaders */ + NIR_PASS(_, nir, nir_lower_vars_to_ssa); + NIR_PASS(_, nir, nir_remove_dead_variables, nir_var_function_temp, NULL); +} + +/* + * Build a bit mask of varyings (by location) that are flatshaded. This + * information is needed by lower_mediump_io, as we don't yet support 16-bit + * flat varyings. + * + * Also varyings that are used as texture coordinates should be kept at fp32 so + * the texture instruction may be promoted to VAR_TEX. In general this is a good + * idea, as fp16 texture coordinates are not supported by the hardware and are + * usually inappropriate. (There are both relevant CTS bugs here, even.) + * + * TODO: If we compacted the varyings with some fixup code in the vertex shader, + * we could implement 16-bit flat varyings. Consider if this case matters. + * + * TODO: The texture coordinate handling could be less heavyhanded. + */ +static bool +bi_gather_texcoords(nir_builder *b, nir_instr *instr, void *data) +{ + uint64_t *mask = data; + + if (instr->type != nir_instr_type_tex) + return false; + + nir_tex_instr *tex = nir_instr_as_tex(instr); + + int coord_idx = nir_tex_instr_src_index(tex, nir_tex_src_coord); + if (coord_idx < 0) + return false; + + nir_src src = tex->src[coord_idx].src; + nir_scalar x = nir_scalar_resolved(src.ssa, 0); + nir_scalar y = nir_scalar_resolved(src.ssa, 1); + + if (x.def != y.def) + return false; + + nir_instr *parent = nir_def_instr(x.def); + + if (parent->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent); + + if (intr->intrinsic != nir_intrinsic_load_interpolated_input) + return false; + + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + *mask |= BITFIELD64_BIT(sem.location); + return false; +} + +static uint64_t +bi_fp32_varying_mask(nir_shader *nir) +{ + uint64_t mask = 0; + + assert(nir->info.stage == MESA_SHADER_FRAGMENT); + + nir_foreach_shader_in_variable(var, nir) { + if (var->data.interpolation == INTERP_MODE_FLAT) { + unsigned slots = glsl_count_attribute_slots(var->type, false); + mask |= BITFIELD64_RANGE(var->data.location, slots); + } + } + + nir_shader_instructions_pass(nir, bi_gather_texcoords, nir_metadata_all, + &mask); + + return mask; +} + +static bool +bi_lower_subgroups(nir_builder *b, nir_intrinsic_instr *intr, void *data) +{ + unsigned int gpu_id = *(unsigned int *)data; + unsigned int arch = pan_arch(gpu_id); + + b->cursor = nir_before_instr(&intr->instr); + + nir_def *val = NULL; + switch (intr->intrinsic) { + case nir_intrinsic_vote_any: + val = nir_ine_imm(b, nir_ballot(b, 1, 32, intr->src[0].ssa), 0); + break; + + case nir_intrinsic_vote_all: + val = nir_ieq_imm(b, nir_ballot(b, 1, 32, nir_inot(b, intr->src[0].ssa)), 0); + break; + + case nir_intrinsic_load_subgroup_id: { + nir_def *local_id = nir_load_local_invocation_id(b); + nir_def *local_size = nir_load_workgroup_size(b); + /* local_id.x + local_size.x * (local_id.y + local_size.y * local_id.z) */ + nir_def *flat_local_id = + nir_iadd(b, + nir_channel(b, local_id, 0), + nir_imul(b, + nir_channel(b, local_size, 0), + nir_iadd(b, + nir_channel(b, local_id, 1), + nir_imul(b, + nir_channel(b, local_size, 1), + nir_channel(b, local_id, 2))))); + /* + * nir_udiv_imm with a power of two divisor, which pan_subgroup_size is, + * will construct a right shift instead of an udiv. + */ + val = nir_udiv_imm(b, flat_local_id, pan_subgroup_size(arch)); + break; + } + + case nir_intrinsic_load_subgroup_size: + val = nir_imm_int(b, pan_subgroup_size(arch)); + break; + + case nir_intrinsic_load_num_subgroups: { + uint32_t subgroup_size = pan_subgroup_size(arch); + assert(!b->shader->info.workgroup_size_variable); + uint32_t workgroup_size = + b->shader->info.workgroup_size[0] * + b->shader->info.workgroup_size[1] * + b->shader->info.workgroup_size[2]; + uint32_t num_subgroups = DIV_ROUND_UP(workgroup_size, subgroup_size); + val = nir_imm_int(b, num_subgroups); + break; + } + + default: + return false; + } + + nir_def_rewrite_uses(&intr->def, val); + return true; +} + +static bool +bi_lower_load_output(nir_builder *b, nir_intrinsic_instr *intr, + UNUSED void *data) +{ + if (intr->intrinsic != nir_intrinsic_load_output) + return false; + + unsigned loc = nir_intrinsic_io_semantics(intr).location; + assert(loc >= FRAG_RESULT_DATA0); + unsigned rt = loc - FRAG_RESULT_DATA0; + + b->cursor = nir_before_instr(&intr->instr); + + nir_def *conversion = nir_load_rt_conversion_pan( + b, .base = rt, .src_type = nir_intrinsic_dest_type(intr)); + + nir_def *lowered = nir_load_tile_pan( + b, intr->def.num_components, intr->def.bit_size, + pan_nir_tile_location_sample(b, loc, nir_imm_int(b, 0)), + pan_nir_tile_default_coverage(b), + conversion, .dest_type = nir_intrinsic_dest_type(intr), + .io_semantics = nir_intrinsic_io_semantics(intr)); + + nir_def_rewrite_uses(&intr->def, lowered); + return true; +} + +static bool +bifrost_nir_lower_load_output(nir_shader *nir) +{ + assert(nir->info.stage == MESA_SHADER_FRAGMENT); + + return nir_shader_intrinsics_pass( + nir, bi_lower_load_output, + nir_metadata_control_flow, NULL); +} + +static nir_mem_access_size_align +mem_access_size_align_cb(nir_intrinsic_op intrin, uint8_t bytes, + uint8_t bit_size, uint32_t align_mul, + uint32_t align_offset, bool offset_is_const, + enum gl_access_qualifier access, const void *cb_data) +{ + uint32_t align = nir_combined_align(align_mul, align_offset); + assert(util_is_power_of_two_nonzero(align)); + + /* No more than 16 bytes at a time. */ + bytes = MIN2(bytes, 16); + + /* If the number of bytes is a multiple of 4, use 32-bit loads. Else if it's + * a multiple of 2, use 16-bit loads. Else use 8-bit loads. + * + * But if we're only aligned to 1 byte, use 8-bit loads. If we're only + * aligned to 2 bytes, use 16-bit loads, unless we needed 8-bit loads due to + * the size. + */ + if ((bytes & 1) || (align == 1)) + bit_size = 8; + else if ((bytes & 2) || (align == 2)) + bit_size = 16; + else if (bit_size >= 32) + bit_size = 32; + + unsigned num_comps = MIN2(bytes / (bit_size / 8), 4); + + /* Push constants require 32-bit loads. */ + if (intrin == nir_intrinsic_load_push_constant) { + if (align_mul >= 4) { + /* If align_mul is bigger than 4 we can use align_offset to find + * the exact number of words we need to read. + */ + num_comps = DIV_ROUND_UP((align_offset % 4) + bytes, 4); + } else { + /* If bytes is aligned on 32-bit, the access might still cross one + * word at the beginning, and one word at the end. If bytes is not + * aligned on 32-bit, the extra two words should cover for both the + * size and offset mis-alignment. + */ + num_comps = (bytes / 4) + 2; + } + + bit_size = MAX2(bit_size, 32); + align = 4; + } else { + align = bit_size / 8; + } + + return (nir_mem_access_size_align){ + .num_components = num_comps, + .bit_size = bit_size, + .align = align, + .shift = nir_mem_access_shift_method_scalar, + }; +} + +void +bifrost_postprocess_nir(nir_shader *nir, unsigned gpu_id) +{ + MESA_TRACE_FUNC(); + + /* We assume that UBO and SSBO were lowered, let's move things around. */ + nir_move_options move_all = nir_move_const_undef | nir_move_load_ubo | + nir_move_comparisons | nir_move_copies | + nir_move_load_ssbo; + + NIR_PASS(_, nir, nir_opt_sink, move_all); + NIR_PASS(_, nir, nir_opt_move, move_all); + + bifrost_lower_texture_nir(nir, gpu_id); + + if (nir->info.stage == MESA_SHADER_FRAGMENT) { + NIR_PASS(_, nir, pan_nir_lower_noperspective_fs); + + NIR_PASS(_, nir, nir_lower_mediump_io, + nir_var_shader_in | nir_var_shader_out, + ~bi_fp32_varying_mask(nir), false); + + NIR_PASS(_, nir, bifrost_nir_lower_load_output); + } else if (nir->info.stage == MESA_SHADER_VERTEX) { + NIR_PASS(_, nir, nir_lower_viewport_transform); + NIR_PASS(_, nir, nir_lower_point_size, 1.0, 0.0); + NIR_PASS(_, nir, pan_nir_lower_noperspective_vs); + } + + nir_lower_mem_access_bit_sizes_options mem_size_options = { + .modes = nir_var_mem_ubo | nir_var_mem_push_const | nir_var_mem_ssbo | + nir_var_mem_constant | nir_var_mem_task_payload | + nir_var_shader_temp | nir_var_function_temp | + nir_var_mem_global | nir_var_mem_shared, + .callback = mem_access_size_align_cb, + }; + NIR_PASS(_, nir, nir_lower_mem_access_bit_sizes, &mem_size_options); + + nir_lower_ssbo_options ssbo_opts = { + .native_loads = pan_arch(gpu_id) >= 9, + .native_offset = pan_arch(gpu_id) >= 9, + }; + NIR_PASS(_, nir, nir_lower_ssbo, &ssbo_opts); + + /* + * Lower subgroups ops before lowering int64: nir_lower_int64 doesn't know + * how to lower imul reductions and scans. + * + * TODO: we can implement certain operations (notably reductions, scans, + * certain shuffles, etc) more efficiently than nir_lower_subgroups. Moreover + * we can implement reductions and scans on f16vec2 values without splitting + * to scalar first. + */ + bool lower_subgroups_progress = false; + NIR_PASS(lower_subgroups_progress, nir, nir_lower_subgroups, + &(nir_lower_subgroups_options) { + .subgroup_size = pan_subgroup_size(pan_arch(gpu_id)), + .ballot_bit_size = 32, + .ballot_components = 1, + .lower_to_scalar = true, + .lower_vote_feq = true, + .lower_vote_ieq = true, + .lower_vote_bool_eq = true, + .lower_first_invocation_to_ballot = true, + .lower_read_first_invocation = true, + .lower_subgroup_masks = true, + .lower_relative_shuffle = true, + .lower_shuffle = true, + .lower_quad = true, + .lower_quad_broadcast_dynamic = true, + .lower_quad_vote = true, + .lower_elect = true, + .lower_rotate_to_shuffle = true, + .lower_rotate_clustered_to_shuffle = true, + .lower_inverse_ballot = true, + .lower_reduce = true, + .lower_boolean_reduce = true, + .lower_boolean_shuffle = true, + }); + /* nir_lower_subgroups creates new vars, clean them up. */ + if (lower_subgroups_progress) + NIR_PASS(_, nir, nir_lower_vars_to_ssa); + + NIR_PASS(_, nir, nir_shader_intrinsics_pass, bi_lower_subgroups, + nir_metadata_control_flow, &gpu_id); + + NIR_PASS(_, nir, nir_lower_64bit_phis); + NIR_PASS(_, nir, nir_lower_int64); + NIR_PASS(_, nir, nir_lower_bit_size, bi_lower_bit_size, &gpu_id); + + NIR_PASS(_, nir, nir_opt_idiv_const, 8); + NIR_PASS(_, nir, nir_lower_idiv, + &(nir_lower_idiv_options){.allow_fp16 = true}); + + NIR_PASS(_, nir, nir_lower_alu_width, bi_vectorize_filter, &gpu_id); + NIR_PASS(_, nir, nir_lower_load_const_to_scalar); + NIR_PASS(_, nir, nir_lower_phis_to_scalar, bi_vectorize_filter, &gpu_id); + NIR_PASS(_, nir, nir_lower_flrp, 16 | 32 | 64, false /* always_precise */); + NIR_PASS(_, nir, nir_lower_var_copies); + NIR_PASS(_, nir, nir_lower_alu); + NIR_PASS(_, nir, nir_lower_frag_coord_to_pixel_coord); + NIR_PASS(_, nir, pan_nir_lower_var_special_pan); +} + +void bifrost_lower_texture_nir(nir_shader *nir, unsigned gpu_id) +{ + NIR_PASS(_, nir, nir_lower_image_atomics_to_global, NULL, NULL); + + /* on Bifrost, lower MSAA load/stores to 3D load/stores */ + if (pan_arch(gpu_id) < 9) + NIR_PASS(_, nir, pan_nir_lower_image_ms); + + if (nir->info.stage == MESA_SHADER_FRAGMENT) { + NIR_PASS(_, nir, nir_lower_is_helper_invocation); + NIR_PASS(_, nir, pan_nir_lower_helper_invocation); + NIR_PASS(_, nir, pan_nir_lower_sample_pos); + } +} + +static bool +lower_texel_buffer_fetch(nir_builder *b, nir_tex_instr *tex, void *data) +{ + if (tex->op != nir_texop_txf || tex->sampler_dim != GLSL_SAMPLER_DIM_BUF) + return false; + + unsigned *arch = data; + b->cursor = nir_before_instr(&tex->instr); + + nir_def *res_handle = nir_imm_int(b, tex->texture_index); + nir_def *buf_index = NULL; + for (unsigned i = 0; i < tex->num_srcs; ++i) { + switch (tex->src[i].src_type) { + case nir_tex_src_coord: + buf_index = tex->src[i].src.ssa; + break; + case nir_tex_src_texture_offset: + /* This should always be 0 as lower_index_to_offset is expected to be + * set */ + assert(tex->texture_index == 0); + res_handle = tex->src[i].src.ssa; + break; + default: + continue; + } + } + + nir_def *loaded_texel_addr = + nir_load_texel_buf_index_address_pan(b, res_handle, buf_index); + nir_def *texel_addr = + nir_pack_64_2x32(b, nir_channels(b, loaded_texel_addr, BITFIELD_MASK(2))); + + nir_def *loaded_mem; + if (*arch >= 9) { + nir_def *icd = nir_load_texel_buf_conv_pan(b, res_handle); + loaded_mem = nir_load_global_cvt_pan(b, tex->def.num_components, + tex->def.bit_size, texel_addr, + icd, tex->dest_type); + } else { + nir_def *icd = nir_channel(b, loaded_texel_addr, 2); + loaded_mem = nir_load_global_cvt_pan(b, tex->def.num_components, + tex->def.bit_size, texel_addr, + icd, tex->dest_type); + } + nir_def_replace(&tex->def, loaded_mem); + return true; +} + +static bool +pan_nir_lower_texel_buffer_fetch(nir_shader *shader, unsigned arch) +{ + return nir_shader_tex_pass(shader, lower_texel_buffer_fetch, + nir_metadata_control_flow, &arch); +} + +static bool +lower_buf_image_access(nir_builder *b, nir_intrinsic_instr *intr, void *data) +{ + switch (intr->intrinsic) { + case nir_intrinsic_image_texel_address: + case nir_intrinsic_image_load: + case nir_intrinsic_image_store: + break; + default: + return false; + } + enum glsl_sampler_dim dim = nir_intrinsic_image_dim(intr); + if (dim != GLSL_SAMPLER_DIM_BUF) + return false; + + unsigned *arch = data; + b->cursor = nir_before_instr(&intr->instr); + + nir_def *res_handle = intr->src[0].ssa; + nir_def *buf_index = nir_channel(b, intr->src[1].ssa, 0); + nir_def *loaded_texel_addr = + nir_load_texel_buf_index_address_pan(b, res_handle, buf_index); + nir_def *texel_addr = + nir_pack_64_2x32(b, nir_channels(b, loaded_texel_addr, BITFIELD_MASK(2))); + + switch (intr->intrinsic) { + case nir_intrinsic_image_texel_address: + nir_def_replace(&intr->def, texel_addr); + break; + case nir_intrinsic_image_load: { + nir_def *icd; + if (*arch >= 9) + icd = nir_load_texel_buf_conv_pan(b, res_handle); + else + icd = nir_channel(b, loaded_texel_addr, 2); + nir_def *loaded_mem = nir_load_global_cvt_pan( + b, intr->def.num_components, intr->def.bit_size, texel_addr, icd, + .dest_type = nir_intrinsic_dest_type(intr)); + nir_def_replace(&intr->def, loaded_mem); + break; + } + case nir_intrinsic_image_store: { + /* Due to SPIR-V limitations, the source type is not fully reliable: it + * reports uint32 even for write_imagei. This causes an incorrect + * u32->s32->u32 roundtrip which incurs an unwanted clamping. Use auto32 + * instead, which will match per the OpenCL spec. Of course this does + * not work for 16-bit stores, but those are not available in OpenCL. + */ + ASSERTED nir_alu_type T = nir_intrinsic_src_type(intr); + assert(nir_alu_type_get_type_size(T) == 32); + + nir_def *value = intr->src[3].ssa; + nir_def *icd; + if (*arch >= 9) + icd = nir_load_texel_buf_conv_pan(b, res_handle); + else + icd = nir_channel(b, loaded_texel_addr, 2); + nir_store_global_cvt_pan(b, value, texel_addr, icd, .src_type = 32); + nir_instr_remove(&intr->instr); + break; + } + default: + UNREACHABLE("Unexpected intrinsic"); + } + + return true; +} + +static bool +pan_nir_lower_buf_image_access(nir_shader *shader, unsigned arch) +{ + return nir_shader_intrinsics_pass(shader, lower_buf_image_access, + nir_metadata_control_flow, &arch); +} + +void +bifrost_lower_texture_late_nir(nir_shader *nir, unsigned gpu_id) +{ + NIR_PASS(_, nir, pan_nir_lower_texel_buffer_fetch, pan_arch(gpu_id)); + NIR_PASS(_, nir, pan_nir_lower_buf_image_access, pan_arch(gpu_id)); +} + +/* Decide if Index-Driven Vertex Shading should be used for a given shader */ +static bool +bi_should_idvs(nir_shader *nir, const struct pan_compile_inputs *inputs) +{ + /* Opt-out */ + if (inputs->no_idvs || bifrost_debug & BIFROST_DBG_NOIDVS) + return false; + + /* IDVS splits up vertex shaders, not defined on other shader stages */ + if (nir->info.stage != MESA_SHADER_VERTEX) + return false; + + /* Bifrost cannot write gl_PointSize during IDVS */ + if ((pan_arch(inputs->gpu_id) < 9) && + nir->info.outputs_written & VARYING_BIT_PSIZ) + return false; + + /* Otherwise, IDVS is usually better */ + return true; +} + +/* Atomics and memory write on the vertex stage have implementation-defined + * behaviors on how many invocations will happen. However for some reasons, + * atomic counters on GL/GLES specs are quite ambigous here and even have tests + * counting how many invocations have been made on VS.... This pass detects + * atomics that result in a direct store output of one specific IDVS stage + * and ensure it's only executed for said stage. + * + * This allows + * "dEQP-GLES31.functional.shaders.opaque_type_indexing.atomic_counter.*" to + * pass under ANGLE. + */ + +static bool +bifrost_nir_lower_vs_atomics_impl(nir_builder *b, nir_intrinsic_instr *intr, + UNUSED void *data) +{ + if (intr->intrinsic != nir_intrinsic_global_atomic) + return false; + + unsigned output_mask = 0; + nir_foreach_use(use, &intr->def) { + nir_instr *parent = nir_src_parent_instr(use); + if (parent->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *parent_intr = nir_instr_as_intrinsic(parent); + if (parent_intr->intrinsic != nir_intrinsic_store_output && + parent_intr->intrinsic != nir_intrinsic_store_per_view_output) + continue; + + nir_io_semantics sem = nir_intrinsic_io_semantics(parent_intr); + output_mask |= BITFIELD_BIT(va_shader_output_from_loc(sem.location)); + } + + /* In case they are not written to any outputs, we default to only output in + * the position stage */ + if (output_mask == 0) + output_mask |= VA_SHADER_OUTPUT_POSITION_BIT; + + /* In case they are not written to both IDVS stages, we just do not try + * lowering it */ + if (((output_mask & VA_SHADER_OUTPUT_VARY_BIT) && + (output_mask & (VA_SHADER_OUTPUT_POSITION_BIT | + VA_SHADER_OUTPUT_ATTRIB_BIT)))) + return false; + + /* In case we know we have only outputs to a certain type, we can make the + * atomic exclusive to this */ + b->cursor = nir_before_instr(&intr->instr); + nir_def *res = nir_undef(b, intr->def.num_components, intr->def.bit_size); + + nir_def *shader_output = nir_load_shader_output_pan(b); + nir_push_if(b, nir_i2b(b, nir_iand_imm(b, shader_output, output_mask))); + nir_instr *new_instr = nir_instr_clone(b->shader, &intr->instr); + nir_intrinsic_instr *new_intr = nir_instr_as_intrinsic(new_instr); + nir_builder_instr_insert(b, new_instr); + nir_pop_if(b, NULL); + + res = nir_if_phi(b, &new_intr->def, res); + nir_def_replace(&intr->def, res); + + return true; +} + +static bool +bifrost_nir_lower_vs_atomics(nir_shader *shader) +{ + assert(shader->info.stage == MESA_SHADER_VERTEX); + return nir_shader_intrinsics_pass(shader, bifrost_nir_lower_vs_atomics_impl, + nir_metadata_none, NULL); +} + +void +bifrost_compile_shader_nir(nir_shader *nir, + const struct pan_compile_inputs *inputs, + struct util_dynarray *binary, + struct pan_shader_info *info) +{ + MESA_TRACE_FUNC(); + + bifrost_init_debug_options(); + + /* The varying layout (if any) may have different bit sizes for some + * varyings than we have in the shader. For descriptors, this isn't a + * problem as it's handled by the descriptor layout. However, for direct + * loads and stores on Valhall+, we need the right bit sizes in the shader. + * We could do this in the back-end as we emit but it's easier for now to + * lower in NIR. This also handles the case where we do a load from the + * fragment shader of something that isn't written by the vertex shader. + * In that case, we just return zero. + */ + if (pan_arch(inputs->gpu_id) >= 9 && inputs->varying_layout) { + NIR_PASS(_, nir, pan_nir_resize_varying_io, inputs->varying_layout); + + /* pan_nir_resize_varying_io may generate vector conversions which we + * need to clean up so the back-end doesn't see them. + */ + unsigned gpu_id = inputs->gpu_id; + NIR_PASS(_, nir, nir_lower_alu_width, bi_vectorize_filter, &gpu_id); + NIR_PASS(_, nir, nir_lower_load_const_to_scalar); + NIR_PASS(_, nir, nir_opt_copy_prop); + NIR_PASS(_, nir, nir_opt_dce); + } + + if (nir->info.stage == MESA_SHADER_VERTEX) { + info->vs.idvs = bi_should_idvs(nir, inputs); + + if (info->vs.idvs && nir->info.writes_memory) + NIR_PASS(_, nir, bifrost_nir_lower_vs_atomics); + + NIR_PASS(_, nir, pan_nir_lower_vs_outputs, inputs->gpu_id, + inputs->varying_layout, info->vs.idvs, + &info->vs.needs_extended_fifo); + } + + if (nir->info.stage == MESA_SHADER_FRAGMENT) { + /* Blit shaders may not need to run ATEST, since ATEST is not needed if + * early-z is forced, alpha-to-coverage is disabled, and there are no + * writes to the coverage mask. The latter two are satisfied for all + * blit shaders, so we just care about early-z, which blit shaders force + * iff they do not write depth or stencil + */ + const bool emit_zs = + nir->info.outputs_written & (BITFIELD_BIT(FRAG_RESULT_DEPTH) | + BITFIELD_BIT(FRAG_RESULT_STENCIL)); + const bool skip_atest = inputs->is_blit && !emit_zs; + NIR_PASS(_, nir, pan_nir_lower_fs_outputs, skip_atest); + } + + bi_optimize_nir(nir, inputs->gpu_id, inputs->robust_modes); + + { + bool scalar_phis_pass = false; + unsigned gpu_id = inputs->gpu_id; + NIR_PASS(scalar_phis_pass, nir, nir_lower_phis_to_scalar, + bi_vectorize_filter, &gpu_id); + if (scalar_phis_pass) { + NIR_PASS(_, nir, nir_opt_copy_prop); + NIR_PASS(_, nir, nir_opt_dce); + } + } + + info->tls_size = nir->scratch_size; + info->stage = nir->info.stage; + + if (nir->info.stage == MESA_SHADER_VERTEX) { + assert(inputs->varying_layout); + memcpy(&info->varyings.formats, inputs->varying_layout, + sizeof(*inputs->varying_layout)); + } else if (nir->info.stage == MESA_SHADER_FRAGMENT) { + pan_varying_collect_formats(&info->varyings.formats, + nir, inputs->gpu_id, + inputs->trust_varying_flat_highp_types, false); + info->varyings.noperspective = + pan_nir_collect_noperspective_varyings_fs(nir); + + if (!inputs->is_blend) + NIR_PASS(_, nir, pan_nir_lower_fs_inputs, inputs->gpu_id, + inputs->varying_layout, info); + } + + if (nir->info.stage == MESA_SHADER_VERTEX && info->vs.idvs) { + /* On 5th Gen, IDVS is only in one binary */ + if (pan_arch(inputs->gpu_id) >= 12) + bi_compile_variant(nir, inputs, binary, info, BI_IDVS_ALL); + else { + bi_compile_variant(nir, inputs, binary, info, BI_IDVS_POSITION); + bi_compile_variant(nir, inputs, binary, info, BI_IDVS_VARYING); + } + } else { + bi_compile_variant(nir, inputs, binary, info, BI_IDVS_NONE); + } + + if (mesa_shader_stage_is_compute(nir->info.stage)) { + /* Workgroups may be merged if the structure of the workgroup is + * not software visible. This is true if neither shared memory + * nor barriers are used. The hardware may be able to optimize + * compute shaders that set this flag. + */ + info->cs.allow_merging_workgroups = (nir->info.shared_size == 0) && + !nir->info.uses_control_barrier && + !nir->info.uses_memory_barrier; + } + + info->ubo_mask &= (1 << nir->info.num_ubos) - 1; +} diff --git a/src/panfrost/compiler/bifrost/compiler.h b/src/panfrost/compiler/bifrost/compiler.h index c8f999e6e0b..87d2d5e9fcd 100644 --- a/src/panfrost/compiler/bifrost/compiler.h +++ b/src/panfrost/compiler/bifrost/compiler.h @@ -1529,6 +1529,11 @@ bi_is_terminal_block(bi_block *block) unsigned bi_pack(bi_context *ctx, struct util_dynarray *emission); void bi_pack_valhall(bi_context *ctx, struct util_dynarray *emission); +void bi_compile_variant(nir_shader *nir, + const struct pan_compile_inputs *inputs, + struct util_dynarray *binary, struct pan_shader_info *info, + enum bi_idvs_mode idvs); + struct bi_packed_tuple { uint64_t lo; uint64_t hi; diff --git a/src/panfrost/compiler/bifrost/meson.build b/src/panfrost/compiler/bifrost/meson.build index 77c2e76fc84..a8fa63328df 100644 --- a/src/panfrost/compiler/bifrost/meson.build +++ b/src/panfrost/compiler/bifrost/meson.build @@ -34,6 +34,7 @@ libpanfrost_bifrost_files = files( 'bi_debug.c', 'bir.c', 'bifrost_compile.c', + 'bifrost_nir.c', 'bifrost/bi_opt_message_preload.c', 'bifrost/bi_pack.c', 'bifrost/bi_schedule.c',