mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-20 22:08:10 +02:00
The goal here is to generate addresses that are a right-shifted version of the actual byte address and record the shift amount in the offset_shift index. While we could just insert a ushr at the end of deref chains, this will prevent the shift to be optimized away in many cases. Instead, we try to extract the shift from the array strides and struct offsets that make up the deref chain, and only insert a ushr when absolutely necessary (i.e., for casts). This means we have to walk the entire deref chain at once for accesses that support offset_shift and we don't use the standard algorithm of replacing each deref one at a time. To be able to legally right-shift casts, we use the alignment information and never shift more than what the alignment could support. It should also be noted that casts generally have two sources: something provided by the driver (e.g., a Vulkan resource index) or a variable pointer coming from a phi/bcsel. For the latter, the entire access chain consists of multiple parts that are ended by either a phi/bcsel or an access. Only the part the ends in an access is handled by this new algorithm; the other parts are handled as usual. This is necessary because we have no way to encode the offset shift or to even know how much we would be able to shift without knowing how it is accessed. This commit adds the general implementation for lowering accesses using offset_shift and adds a compiler option for drivers to enable it for SSBO accesses. Signed-off-by: Job Noorman <jnoorman@igalia.com> Reviewed-by: Emma Anholt <emma@anholt.net> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35092>
860 lines
28 KiB
C
860 lines
28 KiB
C
/*
|
|
* Copyright © 2014 Connor Abbott
|
|
* SPDX-License-Identifier: MIT
|
|
*/
|
|
|
|
#ifndef NIR_SHADER_COMPILER_OPTIONS_H
|
|
#define NIR_SHADER_COMPILER_OPTIONS_H
|
|
|
|
#include "util/macros.h"
|
|
#include "nir_defines.h"
|
|
#include <stdbool.h>
|
|
#include <stdint.h>
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
typedef enum {
|
|
nir_lower_imul64 = (1 << 0),
|
|
nir_lower_isign64 = (1 << 1),
|
|
/** Lower all int64 modulus and division opcodes */
|
|
nir_lower_divmod64 = (1 << 2),
|
|
/** Lower all 64-bit umul_high and imul_high opcodes */
|
|
nir_lower_imul_high64 = (1 << 3),
|
|
nir_lower_bcsel64 = (1 << 4),
|
|
nir_lower_icmp64 = (1 << 5),
|
|
nir_lower_iadd64 = (1 << 6),
|
|
nir_lower_iabs64 = (1 << 7),
|
|
nir_lower_ineg64 = (1 << 8),
|
|
nir_lower_logic64 = (1 << 9),
|
|
nir_lower_minmax64 = (1 << 10),
|
|
nir_lower_shift64 = (1 << 11),
|
|
nir_lower_imul_2x32_64 = (1 << 12),
|
|
nir_lower_extract64 = (1 << 13),
|
|
nir_lower_ufind_msb64 = (1 << 14),
|
|
nir_lower_bit_count64 = (1 << 15),
|
|
nir_lower_subgroup_shuffle64 = (1 << 16),
|
|
nir_lower_scan_reduce_bitwise64 = (1 << 17),
|
|
nir_lower_scan_reduce_iadd64 = (1 << 18),
|
|
nir_lower_vote_ieq64 = (1 << 19),
|
|
nir_lower_usub_sat64 = (1 << 20),
|
|
nir_lower_iadd_sat64 = (1 << 21),
|
|
nir_lower_find_lsb64 = (1 << 22),
|
|
nir_lower_conv64 = (1 << 23),
|
|
nir_lower_uadd_sat64 = (1 << 24),
|
|
nir_lower_iadd3_64 = (1 << 25),
|
|
nir_lower_bitfield_reverse64 = (1 << 26),
|
|
nir_lower_bitfield_extract64 = (1 << 27),
|
|
} nir_lower_int64_options;
|
|
|
|
typedef enum {
|
|
nir_lower_drcp = (1 << 0),
|
|
nir_lower_dsqrt = (1 << 1),
|
|
nir_lower_drsq = (1 << 2),
|
|
nir_lower_dtrunc = (1 << 3),
|
|
nir_lower_dfloor = (1 << 4),
|
|
nir_lower_dceil = (1 << 5),
|
|
nir_lower_dfract = (1 << 6),
|
|
nir_lower_dround_even = (1 << 7),
|
|
nir_lower_dmod = (1 << 8),
|
|
nir_lower_dsub = (1 << 9),
|
|
nir_lower_ddiv = (1 << 10),
|
|
nir_lower_dsign = (1 << 11),
|
|
nir_lower_dminmax = (1 << 12),
|
|
nir_lower_dsat = (1 << 13),
|
|
nir_lower_fp64_full_software = (1 << 14),
|
|
} nir_lower_doubles_options;
|
|
|
|
typedef enum {
|
|
nir_divergence_single_prim_per_subgroup = (1 << 0),
|
|
nir_divergence_single_patch_per_tcs_subgroup = (1 << 1),
|
|
nir_divergence_single_patch_per_tes_subgroup = (1 << 2),
|
|
nir_divergence_view_index_uniform = (1 << 3),
|
|
nir_divergence_single_frag_shading_rate_per_subgroup = (1 << 4),
|
|
nir_divergence_multiple_workgroup_per_compute_subgroup = (1 << 5),
|
|
nir_divergence_shader_record_ptr_uniform = (1 << 6),
|
|
nir_divergence_uniform_load_tears = (1 << 7),
|
|
/* If used, this allows phis for divergent merges with undef and a uniform source to be considered uniform */
|
|
nir_divergence_ignore_undef_if_phi_srcs = (1 << 8),
|
|
/* Whether to compute vertex divergence (meaning between vertices
|
|
* of the same primitive) instead of subgroup invocation divergence
|
|
* (between invocations of the same subgroup). For example, patch input
|
|
* loads are always convergent, while subgroup intrinsics are divergent.
|
|
*/
|
|
nir_divergence_vertex = (1 << 11),
|
|
} nir_divergence_options;
|
|
|
|
/** An instruction filtering callback
|
|
*
|
|
* Returns true if the instruction should be processed and false otherwise.
|
|
*/
|
|
typedef bool (*nir_instr_filter_cb)(const nir_instr *, const void *);
|
|
|
|
typedef enum {
|
|
/**
|
|
* Whether a fragment shader can interpolate the same input multiple times
|
|
* with different modes (smooth, noperspective) and locations (pixel,
|
|
* centroid, sample, at_offset, at_sample), excluding the flat mode.
|
|
*
|
|
* This matches AMD GPU flexibility and limitations and is a superset of
|
|
* the GL4 requirement that each input can be interpolated at its specified
|
|
* location, and then also as centroid, at_offset, and at_sample.
|
|
*/
|
|
nir_io_has_flexible_input_interpolation_except_flat = BITFIELD_BIT(0),
|
|
|
|
/**
|
|
* nir_opt_varyings compacts (relocates) components of varyings by
|
|
* rewriting their locations completely, effectively moving components of
|
|
* varyings between slots. This option forces nir_opt_varyings to make
|
|
* VARYING_SLOT_POS unused by moving its contents to VARn if the consumer
|
|
* is not FS. If this option is not set and POS is unused, it moves
|
|
* components of VARn to POS until it's fully used.
|
|
*/
|
|
nir_io_dont_use_pos_for_non_fs_varyings = BITFIELD_BIT(1),
|
|
|
|
nir_io_16bit_input_output_support = BITFIELD_BIT(2),
|
|
|
|
/**
|
|
* Implement mediump inputs and outputs as normal 32-bit IO.
|
|
* Causes the mediump flag to be not set for IO semantics, essentially
|
|
* destroying any mediump-related IO information in the shader.
|
|
*/
|
|
nir_io_mediump_is_32bit = BITFIELD_BIT(3),
|
|
|
|
/**
|
|
* Whether nir_opt_vectorize_io should ignore FS inputs.
|
|
*/
|
|
nir_io_prefer_scalar_fs_inputs = BITFIELD_BIT(4),
|
|
|
|
/**
|
|
* Whether interpolated fragment shader vec4 slots can use load_input for
|
|
* a subset of its components to skip interpolation for those components.
|
|
* The result of such load_input is a value from a random (not necessarily
|
|
* provoking) vertex. If a value from the provoking vertex is required,
|
|
* the vec4 slot should have no load_interpolated_input instructions.
|
|
*
|
|
* This exposes the AMD capability that allows packing flat inputs with
|
|
* interpolated inputs in a limited number of cases. Normally, flat
|
|
* components must be in a separate vec4 slot to get the value from
|
|
* the provoking vertex. If the compiler can prove that all per-vertex
|
|
* values are equal (convergent, i.e. the provoking vertex doesn't matter),
|
|
* it can put such flat components into any interpolated vec4 slot.
|
|
*
|
|
* It should also be set if the hw can mix flat and interpolated components
|
|
* in the same vec4 slot.
|
|
*
|
|
* This causes nir_opt_varyings to skip interpolation for all varyings
|
|
* that are convergent, and enables better compaction and inter-shader code
|
|
* motion for convergent varyings.
|
|
*/
|
|
nir_io_mix_convergent_flat_with_interpolated = BITFIELD_BIT(5),
|
|
|
|
/**
|
|
* Whether src_type and dest_type of IO intrinsics are irrelevant and
|
|
* should be ignored by nir_opt_vectorize_io. All drivers that always treat
|
|
* load_input and store_output as untyped and load_interpolated_input as
|
|
* float##bit_size should set this.
|
|
*/
|
|
nir_io_vectorizer_ignores_types = BITFIELD_BIT(6),
|
|
|
|
/**
|
|
* Whether nir_opt_varyings should never promote convergent FS inputs
|
|
* to flat.
|
|
*/
|
|
nir_io_always_interpolate_convergent_fs_inputs = BITFIELD_BIT(7),
|
|
|
|
/**
|
|
* Whether the first assigned color channel component should be equal to
|
|
* the first unused VARn component.
|
|
*
|
|
* For example, if the first unused VARn channel is VAR0.z, color channels
|
|
* are assigned in this order:
|
|
* COL0.z, COL0.w, COL0.x, COL0.y, COL1.z, COL1.w, COL1.x, COL1.y
|
|
*
|
|
* This allows certain drivers to merge outputs if each output sets
|
|
* different components, for example 2 outputs writing VAR0.xy and COL0.z
|
|
* will only use 1 HW output.
|
|
*/
|
|
nir_io_compaction_rotates_color_channels = BITFIELD_BIT(8),
|
|
|
|
/**
|
|
* Whether to group TES inputs as follows:
|
|
* - inputs used to compute only POS/CLIP outputs are first
|
|
* - inputs used to compute both POS/CLIP outputs and other outputs are next
|
|
* - inputs used to compute only other outputs are last
|
|
*/
|
|
nir_io_compaction_groups_tes_inputs_into_pos_and_var_groups = BITFIELD_BIT(9),
|
|
|
|
/**
|
|
* RADV expects that high 16 bits of outputs set component >= 4. That's not
|
|
* legal in NIR, but RADV unfortunately relies on it because it's not
|
|
* validated.
|
|
*/
|
|
nir_io_radv_intrinsic_component_workaround = BITFIELD_BIT(10),
|
|
|
|
/* Options affecting the GLSL compiler or Gallium are below. */
|
|
|
|
/**
|
|
* Lower load_deref/store_deref to load_input/store_output/etc. intrinsics.
|
|
* This is only affects GLSL compilation and Gallium.
|
|
*/
|
|
nir_io_has_intrinsics = BITFIELD_BIT(16),
|
|
|
|
/**
|
|
* Whether clip and cull distance arrays should be separate. If this is not
|
|
* set, cull distances will be moved into VARYING_SLOT_CLIP_DISTn after clip
|
|
* distances, and shader_info::clip_distance_array_size will be the index
|
|
* of the first cull distance. nir_lower_clip_cull_distance_array_vars does
|
|
* that.
|
|
*/
|
|
nir_io_separate_clip_cull_distance_arrays = BITFIELD_BIT(17),
|
|
} nir_io_options;
|
|
|
|
typedef enum {
|
|
nir_lower_packing_op_pack_64_2x32,
|
|
nir_lower_packing_op_unpack_64_2x32,
|
|
nir_lower_packing_op_pack_64_4x16,
|
|
nir_lower_packing_op_unpack_64_4x16,
|
|
nir_lower_packing_op_pack_32_2x16,
|
|
nir_lower_packing_op_unpack_32_2x16,
|
|
nir_lower_packing_op_pack_32_4x8,
|
|
nir_lower_packing_op_unpack_32_4x8,
|
|
nir_lower_packing_num_ops,
|
|
} nir_lower_packing_op;
|
|
|
|
typedef struct nir_shader_compiler_options {
|
|
bool lower_fdiv;
|
|
bool lower_ffma16;
|
|
bool lower_ffma32;
|
|
bool lower_ffma64;
|
|
bool fuse_ffma16;
|
|
bool fuse_ffma32;
|
|
bool fuse_ffma64;
|
|
bool lower_flrp16;
|
|
bool lower_flrp32;
|
|
/** Lowers flrp when it does not support doubles */
|
|
bool lower_flrp64;
|
|
bool lower_fpow;
|
|
bool lower_fsat;
|
|
bool lower_fsqrt;
|
|
bool lower_sincos;
|
|
bool lower_fmod;
|
|
/** Lowers ibitfield_extract/ubitfield_extract for 8, 16 & 32 bits. */
|
|
bool lower_bitfield_extract8;
|
|
bool lower_bitfield_extract16;
|
|
bool lower_bitfield_extract;
|
|
/** Lowers bitfield_insert. */
|
|
bool lower_bitfield_insert;
|
|
/** Lowers bitfield_reverse to shifts. */
|
|
bool lower_bitfield_reverse;
|
|
/** Lowers bit_count to shifts. */
|
|
bool lower_bit_count;
|
|
/** Lowers ifind_msb. */
|
|
bool lower_ifind_msb;
|
|
/** Lowers ufind_msb. */
|
|
bool lower_ufind_msb;
|
|
/** Lowers find_lsb to ufind_msb and logic ops */
|
|
bool lower_find_lsb;
|
|
bool lower_uadd_carry;
|
|
bool lower_usub_borrow;
|
|
/** Lowers imul_high/umul_high to 16-bit multiplies and carry operations. */
|
|
bool lower_mul_high;
|
|
bool lower_mul_high16;
|
|
/** lowers fneg to fmul(x, -1.0). Driver must call nir_opt_algebraic_late() */
|
|
bool lower_fneg;
|
|
/** lowers ineg to isub. Driver must call nir_opt_algebraic_late(). */
|
|
bool lower_ineg;
|
|
/** lowers fisnormal to alu ops. */
|
|
bool lower_fisnormal;
|
|
|
|
/* lower {slt,sge,seq,sne} to {flt,fge,feq,fneu} + b2f: */
|
|
bool lower_scmp;
|
|
|
|
/* lower b/fall_equalN/b/fany_nequalN (ex:fany_nequal4 to sne+fdot4+fsat) */
|
|
bool lower_vector_cmp;
|
|
|
|
/** enable rules to avoid bit ops */
|
|
bool lower_bitops;
|
|
|
|
/** enables rules to lower isign to imin+imax */
|
|
bool lower_isign;
|
|
|
|
/** enables rules to lower fsign to fsub and flt */
|
|
bool lower_fsign;
|
|
|
|
/** enables rules to lower iabs to ineg+imax */
|
|
bool lower_iabs;
|
|
|
|
/** enable rules that avoid generating umax from signed integer ops */
|
|
bool lower_umax;
|
|
|
|
/** enable rules that avoid generating umin from signed integer ops */
|
|
bool lower_umin;
|
|
|
|
/* lower fmin/fmax with signed zero preserve to fmin/fmax with
|
|
* no_signed_zero, for backends whose fmin/fmax implementations do not
|
|
* implement IEEE-754-2019 semantics for signed zero.
|
|
*/
|
|
bool lower_fminmax_signed_zero;
|
|
|
|
/* lower fdph to fdot4 */
|
|
bool lower_fdph;
|
|
|
|
/* Does the native fdot instruction replicate its result for four
|
|
* components? If so, then opt_algebraic_late will turn all fdotN
|
|
* instructions into fdotN_replicated instructions.
|
|
*/
|
|
bool fdot_replicates;
|
|
|
|
/** lowers ffloor to fsub+ffract: */
|
|
bool lower_ffloor;
|
|
|
|
/** lowers ffract to fsub+ffloor: */
|
|
bool lower_ffract;
|
|
|
|
/** lowers fceil to fneg+ffloor+fneg: */
|
|
bool lower_fceil;
|
|
|
|
bool lower_ftrunc;
|
|
|
|
/** Lowers fround_even to ffract+feq+csel.
|
|
*
|
|
* Not correct in that it doesn't correctly handle the "_even" part of the
|
|
* rounding, but good enough for DX9 array indexing handling on DX9-class
|
|
* hardware.
|
|
*/
|
|
bool lower_fround_even;
|
|
|
|
bool lower_ldexp;
|
|
|
|
bool lower_pack_half_2x16;
|
|
bool lower_pack_unorm_2x16;
|
|
bool lower_pack_snorm_2x16;
|
|
bool lower_pack_unorm_4x8;
|
|
bool lower_pack_snorm_4x8;
|
|
bool lower_pack_64_2x32;
|
|
bool lower_pack_64_4x16;
|
|
bool lower_pack_32_2x16;
|
|
bool lower_pack_64_2x32_split;
|
|
bool lower_pack_32_2x16_split;
|
|
bool lower_unpack_half_2x16;
|
|
bool lower_unpack_unorm_2x16;
|
|
bool lower_unpack_snorm_2x16;
|
|
bool lower_unpack_unorm_4x8;
|
|
bool lower_unpack_snorm_4x8;
|
|
bool lower_unpack_64_2x32_split;
|
|
bool lower_unpack_32_2x16_split;
|
|
|
|
bool lower_pack_split;
|
|
|
|
bool lower_extract_byte;
|
|
bool lower_extract_word;
|
|
bool lower_insert_byte;
|
|
bool lower_insert_word;
|
|
|
|
/* Indicates that the driver only has zero-based vertex id */
|
|
bool vertex_id_zero_based;
|
|
|
|
/**
|
|
* If enabled, gl_BaseVertex will be lowered as:
|
|
* is_indexed_draw (~0/0) & firstvertex
|
|
*/
|
|
bool lower_base_vertex;
|
|
|
|
/* Indicates that gl_InstanceIndex already includes base index
|
|
* and doesn't require further lowering.
|
|
*/
|
|
bool instance_id_includes_base_index;
|
|
|
|
/**
|
|
* If enabled, gl_HelperInvocation will be lowered as:
|
|
*
|
|
* - non-sample-shading: sample_mask_in == 0.
|
|
* - sample shading: !((1 << sample_id) & sample_mask_in))
|
|
*
|
|
* For this to be correct, it requires that fs.uses_sample_shading is set to
|
|
* true when sample shading is enabled. This means that you need shader
|
|
* variants to set the flag when Vulkan's
|
|
* VkPipelineMultisampleStateCreateInfo->sampleShadingEnable or GL's
|
|
* glMinSampleshading() are enabled.
|
|
*
|
|
* This depends on some possibly hw implementation details, which may
|
|
* not be true for all hw. In particular that the FS is only executed
|
|
* for covered samples or for helper invocations. So, do not blindly
|
|
* enable this option.
|
|
*
|
|
* Note: See also issue #22 in ARB_shader_image_load_store
|
|
*/
|
|
bool lower_helper_invocation;
|
|
|
|
/**
|
|
* Convert gl_SampleMaskIn to gl_HelperInvocation as follows:
|
|
*
|
|
* gl_SampleMaskIn == 0 ---> gl_HelperInvocation
|
|
* gl_SampleMaskIn != 0 ---> !gl_HelperInvocation
|
|
*/
|
|
bool optimize_sample_mask_in;
|
|
|
|
/**
|
|
* Optimize load_front_face ? a : -a to load_front_face_fsign * a
|
|
*/
|
|
bool optimize_load_front_face_fsign;
|
|
|
|
/**
|
|
* Optimize boolean reductions of quad broadcasts. This should only be enabled if
|
|
* nir_intrinsic_reduce supports INCLUDE_HELPERS.
|
|
*/
|
|
bool optimize_quad_vote_to_reduce;
|
|
|
|
bool lower_cs_local_index_to_id;
|
|
bool lower_cs_local_id_to_index;
|
|
|
|
/* Prevents lowering global_invocation_id to be in terms of workgroup_id */
|
|
bool has_cs_global_id;
|
|
|
|
bool lower_device_index_to_zero;
|
|
|
|
/* Set if nir_lower_pntc_ytransform() should invert gl_PointCoord.
|
|
* Either when frame buffer is flipped or GL_POINT_SPRITE_COORD_ORIGIN
|
|
* is GL_LOWER_LEFT.
|
|
*/
|
|
bool lower_wpos_pntc;
|
|
|
|
/**
|
|
* Set if nir_op_[iu]hadd and nir_op_[iu]rhadd instructions should be
|
|
* lowered to simple arithmetic.
|
|
*
|
|
* If this flag is set, the lowering will be applied to all bit-sizes of
|
|
* these instructions.
|
|
*
|
|
* :c:member:`lower_hadd64`
|
|
*/
|
|
bool lower_hadd;
|
|
|
|
/**
|
|
* Set if only 64-bit nir_op_[iu]hadd and nir_op_[iu]rhadd instructions
|
|
* should be lowered to simple arithmetic.
|
|
*
|
|
* If this flag is set, the lowering will be applied to only 64-bit
|
|
* versions of these instructions.
|
|
*
|
|
* :c:member:`lower_hadd`
|
|
*/
|
|
bool lower_hadd64;
|
|
|
|
/**
|
|
* Set if nir_op_uadd_sat should be lowered to simple arithmetic.
|
|
*
|
|
* If this flag is set, the lowering will be applied to all bit-sizes of
|
|
* these instructions.
|
|
*/
|
|
bool lower_uadd_sat;
|
|
|
|
/**
|
|
* Set if nir_op_usub_sat should be lowered to simple arithmetic.
|
|
*
|
|
* If this flag is set, the lowering will be applied to all bit-sizes of
|
|
* these instructions.
|
|
*/
|
|
bool lower_usub_sat;
|
|
|
|
/**
|
|
* Set if nir_op_iadd_sat and nir_op_isub_sat should be lowered to simple
|
|
* arithmetic.
|
|
*
|
|
* If this flag is set, the lowering will be applied to all bit-sizes of
|
|
* these instructions.
|
|
*/
|
|
bool lower_iadd_sat;
|
|
|
|
/**
|
|
* Set if imul_32x16 and umul_32x16 should be lowered to simple
|
|
* arithmetic.
|
|
*/
|
|
bool lower_mul_32x16;
|
|
|
|
/**
|
|
* Set if bf2f and f2bf should be lowered to arithmetic.
|
|
*/
|
|
bool lower_bfloat16_conversions;
|
|
|
|
bool vectorize_tess_levels;
|
|
bool lower_to_scalar;
|
|
nir_instr_filter_cb lower_to_scalar_filter;
|
|
|
|
/**
|
|
* Disables potentially harmful algebraic transformations for architectures
|
|
* with SIMD-within-a-register semantics.
|
|
*
|
|
* Note, to actually vectorize 16bit instructions, use nir_opt_vectorize()
|
|
* with a suitable callback function.
|
|
*/
|
|
bool vectorize_vec2_16bit;
|
|
|
|
/**
|
|
* Should the linker unify inputs_read/outputs_written between adjacent
|
|
* shader stages which are linked into a single program?
|
|
*/
|
|
bool unify_interfaces;
|
|
|
|
/**
|
|
* Whether nir_lower_io() will lower interpolateAt functions to
|
|
* load_interpolated_input intrinsics.
|
|
*
|
|
* Unlike nir_lower_io_use_interpolated_input_intrinsics this will only
|
|
* lower these functions and leave input load intrinsics untouched.
|
|
*/
|
|
bool lower_interpolate_at;
|
|
|
|
/* Lowers when 32x32->64 bit multiplication is not supported */
|
|
bool lower_mul_2x32_64;
|
|
|
|
/* Indicates that urol and uror are supported */
|
|
bool has_rotate8;
|
|
bool has_rotate16;
|
|
bool has_rotate32;
|
|
|
|
/** Backend supports shfr */
|
|
bool has_shfr32;
|
|
|
|
/** Backend supports ternary addition */
|
|
bool has_iadd3;
|
|
|
|
/**
|
|
* Backend supports amul and would like them generated whenever
|
|
* possible. This is stronger than has_imul24 for amul, but does not imply
|
|
* support for imul24.
|
|
*/
|
|
bool has_amul;
|
|
|
|
/**
|
|
* Backend supports imul24, and would like to use it (when possible)
|
|
* for address/offset calculation. If true, driver should call
|
|
* nir_lower_amul(). (If not set, amul will automatically be lowered
|
|
* to imul.)
|
|
*/
|
|
bool has_imul24;
|
|
|
|
/** Backend supports umul24, if not set umul24 will automatically be lowered
|
|
* to imul with masked inputs */
|
|
bool has_umul24;
|
|
|
|
/** Backend supports imul24_relaxed and umul24_relaxed, if not set they will be lowered
|
|
* to imul24, umul24 or imul.
|
|
*/
|
|
bool has_mul24_relaxed;
|
|
|
|
/** Backend supports 32-bit imad */
|
|
bool has_imad32;
|
|
|
|
/** Backend supports umad24, if not set umad24 will automatically be lowered
|
|
* to imul with masked inputs and iadd */
|
|
bool has_umad24;
|
|
|
|
/* Backend supports fused compare against zero and csel */
|
|
bool has_fused_comp_and_csel;
|
|
/* Backend supports fused int eq/ne against zero and csel. */
|
|
bool has_icsel_eqz64;
|
|
bool has_icsel_eqz32;
|
|
bool has_icsel_eqz16;
|
|
|
|
/* Backend supports fneo, fequ, fltu, fgeu. */
|
|
bool has_fneo_fcmpu;
|
|
|
|
/* Backend supports ford and funord. */
|
|
bool has_ford_funord;
|
|
|
|
/** Backend supports fsub, if not set fsub will automatically be lowered to
|
|
* fadd(x, fneg(y)). If true, driver should call nir_opt_algebraic_late(). */
|
|
bool has_fsub;
|
|
|
|
/** Backend supports isub, if not set isub will automatically be lowered to
|
|
* iadd(x, ineg(y)). If true, driver should call nir_opt_algebraic_late(). */
|
|
bool has_isub;
|
|
|
|
/** Backend supports pack_32_4x8 or pack_32_4x8_split. */
|
|
bool has_pack_32_4x8;
|
|
|
|
/** Backend supports nir_load_texture_scale and prefers it over txs for nir
|
|
* lowerings. */
|
|
bool has_texture_scaling;
|
|
|
|
/** Backend supports sdot_4x8_iadd. */
|
|
bool has_sdot_4x8;
|
|
|
|
/** Backend supports udot_4x8_uadd. */
|
|
bool has_udot_4x8;
|
|
|
|
/** Backend supports sudot_4x8_iadd. */
|
|
bool has_sudot_4x8;
|
|
|
|
/** Backend supports sdot_4x8_iadd_sat. */
|
|
bool has_sdot_4x8_sat;
|
|
|
|
/** Backend supports udot_4x8_uadd_sat. */
|
|
bool has_udot_4x8_sat;
|
|
|
|
/** Backend supports sudot_4x8_iadd_sat. */
|
|
bool has_sudot_4x8_sat;
|
|
|
|
/** Backend supports sdot_2x16 and udot_2x16 opcodes. */
|
|
bool has_dot_2x16;
|
|
|
|
/** Backend supports bfdot2_bfadd opcode. */
|
|
bool has_bfdot2_bfadd;
|
|
|
|
/** Backend supports fmulz (and ffmaz if lower_ffma32=false) */
|
|
bool has_fmulz;
|
|
|
|
/**
|
|
* Backend supports fmulz (and ffmaz if lower_ffma32=false) but only if
|
|
* FLOAT_CONTROLS_DENORM_PRESERVE_FP32 is not set
|
|
*/
|
|
bool has_fmulz_no_denorms;
|
|
|
|
/** Backend supports 32bit ufind_msb_rev and ifind_msb_rev. */
|
|
bool has_find_msb_rev;
|
|
|
|
/** Backend supports pack_half_2x16_rtz_split. */
|
|
bool has_pack_half_2x16_rtz;
|
|
|
|
/** Backend supports bitz/bitnz. */
|
|
bool has_bit_test;
|
|
|
|
/** Backend supports ubfe/ibfe. */
|
|
bool has_bfe;
|
|
|
|
/** Backend supports bfm. */
|
|
bool has_bfm;
|
|
|
|
/** Backend supports bfi. */
|
|
bool has_bfi;
|
|
|
|
/** Backend supports bitfield_select. */
|
|
bool has_bitfield_select;
|
|
|
|
/** Backend supports uclz. */
|
|
bool has_uclz;
|
|
|
|
/** Backend support msad_u4x8. */
|
|
bool has_msad;
|
|
|
|
/** Backend supports f2e4m3fn_satfn */
|
|
bool has_f2e4m3fn_satfn;
|
|
|
|
/** Backend supports load_global_bounded intrinsics. */
|
|
bool has_load_global_bounded;
|
|
|
|
/**
|
|
* Is this the Intel vec4 backend?
|
|
*
|
|
* Used to inhibit algebraic optimizations that are known to be harmful on
|
|
* the Intel vec4 backend. This is generally applicable to any
|
|
* optimization that might cause more immediate values to be used in
|
|
* 3-source (e.g., ffma and flrp) instructions.
|
|
*/
|
|
bool intel_vec4;
|
|
|
|
/**
|
|
* For most Intel GPUs, all ternary operations such as FMA and BFE cannot
|
|
* have immediates, so two to three instructions may eventually be needed.
|
|
*/
|
|
bool avoid_ternary_with_two_constants;
|
|
|
|
/** Whether 8-bit ALU is supported. */
|
|
bool support_8bit_alu;
|
|
|
|
/** Whether 16-bit ALU is supported. */
|
|
bool support_16bit_alu;
|
|
|
|
unsigned max_unroll_iterations;
|
|
unsigned max_unroll_iterations_aggressive;
|
|
unsigned max_unroll_iterations_fp64;
|
|
|
|
bool lower_uniforms_to_ubo;
|
|
|
|
/* Specifies if indirect sampler array access will trigger forced loop
|
|
* unrolling.
|
|
*/
|
|
bool force_indirect_unrolling_sampler;
|
|
|
|
/* Some older drivers don't support GLSL versions with the concept of flat
|
|
* varyings and also don't support integers. This setting helps us avoid
|
|
* marking varyings as flat and potentially having them changed to ints via
|
|
* varying packing.
|
|
*/
|
|
bool no_integers;
|
|
|
|
/**
|
|
* Specifies which type of indirectly accessed variables should force
|
|
* loop unrolling.
|
|
*/
|
|
nir_variable_mode force_indirect_unrolling;
|
|
|
|
bool driver_functions;
|
|
|
|
/**
|
|
* If true, the driver will call nir_lower_int64 itself and the frontend
|
|
* should not do so. This may enable better optimization around address
|
|
* modes.
|
|
*/
|
|
bool late_lower_int64;
|
|
nir_lower_int64_options lower_int64_options;
|
|
nir_lower_doubles_options lower_doubles_options;
|
|
nir_divergence_options divergence_analysis_options;
|
|
|
|
/**
|
|
* The masks of shader stages that support indirect indexing with
|
|
* load_input and store_output intrinsics. It's used by
|
|
* nir_lower_io_passes.
|
|
*/
|
|
uint8_t support_indirect_inputs;
|
|
uint8_t support_indirect_outputs;
|
|
|
|
/** store the variable offset into the instrinsic range_base instead
|
|
* of adding it to the image index.
|
|
*/
|
|
bool lower_image_offset_to_range_base;
|
|
|
|
/** store the variable offset into the instrinsic range_base instead
|
|
* of adding it to the atomic source
|
|
*/
|
|
bool lower_atomic_offset_to_range_base;
|
|
|
|
/** Don't convert medium-precision casts (e.g. f2fmp) into concrete
|
|
* type casts (e.g. f2f16).
|
|
*/
|
|
bool preserve_mediump;
|
|
|
|
/** lowers fquantize2f16 to alu ops. */
|
|
bool lower_fquantize2f16;
|
|
|
|
/** Lower f2f16 to f2f16_rtz when execution mode is not rtne. */
|
|
bool force_f2f16_rtz;
|
|
|
|
/** Lower VARYING_SLOT_LAYER in FS to SYSTEM_VALUE_LAYER_ID. */
|
|
bool lower_layer_fs_input_to_sysval;
|
|
|
|
/** clip/cull distance and tess level arrays use compact semantics */
|
|
bool compact_arrays;
|
|
|
|
/**
|
|
* Whether discard gets emitted as nir_intrinsic_demote.
|
|
* Otherwise, nir_intrinsic_terminate is being used.
|
|
*/
|
|
bool discard_is_demote;
|
|
|
|
/**
|
|
* Whether the new-style derivative intrinsics are supported. If false,
|
|
* legacy ALU derivative ops will be emitted. This transitional option will
|
|
* be removed once all drivers are converted to derivative intrinsics.
|
|
*/
|
|
bool has_ddx_intrinsics;
|
|
|
|
/** Whether derivative intrinsics must be scalarized. */
|
|
bool scalarize_ddx;
|
|
|
|
/**
|
|
* Assign a range of driver locations to per-view outputs, with unique
|
|
* slots for each view. If unset, per-view outputs will be treated
|
|
* similarly to other arrayed IO, and only slots for one view will be
|
|
* assigned. Regardless of this setting, per-view outputs are only assigned
|
|
* slots for one value in var->data.location.
|
|
*/
|
|
bool per_view_unique_driver_locations;
|
|
|
|
/**
|
|
* Emit nir_intrinsic_store_per_view_output with compacted view indices
|
|
* rather than absolute view indices. When using compacted indices, the Nth
|
|
* index refers to the Nth enabled view, not the Nth absolute view. For
|
|
* example, with view mask 0b1010, compacted index 0 is absolute index 1,
|
|
* and compacted index 1 is absolute index 3. Note that compacted view
|
|
* indices do not correspond directly to gl_ViewIndex.
|
|
*
|
|
* If compact_view_index is unset, per-view indices must be constant before
|
|
* nir_lower_io. This can be guaranteed by calling nir_lower_io_temporaries
|
|
* first.
|
|
*/
|
|
bool compact_view_index;
|
|
|
|
/** Options determining lowering and behavior of inputs and outputs. */
|
|
nir_io_options io_options;
|
|
|
|
/**
|
|
* Bit mask of nir_lower_packing_op to skip lowering some nir ops in
|
|
* nir_lower_packing().
|
|
*/
|
|
unsigned skip_lower_packing_ops;
|
|
|
|
/* In case the exact subgroup size is not known, subgroup_size should be
|
|
* set to 0. In that case, the maximum subgroup size will be calculated by
|
|
* ballot_components * ballot_bit_size.
|
|
*/
|
|
uint8_t subgroup_size;
|
|
uint8_t ballot_bit_size;
|
|
uint8_t ballot_components;
|
|
|
|
/** Driver callback where drivers can define how to lower mediump.
|
|
* Used by nir_lower_io_passes.
|
|
*/
|
|
void (*lower_mediump_io)(struct nir_shader *nir);
|
|
|
|
/**
|
|
* Return the maximum cost of an expression that's written to a shader
|
|
* output that can be moved into the next shader to remove that output.
|
|
*
|
|
* Currently only uniform expressions are moved. A uniform expression is
|
|
* any ALU expression sourcing only constants, uniforms, and UBO loads.
|
|
*
|
|
* Set to NULL or return 0 if you only want to propagate constants from
|
|
* outputs to inputs.
|
|
*
|
|
* Drivers can set the maximum cost based on the types of consecutive
|
|
* shaders or shader SHA1s.
|
|
*
|
|
* Drivers should also set "varying_estimate_instr_cost".
|
|
*/
|
|
unsigned (*varying_expression_max_cost)(struct nir_shader *consumer,
|
|
struct nir_shader *producer);
|
|
|
|
/**
|
|
* Return the cost of an instruction that could be moved into the next
|
|
* shader. If the cost of all instructions in an expression is <=
|
|
* varying_expression_max_cost(), the instruction is moved.
|
|
*
|
|
* When this callback isn't set, nir_opt_varyings uses its own version.
|
|
*/
|
|
unsigned (*varying_estimate_instr_cost)(struct nir_instr *instr);
|
|
|
|
/**
|
|
* When the varying_expression_max_cost callback isn't set, this specifies
|
|
* the maximum cost of a uniform expression that is allowed to be moved
|
|
* from output stores into the next shader stage to eliminate those output
|
|
* stores and corresponding inputs.
|
|
*
|
|
* 0 only allows propagating constants written to output stores to
|
|
* the next shader.
|
|
*
|
|
* At least 2 is required for moving a uniform stored in an output into
|
|
* the next shader according to default_varying_estimate_instr_cost.
|
|
*/
|
|
unsigned max_varying_expression_cost;
|
|
|
|
/**
|
|
* Used by nir_lower_explicit_io to determine the maximum offset_shift to
|
|
* use when lowering the deref address of the given intrinsic.
|
|
*/
|
|
unsigned (*max_offset_shift)(nir_intrinsic_instr *, const void *);
|
|
|
|
/**
|
|
* Passed to the callbacks that accept a data pointer.
|
|
*/
|
|
const void *cb_data;
|
|
} nir_shader_compiler_options;
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
|
|
#endif /* NIR_SHADER_COMPILER_OPTIONS_H */
|