mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-01-04 02:40:11 +01:00
The two backends (ir3, dxil) using the lowering have info->fs.uses_sample_shading matching set when sample shading is used -- the VK drivers pass the rasterization state flag into the compiler, while freedreno and d3d12 have caps->force_persample_interp so the frontend creates a shader variant with info->fs.uses_sample_shading set. This means that we can drop the sample-id SHL/AND in the pixel-rate shading case, which in turn means that drivers don't need to have a load_sample_id() that doesn't trigger sample-rate shading (which Adreno doesn't have). Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36429>
841 lines
27 KiB
C
841 lines
27 KiB
C
/*
|
|
* Copyright © 2014 Connor Abbott
|
|
* SPDX-License-Identifier: MIT
|
|
*/
|
|
|
|
#ifndef NIR_SHADER_COMPILER_OPTIONS_H
|
|
#define NIR_SHADER_COMPILER_OPTIONS_H
|
|
|
|
#include "util/macros.h"
|
|
#include "nir_defines.h"
|
|
#include <stdbool.h>
|
|
#include <stdint.h>
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
typedef enum {
|
|
nir_lower_imul64 = (1 << 0),
|
|
nir_lower_isign64 = (1 << 1),
|
|
/** Lower all int64 modulus and division opcodes */
|
|
nir_lower_divmod64 = (1 << 2),
|
|
/** Lower all 64-bit umul_high and imul_high opcodes */
|
|
nir_lower_imul_high64 = (1 << 3),
|
|
nir_lower_bcsel64 = (1 << 4),
|
|
nir_lower_icmp64 = (1 << 5),
|
|
nir_lower_iadd64 = (1 << 6),
|
|
nir_lower_iabs64 = (1 << 7),
|
|
nir_lower_ineg64 = (1 << 8),
|
|
nir_lower_logic64 = (1 << 9),
|
|
nir_lower_minmax64 = (1 << 10),
|
|
nir_lower_shift64 = (1 << 11),
|
|
nir_lower_imul_2x32_64 = (1 << 12),
|
|
nir_lower_extract64 = (1 << 13),
|
|
nir_lower_ufind_msb64 = (1 << 14),
|
|
nir_lower_bit_count64 = (1 << 15),
|
|
nir_lower_subgroup_shuffle64 = (1 << 16),
|
|
nir_lower_scan_reduce_bitwise64 = (1 << 17),
|
|
nir_lower_scan_reduce_iadd64 = (1 << 18),
|
|
nir_lower_vote_ieq64 = (1 << 19),
|
|
nir_lower_usub_sat64 = (1 << 20),
|
|
nir_lower_iadd_sat64 = (1 << 21),
|
|
nir_lower_find_lsb64 = (1 << 22),
|
|
nir_lower_conv64 = (1 << 23),
|
|
nir_lower_uadd_sat64 = (1 << 24),
|
|
nir_lower_iadd3_64 = (1 << 25),
|
|
nir_lower_bitfield_reverse64 = (1 << 26),
|
|
nir_lower_bitfield_extract64 = (1 << 27),
|
|
} nir_lower_int64_options;
|
|
|
|
typedef enum {
|
|
nir_lower_drcp = (1 << 0),
|
|
nir_lower_dsqrt = (1 << 1),
|
|
nir_lower_drsq = (1 << 2),
|
|
nir_lower_dtrunc = (1 << 3),
|
|
nir_lower_dfloor = (1 << 4),
|
|
nir_lower_dceil = (1 << 5),
|
|
nir_lower_dfract = (1 << 6),
|
|
nir_lower_dround_even = (1 << 7),
|
|
nir_lower_dmod = (1 << 8),
|
|
nir_lower_dsub = (1 << 9),
|
|
nir_lower_ddiv = (1 << 10),
|
|
nir_lower_dsign = (1 << 11),
|
|
nir_lower_dminmax = (1 << 12),
|
|
nir_lower_dsat = (1 << 13),
|
|
nir_lower_fp64_full_software = (1 << 14),
|
|
} nir_lower_doubles_options;
|
|
|
|
typedef enum {
|
|
nir_divergence_single_prim_per_subgroup = (1 << 0),
|
|
nir_divergence_single_patch_per_tcs_subgroup = (1 << 1),
|
|
nir_divergence_single_patch_per_tes_subgroup = (1 << 2),
|
|
nir_divergence_view_index_uniform = (1 << 3),
|
|
nir_divergence_single_frag_shading_rate_per_subgroup = (1 << 4),
|
|
nir_divergence_multiple_workgroup_per_compute_subgroup = (1 << 5),
|
|
nir_divergence_shader_record_ptr_uniform = (1 << 6),
|
|
nir_divergence_uniform_load_tears = (1 << 7),
|
|
/* If used, this allows phis for divergent merges with undef and a uniform source to be considered uniform */
|
|
nir_divergence_ignore_undef_if_phi_srcs = (1 << 8),
|
|
/* Whether to compute vertex divergence (meaning between vertices
|
|
* of the same primitive) instead of subgroup invocation divergence
|
|
* (between invocations of the same subgroup). For example, patch input
|
|
* loads are always convergent, while subgroup intrinsics are divergent.
|
|
*/
|
|
nir_divergence_vertex = (1 << 11),
|
|
} nir_divergence_options;
|
|
|
|
/** An instruction filtering callback
|
|
*
|
|
* Returns true if the instruction should be processed and false otherwise.
|
|
*/
|
|
typedef bool (*nir_instr_filter_cb)(const nir_instr *, const void *);
|
|
|
|
typedef enum {
|
|
/**
|
|
* Whether a fragment shader can interpolate the same input multiple times
|
|
* with different modes (smooth, noperspective) and locations (pixel,
|
|
* centroid, sample, at_offset, at_sample), excluding the flat mode.
|
|
*
|
|
* This matches AMD GPU flexibility and limitations and is a superset of
|
|
* the GL4 requirement that each input can be interpolated at its specified
|
|
* location, and then also as centroid, at_offset, and at_sample.
|
|
*/
|
|
nir_io_has_flexible_input_interpolation_except_flat = BITFIELD_BIT(0),
|
|
|
|
/**
|
|
* nir_opt_varyings compacts (relocates) components of varyings by
|
|
* rewriting their locations completely, effectively moving components of
|
|
* varyings between slots. This option forces nir_opt_varyings to make
|
|
* VARYING_SLOT_POS unused by moving its contents to VARn if the consumer
|
|
* is not FS. If this option is not set and POS is unused, it moves
|
|
* components of VARn to POS until it's fully used.
|
|
*/
|
|
nir_io_dont_use_pos_for_non_fs_varyings = BITFIELD_BIT(1),
|
|
|
|
nir_io_16bit_input_output_support = BITFIELD_BIT(2),
|
|
|
|
/**
|
|
* Implement mediump inputs and outputs as normal 32-bit IO.
|
|
* Causes the mediump flag to be not set for IO semantics, essentially
|
|
* destroying any mediump-related IO information in the shader.
|
|
*/
|
|
nir_io_mediump_is_32bit = BITFIELD_BIT(3),
|
|
|
|
/**
|
|
* Whether nir_opt_vectorize_io should ignore FS inputs.
|
|
*/
|
|
nir_io_prefer_scalar_fs_inputs = BITFIELD_BIT(4),
|
|
|
|
/**
|
|
* Whether interpolated fragment shader vec4 slots can use load_input for
|
|
* a subset of its components to skip interpolation for those components.
|
|
* The result of such load_input is a value from a random (not necessarily
|
|
* provoking) vertex. If a value from the provoking vertex is required,
|
|
* the vec4 slot should have no load_interpolated_input instructions.
|
|
*
|
|
* This exposes the AMD capability that allows packing flat inputs with
|
|
* interpolated inputs in a limited number of cases. Normally, flat
|
|
* components must be in a separate vec4 slot to get the value from
|
|
* the provoking vertex. If the compiler can prove that all per-vertex
|
|
* values are equal (convergent, i.e. the provoking vertex doesn't matter),
|
|
* it can put such flat components into any interpolated vec4 slot.
|
|
*
|
|
* It should also be set if the hw can mix flat and interpolated components
|
|
* in the same vec4 slot.
|
|
*
|
|
* This causes nir_opt_varyings to skip interpolation for all varyings
|
|
* that are convergent, and enables better compaction and inter-shader code
|
|
* motion for convergent varyings.
|
|
*/
|
|
nir_io_mix_convergent_flat_with_interpolated = BITFIELD_BIT(5),
|
|
|
|
/**
|
|
* Whether src_type and dest_type of IO intrinsics are irrelevant and
|
|
* should be ignored by nir_opt_vectorize_io. All drivers that always treat
|
|
* load_input and store_output as untyped and load_interpolated_input as
|
|
* float##bit_size should set this.
|
|
*/
|
|
nir_io_vectorizer_ignores_types = BITFIELD_BIT(6),
|
|
|
|
/**
|
|
* Whether nir_opt_varyings should never promote convergent FS inputs
|
|
* to flat.
|
|
*/
|
|
nir_io_always_interpolate_convergent_fs_inputs = BITFIELD_BIT(7),
|
|
|
|
/**
|
|
* Whether the first assigned color channel component should be equal to
|
|
* the first unused VARn component.
|
|
*
|
|
* For example, if the first unused VARn channel is VAR0.z, color channels
|
|
* are assigned in this order:
|
|
* COL0.z, COL0.w, COL0.x, COL0.y, COL1.z, COL1.w, COL1.x, COL1.y
|
|
*
|
|
* This allows certain drivers to merge outputs if each output sets
|
|
* different components, for example 2 outputs writing VAR0.xy and COL0.z
|
|
* will only use 1 HW output.
|
|
*/
|
|
nir_io_compaction_rotates_color_channels = BITFIELD_BIT(8),
|
|
|
|
/**
|
|
* Whether to group TES inputs as follows:
|
|
* - inputs used to compute only POS/CLIP outputs are first
|
|
* - inputs used to compute both POS/CLIP outputs and other outputs are next
|
|
* - inputs used to compute only other outputs are last
|
|
*/
|
|
nir_io_compaction_groups_tes_inputs_into_pos_and_var_groups = BITFIELD_BIT(9),
|
|
|
|
/**
|
|
* RADV expects that high 16 bits of outputs set component >= 4. That's not
|
|
* legal in NIR, but RADV unfortunately relies on it because it's not
|
|
* validated.
|
|
*/
|
|
nir_io_radv_intrinsic_component_workaround = BITFIELD_BIT(10),
|
|
|
|
/* Options affecting the GLSL compiler or Gallium are below. */
|
|
|
|
/**
|
|
* Lower load_deref/store_deref to load_input/store_output/etc. intrinsics.
|
|
* This is only affects GLSL compilation and Gallium.
|
|
*/
|
|
nir_io_has_intrinsics = BITFIELD_BIT(16),
|
|
|
|
/**
|
|
* Whether clip and cull distance arrays should be separate. If this is not
|
|
* set, cull distances will be moved into VARYING_SLOT_CLIP_DISTn after clip
|
|
* distances, and shader_info::clip_distance_array_size will be the index
|
|
* of the first cull distance. nir_lower_clip_cull_distance_array_vars does
|
|
* that.
|
|
*/
|
|
nir_io_separate_clip_cull_distance_arrays = BITFIELD_BIT(17),
|
|
} nir_io_options;
|
|
|
|
typedef enum {
|
|
nir_lower_packing_op_pack_64_2x32,
|
|
nir_lower_packing_op_unpack_64_2x32,
|
|
nir_lower_packing_op_pack_64_4x16,
|
|
nir_lower_packing_op_unpack_64_4x16,
|
|
nir_lower_packing_op_pack_32_2x16,
|
|
nir_lower_packing_op_unpack_32_2x16,
|
|
nir_lower_packing_op_pack_32_4x8,
|
|
nir_lower_packing_op_unpack_32_4x8,
|
|
nir_lower_packing_num_ops,
|
|
} nir_lower_packing_op;
|
|
|
|
typedef struct nir_shader_compiler_options {
|
|
bool lower_fdiv;
|
|
bool lower_ffma16;
|
|
bool lower_ffma32;
|
|
bool lower_ffma64;
|
|
bool fuse_ffma16;
|
|
bool fuse_ffma32;
|
|
bool fuse_ffma64;
|
|
bool lower_flrp16;
|
|
bool lower_flrp32;
|
|
/** Lowers flrp when it does not support doubles */
|
|
bool lower_flrp64;
|
|
bool lower_fpow;
|
|
bool lower_fsat;
|
|
bool lower_fsqrt;
|
|
bool lower_sincos;
|
|
bool lower_fmod;
|
|
/** Lowers ibitfield_extract/ubitfield_extract for 8, 16 & 32 bits. */
|
|
bool lower_bitfield_extract8;
|
|
bool lower_bitfield_extract16;
|
|
bool lower_bitfield_extract;
|
|
/** Lowers bitfield_insert. */
|
|
bool lower_bitfield_insert;
|
|
/** Lowers bitfield_reverse to shifts. */
|
|
bool lower_bitfield_reverse;
|
|
/** Lowers bit_count to shifts. */
|
|
bool lower_bit_count;
|
|
/** Lowers ifind_msb. */
|
|
bool lower_ifind_msb;
|
|
/** Lowers ufind_msb. */
|
|
bool lower_ufind_msb;
|
|
/** Lowers find_lsb to ufind_msb and logic ops */
|
|
bool lower_find_lsb;
|
|
bool lower_uadd_carry;
|
|
bool lower_usub_borrow;
|
|
/** Lowers imul_high/umul_high to 16-bit multiplies and carry operations. */
|
|
bool lower_mul_high;
|
|
bool lower_mul_high16;
|
|
/** lowers fneg to fmul(x, -1.0). Driver must call nir_opt_algebraic_late() */
|
|
bool lower_fneg;
|
|
/** lowers ineg to isub. Driver must call nir_opt_algebraic_late(). */
|
|
bool lower_ineg;
|
|
/** lowers fisnormal to alu ops. */
|
|
bool lower_fisnormal;
|
|
|
|
/* lower {slt,sge,seq,sne} to {flt,fge,feq,fneu} + b2f: */
|
|
bool lower_scmp;
|
|
|
|
/* lower b/fall_equalN/b/fany_nequalN (ex:fany_nequal4 to sne+fdot4+fsat) */
|
|
bool lower_vector_cmp;
|
|
|
|
/** enable rules to avoid bit ops */
|
|
bool lower_bitops;
|
|
|
|
/** enables rules to lower isign to imin+imax */
|
|
bool lower_isign;
|
|
|
|
/** enables rules to lower fsign to fsub and flt */
|
|
bool lower_fsign;
|
|
|
|
/** enables rules to lower iabs to ineg+imax */
|
|
bool lower_iabs;
|
|
|
|
/** enable rules that avoid generating umax from signed integer ops */
|
|
bool lower_umax;
|
|
|
|
/** enable rules that avoid generating umin from signed integer ops */
|
|
bool lower_umin;
|
|
|
|
/* lower fmin/fmax with signed zero preserve to fmin/fmax with
|
|
* no_signed_zero, for backends whose fmin/fmax implementations do not
|
|
* implement IEEE-754-2019 semantics for signed zero.
|
|
*/
|
|
bool lower_fminmax_signed_zero;
|
|
|
|
/* lower fdph to fdot4 */
|
|
bool lower_fdph;
|
|
|
|
/* Does the native fdot instruction replicate its result for four
|
|
* components? If so, then opt_algebraic_late will turn all fdotN
|
|
* instructions into fdotN_replicated instructions.
|
|
*/
|
|
bool fdot_replicates;
|
|
|
|
/** lowers ffloor to fsub+ffract: */
|
|
bool lower_ffloor;
|
|
|
|
/** lowers ffract to fsub+ffloor: */
|
|
bool lower_ffract;
|
|
|
|
/** lowers fceil to fneg+ffloor+fneg: */
|
|
bool lower_fceil;
|
|
|
|
bool lower_ftrunc;
|
|
|
|
/** Lowers fround_even to ffract+feq+csel.
|
|
*
|
|
* Not correct in that it doesn't correctly handle the "_even" part of the
|
|
* rounding, but good enough for DX9 array indexing handling on DX9-class
|
|
* hardware.
|
|
*/
|
|
bool lower_fround_even;
|
|
|
|
bool lower_ldexp;
|
|
|
|
bool lower_pack_half_2x16;
|
|
bool lower_pack_unorm_2x16;
|
|
bool lower_pack_snorm_2x16;
|
|
bool lower_pack_unorm_4x8;
|
|
bool lower_pack_snorm_4x8;
|
|
bool lower_pack_64_2x32;
|
|
bool lower_pack_64_4x16;
|
|
bool lower_pack_32_2x16;
|
|
bool lower_pack_64_2x32_split;
|
|
bool lower_pack_32_2x16_split;
|
|
bool lower_unpack_half_2x16;
|
|
bool lower_unpack_unorm_2x16;
|
|
bool lower_unpack_snorm_2x16;
|
|
bool lower_unpack_unorm_4x8;
|
|
bool lower_unpack_snorm_4x8;
|
|
bool lower_unpack_64_2x32_split;
|
|
bool lower_unpack_32_2x16_split;
|
|
|
|
bool lower_pack_split;
|
|
|
|
bool lower_extract_byte;
|
|
bool lower_extract_word;
|
|
bool lower_insert_byte;
|
|
bool lower_insert_word;
|
|
|
|
/* Indicates that the driver only has zero-based vertex id */
|
|
bool vertex_id_zero_based;
|
|
|
|
/**
|
|
* If enabled, gl_BaseVertex will be lowered as:
|
|
* is_indexed_draw (~0/0) & firstvertex
|
|
*/
|
|
bool lower_base_vertex;
|
|
|
|
/* Indicates that gl_InstanceIndex already includes base index
|
|
* and doesn't require further lowering.
|
|
*/
|
|
bool instance_id_includes_base_index;
|
|
|
|
/**
|
|
* If enabled, gl_HelperInvocation will be lowered as:
|
|
*
|
|
* - non-sample-shading: sample_mask_in == 0.
|
|
* - sample shading: !((1 << sample_id) & sample_mask_in))
|
|
*
|
|
* For this to be correct, it requires that fs.uses_sample_shading is set to
|
|
* true when sample shading is enabled. This means that you need shader
|
|
* variants to set the flag when Vulkan's
|
|
* VkPipelineMultisampleStateCreateInfo->sampleShadingEnable or GL's
|
|
* glMinSampleshading() are enabled.
|
|
*
|
|
* This depends on some possibly hw implementation details, which may
|
|
* not be true for all hw. In particular that the FS is only executed
|
|
* for covered samples or for helper invocations. So, do not blindly
|
|
* enable this option.
|
|
*
|
|
* Note: See also issue #22 in ARB_shader_image_load_store
|
|
*/
|
|
bool lower_helper_invocation;
|
|
|
|
/**
|
|
* Convert gl_SampleMaskIn to gl_HelperInvocation as follows:
|
|
*
|
|
* gl_SampleMaskIn == 0 ---> gl_HelperInvocation
|
|
* gl_SampleMaskIn != 0 ---> !gl_HelperInvocation
|
|
*/
|
|
bool optimize_sample_mask_in;
|
|
|
|
/**
|
|
* Optimize load_front_face ? a : -a to load_front_face_fsign * a
|
|
*/
|
|
bool optimize_load_front_face_fsign;
|
|
|
|
/**
|
|
* Optimize boolean reductions of quad broadcasts. This should only be enabled if
|
|
* nir_intrinsic_reduce supports INCLUDE_HELPERS.
|
|
*/
|
|
bool optimize_quad_vote_to_reduce;
|
|
|
|
bool lower_cs_local_index_to_id;
|
|
bool lower_cs_local_id_to_index;
|
|
|
|
/* Prevents lowering global_invocation_id to be in terms of workgroup_id */
|
|
bool has_cs_global_id;
|
|
|
|
bool lower_device_index_to_zero;
|
|
|
|
/* Set if nir_lower_pntc_ytransform() should invert gl_PointCoord.
|
|
* Either when frame buffer is flipped or GL_POINT_SPRITE_COORD_ORIGIN
|
|
* is GL_LOWER_LEFT.
|
|
*/
|
|
bool lower_wpos_pntc;
|
|
|
|
/**
|
|
* Set if nir_op_[iu]hadd and nir_op_[iu]rhadd instructions should be
|
|
* lowered to simple arithmetic.
|
|
*
|
|
* If this flag is set, the lowering will be applied to all bit-sizes of
|
|
* these instructions.
|
|
*
|
|
* :c:member:`lower_hadd64`
|
|
*/
|
|
bool lower_hadd;
|
|
|
|
/**
|
|
* Set if only 64-bit nir_op_[iu]hadd and nir_op_[iu]rhadd instructions
|
|
* should be lowered to simple arithmetic.
|
|
*
|
|
* If this flag is set, the lowering will be applied to only 64-bit
|
|
* versions of these instructions.
|
|
*
|
|
* :c:member:`lower_hadd`
|
|
*/
|
|
bool lower_hadd64;
|
|
|
|
/**
|
|
* Set if nir_op_uadd_sat should be lowered to simple arithmetic.
|
|
*
|
|
* If this flag is set, the lowering will be applied to all bit-sizes of
|
|
* these instructions.
|
|
*/
|
|
bool lower_uadd_sat;
|
|
|
|
/**
|
|
* Set if nir_op_usub_sat should be lowered to simple arithmetic.
|
|
*
|
|
* If this flag is set, the lowering will be applied to all bit-sizes of
|
|
* these instructions.
|
|
*/
|
|
bool lower_usub_sat;
|
|
|
|
/**
|
|
* Set if nir_op_iadd_sat and nir_op_isub_sat should be lowered to simple
|
|
* arithmetic.
|
|
*
|
|
* If this flag is set, the lowering will be applied to all bit-sizes of
|
|
* these instructions.
|
|
*/
|
|
bool lower_iadd_sat;
|
|
|
|
/**
|
|
* Set if imul_32x16 and umul_32x16 should be lowered to simple
|
|
* arithmetic.
|
|
*/
|
|
bool lower_mul_32x16;
|
|
|
|
/**
|
|
* Set if bf2f and f2bf should be lowered to arithmetic.
|
|
*/
|
|
bool lower_bfloat16_conversions;
|
|
|
|
bool vectorize_tess_levels;
|
|
bool lower_to_scalar;
|
|
nir_instr_filter_cb lower_to_scalar_filter;
|
|
|
|
/**
|
|
* Disables potentially harmful algebraic transformations for architectures
|
|
* with SIMD-within-a-register semantics.
|
|
*
|
|
* Note, to actually vectorize 16bit instructions, use nir_opt_vectorize()
|
|
* with a suitable callback function.
|
|
*/
|
|
bool vectorize_vec2_16bit;
|
|
|
|
/**
|
|
* Should the linker unify inputs_read/outputs_written between adjacent
|
|
* shader stages which are linked into a single program?
|
|
*/
|
|
bool unify_interfaces;
|
|
|
|
/**
|
|
* Whether nir_lower_io() will lower interpolateAt functions to
|
|
* load_interpolated_input intrinsics.
|
|
*
|
|
* Unlike nir_lower_io_use_interpolated_input_intrinsics this will only
|
|
* lower these functions and leave input load intrinsics untouched.
|
|
*/
|
|
bool lower_interpolate_at;
|
|
|
|
/* Lowers when 32x32->64 bit multiplication is not supported */
|
|
bool lower_mul_2x32_64;
|
|
|
|
/* Indicates that urol and uror are supported */
|
|
bool has_rotate8;
|
|
bool has_rotate16;
|
|
bool has_rotate32;
|
|
|
|
/** Backend supports shfr */
|
|
bool has_shfr32;
|
|
|
|
/** Backend supports ternary addition */
|
|
bool has_iadd3;
|
|
|
|
/**
|
|
* Backend supports amul and would like them generated whenever
|
|
* possible. This is stronger than has_imul24 for amul, but does not imply
|
|
* support for imul24.
|
|
*/
|
|
bool has_amul;
|
|
|
|
/**
|
|
* Backend supports imul24, and would like to use it (when possible)
|
|
* for address/offset calculation. If true, driver should call
|
|
* nir_lower_amul(). (If not set, amul will automatically be lowered
|
|
* to imul.)
|
|
*/
|
|
bool has_imul24;
|
|
|
|
/** Backend supports umul24, if not set umul24 will automatically be lowered
|
|
* to imul with masked inputs */
|
|
bool has_umul24;
|
|
|
|
/** Backend supports imul24_relaxed and umul24_relaxed, if not set they will be lowered
|
|
* to imul24, umul24 or imul.
|
|
*/
|
|
bool has_mul24_relaxed;
|
|
|
|
/** Backend supports 32-bit imad */
|
|
bool has_imad32;
|
|
|
|
/** Backend supports umad24, if not set umad24 will automatically be lowered
|
|
* to imul with masked inputs and iadd */
|
|
bool has_umad24;
|
|
|
|
/* Backend supports fused compare against zero and csel */
|
|
bool has_fused_comp_and_csel;
|
|
/* Backend supports fused int eq/ne against zero and csel. */
|
|
bool has_icsel_eqz64;
|
|
bool has_icsel_eqz32;
|
|
bool has_icsel_eqz16;
|
|
|
|
/* Backend supports fneo, fequ, fltu, fgeu. */
|
|
bool has_fneo_fcmpu;
|
|
|
|
/* Backend supports ford and funord. */
|
|
bool has_ford_funord;
|
|
|
|
/** Backend supports fsub, if not set fsub will automatically be lowered to
|
|
* fadd(x, fneg(y)). If true, driver should call nir_opt_algebraic_late(). */
|
|
bool has_fsub;
|
|
|
|
/** Backend supports isub, if not set isub will automatically be lowered to
|
|
* iadd(x, ineg(y)). If true, driver should call nir_opt_algebraic_late(). */
|
|
bool has_isub;
|
|
|
|
/** Backend supports pack_32_4x8 or pack_32_4x8_split. */
|
|
bool has_pack_32_4x8;
|
|
|
|
/** Backend supports nir_load_texture_scale and prefers it over txs for nir
|
|
* lowerings. */
|
|
bool has_texture_scaling;
|
|
|
|
/** Backend supports sdot_4x8_iadd. */
|
|
bool has_sdot_4x8;
|
|
|
|
/** Backend supports udot_4x8_uadd. */
|
|
bool has_udot_4x8;
|
|
|
|
/** Backend supports sudot_4x8_iadd. */
|
|
bool has_sudot_4x8;
|
|
|
|
/** Backend supports sdot_4x8_iadd_sat. */
|
|
bool has_sdot_4x8_sat;
|
|
|
|
/** Backend supports udot_4x8_uadd_sat. */
|
|
bool has_udot_4x8_sat;
|
|
|
|
/** Backend supports sudot_4x8_iadd_sat. */
|
|
bool has_sudot_4x8_sat;
|
|
|
|
/** Backend supports sdot_2x16 and udot_2x16 opcodes. */
|
|
bool has_dot_2x16;
|
|
|
|
/** Backend supports bfdot2_bfadd opcode. */
|
|
bool has_bfdot2_bfadd;
|
|
|
|
/** Backend supports fmulz (and ffmaz if lower_ffma32=false) */
|
|
bool has_fmulz;
|
|
|
|
/**
|
|
* Backend supports fmulz (and ffmaz if lower_ffma32=false) but only if
|
|
* FLOAT_CONTROLS_DENORM_PRESERVE_FP32 is not set
|
|
*/
|
|
bool has_fmulz_no_denorms;
|
|
|
|
/** Backend supports 32bit ufind_msb_rev and ifind_msb_rev. */
|
|
bool has_find_msb_rev;
|
|
|
|
/** Backend supports pack_half_2x16_rtz_split. */
|
|
bool has_pack_half_2x16_rtz;
|
|
|
|
/** Backend supports bitz/bitnz. */
|
|
bool has_bit_test;
|
|
|
|
/** Backend supports ubfe/ibfe. */
|
|
bool has_bfe;
|
|
|
|
/** Backend supports bfm. */
|
|
bool has_bfm;
|
|
|
|
/** Backend supports bfi. */
|
|
bool has_bfi;
|
|
|
|
/** Backend supports bitfield_select. */
|
|
bool has_bitfield_select;
|
|
|
|
/** Backend supports uclz. */
|
|
bool has_uclz;
|
|
|
|
/** Backend support msad_u4x8. */
|
|
bool has_msad;
|
|
|
|
/** Backend supports f2e4m3fn_satfn */
|
|
bool has_f2e4m3fn_satfn;
|
|
|
|
/** Backend supports load_global_bounded intrinsics. */
|
|
bool has_load_global_bounded;
|
|
|
|
/**
|
|
* Is this the Intel vec4 backend?
|
|
*
|
|
* Used to inhibit algebraic optimizations that are known to be harmful on
|
|
* the Intel vec4 backend. This is generally applicable to any
|
|
* optimization that might cause more immediate values to be used in
|
|
* 3-source (e.g., ffma and flrp) instructions.
|
|
*/
|
|
bool intel_vec4;
|
|
|
|
/**
|
|
* For most Intel GPUs, all ternary operations such as FMA and BFE cannot
|
|
* have immediates, so two to three instructions may eventually be needed.
|
|
*/
|
|
bool avoid_ternary_with_two_constants;
|
|
|
|
/** Whether 8-bit ALU is supported. */
|
|
bool support_8bit_alu;
|
|
|
|
/** Whether 16-bit ALU is supported. */
|
|
bool support_16bit_alu;
|
|
|
|
unsigned max_unroll_iterations;
|
|
unsigned max_unroll_iterations_aggressive;
|
|
unsigned max_unroll_iterations_fp64;
|
|
|
|
bool lower_uniforms_to_ubo;
|
|
|
|
/* Specifies if indirect sampler array access will trigger forced loop
|
|
* unrolling.
|
|
*/
|
|
bool force_indirect_unrolling_sampler;
|
|
|
|
/* Some older drivers don't support GLSL versions with the concept of flat
|
|
* varyings and also don't support integers. This setting helps us avoid
|
|
* marking varyings as flat and potentially having them changed to ints via
|
|
* varying packing.
|
|
*/
|
|
bool no_integers;
|
|
|
|
/**
|
|
* Specifies which type of indirectly accessed variables should force
|
|
* loop unrolling.
|
|
*/
|
|
nir_variable_mode force_indirect_unrolling;
|
|
|
|
bool driver_functions;
|
|
|
|
/**
|
|
* If true, the driver will call nir_lower_int64 itself and the frontend
|
|
* should not do so. This may enable better optimization around address
|
|
* modes.
|
|
*/
|
|
bool late_lower_int64;
|
|
nir_lower_int64_options lower_int64_options;
|
|
nir_lower_doubles_options lower_doubles_options;
|
|
nir_divergence_options divergence_analysis_options;
|
|
|
|
/**
|
|
* The masks of shader stages that support indirect indexing with
|
|
* load_input and store_output intrinsics. It's used by
|
|
* nir_lower_io_passes.
|
|
*/
|
|
uint8_t support_indirect_inputs;
|
|
uint8_t support_indirect_outputs;
|
|
|
|
/** store the variable offset into the instrinsic range_base instead
|
|
* of adding it to the image index.
|
|
*/
|
|
bool lower_image_offset_to_range_base;
|
|
|
|
/** store the variable offset into the instrinsic range_base instead
|
|
* of adding it to the atomic source
|
|
*/
|
|
bool lower_atomic_offset_to_range_base;
|
|
|
|
/** Don't convert medium-precision casts (e.g. f2fmp) into concrete
|
|
* type casts (e.g. f2f16).
|
|
*/
|
|
bool preserve_mediump;
|
|
|
|
/** lowers fquantize2f16 to alu ops. */
|
|
bool lower_fquantize2f16;
|
|
|
|
/** Lower f2f16 to f2f16_rtz when execution mode is not rtne. */
|
|
bool force_f2f16_rtz;
|
|
|
|
/** Lower VARYING_SLOT_LAYER in FS to SYSTEM_VALUE_LAYER_ID. */
|
|
bool lower_layer_fs_input_to_sysval;
|
|
|
|
/** clip/cull distance and tess level arrays use compact semantics */
|
|
bool compact_arrays;
|
|
|
|
/**
|
|
* Whether discard gets emitted as nir_intrinsic_demote.
|
|
* Otherwise, nir_intrinsic_terminate is being used.
|
|
*/
|
|
bool discard_is_demote;
|
|
|
|
/**
|
|
* Whether the new-style derivative intrinsics are supported. If false,
|
|
* legacy ALU derivative ops will be emitted. This transitional option will
|
|
* be removed once all drivers are converted to derivative intrinsics.
|
|
*/
|
|
bool has_ddx_intrinsics;
|
|
|
|
/** Whether derivative intrinsics must be scalarized. */
|
|
bool scalarize_ddx;
|
|
|
|
/**
|
|
* Assign a range of driver locations to per-view outputs, with unique
|
|
* slots for each view. If unset, per-view outputs will be treated
|
|
* similarly to other arrayed IO, and only slots for one view will be
|
|
* assigned. Regardless of this setting, per-view outputs are only assigned
|
|
* slots for one value in var->data.location.
|
|
*/
|
|
bool per_view_unique_driver_locations;
|
|
|
|
/**
|
|
* Emit nir_intrinsic_store_per_view_output with compacted view indices
|
|
* rather than absolute view indices. When using compacted indices, the Nth
|
|
* index refers to the Nth enabled view, not the Nth absolute view. For
|
|
* example, with view mask 0b1010, compacted index 0 is absolute index 1,
|
|
* and compacted index 1 is absolute index 3. Note that compacted view
|
|
* indices do not correspond directly to gl_ViewIndex.
|
|
*
|
|
* If compact_view_index is unset, per-view indices must be constant before
|
|
* nir_lower_io. This can be guaranteed by calling nir_lower_io_temporaries
|
|
* first.
|
|
*/
|
|
bool compact_view_index;
|
|
|
|
/** Options determining lowering and behavior of inputs and outputs. */
|
|
nir_io_options io_options;
|
|
|
|
/**
|
|
* Bit mask of nir_lower_packing_op to skip lowering some nir ops in
|
|
* nir_lower_packing().
|
|
*/
|
|
unsigned skip_lower_packing_ops;
|
|
|
|
/** Driver callback where drivers can define how to lower mediump.
|
|
* Used by nir_lower_io_passes.
|
|
*/
|
|
void (*lower_mediump_io)(struct nir_shader *nir);
|
|
|
|
/**
|
|
* Return the maximum cost of an expression that's written to a shader
|
|
* output that can be moved into the next shader to remove that output.
|
|
*
|
|
* Currently only uniform expressions are moved. A uniform expression is
|
|
* any ALU expression sourcing only constants, uniforms, and UBO loads.
|
|
*
|
|
* Set to NULL or return 0 if you only want to propagate constants from
|
|
* outputs to inputs.
|
|
*
|
|
* Drivers can set the maximum cost based on the types of consecutive
|
|
* shaders or shader SHA1s.
|
|
*
|
|
* Drivers should also set "varying_estimate_instr_cost".
|
|
*/
|
|
unsigned (*varying_expression_max_cost)(struct nir_shader *consumer,
|
|
struct nir_shader *producer);
|
|
|
|
/**
|
|
* Return the cost of an instruction that could be moved into the next
|
|
* shader. If the cost of all instructions in an expression is <=
|
|
* varying_expression_max_cost(), the instruction is moved.
|
|
*
|
|
* When this callback isn't set, nir_opt_varyings uses its own version.
|
|
*/
|
|
unsigned (*varying_estimate_instr_cost)(struct nir_instr *instr);
|
|
|
|
/**
|
|
* When the varying_expression_max_cost callback isn't set, this specifies
|
|
* the maximum cost of a uniform expression that is allowed to be moved
|
|
* from output stores into the next shader stage to eliminate those output
|
|
* stores and corresponding inputs.
|
|
*
|
|
* 0 only allows propagating constants written to output stores to
|
|
* the next shader.
|
|
*
|
|
* At least 2 is required for moving a uniform stored in an output into
|
|
* the next shader according to default_varying_estimate_instr_cost.
|
|
*/
|
|
unsigned max_varying_expression_cost;
|
|
} nir_shader_compiler_options;
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
|
|
#endif /* NIR_SHADER_COMPILER_OPTIONS_H */
|