/* * Copyright © 2014 Connor Abbott * SPDX-License-Identifier: MIT */ #ifndef NIR_SHADER_COMPILER_OPTIONS_H #define NIR_SHADER_COMPILER_OPTIONS_H #include "util/macros.h" #include "nir_defines.h" #include #include #ifdef __cplusplus extern "C" { #endif typedef enum { nir_lower_imul64 = (1 << 0), nir_lower_isign64 = (1 << 1), /** Lower all int64 modulus and division opcodes */ nir_lower_divmod64 = (1 << 2), /** Lower all 64-bit umul_high and imul_high opcodes */ nir_lower_imul_high64 = (1 << 3), nir_lower_bcsel64 = (1 << 4), nir_lower_icmp64 = (1 << 5), nir_lower_iadd64 = (1 << 6), nir_lower_iabs64 = (1 << 7), nir_lower_ineg64 = (1 << 8), nir_lower_logic64 = (1 << 9), nir_lower_minmax64 = (1 << 10), nir_lower_shift64 = (1 << 11), nir_lower_imul_2x32_64 = (1 << 12), nir_lower_extract64 = (1 << 13), nir_lower_ufind_msb64 = (1 << 14), nir_lower_bit_count64 = (1 << 15), nir_lower_subgroup_shuffle64 = (1 << 16), nir_lower_scan_reduce_bitwise64 = (1 << 17), nir_lower_scan_reduce_iadd64 = (1 << 18), nir_lower_vote_ieq64 = (1 << 19), nir_lower_usub_sat64 = (1 << 20), nir_lower_iadd_sat64 = (1 << 21), nir_lower_find_lsb64 = (1 << 22), nir_lower_conv64 = (1 << 23), nir_lower_uadd_sat64 = (1 << 24), nir_lower_iadd3_64 = (1 << 25), nir_lower_bitfield_reverse64 = (1 << 26), nir_lower_bitfield_extract64 = (1 << 27), } nir_lower_int64_options; typedef enum { nir_lower_drcp = (1 << 0), nir_lower_dsqrt = (1 << 1), nir_lower_drsq = (1 << 2), nir_lower_dtrunc = (1 << 3), nir_lower_dfloor = (1 << 4), nir_lower_dceil = (1 << 5), nir_lower_dfract = (1 << 6), nir_lower_dround_even = (1 << 7), nir_lower_dmod = (1 << 8), nir_lower_dsub = (1 << 9), nir_lower_ddiv = (1 << 10), nir_lower_dsign = (1 << 11), nir_lower_dminmax = (1 << 12), nir_lower_dsat = (1 << 13), nir_lower_fp64_full_software = (1 << 14), } nir_lower_doubles_options; typedef enum { nir_divergence_single_prim_per_subgroup = (1 << 0), nir_divergence_single_patch_per_tcs_subgroup = (1 << 1), nir_divergence_single_patch_per_tes_subgroup = (1 << 2), nir_divergence_view_index_uniform = (1 << 3), nir_divergence_single_frag_shading_rate_per_subgroup = (1 << 4), nir_divergence_multiple_workgroup_per_compute_subgroup = (1 << 5), nir_divergence_shader_record_ptr_uniform = (1 << 6), nir_divergence_uniform_load_tears = (1 << 7), /* If used, this allows phis for divergent merges with undef and a uniform source to be considered uniform */ nir_divergence_ignore_undef_if_phi_srcs = (1 << 8), /* Whether to compute vertex divergence (meaning between vertices * of the same primitive) instead of subgroup invocation divergence * (between invocations of the same subgroup). For example, patch input * loads are always convergent, while subgroup intrinsics are divergent. */ nir_divergence_vertex = (1 << 11), } nir_divergence_options; /** An instruction filtering callback * * Returns true if the instruction should be processed and false otherwise. */ typedef bool (*nir_instr_filter_cb)(const nir_instr *, const void *); typedef enum { /** * Whether a fragment shader can interpolate the same input multiple times * with different modes (smooth, noperspective) and locations (pixel, * centroid, sample, at_offset, at_sample), excluding the flat mode. * * This matches AMD GPU flexibility and limitations and is a superset of * the GL4 requirement that each input can be interpolated at its specified * location, and then also as centroid, at_offset, and at_sample. */ nir_io_has_flexible_input_interpolation_except_flat = BITFIELD_BIT(0), /** * nir_opt_varyings compacts (relocates) components of varyings by * rewriting their locations completely, effectively moving components of * varyings between slots. This option forces nir_opt_varyings to make * VARYING_SLOT_POS unused by moving its contents to VARn if the consumer * is not FS. If this option is not set and POS is unused, it moves * components of VARn to POS until it's fully used. */ nir_io_dont_use_pos_for_non_fs_varyings = BITFIELD_BIT(1), nir_io_16bit_input_output_support = BITFIELD_BIT(2), /** * Implement mediump inputs and outputs as normal 32-bit IO. * Causes the mediump flag to be not set for IO semantics, essentially * destroying any mediump-related IO information in the shader. */ nir_io_mediump_is_32bit = BITFIELD_BIT(3), /** * Whether nir_opt_vectorize_io should ignore FS inputs. */ nir_io_prefer_scalar_fs_inputs = BITFIELD_BIT(4), /** * Whether interpolated fragment shader vec4 slots can use load_input for * a subset of its components to skip interpolation for those components. * The result of such load_input is a value from a random (not necessarily * provoking) vertex. If a value from the provoking vertex is required, * the vec4 slot should have no load_interpolated_input instructions. * * This exposes the AMD capability that allows packing flat inputs with * interpolated inputs in a limited number of cases. Normally, flat * components must be in a separate vec4 slot to get the value from * the provoking vertex. If the compiler can prove that all per-vertex * values are equal (convergent, i.e. the provoking vertex doesn't matter), * it can put such flat components into any interpolated vec4 slot. * * It should also be set if the hw can mix flat and interpolated components * in the same vec4 slot. * * This causes nir_opt_varyings to skip interpolation for all varyings * that are convergent, and enables better compaction and inter-shader code * motion for convergent varyings. */ nir_io_mix_convergent_flat_with_interpolated = BITFIELD_BIT(5), /** * Whether src_type and dest_type of IO intrinsics are irrelevant and * should be ignored by nir_opt_vectorize_io. All drivers that always treat * load_input and store_output as untyped and load_interpolated_input as * float##bit_size should set this. */ nir_io_vectorizer_ignores_types = BITFIELD_BIT(6), /** * Whether nir_opt_varyings should never promote convergent FS inputs * to flat. */ nir_io_always_interpolate_convergent_fs_inputs = BITFIELD_BIT(7), /** * Whether the first assigned color channel component should be equal to * the first unused VARn component. * * For example, if the first unused VARn channel is VAR0.z, color channels * are assigned in this order: * COL0.z, COL0.w, COL0.x, COL0.y, COL1.z, COL1.w, COL1.x, COL1.y * * This allows certain drivers to merge outputs if each output sets * different components, for example 2 outputs writing VAR0.xy and COL0.z * will only use 1 HW output. */ nir_io_compaction_rotates_color_channels = BITFIELD_BIT(8), /** * Whether to group TES inputs as follows: * - inputs used to compute only POS/CLIP outputs are first * - inputs used to compute both POS/CLIP outputs and other outputs are next * - inputs used to compute only other outputs are last */ nir_io_compaction_groups_tes_inputs_into_pos_and_var_groups = BITFIELD_BIT(9), /** * RADV expects that high 16 bits of outputs set component >= 4. That's not * legal in NIR, but RADV unfortunately relies on it because it's not * validated. */ nir_io_radv_intrinsic_component_workaround = BITFIELD_BIT(10), /* Options affecting the GLSL compiler or Gallium are below. */ /** * Lower load_deref/store_deref to load_input/store_output/etc. intrinsics. * This is only affects GLSL compilation and Gallium. */ nir_io_has_intrinsics = BITFIELD_BIT(16), /** * Whether clip and cull distance arrays should be separate. If this is not * set, cull distances will be moved into VARYING_SLOT_CLIP_DISTn after clip * distances, and shader_info::clip_distance_array_size will be the index * of the first cull distance. nir_lower_clip_cull_distance_array_vars does * that. */ nir_io_separate_clip_cull_distance_arrays = BITFIELD_BIT(17), } nir_io_options; typedef enum { nir_lower_packing_op_pack_64_2x32, nir_lower_packing_op_unpack_64_2x32, nir_lower_packing_op_pack_64_4x16, nir_lower_packing_op_unpack_64_4x16, nir_lower_packing_op_pack_32_2x16, nir_lower_packing_op_unpack_32_2x16, nir_lower_packing_op_pack_32_4x8, nir_lower_packing_op_unpack_32_4x8, nir_lower_packing_num_ops, } nir_lower_packing_op; typedef struct nir_shader_compiler_options { bool lower_fdiv; bool lower_ffma16; bool lower_ffma32; bool lower_ffma64; bool fuse_ffma16; bool fuse_ffma32; bool fuse_ffma64; bool lower_flrp16; bool lower_flrp32; /** Lowers flrp when it does not support doubles */ bool lower_flrp64; bool lower_fpow; bool lower_fsat; bool lower_fsqrt; bool lower_sincos; bool lower_fmod; /** Lowers ibitfield_extract/ubitfield_extract for 8, 16 & 32 bits. */ bool lower_bitfield_extract8; bool lower_bitfield_extract16; bool lower_bitfield_extract; /** Lowers bitfield_insert. */ bool lower_bitfield_insert; /** Lowers bitfield_reverse to shifts. */ bool lower_bitfield_reverse; /** Lowers bit_count to shifts. */ bool lower_bit_count; /** Lowers ifind_msb. */ bool lower_ifind_msb; /** Lowers ufind_msb. */ bool lower_ufind_msb; /** Lowers find_lsb to ufind_msb and logic ops */ bool lower_find_lsb; bool lower_uadd_carry; bool lower_usub_borrow; /** Lowers imul_high/umul_high to 16-bit multiplies and carry operations. */ bool lower_mul_high; bool lower_mul_high16; /** lowers fneg to fmul(x, -1.0). Driver must call nir_opt_algebraic_late() */ bool lower_fneg; /** lowers ineg to isub. Driver must call nir_opt_algebraic_late(). */ bool lower_ineg; /** lowers fisnormal to alu ops. */ bool lower_fisnormal; /* lower {slt,sge,seq,sne} to {flt,fge,feq,fneu} + b2f: */ bool lower_scmp; /* lower b/fall_equalN/b/fany_nequalN (ex:fany_nequal4 to sne+fdot4+fsat) */ bool lower_vector_cmp; /** enable rules to avoid bit ops */ bool lower_bitops; /** enables rules to lower isign to imin+imax */ bool lower_isign; /** enables rules to lower fsign to fsub and flt */ bool lower_fsign; /** enables rules to lower iabs to ineg+imax */ bool lower_iabs; /** enable rules that avoid generating umax from signed integer ops */ bool lower_umax; /** enable rules that avoid generating umin from signed integer ops */ bool lower_umin; /* lower fmin/fmax with signed zero preserve to fmin/fmax with * no_signed_zero, for backends whose fmin/fmax implementations do not * implement IEEE-754-2019 semantics for signed zero. */ bool lower_fminmax_signed_zero; /* lower fdph to fdot4 */ bool lower_fdph; /* Does the native fdot instruction replicate its result for four * components? If so, then opt_algebraic_late will turn all fdotN * instructions into fdotN_replicated instructions. */ bool fdot_replicates; /** lowers ffloor to fsub+ffract: */ bool lower_ffloor; /** lowers ffract to fsub+ffloor: */ bool lower_ffract; /** lowers fceil to fneg+ffloor+fneg: */ bool lower_fceil; bool lower_ftrunc; /** Lowers fround_even to ffract+feq+csel. * * Not correct in that it doesn't correctly handle the "_even" part of the * rounding, but good enough for DX9 array indexing handling on DX9-class * hardware. */ bool lower_fround_even; bool lower_ldexp; bool lower_pack_half_2x16; bool lower_pack_unorm_2x16; bool lower_pack_snorm_2x16; bool lower_pack_unorm_4x8; bool lower_pack_snorm_4x8; bool lower_pack_64_2x32; bool lower_pack_64_4x16; bool lower_pack_32_2x16; bool lower_pack_64_2x32_split; bool lower_pack_32_2x16_split; bool lower_unpack_half_2x16; bool lower_unpack_unorm_2x16; bool lower_unpack_snorm_2x16; bool lower_unpack_unorm_4x8; bool lower_unpack_snorm_4x8; bool lower_unpack_64_2x32_split; bool lower_unpack_32_2x16_split; bool lower_pack_split; bool lower_extract_byte; bool lower_extract_word; bool lower_insert_byte; bool lower_insert_word; /* Indicates that the driver only has zero-based vertex id */ bool vertex_id_zero_based; /** * If enabled, gl_BaseVertex will be lowered as: * is_indexed_draw (~0/0) & firstvertex */ bool lower_base_vertex; /* Indicates that gl_InstanceIndex already includes base index * and doesn't require further lowering. */ bool instance_id_includes_base_index; /** * If enabled, gl_HelperInvocation will be lowered as: * * - non-sample-shading: sample_mask_in == 0. * - sample shading: !((1 << sample_id) & sample_mask_in)) * * For this to be correct, it requires that fs.uses_sample_shading is set to * true when sample shading is enabled. This means that you need shader * variants to set the flag when Vulkan's * VkPipelineMultisampleStateCreateInfo->sampleShadingEnable or GL's * glMinSampleshading() are enabled. * * This depends on some possibly hw implementation details, which may * not be true for all hw. In particular that the FS is only executed * for covered samples or for helper invocations. So, do not blindly * enable this option. * * Note: See also issue #22 in ARB_shader_image_load_store */ bool lower_helper_invocation; /** * Convert gl_SampleMaskIn to gl_HelperInvocation as follows: * * gl_SampleMaskIn == 0 ---> gl_HelperInvocation * gl_SampleMaskIn != 0 ---> !gl_HelperInvocation */ bool optimize_sample_mask_in; /** * Optimize load_front_face ? a : -a to load_front_face_fsign * a */ bool optimize_load_front_face_fsign; /** * Optimize boolean reductions of quad broadcasts. This should only be enabled if * nir_intrinsic_reduce supports INCLUDE_HELPERS. */ bool optimize_quad_vote_to_reduce; bool lower_cs_local_index_to_id; bool lower_cs_local_id_to_index; /* Prevents lowering global_invocation_id to be in terms of workgroup_id */ bool has_cs_global_id; bool lower_device_index_to_zero; /* Set if nir_lower_pntc_ytransform() should invert gl_PointCoord. * Either when frame buffer is flipped or GL_POINT_SPRITE_COORD_ORIGIN * is GL_LOWER_LEFT. */ bool lower_wpos_pntc; /** * Set if nir_op_[iu]hadd and nir_op_[iu]rhadd instructions should be * lowered to simple arithmetic. * * If this flag is set, the lowering will be applied to all bit-sizes of * these instructions. * * :c:member:`lower_hadd64` */ bool lower_hadd; /** * Set if only 64-bit nir_op_[iu]hadd and nir_op_[iu]rhadd instructions * should be lowered to simple arithmetic. * * If this flag is set, the lowering will be applied to only 64-bit * versions of these instructions. * * :c:member:`lower_hadd` */ bool lower_hadd64; /** * Set if nir_op_uadd_sat should be lowered to simple arithmetic. * * If this flag is set, the lowering will be applied to all bit-sizes of * these instructions. */ bool lower_uadd_sat; /** * Set if nir_op_usub_sat should be lowered to simple arithmetic. * * If this flag is set, the lowering will be applied to all bit-sizes of * these instructions. */ bool lower_usub_sat; /** * Set if nir_op_iadd_sat and nir_op_isub_sat should be lowered to simple * arithmetic. * * If this flag is set, the lowering will be applied to all bit-sizes of * these instructions. */ bool lower_iadd_sat; /** * Set if imul_32x16 and umul_32x16 should be lowered to simple * arithmetic. */ bool lower_mul_32x16; /** * Set if bf2f and f2bf should be lowered to arithmetic. */ bool lower_bfloat16_conversions; bool vectorize_tess_levels; bool lower_to_scalar; nir_instr_filter_cb lower_to_scalar_filter; /** * Disables potentially harmful algebraic transformations for architectures * with SIMD-within-a-register semantics. * * Note, to actually vectorize 16bit instructions, use nir_opt_vectorize() * with a suitable callback function. */ bool vectorize_vec2_16bit; /** * Should the linker unify inputs_read/outputs_written between adjacent * shader stages which are linked into a single program? */ bool unify_interfaces; /** * Whether nir_lower_io() will lower interpolateAt functions to * load_interpolated_input intrinsics. * * Unlike nir_lower_io_use_interpolated_input_intrinsics this will only * lower these functions and leave input load intrinsics untouched. */ bool lower_interpolate_at; /* Lowers when 32x32->64 bit multiplication is not supported */ bool lower_mul_2x32_64; /* Indicates that urol and uror are supported */ bool has_rotate8; bool has_rotate16; bool has_rotate32; /** Backend supports shfr */ bool has_shfr32; /** Backend supports ternary addition */ bool has_iadd3; /** * Backend supports amul and would like them generated whenever * possible. This is stronger than has_imul24 for amul, but does not imply * support for imul24. */ bool has_amul; /** * Backend supports imul24, and would like to use it (when possible) * for address/offset calculation. If true, driver should call * nir_lower_amul(). (If not set, amul will automatically be lowered * to imul.) */ bool has_imul24; /** Backend supports umul24, if not set umul24 will automatically be lowered * to imul with masked inputs */ bool has_umul24; /** Backend supports imul24_relaxed and umul24_relaxed, if not set they will be lowered * to imul24, umul24 or imul. */ bool has_mul24_relaxed; /** Backend supports 32-bit imad */ bool has_imad32; /** Backend supports umad24, if not set umad24 will automatically be lowered * to imul with masked inputs and iadd */ bool has_umad24; /* Backend supports fused compare against zero and csel */ bool has_fused_comp_and_csel; /* Backend supports fused int eq/ne against zero and csel. */ bool has_icsel_eqz64; bool has_icsel_eqz32; bool has_icsel_eqz16; /* Backend supports fneo, fequ, fltu, fgeu. */ bool has_fneo_fcmpu; /* Backend supports ford and funord. */ bool has_ford_funord; /** Backend supports fsub, if not set fsub will automatically be lowered to * fadd(x, fneg(y)). If true, driver should call nir_opt_algebraic_late(). */ bool has_fsub; /** Backend supports isub, if not set isub will automatically be lowered to * iadd(x, ineg(y)). If true, driver should call nir_opt_algebraic_late(). */ bool has_isub; /** Backend supports pack_32_4x8 or pack_32_4x8_split. */ bool has_pack_32_4x8; /** Backend supports nir_load_texture_scale and prefers it over txs for nir * lowerings. */ bool has_texture_scaling; /** Backend supports sdot_4x8_iadd. */ bool has_sdot_4x8; /** Backend supports udot_4x8_uadd. */ bool has_udot_4x8; /** Backend supports sudot_4x8_iadd. */ bool has_sudot_4x8; /** Backend supports sdot_4x8_iadd_sat. */ bool has_sdot_4x8_sat; /** Backend supports udot_4x8_uadd_sat. */ bool has_udot_4x8_sat; /** Backend supports sudot_4x8_iadd_sat. */ bool has_sudot_4x8_sat; /** Backend supports sdot_2x16 and udot_2x16 opcodes. */ bool has_dot_2x16; /** Backend supports bfdot2_bfadd opcode. */ bool has_bfdot2_bfadd; /** Backend supports fmulz (and ffmaz if lower_ffma32=false) */ bool has_fmulz; /** * Backend supports fmulz (and ffmaz if lower_ffma32=false) but only if * FLOAT_CONTROLS_DENORM_PRESERVE_FP32 is not set */ bool has_fmulz_no_denorms; /** Backend supports 32bit ufind_msb_rev and ifind_msb_rev. */ bool has_find_msb_rev; /** Backend supports pack_half_2x16_rtz_split. */ bool has_pack_half_2x16_rtz; /** Backend supports bitz/bitnz. */ bool has_bit_test; /** Backend supports ubfe/ibfe. */ bool has_bfe; /** Backend supports bfm. */ bool has_bfm; /** Backend supports bfi. */ bool has_bfi; /** Backend supports bitfield_select. */ bool has_bitfield_select; /** Backend supports uclz. */ bool has_uclz; /** Backend support msad_u4x8. */ bool has_msad; /** Backend supports f2e4m3fn_satfn */ bool has_f2e4m3fn_satfn; /** Backend supports load_global_bounded intrinsics. */ bool has_load_global_bounded; /** * Is this the Intel vec4 backend? * * Used to inhibit algebraic optimizations that are known to be harmful on * the Intel vec4 backend. This is generally applicable to any * optimization that might cause more immediate values to be used in * 3-source (e.g., ffma and flrp) instructions. */ bool intel_vec4; /** * For most Intel GPUs, all ternary operations such as FMA and BFE cannot * have immediates, so two to three instructions may eventually be needed. */ bool avoid_ternary_with_two_constants; /** Whether 8-bit ALU is supported. */ bool support_8bit_alu; /** Whether 16-bit ALU is supported. */ bool support_16bit_alu; unsigned max_unroll_iterations; unsigned max_unroll_iterations_aggressive; unsigned max_unroll_iterations_fp64; bool lower_uniforms_to_ubo; /* Specifies if indirect sampler array access will trigger forced loop * unrolling. */ bool force_indirect_unrolling_sampler; /* Some older drivers don't support GLSL versions with the concept of flat * varyings and also don't support integers. This setting helps us avoid * marking varyings as flat and potentially having them changed to ints via * varying packing. */ bool no_integers; /** * Specifies which type of indirectly accessed variables should force * loop unrolling. */ nir_variable_mode force_indirect_unrolling; bool driver_functions; /** * If true, the driver will call nir_lower_int64 itself and the frontend * should not do so. This may enable better optimization around address * modes. */ bool late_lower_int64; nir_lower_int64_options lower_int64_options; nir_lower_doubles_options lower_doubles_options; nir_divergence_options divergence_analysis_options; /** * The masks of shader stages that support indirect indexing with * load_input and store_output intrinsics. It's used by * nir_lower_io_passes. */ uint8_t support_indirect_inputs; uint8_t support_indirect_outputs; /** store the variable offset into the instrinsic range_base instead * of adding it to the image index. */ bool lower_image_offset_to_range_base; /** store the variable offset into the instrinsic range_base instead * of adding it to the atomic source */ bool lower_atomic_offset_to_range_base; /** Don't convert medium-precision casts (e.g. f2fmp) into concrete * type casts (e.g. f2f16). */ bool preserve_mediump; /** lowers fquantize2f16 to alu ops. */ bool lower_fquantize2f16; /** Lower f2f16 to f2f16_rtz when execution mode is not rtne. */ bool force_f2f16_rtz; /** Lower VARYING_SLOT_LAYER in FS to SYSTEM_VALUE_LAYER_ID. */ bool lower_layer_fs_input_to_sysval; /** clip/cull distance and tess level arrays use compact semantics */ bool compact_arrays; /** * Whether discard gets emitted as nir_intrinsic_demote. * Otherwise, nir_intrinsic_terminate is being used. */ bool discard_is_demote; /** Whether derivative intrinsics must be scalarized. */ bool scalarize_ddx; /** * Assign a range of driver locations to per-view outputs, with unique * slots for each view. If unset, per-view outputs will be treated * similarly to other arrayed IO, and only slots for one view will be * assigned. Regardless of this setting, per-view outputs are only assigned * slots for one value in var->data.location. */ bool per_view_unique_driver_locations; /** * Emit nir_intrinsic_store_per_view_output with compacted view indices * rather than absolute view indices. When using compacted indices, the Nth * index refers to the Nth enabled view, not the Nth absolute view. For * example, with view mask 0b1010, compacted index 0 is absolute index 1, * and compacted index 1 is absolute index 3. Note that compacted view * indices do not correspond directly to gl_ViewIndex. * * If compact_view_index is unset, per-view indices must be constant before * nir_lower_io. This can be guaranteed by calling nir_lower_io_temporaries * first. */ bool compact_view_index; /** Options determining lowering and behavior of inputs and outputs. */ nir_io_options io_options; /** * Bit mask of nir_lower_packing_op to skip lowering some nir ops in * nir_lower_packing(). */ unsigned skip_lower_packing_ops; /** Driver callback where drivers can define how to lower mediump. * Used by nir_lower_io_passes. */ void (*lower_mediump_io)(struct nir_shader *nir); /** * Return the maximum cost of an expression that's written to a shader * output that can be moved into the next shader to remove that output. * * Currently only uniform expressions are moved. A uniform expression is * any ALU expression sourcing only constants, uniforms, and UBO loads. * * Set to NULL or return 0 if you only want to propagate constants from * outputs to inputs. * * Drivers can set the maximum cost based on the types of consecutive * shaders or shader SHA1s. * * Drivers should also set "varying_estimate_instr_cost". */ unsigned (*varying_expression_max_cost)(struct nir_shader *consumer, struct nir_shader *producer); /** * Return the cost of an instruction that could be moved into the next * shader. If the cost of all instructions in an expression is <= * varying_expression_max_cost(), the instruction is moved. * * When this callback isn't set, nir_opt_varyings uses its own version. */ unsigned (*varying_estimate_instr_cost)(struct nir_instr *instr); /** * When the varying_expression_max_cost callback isn't set, this specifies * the maximum cost of a uniform expression that is allowed to be moved * from output stores into the next shader stage to eliminate those output * stores and corresponding inputs. * * 0 only allows propagating constants written to output stores to * the next shader. * * At least 2 is required for moving a uniform stored in an output into * the next shader according to default_varying_estimate_instr_cost. */ unsigned max_varying_expression_cost; /** * Used by nir_lower_explicit_io to determine the maximum offset_shift to * use when lowering the deref address of the given intrinsic. */ unsigned (*max_offset_shift)(nir_intrinsic_instr *, const void *); /** * Passed to the callbacks that accept a data pointer. */ const void *cb_data; /** Maximum amount of invocations per workgroup. */ unsigned max_workgroup_invocations; /** Maximum compute shader / kernel dispatchable work size. */ unsigned max_workgroup_count[3]; } nir_shader_compiler_options; #ifdef __cplusplus } #endif #endif /* NIR_SHADER_COMPILER_OPTIONS_H */