/* * Copyright 2024 Intel Corporation * SPDX-License-Identifier: MIT */ #pragma once #ifndef __OPENCL_VERSION__ #include #include "util/bitscan.h" #endif #include "compiler/shader_enums.h" #include "util/enum_operators.h" #ifdef __cplusplus extern "C" { #endif /** A tri-state value to track states that are potentially dynamic */ enum intel_sometimes { INTEL_NEVER = 0, INTEL_SOMETIMES, INTEL_ALWAYS }; static inline enum intel_sometimes intel_sometimes_invert(enum intel_sometimes x) { return (enum intel_sometimes)((int)INTEL_ALWAYS - (int)x); } #define INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_OFFSET (20) #define INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_MESH (32) enum intel_msaa_flags { /** Must be set whenever any dynamic MSAA is used * * This flag mostly exists to let us assert that the driver understands * dynamic MSAA so we don't run into trouble with drivers that don't. */ INTEL_MSAA_FLAG_ENABLE_DYNAMIC = (1 << 0), /** True if the framebuffer is multisampled */ INTEL_MSAA_FLAG_MULTISAMPLE_FBO = (1 << 1), /** True if this shader has been dispatched per-sample */ INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH = (1 << 2), /** True if inputs should be interpolated per-sample by default */ INTEL_MSAA_FLAG_PERSAMPLE_INTERP = (1 << 3), /** True if this shader has been dispatched with alpha-to-coverage */ INTEL_MSAA_FLAG_ALPHA_TO_COVERAGE = (1 << 4), /** True if this shader has been dispatched coarse * * This is intentionally chose to be bit 15 to correspond to the coarse bit * in the pixel interpolator messages. */ INTEL_MSAA_FLAG_COARSE_PI_MSG = (1 << 15), /** True if this shader has been dispatched coarse * * This is intentionally chose to be bit 18 to correspond to the coarse bit * in the render target messages. */ INTEL_MSAA_FLAG_COARSE_RT_WRITES = (1 << 18), /** Index of the PrimitiveID attribute relative to the first read * attribute. * * This is not a flag but a value that cover bits 20:31. Value 32 means the * PrimitiveID is coming from the PerPrimitive block, written by the Mesh * shader. */ INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX = (1 << INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_OFFSET), }; MESA_DEFINE_CPP_ENUM_BITFIELD_OPERATORS(intel_msaa_flags) /** * @defgroup Tessellator parameter enumerations. * * These correspond to the hardware values in 3DSTATE_TE, and are provided * as part of the tessellation evaluation shader. * * @{ */ enum intel_tess_partitioning { INTEL_TESS_PARTITIONING_INTEGER = 0, INTEL_TESS_PARTITIONING_ODD_FRACTIONAL = 1, INTEL_TESS_PARTITIONING_EVEN_FRACTIONAL = 2, }; enum intel_tess_output_topology { INTEL_TESS_OUTPUT_TOPOLOGY_POINT = 0, INTEL_TESS_OUTPUT_TOPOLOGY_LINE = 1, INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CW = 2, INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CCW = 3, }; enum intel_tess_domain { INTEL_TESS_DOMAIN_QUAD = 0, INTEL_TESS_DOMAIN_TRI = 1, INTEL_TESS_DOMAIN_ISOLINE = 2, }; /** @} */ enum intel_shader_dispatch_mode { INTEL_DISPATCH_MODE_4X1_SINGLE = 0, INTEL_DISPATCH_MODE_4X2_DUAL_INSTANCE = 1, INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT = 2, INTEL_DISPATCH_MODE_SIMD8 = 3, INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH = 0, INTEL_DISPATCH_MODE_TCS_MULTI_PATCH = 2, }; enum intel_barycentric_mode { INTEL_BARYCENTRIC_PERSPECTIVE_PIXEL = 0, INTEL_BARYCENTRIC_PERSPECTIVE_CENTROID = 1, INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE = 2, INTEL_BARYCENTRIC_NONPERSPECTIVE_PIXEL = 3, INTEL_BARYCENTRIC_NONPERSPECTIVE_CENTROID = 4, INTEL_BARYCENTRIC_NONPERSPECTIVE_SAMPLE = 5, INTEL_BARYCENTRIC_MODE_COUNT = 6 }; #define INTEL_BARYCENTRIC_PERSPECTIVE_BITS \ ((1 << INTEL_BARYCENTRIC_PERSPECTIVE_PIXEL) | \ (1 << INTEL_BARYCENTRIC_PERSPECTIVE_CENTROID) | \ (1 << INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE)) #define INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS \ ((1 << INTEL_BARYCENTRIC_NONPERSPECTIVE_PIXEL) | \ (1 << INTEL_BARYCENTRIC_NONPERSPECTIVE_CENTROID) | \ (1 << INTEL_BARYCENTRIC_NONPERSPECTIVE_SAMPLE)) enum intel_vue_layout { /** * Layout is fixed and shared by producer/consumer, allowing for tigh * packing */ INTEL_VUE_LAYOUT_FIXED = 0, /** * Layout is separate, works for ARB_separate_shader_objects but without * Mesh support. */ INTEL_VUE_LAYOUT_SEPARATE, /** * Layout is separate and works with Mesh shaders. */ INTEL_VUE_LAYOUT_SEPARATE_MESH, }; /** * Data structure recording the relationship between the gl_varying_slot enum * and "slots" within the vertex URB entry (VUE). A "slot" is defined as a * single octaword within the VUE (128 bits). * * Note that each BRW register contains 256 bits (2 octawords), so when * accessing the VUE in URB_NOSWIZZLE mode, each register corresponds to two * consecutive VUE slots. When accessing the VUE in URB_INTERLEAVED mode (as * in a vertex shader), each register corresponds to a single VUE slot, since * it contains data for two separate vertices. */ struct intel_vue_map { /** * Bitfield representing all varying slots that are (a) stored in this VUE * map, and (b) actually written by the shader. Does not include any of * the additional varying slots defined in brw_varying_slot. */ uint64_t slots_valid; /** * The layout of the VUE * * Separable programs (GL_ARB_separate_shader_objects) can be mixed and * matched without the linker having a chance to dead code eliminate unused * varyings. * * This means that we have to use a fixed slot layout, based on the output's * location field, rather than assigning slots in a compact contiguous block. * * When using Mesh, another constraint arises which is the HW limits for * loading per-primitive & per-vertex data, limited to 32 varying in total. * This requires us to be quite inventive with the way we lay things out. * Take a fragment shader loading the following data : * * float gl_ClipDistance[]; * uint gl_PrimitiveID; * vec4 someAppValue[29]; * * According to the Vulkan spec, someAppValue will occupy 29 slots, * gl_PrimitiveID 1 slot, gl_ClipDistance[] up to 2 slots. If the input is * coming from a VS/DS/GS shader, we can load all of this through a single * block using 3DSTATE_SBE::VertexURBEntryReadLength = 16 (maximum * programmable value) and the layout with * BRW_VUE_MAP_LAYOUT_FIXED/BRW_VUE_MAP_LAYOUT_SEPARATE will be this : * * ----------------------- * | gl_ClipDistance 0-3 | * |---------------------| * | gl_ClipDistance 4-7 | * |---------------------| * | gl_PrimitiveID | * |---------------------| * | someAppValue[] | * |---------------------| * * This works nicely as everything is coming from the same location in the * URB. * * When mesh shaders are involved, gl_PrimitiveID is located in a different * place in the URB (the per-primitive block) and requires programming * 3DSTATE_SBE_MESH::PerPrimitiveURBEntryOutputReadLength to load some * additional data. The HW has a limit such that * 3DSTATE_SBE_MESH::PerPrimitiveURBEntryOutputReadLength + * 3DSTATE_SBE_MESH::PerVertexURBEntryOutputReadLength <= 16. With the * layout above, we would not be able to accomodate that HW limit. * * The solution to this is to lay the built-in varyings out * (gl_ClipDistance omitted since it's part of the VUE header and cannot * live any other place) at the end of the VUE like this : * * ----------------------- * | gl_ClipDistance 0-3 | * |---------------------| * | gl_ClipDistance 4-7 | * |---------------------| * | someAppValue[] | * |---------------------| * | gl_PrimitiveID | * |---------------------| * * This layout adds another challenge because with separate shader * compilations, we cannot tell in the consumer shader how many outputs the * producer has, so we don't know where the gl_PrimitiveID lives. The * solution to this other problem is to read the built-in with a * MOV_INDIRECT and have the offset of the MOV_INDIRECT loaded through a * push constant. */ enum intel_vue_layout layout; /** * Map from gl_varying_slot value to VUE slot. For gl_varying_slots that are * not stored in a slot (because they are not written, or because * additional processing is applied before storing them in the VUE), the * value is -1. */ signed char varying_to_slot[VARYING_SLOT_TESS_MAX]; /** * Map from VUE slot to gl_varying_slot value. For slots that do not * directly correspond to a gl_varying_slot, the value comes from * brw_varying_slot. * * For slots that are not in use, the value is BRW_VARYING_SLOT_PAD. */ signed char slot_to_varying[VARYING_SLOT_TESS_MAX]; /** * Total number of VUE slots in use */ int num_slots; /** * Number of position VUE slots. If num_pos_slots > 1, primitive * replication is being used. */ int num_pos_slots; /** * Number of per-patch VUE slots. Only valid for tessellation control * shader outputs and tessellation evaluation shader inputs. */ int num_per_patch_slots; /** * Number of per-vertex VUE slots. Only valid for tessellation control * shader outputs and tessellation evaluation shader inputs. */ int num_per_vertex_slots; }; struct intel_cs_dispatch_info { uint32_t group_size; uint32_t simd_size; uint32_t threads; /* RightExecutionMask field used in GPGPU_WALKER. */ uint32_t right_mask; }; enum intel_compute_walk_order { INTEL_WALK_ORDER_XYZ = 0, INTEL_WALK_ORDER_XZY = 1, INTEL_WALK_ORDER_YXZ = 2, INTEL_WALK_ORDER_YZX = 3, INTEL_WALK_ORDER_ZXY = 4, INTEL_WALK_ORDER_ZYX = 5, }; static inline bool intel_fs_is_persample(enum intel_sometimes shader_persample_dispatch, bool shader_per_sample_shading, enum intel_msaa_flags pushed_msaa_flags) { if (shader_persample_dispatch != INTEL_SOMETIMES) return shader_persample_dispatch; assert(pushed_msaa_flags & INTEL_MSAA_FLAG_ENABLE_DYNAMIC); if (!(pushed_msaa_flags & INTEL_MSAA_FLAG_MULTISAMPLE_FBO)) return false; if (shader_per_sample_shading) assert(pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH); return (pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH) != 0; } static inline uint32_t intel_fs_barycentric_modes(enum intel_sometimes shader_persample_dispatch, uint32_t shader_barycentric_modes, enum intel_msaa_flags pushed_msaa_flags) { /* In the non dynamic case, we can just return the computed shader_barycentric_modes from * compilation time. */ if (shader_persample_dispatch != INTEL_SOMETIMES) return shader_barycentric_modes; uint32_t modes = shader_barycentric_modes; assert(pushed_msaa_flags & INTEL_MSAA_FLAG_ENABLE_DYNAMIC); if (pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_INTERP) { assert(pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH); /* Making dynamic per-sample interpolation work is a bit tricky. The * hardware will hang if SAMPLE is requested but per-sample dispatch is * not enabled. This means we can't preemptively add SAMPLE to the * barycentrics bitfield. Instead, we have to add it late and only * on-demand. Annoyingly, changing the number of barycentrics requested * changes the whole PS shader payload so we very much don't want to do * that. Instead, if the dynamic per-sample interpolation flag is set, * we check to see if SAMPLE was requested and, if not, replace the * highest barycentric bit in the [non]perspective grouping (CENTROID, * if it exists, else PIXEL) with SAMPLE. The shader will stomp all the * barycentrics in the shader with SAMPLE so it really doesn't matter * which one we replace. The important thing is that we keep the number * of barycentrics in each [non]perspective grouping the same. */ if ((modes & INTEL_BARYCENTRIC_PERSPECTIVE_BITS) && !(modes & BITFIELD_BIT(INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE))) { int sample_mode = util_last_bit(modes & INTEL_BARYCENTRIC_PERSPECTIVE_BITS) - 1; assert(modes & BITFIELD_BIT(sample_mode)); modes &= ~BITFIELD_BIT(sample_mode); modes |= BITFIELD_BIT(INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE); } if ((modes & INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS) && !(modes & BITFIELD_BIT(INTEL_BARYCENTRIC_NONPERSPECTIVE_SAMPLE))) { int sample_mode = util_last_bit(modes & INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS) - 1; assert(modes & BITFIELD_BIT(sample_mode)); modes &= ~BITFIELD_BIT(sample_mode); modes |= BITFIELD_BIT(INTEL_BARYCENTRIC_NONPERSPECTIVE_SAMPLE); } } else { /* If we're not using per-sample interpolation, we need to disable the * per-sample bits. * * SKL PRMs, Volume 2a: Command Reference: Instructions, * 3DSTATE_WM:Barycentric Interpolation Mode: * "MSDISPMODE_PERSAMPLE is required in order to select Perspective * Sample or Non-perspective Sample barycentric coordinates." */ uint32_t sample_bits = (BITFIELD_BIT(INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE) | BITFIELD_BIT(INTEL_BARYCENTRIC_NONPERSPECTIVE_SAMPLE)); uint32_t requested_sample = modes & sample_bits; modes &= ~sample_bits; /* * If the shader requested some sample modes and we have to disable * them, make sure we add back the pixel variant back to not mess up the * thread payload. * * Why does this works out? Because of the ordering in the thread payload : * * R7:10 Perspective Centroid Barycentric * R11:14 Perspective Sample Barycentric * R15:18 Linear Pixel Location Barycentric * * In the backend when persample dispatch is dynamic, we always select * the sample barycentric and turn off the pixel location (even if * requested through intrinsics). That way when we dynamically select * pixel or sample dispatch, the barycentric always match, since the * pixel location barycentric register offset will align with the sample * barycentric. */ if (requested_sample) { if (requested_sample & BITFIELD_BIT(INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE)) modes |= BITFIELD_BIT(INTEL_BARYCENTRIC_PERSPECTIVE_PIXEL); if (requested_sample & BITFIELD_BIT(INTEL_BARYCENTRIC_NONPERSPECTIVE_SAMPLE)) modes |= BITFIELD_BIT(INTEL_BARYCENTRIC_NONPERSPECTIVE_PIXEL); } } return modes; } static inline bool intel_fs_is_coarse(enum intel_sometimes shader_coarse_pixel_dispatch, enum intel_msaa_flags pushed_msaa_flags) { if (shader_coarse_pixel_dispatch != INTEL_SOMETIMES) return shader_coarse_pixel_dispatch; assert(pushed_msaa_flags & INTEL_MSAA_FLAG_ENABLE_DYNAMIC); assert((pushed_msaa_flags & INTEL_MSAA_FLAG_COARSE_RT_WRITES) ? shader_coarse_pixel_dispatch != INTEL_NEVER : shader_coarse_pixel_dispatch != INTEL_ALWAYS); return (pushed_msaa_flags & INTEL_MSAA_FLAG_COARSE_RT_WRITES) != 0; } struct intel_fs_params { bool shader_sample_shading; float shader_min_sample_shading; bool state_sample_shading; uint32_t rasterization_samples; bool coarse_pixel; bool alpha_to_coverage; uint32_t primitive_id_index; }; static inline enum intel_msaa_flags intel_fs_msaa_flags(struct intel_fs_params params) { enum intel_msaa_flags fs_msaa_flags = INTEL_MSAA_FLAG_ENABLE_DYNAMIC; if (params.rasterization_samples > 1) { fs_msaa_flags |= INTEL_MSAA_FLAG_MULTISAMPLE_FBO; if (params.shader_sample_shading) fs_msaa_flags |= INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH; if (params.shader_sample_shading || (params.state_sample_shading && (params.shader_min_sample_shading * params.rasterization_samples) > 1)) { fs_msaa_flags |= INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH | INTEL_MSAA_FLAG_PERSAMPLE_INTERP; } } if (!(fs_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH) && params.coarse_pixel) { fs_msaa_flags |= INTEL_MSAA_FLAG_COARSE_PI_MSG | INTEL_MSAA_FLAG_COARSE_RT_WRITES; } if (params.alpha_to_coverage) fs_msaa_flags |= INTEL_MSAA_FLAG_ALPHA_TO_COVERAGE; fs_msaa_flags |= (enum intel_msaa_flags)( params.primitive_id_index << INTEL_MSAA_FLAG_PRIMITIVE_ID_INDEX_OFFSET); return fs_msaa_flags; } #ifdef __cplusplus } /* extern "C" */ #endif